--- /dev/null
+PROTOCOL = 'GPU_VIPER'
+TARGET_ISA = 'x86'
+TARGET_GPU_ISA = 'gcn3'
+BUILD_GPU = True
+CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU'
maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\
accessDistance = options.L%(level)dAccessDistanceStat,\
clk_domain = SrcClockDomain(\
- clock = options.GPUClock,\
+ clock = options.gpu_clock,\
voltage_domain = VoltageDomain(\
voltage = options.gpu_voltage)))" % locals()
return constructor_call
coalescingWindow = options.L%(level)dCoalescingWindow,\
disableCoalescing = options.L%(level)dDisableCoalescing,\
clk_domain = SrcClockDomain(\
- clock = options.GPUClock,\
+ clock = options.gpu_clock,\
voltage_domain = VoltageDomain(\
voltage = options.gpu_voltage)))" % locals()
return constructor_call
-def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name):
- # arguments: options, TLB level, number of private structures for this Level,
- # TLB name and Coalescer name
+def create_TLB_Coalescer(options, my_level, my_index, tlb_name,
+ coalescer_name):
+ # arguments: options, TLB level, number of private structures for this
+ # Level, TLB name and Coalescer name
for i in range(my_index):
- TLB_name.append(eval(TLB_constructor(my_level)))
- Coalescer_name.append(eval(Coalescer_constructor(my_level)))
+ tlb_name.append(eval(TLB_constructor(my_level)))
+ coalescer_name.append(eval(Coalescer_constructor(my_level)))
def config_tlb_hierarchy(options, system, shader_idx):
- n_cu = options.num_compute_units
- # Make this configurable now, instead of the hard coded val. The dispatcher
- # is always the last item in the system.cpu list.
- dispatcher_idx = len(system.cpu) - 1
+ n_cu = options.cu_per_sa * options.sa_per_complex * \
+ options.num_gpu_complexes
if options.TLB_config == "perLane":
num_TLBs = 64 * n_cu
print("Bad option for TLB Configuration.")
sys.exit(1)
- #----------------------------------------------------------------------------------------
+ #-------------------------------------------------------------------------
# A visual representation of the TLB hierarchy
# for ease of configuration
- # < Modify here the width and the number of levels if you want a different configuration >
- # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level
- L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []},
- {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []},
- {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}]
+ # < Modify here the width and the number of levels if you want a different
+ # configuration >
+ # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc)
+ # for this level
+ L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [],
+ 'CoalescerArray': []},
+ {'name': 'scalar', 'width' : options.num_scalar_cache,
+ 'TLBarray': [], 'CoalescerArray': []},
+ {'name': 'l1', 'width': num_TLBs, 'TLBarray': [],
+ 'CoalescerArray': []}]
L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}]
TLB_hierarchy = [L1, L2, L3]
- #----------------------------------------------------------------------------------------
+ #-------------------------------------------------------------------------
# Create the hiearchy
# Call the appropriate constructors and add objects to the system
for tlb in range(tlb_per_cu):
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
system.l1_coalescer[%d].slave[%d]' % \
- (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0))
+ (shader_idx, cu_idx, tlb,
+ cu_idx*tlb_per_cu+tlb, 0))
else:
exec('system.cpu[%d].CUs[%d].translation_port[%d] = \
system.l1_coalescer[%d].slave[%d]' % \
- (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs)))
-
- elif name == 'dispatcher': # Dispatcher TLB
- for index in range(TLB_type['width']):
- exec('system.cpu[%d].translation_port = \
- system.dispatcher_coalescer[%d].slave[0]' % \
- (dispatcher_idx, index))
+ (shader_idx, cu_idx, tlb_per_cu,
+ cu_idx / (n_cu / num_TLBs),
+ cu_idx % (n_cu / num_TLBs)))
elif name == 'sqc': # I-TLB
for index in range(n_cu):
sqc_tlb_index = index / options.cu_per_sqc
exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \
system.sqc_coalescer[%d].slave[%d]' % \
(shader_idx, index, sqc_tlb_index, sqc_tlb_port_id))
-
+ elif name == 'scalar': # Scalar D-TLB
+ for index in range(n_cu):
+ scalar_tlb_index = index / options.cu_per_scalar_cache
+ scalar_tlb_port_id = index % options.cu_per_scalar_cache
+ exec('system.cpu[%d].CUs[%d].scalar_tlb_port = \
+ system.scalar_coalescer[%d].slave[%d]' % \
+ (shader_idx, index, scalar_tlb_index,
+ scalar_tlb_port_id))
# Connect the memSidePorts (masters) of all the TLBs with the
# cpuSidePorts (slaves) of the Coalescers of the next level
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
wf->computeUnit->cu_id, wf->wgId, refCount);
- wf->computeUnit->registerManager.freeRegisters(wf);
+ wf->computeUnit->registerManager->freeRegisters(wf);
wf->computeUnit->completedWfs++;
wf->computeUnit->activeWaves--;
*/
bool misaligned_acc = split_addr > vaddr;
- RequestPtr req = new Request(0, vaddr, req_size, 0,
+ RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
- delete req;
} else {
gpuDynInst->numScalarReqs = 1;
gpuDynInst->setRequestFlags(req);
*/
bool misaligned_acc = split_addr > vaddr;
- RequestPtr req = new Request(0, vaddr, req_size, 0,
+ RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
- delete req;
} else {
gpuDynInst->numScalarReqs = 1;
gpuDynInst->setRequestFlags(req);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+ RequestPtr req = std::make_shared<Request>(vaddr,
+ sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+ RequestPtr req = std::make_shared<Request>(vaddr,
+ sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
{
// create request and set flags
gpuDynInst->statusBitVector = VectorMask(1);
- Request *req = new Request(0, 0, 0, 0,
+ RequestPtr req = std::make_shared<Request>(0, 0, 0,
gpuDynInst->computeUnit()->
masterId(), 0,
gpuDynInst->wfDynId);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+ RequestPtr req = std::make_shared<Request>(vaddr,
+ sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, req_size, 0,
+ RequestPtr req = std::make_shared<Request>(vaddr, req_size,
+ 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+ RequestPtr req = std::make_shared<Request>(vaddr,
+ sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, req_size, 0,
+ RequestPtr req = std::make_shared<Request>(vaddr, req_size,
+ 0,
gpuDynInst->computeUnit()->masterId(),
0, gpuDynInst->wfDynId);
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane];
- RequestPtr req = new Request(0, vaddr, sizeof(T), 0,
+ RequestPtr req = std::make_shared<Request>(vaddr,
+ sizeof(T), 0,
gpuDynInst->computeUnit()->masterId(), 0,
gpuDynInst->wfDynId,
gpuDynInst->makeAtomicOpFunctor<T>(
ComputeUnit *cu = _gpuDynInst->computeUnit();
for (auto i = 0; i < NumDwords; ++i) {
- int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i);
+ int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i);
vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx);
? _gpuDynInst->exec_mask : wf->execMask();
if (NumDwords == 1) {
- int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx);
+ int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx);
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx);
assert(vrfData[0]);
auto reg_file_vgpr = vrfData[0]->template as<VecElemU32>();
DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx);
cu->vrf[wf->simdId]->printReg(wf, vgprIdx);
} else if (NumDwords == 2) {
- int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx);
- int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1);
+ int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx);
+ int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1);
vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0);
vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1);
assert(vrfData[0]);
if (_opIdx == REG_VCC_LO) {
sgprIdx = cu->registerManager
- .mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
+ ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword);
} else if (_opIdx == REG_FLAT_SCRATCH_HI) {
sgprIdx = cu->registerManager
- .mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
+ ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword);
} else if (_opIdx == REG_FLAT_SCRATCH_LO) {
assert(NumDwords == 1);
sgprIdx = cu->registerManager
- .mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
+ ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword);
} else {
- sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword);
+ sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword);
}
assert(sgprIdx > -1);
* with new extensions, it will likely be wrong to just arbitrarily
* grab context zero.
*/
- auto process = sys->getThreadContext(0)->getProcessPtr();
+ auto process = sys->threads[0]->getProcessPtr();
if (!process->pTable->translate(vaddr, paddr)) {
fatal("failed translation: vaddr 0x%x\n", vaddr);
DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start);
return start;
}
+
+/**
+ * Forward relevant parameters to packet processor; queueID
+ * is used to link doorbell. The queueIDs are not re-used
+ * in current implementation, and we allocate only one page
+ * (4096 bytes) for doorbells, so check if this queue ID can
+ * be mapped into that page.
+ */
+void
+HSADriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
+{
+ TypedBufferArg<kfd_ioctl_create_queue_args> args(ioc_buf);
+ args.copyIn(mem_proxy);
+
+ if (queueId >= 0x1000) {
+ fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
+ }
+
+ args->queue_id = queueId++;
+ auto &hsa_pp = device->hsaPacketProc();
+ hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
+ args->ring_base_address, args->queue_id,
+ args->ring_size);
+ args.copyOut(mem_proxy);
+}
struct HSADriverParams;
class HSADevice;
-class SETranslatingPortProxy;
+class PortProxy;
class ThreadContext;
class HSADriver : public EmulatedDriver
HSADevice *device;
uint32_t queueId;
- void allocateQueue(const SETranslatingPortProxy &mem_proxy,
- Addr ioc_buf_addr);
+ void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf);
};
#endif // __DEV_HSA_HSA_DRIVER_HH__
// Grab the process and try to translate the virtual address with it; with
// new extensions, it will likely be wrong to just arbitrarily grab context
// zero.
- auto process = sys->getThreadContext(0)->getProcessPtr();
+ auto process = sys->threads[0]->getProcessPtr();
if (!process->pTable->translate(vaddr, paddr))
fatal("failed translation: vaddr 0x%x\n", vaddr);
* The reason for this is that the DMASequencer does
* not support atomic operations.
*/
- auto tc = sys->getThreadContext(0);
+ auto tc = sys->threads[0];
auto &virt_proxy = tc->getVirtProxy();
TypedBufferArg<uint64_t> prev_signal(signal_addr);
prev_signal.copyIn(virt_proxy);
// We use the same mapping function used by hsa runtime to do this mapping
//
// Originally
- // #define VOID_PTR_ADD32(ptr,n) \
+ // #define VOID_PTR_ADD32(ptr,n)
// (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
// (Addr)VOID_PTR_ADD32(0, queue_id)
Addr db_offset = queue_id;
// `(Addr)(VOID_PRT_ADD32(0, queue_id))`
//
// Originally
- // #define VOID_PTR_ADD32(ptr,n) \
+ // #define VOID_PTR_ADD32(ptr,n)
// (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/
// (Addr)VOID_PTR_ADD32(0, queue_id)
Addr db_offset = queue_id;
+# Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+# All rights reserved.
#
-# Copyright (c) 2015 Advanced Micro Devices, Inc.
-# All rights reserved.
+# For use for simulation and test purposes only
#
-# For use for simulation and test purposes only
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
#
-# 1. Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
#
-# 3. Neither the name of the copyright holder nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# Author: Steve Reinhardt
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
#
+# Authors: Steve Reinhardt
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from m5.SimObject import SimObject
+from m5.objects.Bridge import Bridge
from m5.objects.ClockedObject import ClockedObject
from m5.objects.Device import DmaDevice
-from m5.objects.Process import EmulatedDriver
-from m5.objects.Bridge import Bridge
+from m5.objects.HSADevice import HSADevice
+from m5.objects.HSADriver import HSADriver
from m5.objects.LdsState import LdsState
+from m5.objects.Process import EmulatedDriver
class PrefetchType(Enum): vals = [
'PF_CU',
'PF_END',
]
-class VectorRegisterFile(SimObject):
+class PoolManager(SimObject):
+ type = 'PoolManager'
+ abstract = True
+ cxx_header = "gpu-compute/pool_manager.hh"
+
+ min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+ pool_size = Param.Int(2048, 'number of vector registers per SIMD')
+
+# The simple pool manage only allows one workgroup to
+# be executing on a CU at any given time.
+class SimplePoolManager(PoolManager):
+ type = 'SimplePoolManager'
+ cxx_class = 'SimplePoolManager'
+ cxx_header = "gpu-compute/simple_pool_manager.hh"
+
+class RegisterFile(SimObject):
+ type = 'RegisterFile'
+ cxx_class = 'RegisterFile'
+ cxx_header = 'gpu-compute/register_file.hh'
+
+ simd_id = Param.Int(-1, 'SIMD ID associated with this Register File')
+ num_regs = Param.Int(2048, 'number of registers in this RF')
+ wf_size = Param.Int(64, 'Wavefront size (in work items)')
+
+class ScalarRegisterFile(RegisterFile):
+ type = 'ScalarRegisterFile'
+ cxx_class = 'ScalarRegisterFile'
+ cxx_header = 'gpu-compute/scalar_register_file.hh'
+
+class VectorRegisterFile(RegisterFile):
type = 'VectorRegisterFile'
cxx_class = 'VectorRegisterFile'
cxx_header = 'gpu-compute/vector_register_file.hh'
- simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
- num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
- wfSize = Param.Int(64, 'Wavefront size (in work items)')
- min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+class RegisterManager(SimObject):
+ type = 'RegisterManager'
+ cxx_class = 'RegisterManager'
+ cxx_header = 'gpu-compute/register_manager.hh'
+
+ policy = Param.String("static", "Register Manager Policy")
+ vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers')
+ srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers')
class Wavefront(SimObject):
type = 'Wavefront'
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
- wfSize = Param.Int(64, 'Wavefront size (in work items)')
+ wf_size = Param.Int(64, 'Wavefront size (in work items)')
+ max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the '
+ 'instruction buffer (IB).')
+# Most of the default values here are obtained from the
+# AMD Graphics Core Next (GCN) Architecture whitepaper.
class ComputeUnit(ClockedObject):
type = 'ComputeUnit'
cxx_class = 'ComputeUnit'
cxx_header = 'gpu-compute/compute_unit.hh'
wavefronts = VectorParam.Wavefront('Number of wavefronts')
- wfSize = Param.Int(64, 'Wavefront size (in work items)')
+ # Wavefront size is 64. This is configurable, however changing
+ # this value to anything other than 64 will likely cause errors.
+ wf_size = Param.Int(64, 'Wavefront size (in work items)')
num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+ num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
+ num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
+ 'per CU')
+ simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit')
+
+ operand_network_length = Param.Int(1, 'number of pipe stages of operand '\
+ 'network')
spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
'latency')
- dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+ dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\
'latency')
-
+ scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU')
issue_period = Param.Int(4, 'number of cycles per issue period')
+
+ vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+ 'GM bus')
+ srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\
+ 'to Scalar Mem bus')
+ vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\
+ 'LM bus')
+
num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
- n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
- mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
- "Represents the pipeline to reach the TCP and "\
- "specified in GPU clock cycles")
- mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
- "cu. Represents the pipeline between the TCP "\
- "and cu as well as TCP data array access. "\
- "Specified in GPU clock cycles")
+ n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
+ mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\
+ "Represents the pipeline to reach the TCP "\
+ "and specified in GPU clock cycles")
+ mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\
+ "cu. Represents the pipeline between the "\
+ "TCP and cu as well as TCP data array "\
+ "access. Specified in GPU clock cycles")
system = Param.System(Parent.any, "system object")
cu_id = Param.Int('CU id')
- vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
- "in bytes")
- coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
- "in bytes")
+ vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\
+ "width in bytes")
+ coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\
+ "width in bytes")
memory_port = VectorMasterPort("Port to the memory system")
translation_port = VectorMasterPort('Port to the TLB hierarchy')
sqc_port = MasterPort("Port to the SQC (I-cache")
sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+ scalar_port = MasterPort("Port to the scalar data cache")
+ scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache")
perLaneTLB = Param.Bool(False, "enable per-lane TLB")
prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
"(0 turns off prefetching)")
"from last mem req in lane of "\
"CU|Phase|Wavefront")
execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
- xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
"kernel end")
- countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
- "and how many times")
+ countPages = Param.Bool(False, "Generate per-CU file of all pages "\
+ "touched and how many times")
+ scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\
+ "memory pipeline's queues")
global_mem_queue_size = Param.Int(256, "Number of entries in the global "
"memory pipeline's queues")
local_mem_queue_size = Param.Int(256, "Number of entries in the local "
"memory pipeline's queues")
+ max_wave_requests = Param.Int(64, "number of pending vector memory "\
+ "requests per wavefront")
max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
" of instructions that can be sent to coalescer")
ldsBus = Bridge() # the bridge between the CU and its LDS
vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
"file")
+
+ scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\
+ "file")
out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
" in the GM pipeline")
+ register_manager = Param.RegisterManager("Register Manager")
+ fetch_depth = Param.Int(2, 'number of i-cache lines that may be '
+ 'buffered in the fetch unit.')
class Shader(ClockedObject):
type = 'Shader'
cxx_class = 'Shader'
cxx_header = 'gpu-compute/shader.hh'
-
CUs = VectorParam.ComputeUnit('Number of compute units')
- n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
+ dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
+ n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
- ruby at kernel boundaries""")
- separate_acquire_release = Param.Bool(False,
- """Do ld_acquire/st_release generate separate requests for the
- acquire and release?""")
+ ruby at kernel boundaries""")
globalmem = Param.MemorySize('64kB', 'Memory size')
timing = Param.Bool(False, 'timing memory accesses')
cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
translation = Param.Bool(False, "address translation");
+ timer_period = Param.Clock('10us', "system timer period")
+ idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold")
+ max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting")
-class ClDriver(EmulatedDriver):
- type = 'ClDriver'
- cxx_header = 'gpu-compute/cl_driver.hh'
- codefile = VectorParam.String('code file name(s)')
+class GPUComputeDriver(HSADriver):
+ type = 'GPUComputeDriver'
+ cxx_header = 'gpu-compute/gpu_compute_driver.hh'
-class GpuDispatcher(DmaDevice):
- type = 'GpuDispatcher'
+class GPUDispatcher(SimObject):
+ type = 'GPUDispatcher'
cxx_header = 'gpu-compute/dispatcher.hh'
- # put at 8GB line for now
- pio_addr = Param.Addr(0x200000000, "Device Address")
- pio_latency = Param.Latency('1ns', "Programmed IO latency")
- shader_pointer = Param.Shader('pointer to shader')
- translation_port = MasterPort('Port to the dispatcher TLB')
- cpu = Param.BaseCPU("CPU to wake up on kernel completion")
-
- cl_driver = Param.ClDriver('pointer to driver')
-
-class MemType(Enum): vals = [
- 'M_U8',
- 'M_U16',
- 'M_U32',
- 'M_U64',
- 'M_S8',
- 'M_S16',
- 'M_S32',
- 'M_S64',
- 'M_F16',
- 'M_F32',
- 'M_F64',
- ]
+
+class GPUCommandProcessor(HSADevice):
+ type = 'GPUCommandProcessor'
+ cxx_header = 'gpu-compute/gpu_command_processor.hh'
+ dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU')
class StorageClassType(Enum): vals = [
'SC_SPILL',
'SC_GLOBAL',
- 'SC_SHARED',
+ 'SC_GROUP',
'SC_PRIVATE',
'SC_READONLY',
'SC_KERNARG',
+ 'SC_ARG',
'SC_NONE',
]
-
-class RegisterType(Enum): vals = [
- 'RT_VECTOR',
- 'RT_SCALAR',
- 'RT_CONDITION',
- 'RT_HARDWARE',
- 'RT_NONE',
- ]
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
-# 3. Neither the name of the copyright holder nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# Op types
'ALU', # ALU op
'Branch', # Branch instruction
+ 'CondBranch', # Conditinal Branch instruction
'Nop', # No-op (no effect at all)
- 'Return', # Return instruction
+ 'Return', # Subroutine return instruction
+ 'EndOfKernel', # Kernel termination instruction
+ 'KernelLaunch', # Kernel launch inst
'UnconditionalJump', #
'SpecialOp', # Special op
'Waitcnt', # Is a waitcnt instruction
# Memory ops
'MemBarrier', # Barrier instruction
- 'MemFence', # Memory fence instruction
+ 'MemSync', # Synchronizing instruction
'MemoryRef', # References memory (load, store, or atomic)
'Flat', # Flat memory op
'Load', # Reads from memory
'WritesSCC', # The instruction writes SCC
'ReadsVCC', # The instruction reads VCC
'WritesVCC', # The instruction writes VCC
+ 'ReadsEXEC', # The instruction reads Exec Mask
+ 'WritesEXEC', # The instruction writes Exec Mask
+ 'ReadsMode', # The instruction reads Mode register
+ 'WritesMode', # The instruction writes Mode register
+ 'IgnoreExec', # The instruction ignores the Exec Mask
+ 'IsSDWA', # The instruction is a SDWA instruction
+ 'IsDPP', # The instruction is a DPP instruction
# Atomic OP types
'AtomicAnd',
'AtomicMax',
'AtomicMin',
- # Memory order flags
- 'RelaxedOrder',
- 'Acquire', # Has acquire semantics
- 'Release', # Has release semantics
- 'AcquireRelease', # Has acquire and release semantics
- 'NoOrder', # Has no ordering restrictions
-
# Segment access flags
'ArgSegment', # Accesses the arg segment
'GlobalSegment', # Accesses global memory
'SpillSegment', # Accesses the spill segment
'NoSegment', # Does not have an associated segment
- # Scope flags
- 'WorkitemScope',
- 'WavefrontScope',
- 'WorkgroupScope',
- 'DeviceScope',
- 'SystemScope',
- 'NoScope', # Does not have an associated scope
-
# Coherence flags
- 'GloballyCoherent', # Coherent with other workitems on same device
- 'SystemCoherent' # Coherent with a different device, or the host
+ 'GloballyCoherent', # Coherent with other work-items on same device
+ 'SystemCoherent', # Coherent with a different device, or the host
+
+ # Floating-point flags
+ 'F16', # F16 operation
+ 'F32', # F32 operation
+ 'F64', # F64 operation
+
+ # MAC, MAD, FMA
+ 'FMA', # FMA
+ 'MAC', # MAC
+ 'MAD' # MAD
]
SimObject('LdsState.py')
SimObject('X86GPUTLB.py')
-if env['TARGET_GPU_ISA'] == 'hsail':
- Source('brig_object.cc')
- Source('hsail_code.cc')
-
-Source('cl_driver.cc')
Source('compute_unit.cc')
-Source('condition_register_state.cc')
Source('dispatcher.cc')
Source('exec_stage.cc')
Source('fetch_stage.cc')
Source('fetch_unit.cc')
Source('global_memory_pipeline.cc')
+Source('gpu_command_processor.cc')
+Source('gpu_compute_driver.cc')
Source('gpu_dyn_inst.cc')
Source('gpu_exec_context.cc')
Source('gpu_static_inst.cc')
Source('gpu_tlb.cc')
-Source('hsa_object.cc')
-Source('kernel_cfg.cc')
Source('lds_state.cc')
Source('local_memory_pipeline.cc')
Source('pool_manager.cc')
+Source('register_file.cc')
+Source('register_manager.cc')
+Source('scalar_memory_pipeline.cc')
+Source('scalar_register_file.cc')
Source('schedule_stage.cc')
Source('scheduler.cc')
Source('scoreboard_check_stage.cc')
Source('shader.cc')
Source('simple_pool_manager.cc')
+Source('static_register_manager_policy.cc')
Source('tlb_coalescer.cc')
Source('vector_register_file.cc')
-Source('vector_register_state.cc')
Source('wavefront.cc')
-DebugFlag('BRIG')
DebugFlag('GPUCoalescer')
+DebugFlag('GPUCommandProc')
+DebugFlag('GPUDriver')
+DebugFlag('GPUInitAbi')
DebugFlag('GPUDisp')
DebugFlag('GPUExec')
DebugFlag('GPUFetch')
-DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUKernelInfo')
DebugFlag('GPUMem')
DebugFlag('GPUPort')
DebugFlag('GPUPrefetch')
DebugFlag('GPUReg')
+DebugFlag('GPURename')
+DebugFlag('GPURF')
+DebugFlag('GPURfState')
+DebugFlag('GPUSched')
+DebugFlag('GPUShader')
+DebugFlag('GPUSRF')
DebugFlag('GPUSync')
DebugFlag('GPUTLB')
DebugFlag('GPUVRF')
-DebugFlag('HSALoader')
-DebugFlag('HSAIL')
-DebugFlag('HSAILObject')
+DebugFlag('GPUVRFSched')
+DebugFlag('GPUWgLatency')
DebugFlag('Predictor')
DebugFlag('WavefrontStack')
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
- 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL',
- 'GPUVRF'])
+ 'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
+ 'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
+ 'GPUInitAbi'])
#include "debug/GPUMem.hh"
#include "debug/GPUPort.hh"
#include "debug/GPUPrefetch.hh"
+#include "debug/GPUReg.hh"
+#include "debug/GPURename.hh"
#include "debug/GPUSync.hh"
#include "debug/GPUTLB.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/page_table.hh"
#include "sim/process.hh"
-
-ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
- scoreboardCheckStage(p), scheduleStage(p), execStage(p),
- globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
- cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+#include "sim/sim_exit.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
+ numVectorGlobalMemUnits(p->num_global_mem_pipes),
+ numVectorSharedMemUnits(p->num_shared_mem_pipes),
+ numScalarMemUnits(p->num_scalar_mem_pipes),
+ numVectorALUs(p->num_SIMDs),
+ numScalarALUs(p->num_scalar_cores),
+ vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+ coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+ registerManager(p->register_manager), fetchStage(p),
+ scoreboardCheckStage(p), scheduleStage(p, this), execStage(p),
+ globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p),
+ tickEvent([this]{ exec(); }, "Compute unit tick event",
+ false, Event::CPU_Tick_Pri),
+ cu_id(p->cu_id),
+ vrf(p->vector_register_file), srf(p->scalar_register_file),
+ simdWidth(p->simd_width),
spBypassPipeLength(p->spbypass_pipe_length),
dpBypassPipeLength(p->dpbypass_pipe_length),
+ scalarPipeStages(p->scalar_pipe_length),
+ operandNetworkLength(p->operand_network_length),
issuePeriod(p->issue_period),
- numGlbMemUnits(p->num_global_mem_pipes),
- numLocMemUnits(p->num_shared_mem_pipes),
+ vrf_gm_bus_latency(p->vrf_gm_bus_latency),
+ srf_scm_bus_latency(p->srf_scm_bus_latency),
+ vrf_lm_bus_latency(p->vrf_lm_bus_latency),
perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
- xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+ debugSegFault(p->debugSegFault),
functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
countPages(p->countPages), barrier_id(0),
- vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
- coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
_masterId(p->system->getMasterId(this, "ComputeUnit")),
lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
_cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
- wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
+ wavefrontSize(p->wf_size)
{
/**
* This check is necessary because std::bitset only provides conversion
* to unsigned long or unsigned long long via to_ulong() or to_ullong().
- * there are * a few places in the code where to_ullong() is used, however
- * if VSZ is larger than a value the host can support then bitset will
- * throw a runtime exception. we should remove all use of to_long() or
- * to_ullong() so we can have VSZ greater than 64b, however until that is
- * done this assert is required.
+ * there are a few places in the code where to_ullong() is used, however
+ * if wavefrontSize is larger than a value the host can support then
+ * bitset will throw a runtime exception. We should remove all use of
+ * to_long() or to_ullong() so we can have wavefrontSize greater than 64b,
+ * however until that is done this assert is required.
*/
- fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
- p->wfSize <= 0,
+ fatal_if(p->wf_size > std::numeric_limits<unsigned long long>::digits ||
+ p->wf_size <= 0,
"WF size is larger than the host can support");
fatal_if(!isPowerOf2(wavefrontSize),
"Wavefront size should be a power of 2");
numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
/ coalescerToVrfBusWidth;
- lastVaddrWF.resize(numSIMDs);
- wfList.resize(numSIMDs);
+ // Initialization: all WF slots are assumed STOPPED
+ idleWfs = p->n_wf * numVectorALUs;
+ lastVaddrWF.resize(numVectorALUs);
+ wfList.resize(numVectorALUs);
- for (int j = 0; j < numSIMDs; ++j) {
+ for (int j = 0; j < numVectorALUs; ++j) {
lastVaddrWF[j].resize(p->n_wf);
for (int i = 0; i < p->n_wf; ++i) {
}
}
- lastVaddrSimd.resize(numSIMDs);
+ lastVaddrSimd.resize(numVectorALUs);
- for (int i = 0; i < numSIMDs; ++i) {
+ for (int i = 0; i < numVectorALUs; ++i) {
lastVaddrSimd[i].resize(wfSize(), 0);
}
cuExitCallback = new CUExitCallback(this);
registerExitCallback(cuExitCallback);
- xactCasLoadMap.clear();
- lastExecCycle.resize(numSIMDs, 0);
+ lastExecCycle.resize(numVectorALUs, 0);
for (int i = 0; i < vrf.size(); ++i) {
vrf[i]->setParent(this);
}
-
+ for (int i = 0; i < srf.size(); ++i) {
+ srf[i]->setParent(this);
+ }
numVecRegsPerSimd = vrf[0]->numRegs();
+ numScalarRegsPerSimd = srf[0]->numRegs();
+
+ registerManager->setParent(this);
+
+ activeWaves = 0;
+
+ instExecPerSimd.resize(numVectorALUs, 0);
+
+ // Calculate the number of bits to address a cache line
+ panic_if(!isPowerOf2(_cacheLineSize),
+ "Cache line size should be a power of two.");
+ cacheLineBits = floorLog2(_cacheLineSize);
}
ComputeUnit::~ComputeUnit()
{
// Delete wavefront slots
- for (int j = 0; j < numSIMDs; ++j) {
+ for (int j = 0; j < numVectorALUs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
delete wfList[j][i];
}
}
lastVaddrCU.clear();
readyList.clear();
- waveStatusList.clear();
dispatchList.clear();
- vectorAluInstAvail.clear();
delete cuExitCallback;
delete ldsPort;
}
-void
-ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr)
+int
+ComputeUnit::numExeUnits() const
+{
+ return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
+ numVectorSharedMemUnits + numScalarMemUnits;
+}
+
+// index into readyList of the first memory unit
+int
+ComputeUnit::firstMemUnit() const
+{
+ return numVectorALUs + numScalarALUs;
+}
+
+// index into readyList of the last memory unit
+int
+ComputeUnit::lastMemUnit() const
{
- w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+ return numExeUnits() - 1;
+}
- w->workGroupSz[0] = ndr->q.wgSize[0];
- w->workGroupSz[1] = ndr->q.wgSize[1];
- w->workGroupSz[2] = ndr->q.wgSize[2];
+// index into scalarALUs vector of SALU used by the wavefront
+int
+ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const
+{
+ if (numScalarALUs == 1) {
+ return 0;
+ } else {
+ return w->simdId % numScalarALUs;
+ }
+}
+
+// index into readyList of Scalar ALU unit used by wavefront
+int
+ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const
+{
+ return numVectorALUs + mapWaveToScalarAlu(w);
+}
+
+// index into readyList of Global Memory unit used by wavefront
+int
+ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const
+{
+ // TODO: FIXME if more than 1 GM pipe supported
+ return numVectorALUs + numScalarALUs;
+}
+
+// index into readyList of Local Memory unit used by wavefront
+int
+ComputeUnit::mapWaveToLocalMem(Wavefront *w) const
+{
+ // TODO: FIXME if more than 1 LM pipe supported
+ return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits;
+}
+
+// index into readyList of Scalar Memory unit used by wavefront
+int
+ComputeUnit::mapWaveToScalarMem(Wavefront *w) const
+{
+ // TODO: FIXME if more than 1 ScM pipe supported
+ return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
+ numVectorSharedMemUnits;
+}
+
+void
+ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
+{
+ w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
+ w->workGroupSz[0] = task->wgSize(0);
+ w->workGroupSz[1] = task->wgSize(1);
+ w->workGroupSz[2] = task->wgSize(2);
w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
- w->gridSz[0] = ndr->q.gdSize[0];
- w->gridSz[1] = ndr->q.gdSize[1];
- w->gridSz[2] = ndr->q.gdSize[2];
- w->kernelArgs = ndr->q.args;
- w->privSizePerItem = ndr->q.privMemPerItem;
- w->spillSizePerItem = ndr->q.spillMemPerItem;
- w->roBase = ndr->q.roMemStart;
- w->roSize = ndr->q.roMemTotal;
- w->computeActualWgSz(ndr);
+ w->gridSz[0] = task->gridSize(0);
+ w->gridSz[1] = task->gridSize(1);
+ w->gridSz[2] = task->gridSize(2);
+ w->computeActualWgSz(task);
}
+// delete all wavefronts that have been marked as ready at SCB stage
+// but are found to have empty instruction buffers at SCH stage
void
-ComputeUnit::updateEvents() {
-
- if (!timestampVec.empty()) {
- uint32_t vecSize = timestampVec.size();
- uint32_t i = 0;
- while (i < vecSize) {
- if (timestampVec[i] <= shader->tick_cnt) {
- std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
- vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
- statusVec[i]);
- timestampVec.erase(timestampVec.begin() + i);
- regIdxVec.erase(regIdxVec.begin() + i);
- statusVec.erase(statusVec.begin() + i);
- --vecSize;
- --i;
+ComputeUnit::updateReadyList(int unitId)
+{
+ if (!readyList[unitId].empty()) {
+ for (std::vector<Wavefront *>::iterator it = readyList[unitId].begin();
+ it != readyList[unitId].end();) {
+ if ((*it)->instructionBuffer.empty()) {
+ it = readyList[unitId].erase(it);
+ }
+ else {
+ ++it;
}
- ++i;
}
}
-
- for (int i = 0; i< numSIMDs; ++i) {
- vrf[i]->updateEvents();
- }
}
-
void
ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
- NDRange *ndr)
+ HSAQueueEntry *task, bool fetchContext)
{
static int _n_wave = 0;
init_mask[k] = 1;
}
- w->kernId = ndr->dispatchId;
+ w->execMask() = init_mask;
+
+ w->kernId = task->dispatchId();
w->wfId = waveId;
w->initMask = init_mask.to_ullong();
w->oldBarrierCnt = 0;
w->barrierCnt = 0;
- w->privBase = ndr->q.privMemStart;
- ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
-
- w->spillBase = ndr->q.spillMemStart;
- ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
-
- w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
-
// WG state
- w->wgId = ndr->globalWgId;
- w->dispatchId = ndr->dispatchId;
- w->workGroupId[0] = w->wgId % ndr->numWg[0];
- w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
- w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
+ w->wgId = task->globalWgId();
+ w->dispatchId = task->dispatchId();
+ w->workGroupId[0] = w->wgId % task->numWg(0);
+ w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
+ w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
w->barrierId = barrier_id;
- w->stalledAtBarrier = false;
+ w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true;
// set the wavefront context to have a pointer to this section of the LDS
w->ldsChunk = ldsChunk;
int32_t refCount M5_VAR_USED =
- lds.increaseRefCounter(w->dispatchId, w->wgId);
+ lds.increaseRefCounter(w->dispatchId, w->wgId);
DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
cu_id, w->wgId, refCount);
if (w->pendingFetch)
w->dropFetch = true;
- // is this the last wavefront in the workgroup
- // if set the spillWidth to be the remaining work-items
- // so that the vector access is correct
- if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
- w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
- } else {
- w->spillWidth = wfSize();
- }
-
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
- "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+ "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId,
+ w->wfSlotId);
+
+ w->initRegState(task, w->actualWgSzTotal);
+ w->start(_n_wave++, task->codeAddr());
- w->start(++_n_wave, ndr->q.code_ptr);
+ waveLevelParallelism.sample(activeWaves);
+ activeWaves++;
+}
+
+/**
+ * trigger invalidate operation in the cu
+ *
+ * req: request initialized in shader, carrying the invlidate flags
+ */
+void
+ComputeUnit::doInvalidate(RequestPtr req, int kernId){
+ GPUDynInstPtr gpuDynInst
+ = std::make_shared<GPUDynInst>(this, nullptr,
+ new KernelLaunchStaticInst(), getAndIncSeqNum());
+
+ // kern_id will be used in inv responses
+ gpuDynInst->kern_id = kernId;
+ // update contextId field
+ req->setContext(gpuDynInst->wfDynId);
+
+ injectGlobalMemFence(gpuDynInst, true, req);
+}
+
+/**
+ * trigger flush operation in the cu
+ *
+ * gpuDynInst: inst passed to the request
+ */
+void
+ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
+ injectGlobalMemFence(gpuDynInst, true);
}
void
-ComputeUnit::StartWorkgroup(NDRange *ndr)
+ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
{
- // reserve the LDS capacity allocated to the work group
- // disambiguated by the dispatch ID and workgroup ID, which should be
- // globally unique
- LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
- ndr->q.ldsSize);
-
- // Send L1 cache acquire
- // isKernel + isAcquire = Kernel Begin
- if (shader->impl_kern_boundary_sync) {
- GPUDynInstPtr gpuDynInst =
- std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
- getAndIncSeqNum());
-
- gpuDynInst->useContinuation = false;
- injectGlobalMemFence(gpuDynInst, true);
+ // If we aren't ticking, start it up!
+ if (!tickEvent.scheduled()) {
+ DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
+ schedule(tickEvent, nextCycle());
}
- // calculate the number of 32-bit vector registers required by wavefront
- int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
- int wave_id = 0;
-
- // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
- for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
- Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
- // Check if this wavefront slot is available:
- // It must be stopped and not waiting
- // for a release to complete S_RETURNING
- if (w->status == Wavefront::S_STOPPED) {
- fillKernelState(w, ndr);
- // if we have scheduled all work items then stop
- // scheduling wavefronts
- if (wave_id * wfSize() >= w->actualWgSzTotal)
- break;
+ // the kernel's invalidate must have finished before any wg dispatch
+ assert(task->isInvDone());
- // reserve vector registers for the scheduled wavefront
- assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
- uint32_t normSize = 0;
+ // reserve the LDS capacity allocated to the work group
+ // disambiguated by the dispatch ID and workgroup ID, which should be
+ // globally unique
+ LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
+ task->globalWgId(),
+ task->ldsSize());
- w->startVgprIndex = vrf[m % numSIMDs]->manager->
- allocateRegion(vregDemand, &normSize);
+ panic_if(!ldsChunk, "was not able to reserve space for this WG");
- w->reservedVectorRegs = normSize;
- vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+ // calculate the number of 32-bit vector registers required
+ // by each work item
+ int vregDemand = task->numVectorRegs();
+ int sregDemand = task->numScalarRegs();
+ int wave_id = 0;
- startWavefront(w, wave_id, ldsChunk, ndr);
- ++wave_id;
+ // Assign WFs according to numWfsToSched vector, which is computed by
+ // hasDispResources()
+ for (int j = 0; j < shader->n_wf; ++j) {
+ for (int i = 0; i < numVectorALUs; ++i) {
+ Wavefront *w = wfList[i][j];
+ // Check if this wavefront slot is available and there are WFs
+ // remaining to be dispatched to current SIMD:
+ // WF slot must be stopped and not waiting
+ // for a release to complete S_RETURNING
+ if (w->getStatus() == Wavefront::S_STOPPED &&
+ numWfsToSched[i] > 0) {
+ // decrement number of WFs awaiting dispatch to current SIMD
+ numWfsToSched[i] -= 1;
+
+ fillKernelState(w, task);
+
+ DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
+ "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
+ vregDemand, sregDemand);
+
+ registerManager->allocateRegisters(w, vregDemand, sregDemand);
+
+ startWavefront(w, wave_id, ldsChunk, task);
+ ++wave_id;
+ }
}
}
++barrier_id;
}
-int
-ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+void
+ComputeUnit::insertInPipeMap(Wavefront *w)
+{
+ panic_if(w->instructionBuffer.empty(),
+ "Instruction Buffer of WF%d can't be empty", w->wgId);
+ GPUDynInstPtr ii = w->instructionBuffer.front();
+ pipeMap.emplace(ii->seqNum());
+}
+
+void
+ComputeUnit::deleteFromPipeMap(Wavefront *w)
+{
+ panic_if(w->instructionBuffer.empty(),
+ "Instruction Buffer of WF%d can't be empty", w->wgId);
+ GPUDynInstPtr ii = w->instructionBuffer.front();
+ // delete the dynamic instruction from the pipeline map
+ auto it = pipeMap.find(ii->seqNum());
+ panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
+ pipeMap.erase(it);
+}
+
+bool
+ComputeUnit::hasDispResources(HSAQueueEntry *task)
{
- // Get true size of workgroup (after clamping to grid size)
- int trueWgSize[3];
+ // compute true size of workgroup (after clamping to grid size)
+ int trueWgSize[HSAQueueEntry::MAX_DIM];
int trueWgSizeTotal = 1;
- for (int d = 0; d < 3; ++d) {
- trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
- ndr->wgId[d] * ndr->q.wgSize[d]);
+ for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
+ trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
+ task->wgId(d) * task->wgSize(d));
trueWgSizeTotal *= trueWgSize[d];
DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
+ // calculate the number of WFs in this WG
+ int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+
// calculate the number of 32-bit vector registers required by each
// work item of the work group
- int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
- bool vregAvail = true;
- int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+ int vregDemandPerWI = task->numVectorRegs();
+ // calculate the number of 32-bit scalar registers required by each
+ // work item of the work group
+ int sregDemandPerWI = task->numScalarRegs();
+
+ // check if the total number of VGPRs snd SGPRs required by all WFs
+ // of the WG fit in the VRFs of all SIMD units and the CU's SRF
+ panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
+ "WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
+ "that has %d VGPRs\n",
+ numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
+ panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
+ "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
+ "with %d SGPRs\n",
+ numWfs, sregDemandPerWI, numScalarRegsPerSimd);
+
+ // number of WF slots that are not occupied
int freeWfSlots = 0;
- // check if the total number of VGPRs required by all WFs of the WG
- // fit in the VRFs of all SIMD units
- assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+ // number of Wfs from WG that were successfully mapped to a SIMD
int numMappedWfs = 0;
- std::vector<int> numWfsPerSimd;
- numWfsPerSimd.resize(numSIMDs, 0);
- // find how many free WF slots we have across all SIMDs
+ numWfsToSched.clear();
+ numWfsToSched.resize(numVectorALUs, 0);
+
+ // attempt to map WFs to the SIMDs, based on WF slot availability
+ // and register file availability
for (int j = 0; j < shader->n_wf; ++j) {
- for (int i = 0; i < numSIMDs; ++i) {
- if (wfList[i][j]->status == Wavefront::S_STOPPED) {
- // count the number of free WF slots
+ for (int i = 0; i < numVectorALUs; ++i) {
+ if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
++freeWfSlots;
- if (numMappedWfs < numWfs) {
- // count the WFs to be assigned per SIMD
- numWfsPerSimd[i]++;
+ // check if current WF will fit onto current SIMD/VRF
+ // if all WFs have not yet been mapped to the SIMDs
+ if (numMappedWfs < numWfs &&
+ registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1,
+ sregDemandPerWI) &&
+ registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1,
+ vregDemandPerWI)) {
+ numWfsToSched[i]++;
+ numMappedWfs++;
}
- numMappedWfs++;
}
}
}
- // if there are enough free WF slots then find if there are enough
- // free VGPRs per SIMD based on the WF->SIMD mapping
- if (freeWfSlots >= numWfs) {
- for (int j = 0; j < numSIMDs; ++j) {
- // find if there are enough free VGPR regions in the SIMD's VRF
- // to accommodate the WFs of the new WG that would be mapped to
- // this SIMD unit
- vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
- vregDemandPerWI);
-
- // stop searching if there is at least one SIMD
- // whose VRF does not have enough free VGPR pools.
- // This is because a WG is scheduled only if ALL
- // of its WFs can be scheduled
- if (!vregAvail)
- break;
+ // check that the number of mapped WFs is not greater
+ // than the actual number of WFs
+ assert(numMappedWfs <= numWfs);
+
+ bool vregAvail = true;
+ bool sregAvail = true;
+ // if a WF to SIMD mapping was not found, find the limiting resource
+ if (numMappedWfs < numWfs) {
+
+ for (int j = 0; j < numVectorALUs; ++j) {
+ // find if there are enough free VGPRs in the SIMD's VRF
+ // to accomodate the WFs of the new WG that would be mapped
+ // to this SIMD unit
+ vregAvail &= registerManager->
+ canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
+ // find if there are enough free SGPRs in the SIMD's SRF
+ // to accomodate the WFs of the new WG that would be mapped
+ // to this SIMD unit
+ sregAvail &= registerManager->
+ canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
}
}
- DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
- freeWfSlots, vregAvail);
+ DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
+ VGPR Availability = %d, SGPR Availability = %d\n",
+ freeWfSlots, numMappedWfs, vregAvail, sregAvail);
if (!vregAvail) {
++numTimesWgBlockedDueVgprAlloc;
}
+ if (!sregAvail) {
+ ++numTimesWgBlockedDueSgprAlloc;
+ }
+
// Return true if enough WF slots to submit workgroup and if there are
// enough VGPRs to schedule all WFs to their SIMD units
- if (!lds.canReserve(ndr->q.ldsSize)) {
+ bool ldsAvail = lds.canReserve(task->ldsSize());
+ if (!ldsAvail) {
wgBlockedDueLdsAllocation++;
}
- // Return true if (a) there are enough free WF slots to submit
- // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
- // SIMD units and (c) if there is enough space in LDS
- return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+ // Return true if the following are all true:
+ // (a) all WFs of the WG were mapped to free WF slots
+ // (b) there are enough VGPRs to schedule all WFs to their SIMD units
+ // (c) there are enough SGPRs on the CU to schedule all WFs
+ // (d) there is enough space in LDS to allocate for all WFs
+ bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
+ && ldsAvail;
+ return can_dispatch;
}
int
DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
int ccnt = 0;
- for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+ for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) {
for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
Wavefront *w = wfList[i_simd][i_wf];
- if (w->status == Wavefront::S_RUNNING) {
+ if (w->getStatus() == Wavefront::S_RUNNING) {
DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
w->barrierId, _barrier_id);
- DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+ DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n",
w->barrierCnt, bcnt);
+
+ DPRINTF(GPUSync, "outstanding Reqs = %d\n",
+ w->outstandingReqs);
}
- if (w->status == Wavefront::S_RUNNING &&
+ if (w->getStatus() == Wavefront::S_RUNNING &&
w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
!w->outstandingReqs) {
++ccnt;
return ccnt == bslots;
}
-// Check if the current wavefront is blocked on additional resources.
-bool
-ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
-{
- bool cede = false;
-
- // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
- // magic instructions will impact the scheduling of wavefronts
- if (xact_cas_mode) {
- /*
- * When a wavefront calls xact_cas_ld, it adds itself to a per address
- * queue. All per address queues are managed by the xactCasLoadMap.
- *
- * A wavefront is not blocked if: it is not in ANY per address queue or
- * if it is at the head of a per address queue.
- */
- for (auto itMap : xactCasLoadMap) {
- std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
-
- if (!curWaveIDQueue.empty()) {
- for (auto it : curWaveIDQueue) {
- waveIdentifier cur_wave = it;
-
- if (cur_wave.simdId == simdId &&
- cur_wave.wfSlotId == wfSlotId) {
- // 2 possibilities
- // 1: this WF has a green light
- // 2: another WF has a green light
- waveIdentifier owner_wave = curWaveIDQueue.front();
-
- if (owner_wave.simdId != cur_wave.simdId ||
- owner_wave.wfSlotId != cur_wave.wfSlotId) {
- // possibility 2
- cede = true;
- break;
- } else {
- // possibility 1
- break;
- }
- }
- }
- }
- }
- }
-
- return cede;
-}
-
// Execute one clock worth of work on the ComputeUnit.
void
ComputeUnit::exec()
{
- updateEvents();
+ // process reads and writes in the RFs
+ for (auto &vecRegFile : vrf) {
+ vecRegFile->exec();
+ }
+
+ for (auto &scRegFile : srf) {
+ scRegFile->exec();
+ }
+
// Execute pipeline stages in reverse order to simulate
// the pipeline latency
+ scalarMemoryPipe.exec();
globalMemoryPipe.exec();
localMemoryPipe.exec();
execStage.exec();
fetchStage.exec();
totalCycles++;
+
+ // Put this CU to sleep if there is no more work to be done.
+ if (!isDone()) {
+ schedule(tickEvent, nextCycle());
+ } else {
+ shader->notifyCuSleep();
+ DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
+ }
}
void
ComputeUnit::init()
{
- // Initialize CU Bus models
- glbMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
- locMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
- nextGlbMemBus = 0;
- nextLocMemBus = 0;
- fatal_if(numGlbMemUnits > 1,
- "No support for multiple Global Memory Pipelines exists!!!");
- vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
- for (int j = 0; j < numGlbMemUnits; ++j) {
- vrfToGlobalMemPipeBus[j] = WaitClass();
- vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
- }
+ // Initialize CU Bus models and execution resources
- fatal_if(numLocMemUnits > 1,
- "No support for multiple Local Memory Pipelines exists!!!");
- vrfToLocalMemPipeBus.resize(numLocMemUnits);
- for (int j = 0; j < numLocMemUnits; ++j) {
- vrfToLocalMemPipeBus[j] = WaitClass();
- vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
+ // Vector ALUs
+ vectorALUs.clear();
+ for (int i = 0; i < numVectorALUs; i++) {
+ vectorALUs.emplace_back(this, clockPeriod());
}
- vectorRegsReserved.resize(numSIMDs, 0);
- aluPipe.resize(numSIMDs);
- wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
- for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
- wfWait[i] = WaitClass();
- wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
+ // Scalar ALUs
+ scalarALUs.clear();
+ for (int i = 0; i < numScalarALUs; i++) {
+ scalarALUs.emplace_back(this, clockPeriod());
}
- for (int i = 0; i < numSIMDs; ++i) {
- aluPipe[i] = WaitClass();
- aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
- }
+ // Vector Global Memory
+ fatal_if(numVectorGlobalMemUnits > 1,
+ "No support for multiple Global Memory Pipelines exists!!!");
+ vectorGlobalMemUnit.init(this, clockPeriod());
+ vrfToGlobalMemPipeBus.init(this, clockPeriod());
+ glbMemToVrfBus.init(this, clockPeriod());
- // Setup space for call args
- for (int j = 0; j < numSIMDs; ++j) {
- for (int i = 0; i < shader->n_wf; ++i) {
- wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
- }
- }
+ // Vector Local/Shared Memory
+ fatal_if(numVectorSharedMemUnits > 1,
+ "No support for multiple Local Memory Pipelines exists!!!");
+ vectorSharedMemUnit.init(this, clockPeriod());
+ vrfToLocalMemPipeBus.init(this, clockPeriod());
+ locMemToVrfBus.init(this, clockPeriod());
- // Initializing pipeline resources
- readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
- waveStatusList.resize(numSIMDs);
+ // Scalar Memory
+ fatal_if(numScalarMemUnits > 1,
+ "No support for multiple Scalar Memory Pipelines exists!!!");
+ scalarMemUnit.init(this, clockPeriod());
+ srfToScalarMemPipeBus.init(this, clockPeriod());
+ scalarMemToSrfBus.init(this, clockPeriod());
- for (int j = 0; j < numSIMDs; ++j) {
- for (int i = 0; i < shader->n_wf; ++i) {
- waveStatusList[j].push_back(
- std::make_pair(wfList[j][i], BLOCKED));
- }
- }
+ vectorRegsReserved.resize(numVectorALUs, 0);
+ scalarRegsReserved.resize(numVectorALUs, 0);
+
+ // Initializing pipeline resources
+ readyList.resize(numExeUnits());
- for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
- dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+ for (int j = 0; j < numExeUnits(); ++j) {
+ dispatchList.push_back(std::make_pair(nullptr, EMPTY));
}
fetchStage.init(this);
execStage.init(this);
globalMemoryPipe.init(this);
localMemoryPipe.init(this);
- // initialize state for statistics calculation
- vectorAluInstAvail.resize(numSIMDs, false);
- shrMemInstAvail = 0;
- glbMemInstAvail = 0;
+ scalarMemoryPipe.init(this);
gmTokenPort.setTokenManager(memPortTokens);
}
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
int index = sender_state->port_index;
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
+
+ // MemSyncResp + WriteAckResp are handled completely here and we don't
+ // schedule a MemRespEvent to process the responses further
+ if (pkt->cmd == MemCmd::MemSyncResp) {
+ // This response is for 1 of the following request types:
+ // - kernel launch
+ // - kernel end
+ // - non-kernel mem sync
+
+ // Kernel Launch
+ // wavefront was nullptr when launching kernel, so it is meaningless
+ // here (simdId=-1, wfSlotId=-1)
+ if (gpuDynInst->isKernelLaunch()) {
+ // for kernel launch, the original request must be both kernel-type
+ // and acquire
+ assert(pkt->req->isKernel());
+ assert(pkt->req->isAcquire());
+
+ // one D-Cache inv is done, decrement counter
+ dispatcher.updateInvCounter(gpuDynInst->kern_id);
+
+ delete pkt->senderState;
+ delete pkt;
+ return true;
+ }
- // Is the packet returned a Kernel End or Barrier
- if (pkt->req->isKernel() && pkt->req->isRelease()) {
- Wavefront *w =
- computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+ // retrieve wavefront from inst
+ Wavefront *w = gpuDynInst->wavefront();
// Check if we are waiting on Kernel End Release
- if (w->status == Wavefront::S_RETURNING) {
- DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+ if (w->getStatus() == Wavefront::S_RETURNING
+ && gpuDynInst->isEndOfKernel()) {
+ // for kernel end, the original request must be both kernel-type
+ // and release
+ assert(pkt->req->isKernel());
+ assert(pkt->req->isRelease());
+
+ // one wb done, decrement counter, and return whether all wbs are
+ // done for the kernel
+ bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
+
+ // not all wbs are done for the kernel, just release pkt
+ // resources
+ if (!isWbDone) {
+ delete pkt->senderState;
+ delete pkt;
+ return true;
+ }
+
+ // all wbs are completed for the kernel, do retirement work
+ // for the workgroup
+ DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
computeUnit->cu_id, w->simdId, w->wfSlotId,
- w->wfDynId, w->kernId);
+ w->wfDynId, w->wgId);
- computeUnit->shader->dispatcher->notifyWgCompl(w);
- w->status = Wavefront::S_STOPPED;
- } else {
- w->outstandingReqs--;
+ dispatcher.notifyWgCompl(w);
+ w->setStatus(Wavefront::S_STOPPED);
}
- DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+ if (!pkt->req->isKernel()) {
+ w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+ DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
+ "outstanding reqs %d => %d\n", gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
+ gpuDynInst->disassemble(), w->outstandingReqs,
+ w->outstandingReqs - 1);
+ computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
+ }
+
+ DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, w->barrierCnt);
- if (gpuDynInst->useContinuation) {
- assert(!gpuDynInst->isNoScope());
- gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
- gpuDynInst);
- }
-
delete pkt->senderState;
delete pkt;
return true;
- } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
- if (gpuDynInst->useContinuation) {
- assert(!gpuDynInst->isNoScope());
- gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
- gpuDynInst);
+ } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
+ // this is for writeComplete callback
+ // we simply get decrement write-related wait counters
+ assert(gpuDynInst);
+ Wavefront *w M5_VAR_USED =
+ computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+ assert(w);
+ DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
+ "outstanding reqs %d => %d\n", gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
+ gpuDynInst->disassemble(), w->outstandingReqs,
+ w->outstandingReqs - 1);
+ if (gpuDynInst->statusBitVector.none()) {
+ // ask gm pipe to decrement request counters, instead of directly
+ // performing here, to avoid asynchronous counter update and
+ // instruction retirement (which may hurt waincnt effects)
+ computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
}
delete pkt->senderState;
delete pkt;
+
return true;
}
EventFunctionWrapper *mem_resp_event =
computeUnit->memPort[index]->createMemRespEvent(pkt);
- DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
- index, pkt->req->getPaddr());
+ gpuDynInst->seqNum(), index, pkt->req->getPaddr());
computeUnit->schedule(mem_resp_event,
curTick() + computeUnit->resp_tick_latency);
+
+ return true;
+}
+
+bool
+ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
+{
+ assert(!pkt->req->isKernel());
+
+ // retrieve sender state
+ SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+ assert(pkt->isRead() || pkt->isWrite());
+ assert(gpuDynInst->numScalarReqs > 0);
+
+ gpuDynInst->numScalarReqs--;
+
+ /**
+ * for each returned scalar request we decrement the
+ * numScalarReqs counter that is associated with this
+ * gpuDynInst, which should have been set to correspond
+ * to the number of packets sent for the memory op.
+ * once all packets return, the memory op is finished
+ * and we can push it into the response queue.
+ */
+ if (!gpuDynInst->numScalarReqs) {
+ if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+ computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
+ gpuDynInst);
+ } else {
+ computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
+ gpuDynInst);
+ }
+ }
+
+ delete pkt->senderState;
+ delete pkt;
+
return true;
}
+void
+ComputeUnit::ScalarDataPort::recvReqRetry()
+{
+ for (const auto &pkt : retries) {
+ if (!sendTimingReq(pkt)) {
+ break;
+ } else {
+ retries.pop_front();
+ }
+ }
+}
+
void
ComputeUnit::DataPort::recvReqRetry()
{
ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
{
computeUnit->fetchStage.processFetchReturn(pkt);
-
return true;
}
BaseTLB::Mode TLB_mode;
assert(pkt->isRead() || pkt->isWrite());
+ // only do some things if actually accessing data
+ bool isDataAccess = pkt->isWrite() || pkt->isRead();
+
// Check write before read for atomic operations
// since atomic operations should use BaseTLB::Write
- if (pkt->isWrite()){
+ if (pkt->isWrite()) {
TLB_mode = BaseTLB::Write;
} else if (pkt->isRead()) {
TLB_mode = BaseTLB::Read;
assert(pkt->req->hasPaddr());
assert(pkt->req->hasSize());
- uint8_t *tmpData = pkt->getPtr<uint8_t>();
-
// this is necessary because the GPU TLB receives packets instead
// of requests. when the translation is complete, all relevent
// fields in the request will be populated, but not in the packet.
// and proper flags.
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
+ if (isDataAccess) {
+ uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
+ pkt->dataStatic(tmpData);
+ }
delete oldPkt;
- pkt->dataStatic(tmpData);
// New SenderState for the memory access
- pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
- index, nullptr);
+ pkt->senderState =
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+ nullptr);
gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
gpuDynInst->tlbHitLevel[index] = hit_level;
assert(tlbPort[tlbPort_index]->retries.size() > 0);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
- "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
- tmp_vaddr);
+ "failed!\n", cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, tmp_vaddr);
tlbPort[tlbPort_index]->retries.push_back(pkt);
} else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
tlbPort[tlbPort_index]->stallPort();
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
- "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
- tmp_vaddr);
+ "failed!\n", cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, tmp_vaddr);
tlbPort[tlbPort_index]->retries.push_back(pkt);
} else {
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
}
} else {
- if (pkt->cmd == MemCmd::MemFenceReq) {
+ if (pkt->cmd == MemCmd::MemSyncReq) {
gpuDynInst->statusBitVector = VectorMask(0);
} else {
gpuDynInst->statusBitVector &= (~(1ll << index));
// Translation is done. It is safe to send the packet to memory.
memPort[0]->sendFunctional(new_pkt);
+ DPRINTF(GPUMem, "Functional sendRequest\n");
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
new_pkt->req->getPaddr());
}
void
-ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
{
- EventFunctionWrapper *mem_req_event =
- memPort[index]->createMemReqEvent(pkt);
-
+ assert(pkt->isWrite() || pkt->isRead());
- // New SenderState for the memory access
- pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
- nullptr);
+ BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
- DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
- cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
- pkt->req->getPaddr());
+ pkt->senderState =
+ new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst);
- schedule(mem_req_event, curTick() + req_tick_latency);
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
+ pkt->senderState);
+
+ if (scalarDTLBPort->isStalled()) {
+ assert(scalarDTLBPort->retries.size());
+ scalarDTLBPort->retries.push_back(pkt);
+ } else if (!scalarDTLBPort->sendTimingReq(pkt)) {
+ scalarDTLBPort->stallPort();
+ scalarDTLBPort->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
+ tlb_mode == BaseTLB::Read ? "read" : "write",
+ pkt->req->getVaddr());
+ }
}
void
-ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+ bool kernelMemSync,
RequestPtr req)
{
- assert(gpuDynInst->isGlobalSeg());
+ assert(gpuDynInst->isGlobalSeg() ||
+ gpuDynInst->executedAs() == Enums::SC_GLOBAL);
if (!req) {
req = std::make_shared<Request>(
0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
}
+
+ // all mem sync requests have Paddr == 0
req->setPaddr(0);
- if (kernelLaunch) {
- req->setFlags(Request::KERNEL);
- }
- // for non-kernel MemFence operations, memorder flags are set depending
- // on which type of request is currently being sent, so this
- // should be set by the caller (e.g. if an inst has acq-rel
- // semantics, it will send one acquire req an one release req)
- gpuDynInst->setRequestFlags(req, kernelLaunch);
+ PacketPtr pkt = nullptr;
- // a mem fence must correspond to an acquire/release request
- assert(req->isAcquire() || req->isRelease());
+ if (kernelMemSync) {
+ if (gpuDynInst->isKernelLaunch()) {
+ req->setCacheCoherenceFlags(Request::ACQUIRE);
+ req->setReqInstSeqNum(gpuDynInst->seqNum());
+ req->setFlags(Request::KERNEL);
+ pkt = new Packet(req, MemCmd::MemSyncReq);
+ pkt->pushSenderState(
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
- // create packet
- PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+ EventFunctionWrapper *mem_req_event =
+ memPort[0]->createMemReqEvent(pkt);
- // set packet's sender state
- pkt->senderState =
- new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
+ "an acquire\n", cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+ } else {
+ assert(gpuDynInst->isEndOfKernel());
+
+ req->setCacheCoherenceFlags(Request::RELEASE);
+ req->setReqInstSeqNum(gpuDynInst->seqNum());
+ req->setFlags(Request::KERNEL);
+ pkt = new Packet(req, MemCmd::MemSyncReq);
+ pkt->pushSenderState(
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
+
+ EventFunctionWrapper *mem_req_event =
+ memPort[0]->createMemReqEvent(pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
+ "a release\n", cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+ }
+ } else {
+ gpuDynInst->setRequestFlags(req);
+
+ req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+ pkt = new Packet(req, MemCmd::MemSyncReq);
+ pkt->pushSenderState(
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
- // send the packet
- sendSyncRequest(gpuDynInst, 0, pkt);
+ EventFunctionWrapper *mem_req_event =
+ memPort[0]->createMemReqEvent(pkt);
+
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
+ pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+ }
}
void
Addr paddr = pkt->req->getPaddr();
- if (pkt->cmd != MemCmd::MemFenceResp) {
- int index = gpuDynInst->memStatusVector[paddr].back();
-
- DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
- pkt->req->getPaddr(), index);
+ // mem sync resp and write-complete callback must be handled already in
+ // DataPort::recvTimingResp
+ assert(pkt->cmd != MemCmd::MemSyncResp);
+ assert(pkt->cmd != MemCmd::WriteCompleteResp);
- gpuDynInst->memStatusVector[paddr].pop_back();
- gpuDynInst->pAddr = pkt->req->getPaddr();
+ // this is for read, write and atomic
+ int index = gpuDynInst->memStatusVector[paddr].back();
- if (pkt->isRead() || pkt->isWrite()) {
-
- if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
- gpuDynInst->statusBitVector &= (~(1ULL << index));
- } else {
- assert(gpuDynInst->statusVector[index] > 0);
- gpuDynInst->statusVector[index]--;
+ DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+ pkt->req->getPaddr(), index);
- if (!gpuDynInst->statusVector[index])
- gpuDynInst->statusBitVector &= (~(1ULL << index));
- }
+ gpuDynInst->memStatusVector[paddr].pop_back();
+ gpuDynInst->pAddr = pkt->req->getPaddr();
- DPRINTF(GPUMem, "bitvector is now %#x\n",
- gpuDynInst->statusBitVector);
+ gpuDynInst->statusBitVector &= (~(1ULL << index));
- if (gpuDynInst->statusBitVector == VectorMask(0)) {
- auto iter = gpuDynInst->memStatusVector.begin();
- auto end = gpuDynInst->memStatusVector.end();
+ DPRINTF(GPUMem, "bitvector is now %#x\n",
+ gpuDynInst->statusBitVector);
- while (iter != end) {
- assert(iter->second.empty());
- ++iter;
- }
+ if (gpuDynInst->statusBitVector == VectorMask(0)) {
+ auto iter = gpuDynInst->memStatusVector.begin();
+ auto end = gpuDynInst->memStatusVector.end();
- gpuDynInst->memStatusVector.clear();
+ while (iter != end) {
+ assert(iter->second.empty());
+ ++iter;
+ }
- if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
- gpuDynInst->statusVector.clear();
+ // Calculate the difference between the arrival of the first cache
+ // block and the last cache block to arrive if we have the time
+ // for the first cache block.
+ if (compute_unit->headTailMap.count(gpuDynInst)) {
+ Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
+ compute_unit->headTailLatency.sample(curTick() - headTick);
+ compute_unit->headTailMap.erase(gpuDynInst);
+ }
- compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
+ gpuDynInst->memStatusVector.clear();
- DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
- compute_unit->cu_id, gpuDynInst->simdId,
- gpuDynInst->wfSlotId);
+ // note: only handle read response here; for write, the response
+ // is separately handled when writeComplete callback is received
+ if (pkt->isRead()) {
+ gpuDynInst->
+ profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
+ compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
- // after clearing the status vectors,
- // see if there is a continuation to perform
- // the continuation may generate more work for
- // this memory request
- if (gpuDynInst->useContinuation) {
- assert(!gpuDynInst->isNoScope());
- gpuDynInst->execContinuation(
- gpuDynInst->staticInstruction(),
- gpuDynInst);
- }
- }
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
}
} else {
- gpuDynInst->statusBitVector = VectorMask(0);
-
- if (gpuDynInst->useContinuation) {
- assert(!gpuDynInst->isNoScope());
- gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
- gpuDynInst);
+ if (!compute_unit->headTailMap.count(gpuDynInst)) {
+ compute_unit->headTailMap.insert(
+ std::make_pair(gpuDynInst, curTick()));
}
}
// Because it's atomic operation, only need TLB translation state
prefetch_pkt->senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
- computeUnit->shader->gpuTc,
- true);
+ computeUnit->shader->gpuTc, true);
// Currently prefetches are zero-latency, hence the sendFunctional
sendFunctional(prefetch_pkt);
pkt->req->getPaddr());
} else {
DPRINTF(GPUPort,
- "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+ "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
+ "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index,
+ pkt->req->getPaddr());
+ }
+}
+
+const char*
+ComputeUnit::ScalarDataPort::MemReqEvent::description() const
+{
+ return "ComputeUnit scalar memory request event";
+}
+
+void
+ComputeUnit::ScalarDataPort::MemReqEvent::process()
+{
+ SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit;
+
+ if (!(scalarDataPort->sendTimingReq(pkt))) {
+ scalarDataPort->retries.push_back(pkt);
+
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
compute_unit->cu_id, gpuDynInst->simdId,
- gpuDynInst->wfSlotId, index,
+ gpuDynInst->wfSlotId, scalarDataPort->index,
pkt->req->getPaddr());
+ } else {
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
+ "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
+ scalarDataPort->index, pkt->req->getPaddr());
}
}
}
}
+bool
+ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
+{
+ assert(pkt->senderState);
+
+ TheISA::GpuTLB::TranslationState *translation_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ // Page faults are not allowed
+ fatal_if(!translation_state->tlbEntry,
+ "Translation of vaddr %#x failed\n", pkt->req->getVaddr());
+
+ delete translation_state->tlbEntry;
+ assert(!translation_state->ports.size());
+
+ pkt->senderState = translation_state->saved;
+ delete translation_state;
+
+ ScalarDTLBPort::SenderState *sender_state =
+ safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
+
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ delete pkt->senderState;
+
+ Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
+ "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
+ w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
+
+ MemCmd mem_cmd;
+
+ if (pkt->cmd == MemCmd::ReadResp) {
+ mem_cmd = MemCmd::ReadReq;
+ } else if (pkt->cmd == MemCmd::WriteResp) {
+ mem_cmd = MemCmd::WriteReq;
+ } else {
+ fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
+ pkt->cmd.toString());
+ }
+
+ PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
+ req_pkt->dataStatic(pkt->getPtr<uint8_t>());
+ delete pkt;
+
+ req_pkt->senderState =
+ new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);
+
+ if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) {
+ computeUnit->scalarDataPort->retries.push_back(req_pkt);
+ DPRINTF(GPUMem, "send scalar req failed for: %s\n",
+ gpuDynInst->disassemble());
+ } else {
+ DPRINTF(GPUMem, "send scalar req for: %s\n",
+ gpuDynInst->disassemble());
+ }
+
+ return true;
+}
+
bool
ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
{
assert(pkt->senderState);
// pop off the TLB translation state
- TheISA::GpuTLB::TranslationState *translation_state =
- safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+ TheISA::GpuTLB::TranslationState *translation_state
+ = safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
bool success = translation_state->tlbEntry != nullptr;
delete translation_state->tlbEntry;
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
scalarMemReadsPerWF = scalarMemReads / completedWfs;
+ vectorMemReadsPerKiloInst
+ .name(name() + ".vector_mem_reads_per_kilo_inst")
+ .desc("Number of vector mem reads per kilo-instruction")
+ ;
+ vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
+ vectorMemWritesPerKiloInst
+ .name(name() + ".vector_mem_writes_per_kilo_inst")
+ .desc("Number of vector mem writes per kilo-instruction")
+ ;
+ vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
+ vectorMemInstsPerKiloInst
+ .name(name() + ".vector_mem_insts_per_kilo_inst")
+ .desc("Number of vector mem insts per kilo-instruction")
+ ;
+ vectorMemInstsPerKiloInst =
+ ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
+ scalarMemReadsPerKiloInst
+ .name(name() + ".scalar_mem_reads_per_kilo_inst")
+ .desc("Number of scalar mem reads per kilo-instruction")
+ ;
+ scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
+ scalarMemWritesPerKiloInst
+ .name(name() + ".scalar_mem_writes_per_kilo_inst")
+ .desc("Number of scalar mem writes per kilo-instruction")
+ ;
+ scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
+ scalarMemInstsPerKiloInst
+ .name(name() + ".scalar_mem_insts_per_kilo_inst")
+ .desc("Number of scalar mem insts per kilo-instruction")
+ ;
+ scalarMemInstsPerKiloInst =
+ ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
+
+ instCyclesVMemPerSimd
+ .init(numVectorALUs)
+ .name(name() + ".inst_cycles_vector_memory")
+ .desc("Number of cycles to send address, command, data from VRF to "
+ "vector memory unit, per SIMD")
+ ;
+
+ instCyclesScMemPerSimd
+ .init(numVectorALUs)
+ .name(name() + ".inst_cycles_scalar_memory")
+ .desc("Number of cycles to send address, command, data from SRF to "
+ "scalar memory unit, per SIMD")
+ ;
+
+ instCyclesLdsPerSimd
+ .init(numVectorALUs)
+ .name(name() + ".inst_cycles_lds")
+ .desc("Number of cycles to send address, command, data from VRF to "
+ "LDS unit, per SIMD")
+ ;
+
+ globalReads
+ .name(name() + ".global_mem_reads")
+ .desc("Number of reads to the global segment")
+ ;
+ globalWrites
+ .name(name() + ".global_mem_writes")
+ .desc("Number of writes to the global segment")
+ ;
+ globalMemInsts
+ .name(name() + ".global_mem_insts")
+ .desc("Number of memory instructions sent to the global segment")
+ ;
+ globalMemInsts = globalReads + globalWrites;
+ argReads
+ .name(name() + ".arg_reads")
+ .desc("Number of reads to the arg segment")
+ ;
+ argWrites
+ .name(name() + ".arg_writes")
+ .desc("NUmber of writes to the arg segment")
+ ;
+ argMemInsts
+ .name(name() + ".arg_mem_insts")
+ .desc("Number of memory instructions sent to the arg segment")
+ ;
+ argMemInsts = argReads + argWrites;
+ spillReads
+ .name(name() + ".spill_reads")
+ .desc("Number of reads to the spill segment")
+ ;
+ spillWrites
+ .name(name() + ".spill_writes")
+ .desc("Number of writes to the spill segment")
+ ;
+ spillMemInsts
+ .name(name() + ".spill_mem_insts")
+ .desc("Number of memory instructions sent to the spill segment")
+ ;
+ spillMemInsts = spillReads + spillWrites;
+ groupReads
+ .name(name() + ".group_reads")
+ .desc("Number of reads to the group segment")
+ ;
+ groupWrites
+ .name(name() + ".group_writes")
+ .desc("Number of writes to the group segment")
+ ;
+ groupMemInsts
+ .name(name() + ".group_mem_insts")
+ .desc("Number of memory instructions sent to the group segment")
+ ;
+ groupMemInsts = groupReads + groupWrites;
+ privReads
+ .name(name() + ".private_reads")
+ .desc("Number of reads to the private segment")
+ ;
+ privWrites
+ .name(name() + ".private_writes")
+ .desc("Number of writes to the private segment")
+ ;
+ privMemInsts
+ .name(name() + ".private_mem_insts")
+ .desc("Number of memory instructions sent to the private segment")
+ ;
+ privMemInsts = privReads + privWrites;
+ readonlyReads
+ .name(name() + ".readonly_reads")
+ .desc("Number of reads to the readonly segment")
+ ;
+ readonlyWrites
+ .name(name() + ".readonly_writes")
+ .desc("Number of memory instructions sent to the readonly segment")
+ ;
+ readonlyMemInsts
+ .name(name() + ".readonly_mem_insts")
+ .desc("Number of memory instructions sent to the readonly segment")
+ ;
+ readonlyMemInsts = readonlyReads + readonlyWrites;
+ kernargReads
+ .name(name() + ".kernarg_reads")
+ .desc("Number of reads sent to the kernarg segment")
+ ;
+ kernargWrites
+ .name(name() + ".kernarg_writes")
+ .desc("Number of memory instructions sent to the kernarg segment")
+ ;
+ kernargMemInsts
+ .name(name() + ".kernarg_mem_insts")
+ .desc("Number of memory instructions sent to the kernarg segment")
+ ;
+ kernargMemInsts = kernargReads + kernargWrites;
+
tlbCycles
.name(name() + ".tlb_cycles")
.desc("total number of cycles for all uncoalesced requests")
.desc("number of vec ops executed (e.g. WF size/inst)")
;
+ numVecOpsExecutedF16
+ .name(name() + ".num_vec_ops_f16_executed")
+ .desc("number of f16 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedF32
+ .name(name() + ".num_vec_ops_f32_executed")
+ .desc("number of f32 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedF64
+ .name(name() + ".num_vec_ops_f64_executed")
+ .desc("number of f64 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedFMA16
+ .name(name() + ".num_vec_ops_fma16_executed")
+ .desc("number of fma16 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedFMA32
+ .name(name() + ".num_vec_ops_fma32_executed")
+ .desc("number of fma32 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedFMA64
+ .name(name() + ".num_vec_ops_fma64_executed")
+ .desc("number of fma64 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedMAD16
+ .name(name() + ".num_vec_ops_mad16_executed")
+ .desc("number of mad16 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedMAD32
+ .name(name() + ".num_vec_ops_mad32_executed")
+ .desc("number of mad32 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedMAD64
+ .name(name() + ".num_vec_ops_mad64_executed")
+ .desc("number of mad64 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedMAC16
+ .name(name() + ".num_vec_ops_mac16_executed")
+ .desc("number of mac16 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedMAC32
+ .name(name() + ".num_vec_ops_mac32_executed")
+ .desc("number of mac32 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedMAC64
+ .name(name() + ".num_vec_ops_mac64_executed")
+ .desc("number of mac64 vec ops executed (e.g. WF size/inst)")
+ ;
+
+ numVecOpsExecutedTwoOpFP
+ .name(name() + ".num_vec_ops_two_op_fp_executed")
+ .desc("number of two op FP vec ops executed (e.g. WF size/inst)")
+ ;
+
totalCycles
.name(name() + ".num_total_cycles")
.desc("number of cycles the CU ran for")
.desc("Vector Operations per cycle (this CU only)")
;
+ vpc_f16
+ .name(name() + ".vpc_f16")
+ .desc("F16 Vector Operations per cycle (this CU only)")
+ ;
+
+ vpc_f32
+ .name(name() + ".vpc_f32")
+ .desc("F32 Vector Operations per cycle (this CU only)")
+ ;
+
+ vpc_f64
+ .name(name() + ".vpc_f64")
+ .desc("F64 Vector Operations per cycle (this CU only)")
+ ;
+
numALUInstsExecuted
.name(name() + ".num_alu_insts_executed")
.desc("Number of dynamic non-GM memory insts executed")
ipc = numInstrExecuted / totalCycles;
vpc = numVecOpsExecuted / totalCycles;
+ vpc_f16 = numVecOpsExecutedF16 / totalCycles;
+ vpc_f32 = numVecOpsExecutedF32 / totalCycles;
+ vpc_f64 = numVecOpsExecutedF64 / totalCycles;
numTimesWgBlockedDueVgprAlloc
.name(name() + ".times_wg_blocked_due_vgpr_alloc")
- .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+ .desc("Number of times WGs are blocked due to VGPR allocation per "
+ "SIMD")
+ ;
+
+ numTimesWgBlockedDueSgprAlloc
+ .name(name() + ".times_wg_blocked_due_sgpr_alloc")
+ .desc("Number of times WGs are blocked due to SGPR allocation per "
+ "SIMD")
;
dynamicGMemInstrCnt
.name(name() + ".global_mem_instr_cnt")
- .desc("dynamic global memory instructions count")
+ .desc("dynamic non-flat global memory instruction count")
+ ;
+
+ dynamicFlatMemInstrCnt
+ .name(name() + ".flat_global_mem_instr_cnt")
+ .desc("dynamic flat global memory instruction count")
;
dynamicLMemInstrCnt
.desc("number of completed wavefronts")
;
+ completedWGs
+ .name(name() + ".num_completed_wgs")
+ .desc("number of completed workgroups")
+ ;
+
numCASOps
.name(name() + ".num_CAS_ops")
.desc("number of compare and swap operations")
.desc("number of compare and swap operations that failed")
;
+ headTailLatency
+ .init(0, 1000000, 10000)
+ .name(name() + ".head_tail_latency")
+ .desc("ticks between first and last cache block arrival at coalescer")
+ .flags(Stats::pdf | Stats::oneline)
+ ;
+
+ waveLevelParallelism
+ .init(0, shader->n_wf * numVectorALUs, 1)
+ .name(name() + ".wlp")
+ .desc("wave level parallelism: count of active waves at wave launch")
+ ;
+
+ instInterleave
+ .init(numVectorALUs, 0, 20, 1)
+ .name(name() + ".interleaving")
+ .desc("Measure of instruction interleaving per SIMD")
+ ;
+
// register stats of pipeline stages
fetchStage.regStats();
scoreboardCheckStage.regStats();
scheduleStage.regStats();
execStage.regStats();
- // register stats of memory pipeline
+ // register stats of memory pipelines
globalMemoryPipe.regStats();
localMemoryPipe.regStats();
+ scalarMemoryPipe.regStats();
+
+ registerManager->regStats();
}
void
}
} else {
if (gpuDynInst->isALU()) {
+ shader->total_valu_insts++;
+ if (shader->total_valu_insts == shader->max_valu_insts) {
+ exitSimLoop("max vALU insts");
+ }
vALUInsts++;
instCyclesVALU++;
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
} else if (gpuDynInst->isStore()) {
vectorMemWrites++;
}
+
+ if (gpuDynInst->isLoad()) {
+ switch (gpuDynInst->executedAs()) {
+ case Enums::SC_SPILL:
+ spillReads++;
+ break;
+ case Enums::SC_GLOBAL:
+ globalReads++;
+ break;
+ case Enums::SC_GROUP:
+ groupReads++;
+ break;
+ case Enums::SC_PRIVATE:
+ privReads++;
+ break;
+ case Enums::SC_READONLY:
+ readonlyReads++;
+ break;
+ case Enums::SC_KERNARG:
+ kernargReads++;
+ break;
+ case Enums::SC_ARG:
+ argReads++;
+ break;
+ case Enums::SC_NONE:
+ /**
+ * this case can occur for flat mem insts
+ * who execute with EXEC = 0
+ */
+ break;
+ default:
+ fatal("%s has no valid segment\n", gpuDynInst->disassemble());
+ break;
+ }
+ } else if (gpuDynInst->isStore()) {
+ switch (gpuDynInst->executedAs()) {
+ case Enums::SC_SPILL:
+ spillWrites++;
+ break;
+ case Enums::SC_GLOBAL:
+ globalWrites++;
+ break;
+ case Enums::SC_GROUP:
+ groupWrites++;
+ break;
+ case Enums::SC_PRIVATE:
+ privWrites++;
+ break;
+ case Enums::SC_READONLY:
+ readonlyWrites++;
+ break;
+ case Enums::SC_KERNARG:
+ kernargWrites++;
+ break;
+ case Enums::SC_ARG:
+ argWrites++;
+ break;
+ case Enums::SC_NONE:
+ /**
+ * this case can occur for flat mem insts
+ * who execute with EXEC = 0
+ */
+ break;
+ default:
+ fatal("%s has no valid segment\n", gpuDynInst->disassemble());
+ break;
+ }
+ }
}
}
*page_stat_file << std::dec << iter.second.second << std::endl;
}
}
- }
+}
bool
ComputeUnit::isDone() const
{
- for (int i = 0; i < numSIMDs; ++i) {
- if (!isSimdDone(i)) {
+ for (int i = 0; i < numVectorALUs; ++i) {
+ if (!isVectorAluIdle(i)) {
return false;
}
}
- bool glbMemBusRdy = true;
- for (int j = 0; j < numGlbMemUnits; ++j) {
- glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+ // TODO: FIXME if more than 1 of any memory pipe supported
+ if (!srfToScalarMemPipeBus.rdy()) {
+ return false;
+ }
+ if (!vrfToGlobalMemPipeBus.rdy()) {
+ return false;
}
- bool locMemBusRdy = true;
- for (int j = 0; j < numLocMemUnits; ++j) {
- locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+ if (!vrfToLocalMemPipeBus.rdy()) {
+ return false;
}
- if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
- !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
- !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+ if (!globalMemoryPipe.isGMReqFIFOWrRdy()
+ || !localMemoryPipe.isLMReqFIFOWrRdy()
|| !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
- !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+ !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) {
return false;
}
}
int32_t
-ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+ComputeUnit::getRefCounter(const uint32_t dispatchId,
+ const uint32_t wgId) const
{
return lds.getRefCounter(dispatchId, wgId);
}
bool
-ComputeUnit::isSimdDone(uint32_t simdId) const
+ComputeUnit::isVectorAluIdle(uint32_t simdId) const
{
- assert(simdId < numSIMDs);
-
- for (int i=0; i < numGlbMemUnits; ++i) {
- if (!vrfToGlobalMemPipeBus[i].rdy())
- return false;
- }
- for (int i=0; i < numLocMemUnits; ++i) {
- if (!vrfToLocalMemPipeBus[i].rdy())
- return false;
- }
- if (!aluPipe[simdId].rdy()) {
- return false;
- }
+ assert(simdId < numVectorALUs);
for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
- if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+ if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
return false;
}
}
#include <deque>
#include <map>
-#include <unordered_map>
+#include <unordered_set>
#include <vector>
#include "base/callback.hh"
#include "base/statistics.hh"
#include "base/types.hh"
+#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/local_memory_pipeline.hh"
-#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/register_manager.hh"
+#include "gpu-compute/scalar_memory_pipeline.hh"
#include "gpu-compute/schedule_stage.hh"
#include "gpu-compute/scoreboard_check_stage.hh"
#include "mem/port.hh"
#include "mem/token_port.hh"
#include "sim/clocked_object.hh"
-static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
-static const int MAX_WIDTH_FOR_MEM_INST = 32;
-
-class NDRange;
+class HSAQueueEntry;
+class LdsChunk;
+class ScalarRegisterFile;
class Shader;
class VectorRegisterFile;
RR
};
-// List of execution units
-enum EXEC_UNIT
-{
- SIMD0 = 0,
- SIMD1,
- SIMD2,
- SIMD3,
- GLBMEM_PIPE,
- LDSMEM_PIPE,
- NUM_UNITS
-};
-
enum TLB_CACHE
{
TLB_MISS_CACHE_MISS = 0,
class ComputeUnit : public ClockedObject
{
public:
- FetchStage fetchStage;
- ScoreboardCheckStage scoreboardCheckStage;
- ScheduleStage scheduleStage;
- ExecStage execStage;
- GlobalMemPipeline globalMemoryPipe;
- LocalMemPipeline localMemoryPipe;
+
+
+ // Execution resources
+ //
+ // The ordering of units is:
+ // Vector ALUs
+ // Scalar ALUs
+ // GM Pipe
+ // LM Pipe
+ // Scalar Mem Pipe
+ //
+ // Note: the ordering of units is important and the code assumes the
+ // above ordering. However, there may be more than one resource of
+ // each type (e.g., 4 VALUs or 2 SALUs)
+
+ int numVectorGlobalMemUnits;
+ // Resource control for global memory to VRF data/address bus
+ WaitClass glbMemToVrfBus;
+ // Resource control for Vector Register File->Global Memory pipe buses
+ WaitClass vrfToGlobalMemPipeBus;
+ // Resource control for Vector Global Memory execution unit
+ WaitClass vectorGlobalMemUnit;
+
+ int numVectorSharedMemUnits;
+ // Resource control for local memory to VRF data/address bus
+ WaitClass locMemToVrfBus;
+ // Resource control for Vector Register File->Local Memory pipe buses
+ WaitClass vrfToLocalMemPipeBus;
+ // Resource control for Vector Shared/Local Memory execution unit
+ WaitClass vectorSharedMemUnit;
+
+ int numScalarMemUnits;
+ // Resource control for scalar memory to SRF data/address bus
+ WaitClass scalarMemToSrfBus;
+ // Resource control for Scalar Register File->Scalar Memory pipe buses
+ WaitClass srfToScalarMemPipeBus;
+ // Resource control for Scalar Memory execution unit
+ WaitClass scalarMemUnit;
+
+ // vector ALU execution resources
+ int numVectorALUs;
+ std::vector<WaitClass> vectorALUs;
+
+ // scalar ALU execution resources
+ int numScalarALUs;
+ std::vector<WaitClass> scalarALUs;
+
+ // Return total number of execution units on this CU
+ int numExeUnits() const;
+ // index into readyList of the first memory unit
+ int firstMemUnit() const;
+ // index into readyList of the last memory unit
+ int lastMemUnit() const;
+ // index into scalarALUs vector of SALU used by the wavefront
+ int mapWaveToScalarAlu(Wavefront *w) const;
+ // index into readyList of SALU used by wavefront
+ int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
+ // index into readyList of Global Memory unit used by wavefront
+ int mapWaveToGlobalMem(Wavefront *w) const;
+ // index into readyList of Local Memory unit used by wavefront
+ int mapWaveToLocalMem(Wavefront *w) const;
+ // index into readyList of Scalar Memory unit used by wavefront
+ int mapWaveToScalarMem(Wavefront *w) const;
+
+ int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+ int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+ int numCyclesPerStoreTransfer; // number of cycles per vector store
+ int numCyclesPerLoadTransfer; // number of cycles per vector load
// Buffers used to communicate between various pipeline stages
+ // At a high level, the following intra-/inter-stage communication occurs:
+ // SCB to SCH: readyList provides per exec resource list of waves that
+ // passed dependency and readiness checks. If selected by
+ // scheduler, attempt to add wave to schList conditional on
+ // RF support.
+ // SCH: schList holds waves that are gathering operands or waiting
+ // for execution resource availability. Once ready, waves are
+ // placed on the dispatchList as candidates for execution. A wave
+ // may spend multiple cycles in SCH stage, on the schList due to
+ // RF access conflicts or execution resource contention.
+ // SCH to EX: dispatchList holds waves that are ready to be executed.
+ // LM/FLAT arbitration may remove an LM wave and place it
+ // back on the schList. RF model may also force a wave back
+ // to the schList if using the detailed model.
+
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list. readyList is
// used to communicate between scoreboardCheck stage and
// schedule stage
- // TODO: make enum to index readyList
std::vector<std::vector<Wavefront*>> readyList;
- // Stores the status of waves. A READY implies the
- // wave is ready to be scheduled this cycle and
- // is already present in the readyList. waveStatusList is
- // used to communicate between scoreboardCheck stage and
- // schedule stage
- // TODO: convert std::pair to a class to increase readability
- std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
-
// List of waves which will be dispatched to
- // each execution resource. A FILLED implies
+ // each execution resource. An EXREADY implies
// dispatch list is non-empty and
// execution unit has something to execute
// this cycle. Currently, the dispatch list of
// and exec stage
// TODO: convert std::pair to a class to increase readability
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+ // track presence of dynamic instructions in the Schedule pipeline
+ // stage. This is used to check the readiness of the oldest,
+ // non-dispatched instruction of every WF in the Scoreboard stage.
+ std::unordered_set<uint64_t> pipeMap;
+
+ RegisterManager* registerManager;
+
+ FetchStage fetchStage;
+ ScoreboardCheckStage scoreboardCheckStage;
+ ScheduleStage scheduleStage;
+ ExecStage execStage;
+ GlobalMemPipeline globalMemoryPipe;
+ LocalMemPipeline localMemoryPipe;
+ ScalarMemPipeline scalarMemoryPipe;
+
+ EventFunctionWrapper tickEvent;
- int rrNextMemID; // used by RR WF exec policy to cycle through WF's
- int rrNextALUWp;
typedef ComputeUnitParams Params;
std::vector<std::vector<Wavefront*>> wfList;
int cu_id;
// array of vector register files, one per SIMD
std::vector<VectorRegisterFile*> vrf;
- // Number of vector ALU units (SIMDs) in CU
- int numSIMDs;
+ // array of scalar register files, one per SIMD
+ std::vector<ScalarRegisterFile*> srf;
+
+ // Width per VALU/SIMD unit: number of work items that can be executed
+ // on the vector ALU simultaneously in a SIMD unit
+ int simdWidth;
// number of pipe stages for bypassing data to next dependent single
// precision vector instruction inside the vector ALU pipeline
int spBypassPipeLength;
// number of pipe stages for bypassing data to next dependent double
// precision vector instruction inside the vector ALU pipeline
int dpBypassPipeLength;
- // number of cycles per issue period
- int issuePeriod;
+ // number of pipe stages for scalar ALU
+ int scalarPipeStages;
+ // number of pipe stages for operand collection & distribution network
+ int operandNetworkLength;
+ // number of cycles per instruction issue period
+ Cycles issuePeriod;
+
+ // VRF to GM Bus latency
+ Cycles vrf_gm_bus_latency;
+ // SRF to Scalar Mem Bus latency
+ Cycles srf_scm_bus_latency;
+ // VRF to LM Bus latency
+ Cycles vrf_lm_bus_latency;
- // Number of global and local memory execution resources in CU
- int numGlbMemUnits;
- int numLocMemUnits;
// tracks the last cycle a vector instruction was executed on a SIMD
std::vector<uint64_t> lastExecCycle;
+ // Track the amount of interleaving between wavefronts on each SIMD.
+ // This stat is sampled using instExecPerSimd to compute the number of
+ // instructions that have been executed on a SIMD between a WF executing
+ // two successive instructions.
+ Stats::VectorDistribution instInterleave;
+
+ // tracks the number of dyn inst executed per SIMD
+ std::vector<uint64_t> instExecPerSimd;
+
// true if we allow a separate TLB per lane
bool perLaneTLB;
// if 0, TLB prefetching is off.
Enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
- bool xact_cas_mode;
bool debugSegFault;
+ // Idle CU timeout in ticks
+ Tick idleCUTimeout;
+ int idleWfs;
bool functionalTLB;
bool localMemBarrier;
Shader *shader;
uint32_t barrier_id;
- // vector of Vector ALU (MACC) pipelines
- std::vector<WaitClass> aluPipe;
- // minimum issue period per SIMD unit (in cycles)
- std::vector<WaitClass> wfWait;
-
- // Resource control for Vector Register File->Global Memory pipe buses
- std::vector<WaitClass> vrfToGlobalMemPipeBus;
- // Resource control for Vector Register File->Local Memory pipe buses
- std::vector<WaitClass> vrfToLocalMemPipeBus;
- int nextGlbMemBus;
- int nextLocMemBus;
- // Resource control for global memory to VRF data/address bus
- WaitClass glbMemToVrfBus;
- // Resource control for local memory to VRF data/address bus
- WaitClass locMemToVrfBus;
-
- uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
- uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
- uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
- uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
Tick req_tick_latency;
Tick resp_tick_latency;
- // number of vector registers being reserved for each SIMD unit
+ /**
+ * Number of WFs to schedule to each SIMD. This vector is populated
+ * by hasDispResources(), and consumed by the subsequent call to
+ * dispWorkgroup(), to schedule the specified number of WFs to the
+ * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
+ */
+ std::vector<int> numWfsToSched;
+
+ // number of currently reserved vector registers per SIMD unit
std::vector<int> vectorRegsReserved;
+ // number of currently reserved scalar registers per SIMD unit
+ std::vector<int> scalarRegsReserved;
// number of vector registers per SIMD unit
- uint32_t numVecRegsPerSimd;
- // Support for scheduling VGPR status update events
- std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
- std::vector<uint64_t> timestampVec;
- std::vector<uint8_t> statusVec;
+ int numVecRegsPerSimd;
+ // number of available scalar registers per SIMD unit
+ int numScalarRegsPerSimd;
- void
- registerEvent(uint32_t simdId,
- uint32_t regIdx,
- uint32_t operandSize,
- uint64_t when,
- uint8_t newStatus) {
- regIdxVec.push_back(std::make_pair(simdId, regIdx));
- timestampVec.push_back(when);
- statusVec.push_back(newStatus);
- if (operandSize > 4) {
- regIdxVec.push_back(std::make_pair(simdId,
- ((regIdx + 1) %
- numVecRegsPerSimd)));
- timestampVec.push_back(when);
- statusVec.push_back(newStatus);
- }
- }
-
- void updateEvents();
+ void updateReadyList(int unitId);
// this hash map will keep track of page divergence
// per memory instruction per wavefront. The hash map
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
std::map<Addr, int> pagesTouched;
+ void insertInPipeMap(Wavefront *w);
+ void deleteFromPipeMap(Wavefront *w);
+
ComputeUnit(const Params *p);
~ComputeUnit();
- int spBypassLength() { return spBypassPipeLength; };
- int dpBypassLength() { return dpBypassPipeLength; };
- int storeBusLength() { return numCyclesPerStoreTransfer; };
- int loadBusLength() { return numCyclesPerLoadTransfer; };
- int wfSize() const { return wavefrontSize; };
- void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+ // Timing Functions
+ int oprNetPipeLength() const { return operandNetworkLength; }
+ int simdUnitWidth() const { return simdWidth; }
+ int spBypassLength() const { return spBypassPipeLength; }
+ int dpBypassLength() const { return dpBypassPipeLength; }
+ int scalarPipeLength() const { return scalarPipeStages; }
+ int storeBusLength() const { return numCyclesPerStoreTransfer; }
+ int loadBusLength() const { return numCyclesPerLoadTransfer; }
+ int wfSize() const { return wavefrontSize; }
+
void exec();
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
- void fillKernelState(Wavefront *w, NDRange *ndr);
+ void fillKernelState(Wavefront *w, HSAQueueEntry *task);
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
- NDRange *ndr);
-
- void StartWorkgroup(NDRange *ndr);
- int ReadyWorkgroup(NDRange *ndr);
-
- bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
- bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
- bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
- int GlbMemUnitId() { return GLBMEM_PIPE; }
- int ShrMemUnitId() { return LDSMEM_PIPE; }
- int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
- int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+ HSAQueueEntry *task, bool fetchContext=false);
+
+ void doInvalidate(RequestPtr req, int kernId);
+ void doFlush(GPUDynInstPtr gpuDynInst);
+
+ void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
+ bool hasDispResources(HSAQueueEntry *task);
+
+ int cacheLineSize() const { return _cacheLineSize; }
+ int getCacheLineBits() const { return cacheLineBits; }
+
/* This function cycles through all the wavefronts in all the phases to see
* if all of the wavefronts which should be associated with one barrier
* (denoted with _barrier_id), are all at the same barrier in the program
* return true.
*/
int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
- bool cedeSIMD(int simdId, int wfSlotId);
- template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+ template<typename c0, typename c1>
+ void doSmReturn(GPUDynInstPtr gpuDynInst);
+
virtual void init() override;
void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
- void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+ void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
- bool kernelLaunch=true,
+ bool kernelMemSync,
RequestPtr req=nullptr);
void handleMemPacket(PacketPtr pkt, int memport_index);
bool processTimingPacket(PacketPtr pkt);
MasterID masterId() { return _masterId; }
bool isDone() const;
- bool isSimdDone(uint32_t) const;
+ bool isVectorAluIdle(uint32_t simdId) const;
protected:
MasterID _masterId;
Stats::Scalar scalarMemReads;
Stats::Formula scalarMemReadsPerWF;
+ Stats::Formula vectorMemReadsPerKiloInst;
+ Stats::Formula vectorMemWritesPerKiloInst;
+ Stats::Formula vectorMemInstsPerKiloInst;
+ Stats::Formula scalarMemReadsPerKiloInst;
+ Stats::Formula scalarMemWritesPerKiloInst;
+ Stats::Formula scalarMemInstsPerKiloInst;
+
+ // Cycles required to send register source (addr and data) from
+ // register files to memory pipeline, per SIMD.
+ Stats::Vector instCyclesVMemPerSimd;
+ Stats::Vector instCyclesScMemPerSimd;
+ Stats::Vector instCyclesLdsPerSimd;
+
+ Stats::Scalar globalReads;
+ Stats::Scalar globalWrites;
+ Stats::Formula globalMemInsts;
+ Stats::Scalar argReads;
+ Stats::Scalar argWrites;
+ Stats::Formula argMemInsts;
+ Stats::Scalar spillReads;
+ Stats::Scalar spillWrites;
+ Stats::Formula spillMemInsts;
+ Stats::Scalar groupReads;
+ Stats::Scalar groupWrites;
+ Stats::Formula groupMemInsts;
+ Stats::Scalar privReads;
+ Stats::Scalar privWrites;
+ Stats::Formula privMemInsts;
+ Stats::Scalar readonlyReads;
+ Stats::Scalar readonlyWrites;
+ Stats::Formula readonlyMemInsts;
+ Stats::Scalar kernargReads;
+ Stats::Scalar kernargWrites;
+ Stats::Formula kernargMemInsts;
+
+ int activeWaves;
+ Stats::Distribution waveLevelParallelism;
+
void updateInstStats(GPUDynInstPtr gpuDynInst);
// the following stats compute the avg. TLB accesslatency per
// over all memory instructions executed over all wavefronts
// how many touched 0-4 pages, 4-8, ..., 60-64 pages
Stats::Distribution pageDivergenceDist;
+ // count of non-flat global memory vector instructions executed
Stats::Scalar dynamicGMemInstrCnt;
+ // count of flat global memory vector instructions executed
+ Stats::Scalar dynamicFlatMemInstrCnt;
Stats::Scalar dynamicLMemInstrCnt;
Stats::Scalar wgBlockedDueLdsAllocation;
- // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
- // when the instruction is committed, this number is still incremented by 1
+ // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
+ // active when the instruction is committed, this number is still
+ // incremented by 1
Stats::Scalar numInstrExecuted;
// Number of cycles among successive instruction executions across all
// wavefronts of the same CU
Stats::Distribution execRateDist;
// number of individual vector operations executed
Stats::Scalar numVecOpsExecuted;
+ // number of individual f16 vector operations executed
+ Stats::Scalar numVecOpsExecutedF16;
+ // number of individual f32 vector operations executed
+ Stats::Scalar numVecOpsExecutedF32;
+ // number of individual f64 vector operations executed
+ Stats::Scalar numVecOpsExecutedF64;
+ // number of individual FMA 16,32,64 vector operations executed
+ Stats::Scalar numVecOpsExecutedFMA16;
+ Stats::Scalar numVecOpsExecutedFMA32;
+ Stats::Scalar numVecOpsExecutedFMA64;
+ // number of individual MAC 16,32,64 vector operations executed
+ Stats::Scalar numVecOpsExecutedMAC16;
+ Stats::Scalar numVecOpsExecutedMAC32;
+ Stats::Scalar numVecOpsExecutedMAC64;
+ // number of individual MAD 16,32,64 vector operations executed
+ Stats::Scalar numVecOpsExecutedMAD16;
+ Stats::Scalar numVecOpsExecutedMAD32;
+ Stats::Scalar numVecOpsExecutedMAD64;
+ // total number of two op FP vector operations executed
+ Stats::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU
Stats::Scalar totalCycles;
Stats::Formula vpc; // vector ops per cycle
+ Stats::Formula vpc_f16; // vector ops per cycle
+ Stats::Formula vpc_f32; // vector ops per cycle
+ Stats::Formula vpc_f64; // vector ops per cycle
Stats::Formula ipc; // vector instructions per cycle
Stats::Distribution controlFlowDivergenceDist;
Stats::Distribution activeLanesPerGMemInstrDist;
Stats::Formula numALUInstsExecuted;
// number of times a WG can not start due to lack of free VGPRs in SIMDs
Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+ // number of times a WG can not start due to lack of free SGPRs in SIMDs
+ Stats::Scalar numTimesWgBlockedDueSgprAlloc;
Stats::Scalar numCASOps;
Stats::Scalar numFailedCASOps;
Stats::Scalar completedWfs;
- // flag per vector SIMD unit that is set when there is at least one
- // WV that has a vector ALU instruction as the oldest in its
- // Instruction Buffer: Defined in the Scoreboard stage, consumed
- // by the Execute stage.
- std::vector<bool> vectorAluInstAvail;
- // number of available (oldest) LDS instructions that could have
- // been issued to the LDS at a specific issue slot
- int shrMemInstAvail;
- // number of available Global memory instructions that could have
- // been issued to TCP at a specific issue slot
- int glbMemInstAvail;
+ Stats::Scalar completedWGs;
+
+ // distrubtion in latency difference between first and last cache block
+ // arrival ticks
+ Stats::Distribution headTailLatency;
void
regStats() override;
int32_t
getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
- int cacheLineSize() const { return _cacheLineSize; }
-
bool
sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
};
+ // Scalar data cache access port
+ class ScalarDataPort : public MasterPort
+ {
+ public:
+ ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
+ PortID _index)
+ : MasterPort(_name, _cu, _index), computeUnit(_cu), index(_index)
+ {
+ (void)index;
+ }
+
+ bool recvTimingResp(PacketPtr pkt) override;
+ void recvReqRetry() override;
+
+ struct SenderState : public Packet::SenderState
+ {
+ SenderState(GPUDynInstPtr gpuDynInst,
+ Packet::SenderState *sender_state=nullptr)
+ : _gpuDynInst(gpuDynInst), saved(sender_state)
+ {
+ }
+
+ GPUDynInstPtr _gpuDynInst;
+ Packet::SenderState *saved;
+ };
+
+ class MemReqEvent : public Event
+ {
+ private:
+ ScalarDataPort *scalarDataPort;
+ PacketPtr pkt;
+
+ public:
+ MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
+ : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
+ {
+ setFlags(Event::AutoDelete);
+ }
+
+ void process();
+ const char *description() const;
+ };
+
+ std::deque<PacketPtr> retries;
+
+ private:
+ ComputeUnit *computeUnit;
+ PortID index;
+ };
+
// Instruction cache access port
class SQCPort : public MasterPort
{
{
Wavefront *wavefront;
Packet::SenderState *saved;
+ // kernel id to be used in handling I-Cache invalidate response
+ int kernId;
SenderState(Wavefront *_wavefront, Packet::SenderState
- *sender_state=nullptr)
- : wavefront(_wavefront), saved(sender_state) { }
+ *sender_state=nullptr, int _kernId=-1)
+ : wavefront(_wavefront), saved(sender_state),
+ kernId(_kernId){ }
};
std::deque<std::pair<PacketPtr, Wavefront*>> retries;
virtual void recvReqRetry();
};
+ class ScalarDTLBPort : public MasterPort
+ {
+ public:
+ ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
+ : MasterPort(_name, _cu), computeUnit(_cu), stalled(false)
+ {
+ }
+
+ struct SenderState : public Packet::SenderState
+ {
+ SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
+ GPUDynInstPtr _gpuDynInst;
+ };
+
+ bool recvTimingResp(PacketPtr pkt) override;
+ void recvReqRetry() override { assert(false); }
+
+ bool isStalled() const { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ std::deque<PacketPtr> retries;
+
+ private:
+ ComputeUnit *computeUnit;
+ bool stalled;
+ };
+
class ITLBPort : public MasterPort
{
public:
std::vector<DataPort*> memPort;
// port to the TLB hierarchy (i.e., the L1 TLB)
std::vector<DTLBPort*> tlbPort;
+ // port to the scalar data cache
+ ScalarDataPort *scalarDataPort;
+ // port to the scalar data TLB
+ ScalarDTLBPort *scalarDTLBPort;
// port to the SQC (i.e. the I-cache)
SQCPort *sqcPort;
// port to the SQC TLB (there's a separate TLB for each I-cache)
tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
this, idx);
return *tlbPort[idx];
+ } else if (if_name == "scalar_port") {
+ scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
+ idx), this, idx);
+ return *scalarDataPort;
+ } else if (if_name == "scalar_tlb_port") {
+ scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
+ this);
+ return *scalarDTLBPort;
} else if (if_name == "sqc_port") {
sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
this, idx);
}
}
- // xact_cas_load()
- class waveIdentifier
- {
- public:
- waveIdentifier() { }
- waveIdentifier(int _simdId, int _wfSlotId)
- : simdId(_simdId), wfSlotId(_wfSlotId) { }
-
- int simdId;
- int wfSlotId;
- };
-
- class waveQueue
- {
- public:
- std::list<waveIdentifier> waveIDQueue;
- };
- std::map<unsigned, waveQueue> xactCasLoadMap;
-
- uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+ InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
private:
const int _cacheLineSize;
- uint64_t globalSeqNum;
+ int cacheLineBits;
+ InstSeqNum globalSeqNum;
int wavefrontSize;
- GPUStaticInst *kernelLaunchInst;
+
+ // hold the time of the arrival of the first cache block related to
+ // a particular GPUDynInst. This is used to calculate the difference
+ // between the first and last chace block arrival times.
+ std::map<GPUDynInstPtr, Tick> headTailMap;
};
#endif // __COMPUTE_UNIT_HH__
#include "gpu-compute/dispatcher.hh"
-#include "cpu/base.hh"
#include "debug/GPUDisp.hh"
-#include "gpu-compute/cl_driver.hh"
-#include "gpu-compute/cl_event.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "debug/GPUWgLatency.hh"
+#include "gpu-compute/gpu_command_processor.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
-#include "mem/packet_access.hh"
-
-GpuDispatcher *GpuDispatcher::instance = nullptr;
-
-GpuDispatcher::GpuDispatcher(const Params *p)
- : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
- pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
- dispatchCount(0), dispatchActive(false), cpu(p->cpu),
- shader(p->shader_pointer), driver(p->cl_driver),
- tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
- false, Event::CPU_Tick_Pri)
+#include "sim/syscall_emul_buf.hh"
+#include "sim/system.hh"
+
+GPUDispatcher::GPUDispatcher(const Params *p)
+ : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),
+ tickEvent([this]{ exec(); },
+ "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),
+ dispatchActive(false)
{
- shader->handshake(this);
- driver->handshake(this);
-
- ndRange.wg_disp_rem = false;
- ndRange.globalWgId = 0;
-
schedule(&tickEvent, 0);
+}
- // translation port for the dispatcher
- tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+GPUDispatcher::~GPUDispatcher()
+{
+}
- num_kernelLaunched
+void
+GPUDispatcher::regStats()
+{
+ numKernelLaunched
.name(name() + ".num_kernel_launched")
.desc("number of kernel launched")
;
+
+ cyclesWaitingForDispatch
+ .name(name() + ".cycles_wait_dispatch")
+ .desc("number of cycles with outstanding wavefronts "
+ "that are waiting to be dispatched")
+ ;
+}
+
+HSAQueueEntry*
+GPUDispatcher::hsaTask(int disp_id)
+{
+ assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());
+ return hsaQueueEntries[disp_id];
}
-GpuDispatcher *GpuDispatcherParams::create()
+void
+GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
{
- GpuDispatcher *dispatcher = new GpuDispatcher(this);
- GpuDispatcher::setInstance(dispatcher);
+ gpuCmdProc = gpu_cmd_proc;
+}
- return GpuDispatcher::getInstance();
+void
+GPUDispatcher::setShader(Shader *new_shader)
+{
+ shader = new_shader;
}
void
-GpuDispatcher::serialize(CheckpointOut &cp) const
+GPUDispatcher::serialize(CheckpointOut &cp) const
{
Tick event_tick = 0;
- if (ndRange.wg_disp_rem)
- fatal("Checkpointing not supported during active workgroup execution");
-
if (tickEvent.scheduled())
event_tick = tickEvent.when();
SERIALIZE_SCALAR(event_tick);
-
}
void
-GpuDispatcher::unserialize(CheckpointIn &cp)
+GPUDispatcher::unserialize(CheckpointIn &cp)
{
Tick event_tick;
UNSERIALIZE_SCALAR(event_tick);
- if (event_tick)
+ if (event_tick) {
schedule(&tickEvent, event_tick);
+ }
}
-AddrRangeList
-GpuDispatcher::getAddrRanges() const
-{
- AddrRangeList ranges;
-
- DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
- pioAddr, pioSize);
-
- ranges.push_back(RangeSize(pioAddr, pioSize));
-
- return ranges;
-}
-
-Tick
-GpuDispatcher::read(PacketPtr pkt)
+/**
+ * After all relevant HSA data structures have been traversed/extracted
+ * from memory by the CP, dispatch() is called on the dispatcher. This will
+ * schedule a dispatch event that, when triggered, will attempt to dispatch
+ * the WGs associated with the given task to the CUs.
+ */
+void
+GPUDispatcher::dispatch(HSAQueueEntry *task)
{
- assert(pkt->getAddr() >= pioAddr);
- assert(pkt->getAddr() < pioAddr + pioSize);
-
- int offset = pkt->getAddr() - pioAddr;
- pkt->allocate();
+ ++numKernelLaunched;
- DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+ DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",
+ task->kernelName(), task->dispatchId());
- if (offset < 8) {
- assert(!offset);
- assert(pkt->getSize() == 8);
+ execIds.push(task->dispatchId());
+ dispatchActive = true;
+ hsaQueueEntries.emplace(task->dispatchId(), task);
- uint64_t retval = dispatchActive;
- pkt->setLE(retval);
- } else {
- offset -= 8;
- assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
- char *curTaskPtr = (char*)&curTask;
-
- memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->clockPeriod());
}
-
- pkt->makeAtomicResponse();
-
- return pioDelay;
}
-Tick
-GpuDispatcher::write(PacketPtr pkt)
+void
+GPUDispatcher::exec()
{
- assert(pkt->getAddr() >= pioAddr);
- assert(pkt->getAddr() < pioAddr + pioSize);
-
- int offset = pkt->getAddr() - pioAddr;
-
-#if TRACING_ON
- uint64_t data_val = 0;
-
- switch (pkt->getSize()) {
- case 1:
- data_val = pkt->getLE<uint8_t>();
- break;
- case 2:
- data_val = pkt->getLE<uint16_t>();
- break;
- case 4:
- data_val = pkt->getLE<uint32_t>();
- break;
- case 8:
- data_val = pkt->getLE<uint64_t>();
- break;
- default:
- DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
- }
+ int fail_count(0);
- DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
- pkt->getSize());
-#endif
- if (!offset) {
- static int nextId = 0;
-
- // The depends field of the qstruct, which was previously unused, is
- // used to communicate with simulated application.
- if (curTask.depends) {
- HostState hs;
- shader->ReadMem((uint64_t)(curTask.depends), &hs,
- sizeof(HostState), 0);
+ /**
+ * There are potentially multiple outstanding kernel launches.
+ * It is possible that the workgroups in a different kernel
+ * can fit on the GPU even if another kernel's workgroups cannot
+ */
+ DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
- // update event start time (in nano-seconds)
- uint64_t start = curTick() / 1000;
+ if (execIds.size() > 0) {
+ ++cyclesWaitingForDispatch;
+ }
- shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
- &start, sizeof(uint64_t), 0);
+ /**
+ * dispatch work cannot start until the kernel's invalidate is
+ * completely finished; hence, kernel will always initiates
+ * invalidate first and keeps waiting until inv done
+ */
+ while (execIds.size() > fail_count) {
+ int exec_id = execIds.front();
+ auto task = hsaQueueEntries[exec_id];
+ bool launched(false);
+
+ // invalidate is needed before starting dispatch
+ if (shader->impl_kern_boundary_sync) {
+ // try to invalidate cache
+ shader->prepareInvalidate(task);
+ } else {
+ // kern boundary sync is not set, skip invalidate
+ task->markInvDone();
}
- // launch kernel
- ++num_kernelLaunched;
-
- NDRange *ndr = &(ndRangeMap[nextId]);
- // copy dispatch info
- ndr->q = curTask;
-
- // update the numDispTask polled by the runtime
- accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+ /**
+ * invalidate is still ongoing, put the kernel on the queue to
+ * retry later
+ */
+ if (!task->isInvDone()){
+ execIds.push(exec_id);
+ ++fail_count;
- ndr->numWgTotal = 1;
+ DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"
+ " invalidate requests\n", exec_id, task->outstandingInvs());
- for (int i = 0; i < 3; ++i) {
- ndr->wgId[i] = 0;
- ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
- ndr->numWgTotal *= ndr->numWg[i];
+ // try the next kernel_id
+ execIds.pop();
+ continue;
}
- ndr->numWgCompleted = 0;
- ndr->globalWgId = 0;
- ndr->wg_disp_rem = true;
- ndr->execDone = false;
- ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
- ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
- ndr->dispatchId = nextId;
- ndr->curCid = pkt->req->contextId();
- DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
- execIds.push(nextId);
- ++nextId;
-
- dispatchActive = true;
-
- if (!tickEvent.scheduled()) {
- schedule(&tickEvent, curTick() + shader->ticks(1));
- }
- } else {
- // populate current task struct
- // first 64 bits are launch reg
- offset -= 8;
- assert(offset < sizeof(HsaQueueEntry));
- char *curTaskPtr = (char*)&curTask;
- memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
- }
-
- pkt->makeAtomicResponse();
-
- return pioDelay;
-}
-
-
-Port &
-GpuDispatcher::getPort(const std::string &if_name, PortID idx)
-{
- if (if_name == "translation_port") {
- return *tlbPort;
- }
-
- return DmaDevice::getPort(if_name, idx);
-}
-
-void
-GpuDispatcher::exec()
-{
- int fail_count = 0;
-
- // There are potentially multiple outstanding kernel launches.
- // It is possible that the workgroups in a different kernel
- // can fit on the GPU even if another kernel's workgroups cannot
- DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
-
- while (execIds.size() > fail_count) {
- int execId = execIds.front();
-
- while (ndRangeMap[execId].wg_disp_rem) {
- //update the thread context
- shader->updateContext(ndRangeMap[execId].curCid);
-
- // attempt to dispatch_workgroup
- if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
- // if we failed try the next kernel,
- // it may have smaller workgroups.
- // put it on the queue to rety latter
- DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
- execIds.push(execId);
+ // kernel invalidate is done, start workgroup dispatch
+ while (!task->dispComplete()) {
+ // update the thread context
+ shader->updateContext(task->contextId());
+
+ // attempt to dispatch workgroup
+ DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",
+ curTick(), exec_id);
+
+ if (!shader->dispatchWorkgroups(task)) {
+ /**
+ * if we failed try the next kernel,
+ * it may have smaller workgroups.
+ * put it on the queue to rety latter
+ */
+ DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);
+ execIds.push(exec_id);
++fail_count;
break;
+ } else if (!launched) {
+ launched = true;
+ DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);
}
}
- // let's try the next kernel_id
+
+ // try the next kernel_id
execIds.pop();
}
DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
- if (doneIds.size() && cpu) {
- shader->hostWakeUp(cpu);
- }
-
while (doneIds.size()) {
- // wakeup the CPU if any Kernels completed this cycle
- DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+ DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());
doneIds.pop();
}
}
-void
-GpuDispatcher::notifyWgCompl(Wavefront *w)
+bool
+GPUDispatcher::isReachingKernelEnd(Wavefront *wf)
{
- int kern_id = w->kernId;
- DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
- assert(ndRangeMap[kern_id].dispatchId == kern_id);
- ndRangeMap[kern_id].numWgCompleted++;
-
- if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
- ndRangeMap[kern_id].execDone = true;
- doneIds.push(kern_id);
-
- if (ndRangeMap[kern_id].addrToNotify) {
- accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
- 0);
- }
-
- accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+ int kern_id = wf->kernId;
+ assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());
+ auto task = hsaQueueEntries[kern_id];
+ assert(task->dispatchId() == kern_id);
+
+ /**
+ * whether the next workgroup is the final one in the kernel,
+ * +1 as we check first before taking action
+ */
+ return (task->numWgCompleted() + 1 == task->numWgTotal());
+}
- // update event end time (in nano-seconds)
- if (ndRangeMap[kern_id].q.depends) {
- HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
- uint64_t event;
- shader->ReadMem((uint64_t)(&host_state->event), &event,
- sizeof(uint64_t), 0);
+/**
+ * update the counter of oustanding inv requests for the kernel
+ * kern_id: kernel id
+ * val: +1/-1, increment or decrement the counter (default: -1)
+ */
+void
+GPUDispatcher::updateInvCounter(int kern_id, int val) {
+ assert(val == -1 || val == 1);
- uint64_t end = curTick() / 1000;
+ auto task = hsaQueueEntries[kern_id];
+ task->updateOutstandingInvs(val);
- shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
- sizeof(uint64_t), 0);
- }
+ // kernel invalidate is done, schedule dispatch work
+ if (task->isInvDone() && !tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->clockPeriod());
}
+}
- if (!tickEvent.scheduled()) {
- schedule(&tickEvent, curTick() + shader->ticks(1));
- }
+/**
+ * update the counter of oustanding wb requests for the kernel
+ * kern_id: kernel id
+ * val: +1/-1, increment or decrement the counter (default: -1)
+ *
+ * return true if all wbs are done for the kernel
+ */
+bool
+GPUDispatcher::updateWbCounter(int kern_id, int val) {
+ assert(val == -1 || val == 1);
+
+ auto task = hsaQueueEntries[kern_id];
+ task->updateOutstandingWbs(val);
+
+ // true: WB is done, false: WB is still ongoing
+ return (task->outstandingWbs() == 0);
}
-void
-GpuDispatcher::scheduleDispatch()
-{
- if (!tickEvent.scheduled())
- schedule(&tickEvent, curTick() + shader->ticks(1));
+/**
+ * get kernel's outstanding cache writeback requests
+ */
+int
+GPUDispatcher::getOutstandingWbs(int kernId) {
+ auto task = hsaQueueEntries[kernId];
+
+ return task->outstandingWbs();
}
+/**
+ * When an end program instruction detects that the last WF in
+ * a WG has completed it will call this method on the dispatcher.
+ * If we detect that this is the last WG for the given task, then
+ * we ring the completion signal, which is used by the CPU to
+ * synchronize with the GPU. The HSAPP is also notified that the
+ * task has completed so it can be removed from its task queues.
+ */
void
-GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+GPUDispatcher::notifyWgCompl(Wavefront *wf)
{
- if (cpu) {
- if (off) {
- shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
- true);
- val += off;
+ int kern_id = wf->kernId;
+ DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);
+ auto task = hsaQueueEntries[kern_id];
+ assert(task->dispatchId() == kern_id);
+ task->notifyWgCompleted();
+
+ DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",
+ curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);
+
+ if (task->numWgCompleted() == task->numWgTotal()) {
+ // Notify the HSA PP that this kernel is complete
+ gpuCmdProc->hsaPacketProc()
+ .finishPkt(task->dispPktPtr(), task->queueId());
+ if (task->completionSignal()) {
+ // The signal value is aligned 8 bytes from
+ // the actual handle in the runtime
+ Addr signal_addr = task->completionSignal() + sizeof(Addr);
+ DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "
+ "completion signal: %x!\n", signal_addr);
+
+ /**
+ * HACK: The semantics of the HSA signal is to decrement
+ * the current signal value. We cheat here and read out
+ * he value from main memory using functional access and
+ * then just DMA the decremented value. This is because
+ * the DMA controller does not currently support GPU
+ * atomics.
+ */
+ auto *tc = gpuCmdProc->system()->threads[0];
+ auto &virt_proxy = tc->getVirtProxy();
+ TypedBufferArg<Addr> prev_signal(signal_addr);
+ prev_signal.copyIn(virt_proxy);
+
+ Addr *new_signal = new Addr;
+ *new_signal = (Addr)*prev_signal - 1;
+
+ gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,
+ new_signal, 0);
+ } else {
+ DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+ "signal\n");
}
- shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
- } else {
- panic("Cannot find host");
+ DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",
+ curTick(), kern_id);
+ DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
}
-}
-
-// helper functions for driver to retrieve GPU attributes
-int
-GpuDispatcher::getNumCUs()
-{
- return shader->cuList.size();
-}
-int
-GpuDispatcher::wfSize() const
-{
- return shader->cuList[0]->wfSize();
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->clockPeriod());
+ }
}
void
-GpuDispatcher::setFuncargsSize(int funcargs_size)
+GPUDispatcher::scheduleDispatch()
{
- shader->funcargs_size = funcargs_size;
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->clockPeriod());
+ }
}
-uint32_t
-GpuDispatcher::getStaticContextSize() const
+GPUDispatcher *GPUDispatcherParams::create()
{
- return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
+ return new GPUDispatcher(this);
}
* POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __GPU_DISPATCHER_HH__
-#define __GPU_DISPATCHER_HH__
+/**
+ * @file
+ * The GPUDispatcher is the component of the shader that is responsible
+ * for creating and dispatching WGs to the compute units. If all WGs in
+ * a kernel cannot be dispatched simultaneously, then the dispatcher will
+ * keep track of all pending WGs and dispatch them as resources become
+ * available.
+ */
+
+#ifndef __GPU_COMPUTE_DISPATCHER_HH__
+#define __GPU_COMPUTE_DISPATCHER_HH__
#include <queue>
+#include <unordered_map>
#include <vector>
#include "base/statistics.hh"
-#include "dev/dma_device.hh"
-#include "gpu-compute/compute_unit.hh"
-#include "gpu-compute/ndrange.hh"
-#include "gpu-compute/qstruct.hh"
-#include "mem/port.hh"
-#include "params/GpuDispatcher.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "params/GPUDispatcher.hh"
+#include "sim/sim_object.hh"
-class BaseCPU;
+class GPUCommandProcessor;
+class HSAQueueEntry;
class Shader;
+class Wavefront;
-class GpuDispatcher : public DmaDevice
+class GPUDispatcher : public SimObject
{
- public:
- typedef GpuDispatcherParams Params;
-
- MasterID masterId() { return _masterId; }
-
- protected:
- MasterID _masterId;
-
- // Base and length of PIO register space
- Addr pioAddr;
- Addr pioSize;
- Tick pioDelay;
-
- HsaQueueEntry curTask;
-
- std::unordered_map<int, NDRange> ndRangeMap;
- NDRange ndRange;
-
- // list of kernel_ids to launch
- std::queue<int> execIds;
- // list of kernel_ids that have finished
- std::queue<int> doneIds;
-
- uint64_t dispatchCount;
- // is there a kernel in execution?
- bool dispatchActive;
-
- BaseCPU *cpu;
- Shader *shader;
- ClDriver *driver;
- EventFunctionWrapper tickEvent;
-
-
- static GpuDispatcher *instance;
-
- // sycall emulation mode can have only 1 application running(?)
- // else we have to do some pid based tagging
- // unused
- typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
- TranslationBuffer tlb;
-
- public:
- /*statistics*/
- Stats::Scalar num_kernelLaunched;
- GpuDispatcher(const Params *p);
-
- ~GpuDispatcher() { }
-
- void exec();
- virtual void serialize(CheckpointOut &cp) const override;
- virtual void unserialize(CheckpointIn &cp) override;
- void notifyWgCompl(Wavefront *w);
- void scheduleDispatch();
- void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
-
- // using singleton so that glue code can pass pointer locations
- // to the dispatcher. when there are multiple dispatchers, we can
- // call something like getInstance(index)
- static void
- setInstance(GpuDispatcher *_instance)
- {
- instance = _instance;
- }
-
- static GpuDispatcher* getInstance() { return instance; }
-
- class TLBPort : public MasterPort
- {
- public:
-
- TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
- : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
-
- protected:
- GpuDispatcher *dispatcher;
-
- virtual bool recvTimingResp(PacketPtr pkt) { return true; }
- virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
- virtual void recvFunctional(PacketPtr pkt) { }
- virtual void recvRangeChange() { }
- virtual void recvReqRetry() { }
-
- };
-
- TLBPort *tlbPort;
-
- Port &getPort(const std::string &if_name,
- PortID idx=InvalidPortID) override;
-
- AddrRangeList getAddrRanges() const override;
- Tick read(PacketPtr pkt) override;
- Tick write(PacketPtr pkt) override;
-
- // helper functions to retrieve/set GPU attributes
- int getNumCUs();
- int wfSize() const;
- void setFuncargsSize(int funcargs_size);
-
- /** Returns the size of the static hardware context of a wavefront */
- uint32_t getStaticContextSize() const;
+ public:
+ typedef GPUDispatcherParams Params;
+
+ GPUDispatcher(const Params *p);
+ ~GPUDispatcher();
+
+ void serialize(CheckpointOut &cp) const override;
+ void unserialize(CheckpointIn &cp) override;
+ void regStats() override;
+ void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc);
+ void setShader(Shader *new_shader);
+ void exec();
+ bool isReachingKernelEnd(Wavefront *wf);
+ void updateInvCounter(int kern_id, int val=-1);
+ bool updateWbCounter(int kern_id, int val=-1);
+ int getOutstandingWbs(int kern_id);
+ void notifyWgCompl(Wavefront *wf);
+ void scheduleDispatch();
+ void dispatch(HSAQueueEntry *task);
+ HSAQueueEntry* hsaTask(int disp_id);
+
+ private:
+ Shader *shader;
+ GPUCommandProcessor *gpuCmdProc;
+ EventFunctionWrapper tickEvent;
+ std::unordered_map<int, HSAQueueEntry*> hsaQueueEntries;
+ // list of kernel_ids to launch
+ std::queue<int> execIds;
+ // list of kernel_ids that have finished
+ std::queue<int> doneIds;
+ // is there a kernel in execution?
+ bool dispatchActive;
+ /*statistics*/
+ Stats::Scalar numKernelLaunched;
+ Stats::Scalar cyclesWaitingForDispatch;
};
-#endif // __GPU_DISPATCHER_HH__
+#endif // __GPU_COMPUTE_DISPATCHER_HH__
#include "gpu-compute/exec_stage.hh"
+#include <sstream>
+
+#include "base/trace.hh"
+#include "debug/GPUSched.hh"
#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
- numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
- vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
- shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
executionResourcesUsed(0)
{
computeUnit = cu;
_name = computeUnit->name() + ".ExecStage";
dispatchList = &computeUnit->dispatchList;
- vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
- glbMemInstAvail= &(computeUnit->glbMemInstAvail);
- shrMemInstAvail= &(computeUnit->shrMemInstAvail);
idle_dur = 0;
}
void
ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
if (stage == IdleExec) {
- // count cycles of no vector ALU instruction executed
- // even if one was the oldest in a WV of that vector SIMD unit
- if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
- numCyclesWithNoInstrTypeIssued[unitId]++;
- }
-
- // count cycles of no global memory (vector) instruction executed
- // even if one was the oldest in a WV of that vector SIMD unit
- if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
- numCyclesWithNoInstrTypeIssued[unitId]++;
- (*glbMemInstAvail)--;
- }
-
- // count cycles of no shared memory (vector) instruction executed
- // even if one was the oldest in a WV of that vector SIMD unit
- if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
- numCyclesWithNoInstrTypeIssued[unitId]++;
- (*shrMemInstAvail)--;
- }
+ // count cycles when no instruction to a specific execution resource
+ // is executed
+ numCyclesWithNoInstrTypeIssued[unitId]++;
} else if (stage == BusyExec) {
- // count the number of cycles an instruction to a specific unit
- // was issued
+ // count the number of cycles an instruction to a specific execution
+ // resource type was issued
numCyclesWithInstrTypeIssued[unitId]++;
thisTimeInstExecuted = true;
instrExecuted = true;
}
lastTimeInstExecuted = thisTimeInstExecuted;
- // track the number of cycles we either issued one vector instruction
- // or issued no instructions at all
+ // track the number of cycles we either issued at least
+ // instruction or issued no instructions at all
if (instrExecuted) {
numCyclesWithInstrIssued++;
} else {
numCyclesWithNoIssue++;
}
-
spc.sample(executionResourcesUsed);
}
}
thisTimeInstExecuted = false;
}
+std::string
+ExecStage::dispStatusToStr(int i)
+{
+ std::string s("INVALID");
+ switch (i) {
+ case EMPTY:
+ s = "EMPTY";
+ break;
+ case SKIP:
+ s = "SKIP";
+ break;
+ case EXREADY:
+ s = "EXREADY";
+ break;
+ }
+ return s;
+}
+
+void
+ExecStage::dumpDispList()
+{
+ std::stringstream ss;
+ bool empty = true;
+ for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+ DISPATCH_STATUS s = dispatchList->at(i).second;
+ ss << i << ": " << dispStatusToStr(s);
+ if (s != EMPTY) {
+ empty = false;
+ Wavefront *w = dispatchList->at(i).first;
+ ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
+ ss << (w->instructionBuffer.front())->seqNum() << ": ";
+ ss << (w->instructionBuffer.front())->disassemble();
+ }
+ ss << "\n";
+ }
+ if (!empty) {
+ DPRINTF(GPUSched, "Dispatch List:\n%s", ss.str());
+ }
+}
+
void
ExecStage::exec()
{
initStatistics();
-
- for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
- // if dispatch list for this execution resource is empty,
- // skip this execution resource this cycle
- if (dispatchList->at(unitId).second == EMPTY) {
- collectStatistics(IdleExec, unitId);
- continue;
- }
-
- collectStatistics(BusyExec, unitId);
- // execute an instruction for the WF
- dispatchList->at(unitId).first->exec();
- // clear the dispatch list entry
- dispatchList->at(unitId).second = EMPTY;
- dispatchList->at(unitId).first = (Wavefront*)nullptr;
+ if (Debug::GPUSched) {
+ dumpDispList();
+ }
+ for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+ DISPATCH_STATUS s = dispatchList->at(unitId).second;
+ switch (s) {
+ case EMPTY:
+ // Do not execute if empty, waiting for VRF reads,
+ // or LM tied to GM waiting for VRF reads
+ collectStatistics(IdleExec, unitId);
+ break;
+ case EXREADY:
+ {
+ collectStatistics(BusyExec, unitId);
+ Wavefront *w = dispatchList->at(unitId).first;
+ DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
+ unitId, w->simdId, w->wfDynId,
+ (w->instructionBuffer.front())->disassemble());
+ DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
+ dispatchList->at(unitId).first->exec();
+ (computeUnit->scheduleStage).deleteFromSch(w);
+ dispatchList->at(unitId).second = EMPTY;
+ dispatchList->at(unitId).first->freeResources();
+ dispatchList->at(unitId).first = nullptr;
+ break;
+ }
+ case SKIP:
+ collectStatistics(BusyExec, unitId);
+ DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
+ dispatchList->at(unitId).second = EMPTY;
+ dispatchList->at(unitId).first->freeResources();
+ dispatchList->at(unitId).first = nullptr;
+ break;
+ default:
+ panic("Unknown dispatch status in exec()\n");
+ }
}
collectStatistics(PostExec, 0);
;
spc
- .init(0, numSIMDs + numMemUnits, 1)
+ .init(0, computeUnit->numExeUnits(), 1)
.name(name() + ".spc")
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
;
;
numCyclesWithInstrTypeIssued
- .init(numSIMDs + numMemUnits)
- .name(name() + ".num_cycles_with_instrtype_issue")
- .desc("Number of cycles at least one instruction of specific type "
- "issued")
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".num_cycles_issue_exec_rsrc")
+ .desc("Number of cycles at least one instruction issued to "
+ "execution resource type")
;
numCyclesWithNoInstrTypeIssued
- .init(numSIMDs + numMemUnits)
- .name(name() + ".num_cycles_with_instr_type_no_issue")
- .desc("Number of cycles no instruction of specific type issued")
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".num_cycles_no_issue_exec_rsrc")
+ .desc("Number of clks no instructions issued to execution "
+ "resource type")
;
- for (int i = 0; i < numSIMDs; ++i) {
- numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
- numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+ int c = 0;
+ for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
+ std::string s = "VectorALU" + std::to_string(i);
+ numCyclesWithNoInstrTypeIssued.subname(c, s);
+ numCyclesWithInstrTypeIssued.subname(c, s);
+ }
+ for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
+ std::string s = "ScalarALU" + std::to_string(i);
+ numCyclesWithNoInstrTypeIssued.subname(c, s);
+ numCyclesWithInstrTypeIssued.subname(c, s);
}
+ numCyclesWithNoInstrTypeIssued.subname(c, "VectorMemPipe");
+ numCyclesWithInstrTypeIssued.subname(c++, "VectorMemPipe");
+
+ numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe");
+ numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe");
- numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
- numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
- numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
- numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+ numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe");
+ numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe");
}
#define __EXEC_STAGE_HH__
#include <string>
+#include <unordered_map>
#include <utility>
#include <vector>
enum DISPATCH_STATUS
{
- EMPTY = 0,
- FILLED
+ EMPTY = 0, // no wave present in dispatchList slot
+ EXREADY, // wave ready for execution
+ SKIP, // extra memory resource needed, Shared Mem. only
};
// Execution stage.
void init(ComputeUnit *cu);
void exec();
+ std::string dispStatusToStr(int j);
+ void dumpDispList();
+
std::string name() { return _name; }
void regStats();
// number of idle cycles
Stats::Scalar numCyclesWithNoIssue;
// number of busy cycles
Stats::Scalar numCyclesWithInstrIssued;
- // number of cycles (per execution unit) during which at least one
- // instruction was issued to that unit
+ // number of cycles during which at least one
+ // instruction was issued to an execution resource type
Stats::Vector numCyclesWithInstrTypeIssued;
- // number of idle cycles (per execution unit) during which the unit issued
- // no instruction targeting that unit, even though there is at least one
- // Wavefront with such an instruction as the oldest
+ // number of idle cycles during which the scheduler
+ // issued no instructions targeting a specific
+ // execution resource type
Stats::Vector numCyclesWithNoInstrTypeIssued;
// SIMDs active per cycle
Stats::Distribution spc;
void collectStatistics(enum STAT_STATUS stage, int unitId);
void initStatistics();
ComputeUnit *computeUnit;
- uint32_t numSIMDs;
-
- // Number of memory execution resources;
- // both global and local memory execution resources in CU
- uint32_t numMemUnits;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
// dispatchList is used to communicate between schedule
// and exec stage
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
- // flag per vector SIMD unit that is set when there is at least one
- // WV that has a vector ALU instruction as the oldest in its
- // Instruction Buffer
- std::vector<bool> *vectorAluInstAvail;
- int *glbMemInstAvail;
- int *shrMemInstAvail;
bool lastTimeInstExecuted;
bool thisTimeInstExecuted;
bool instrExecuted;
Stats::Scalar numTransActiveIdle;
Stats::Distribution idleDur;
- uint32_t executionResourcesUsed;
+ int executionResourcesUsed;
uint64_t idle_dur;
std::string _name;
};
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/wavefront.hh"
-FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
- computeUnit(nullptr)
+FetchStage::FetchStage(const ComputeUnitParams* p) :
+ numVectorALUs(p->num_SIMDs), computeUnit(nullptr)
{
- for (int j = 0; j < numSIMDs; ++j) {
+ for (int j = 0; j < numVectorALUs; ++j) {
FetchUnit newFetchUnit(p);
- fetchUnit.push_back(newFetchUnit);
+ _fetchUnit.push_back(newFetchUnit);
}
}
FetchStage::~FetchStage()
{
- fetchUnit.clear();
+ _fetchUnit.clear();
}
void
computeUnit = cu;
_name = computeUnit->name() + ".FetchStage";
- for (int j = 0; j < numSIMDs; ++j) {
- fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
- fetchUnit[j].init(computeUnit);
+ for (int j = 0; j < numVectorALUs; ++j) {
+ _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+ _fetchUnit[j].init(computeUnit);
}
}
void
FetchStage::exec()
{
- for (int j = 0; j < numSIMDs; ++j) {
- fetchUnit[j].exec();
+ for (int j = 0; j < numVectorALUs; ++j) {
+ _fetchUnit[j].exec();
}
}
instFetchInstReturned.sample(num_instructions);
uint32_t simdId = wavefront->simdId;
- fetchUnit[simdId].processFetchReturn(pkt);
+ _fetchUnit[simdId].processFetchReturn(pkt);
}
void
FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
{
- fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+ _fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
}
void
std::string name() { return _name; }
void regStats();
Stats::Distribution instFetchInstReturned;
+ FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
private:
- uint32_t numSIMDs;
+ int numVectorALUs;
ComputeUnit *computeUnit;
// List of fetch units. A fetch unit is
- // instantiated per SIMD
- std::vector<FetchUnit> fetchUnit;
+ // instantiated per VALU/SIMD
+ std::vector<FetchUnit> _fetchUnit;
std::string _name;
};
uint32_t FetchUnit::globalFetchUnitID;
-FetchUnit::FetchUnit(const ComputeUnitParams* params) :
- timingSim(true),
- computeUnit(nullptr),
- fetchScheduler(params),
- waveList(nullptr)
+FetchUnit::FetchUnit(const ComputeUnitParams* params)
+ : timingSim(true), computeUnit(nullptr), fetchScheduler(params),
+ waveList(nullptr), fetchDepth(params->fetch_depth)
{
}
timingSim = computeUnit->shader->timingSim;
fetchQueue.clear();
fetchStatusQueue.resize(computeUnit->shader->n_wf);
-
- for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
- fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+ fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
+
+ for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
+ Wavefront *wf = waveList->at(i);
+ assert(wf->wfSlotId == i);
+ fetchStatusQueue[i] = std::make_pair(wf, false);
+ fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
+ fetchBuf[i].decoder(&decoder);
}
fetchScheduler.bindList(&fetchQueue);
void
FetchUnit::exec()
{
+ /**
+ * now we check if any of the fetch buffers have
+ * buffered instruction data that can be decoded
+ * and sent to its wavefront's instruction buffer.
+ * then we check if any of the fetch buffer entries
+ * can be released. we only check if we can
+ * release a buffer
+ */
+ for (auto &fetch_buf : fetchBuf) {
+ if (!fetch_buf.hasFreeSpace()) {
+ fetch_buf.checkWaveReleaseBuf();
+ }
+ if (fetch_buf.hasFetchDataToProcess()) {
+ fetch_buf.decodeInsts();
+ }
+ }
+
// re-evaluate waves which are marked as not ready for fetch
for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
// Following code assumes 64-bit opertaion and all insts are
// 4 or less instructions and it can not have any branches to
// prevent speculative instruction fetches
if (!fetchStatusQueue[j].second) {
- if (curWave->status == Wavefront::S_RUNNING &&
- curWave->instructionBuffer.size() <= 4 &&
- !curWave->instructionBufferHasBranch() &&
+ if ((curWave->getStatus() == Wavefront::S_RUNNING ||
+ curWave->getStatus() == Wavefront::S_WAITCNT) &&
+ fetchBuf[j].hasFreeSpace() &&
+ !curWave->stopFetch() &&
!curWave->pendingFetch) {
fetchQueue.push_back(curWave);
fetchStatusQueue[j].second = true;
void
FetchUnit::initiateFetch(Wavefront *wavefront)
{
- // calculate the virtual address to fetch from the SQC
- Addr vaddr = wavefront->pc();
+ assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace());
/**
- * the instruction buffer holds one instruction per entry, regardless
- * of the underlying instruction's size. the PC, however, addresses
- * instrutions on a 32b granularity so we must account for that here.
- */
- for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) {
- vaddr +=
- wavefront->instructionBuffer.at(i)->staticInstruction()->instSize();
- }
- vaddr = wavefront->basePtr + vaddr;
+ * calculate the virtual address to fetch from the SQC. the fetch
+ * buffer holds a configurable number of cache lines. we start
+ * fetching at the address of the cache line immediately following
+ * the buffered line(s).
+ */
+ Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr();
- DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
- computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+ // this should already be aligned to a cache line
+ assert(vaddr == makeLineAddress(vaddr,
+ computeUnit->getCacheLineBits()));
- // Since this is an instruction prefetch, if you're split then just finish
- // out the current line.
- int block_size = computeUnit->cacheLineSize();
- // check for split accesses
- Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
- int size = block_size;
+ // shouldn't be fetching a line that is already buffered
+ assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
- if (split_addr > vaddr) {
- // misaligned access, just grab the rest of the line
- size = split_addr - vaddr;
- }
+ fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
+
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
+ "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
+ wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
// set up virtual request
RequestPtr req = std::make_shared<Request>(
- vaddr, size, Request::INST_FETCH,
+ vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
computeUnit->masterId(), 0, 0, nullptr);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
- // This fetchBlock is kind of faux right now - because the translations so
- // far don't actually return Data
- uint64_t fetchBlock;
- pkt->dataStatic(&fetchBlock);
if (timingSim) {
// SenderState needed on Return
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
- // this is necessary because the GPU TLB receives packets instead of
- // requests. when the translation is complete, all relevent fields in the
- // request will be populated, but not in the packet. here we create the
- // new packet so we can set the size, addr, and proper flags.
+ /**
+ * this is necessary because the GPU TLB receives packets instead of
+ * requests. when the translation is complete, all relevent fields in
+ * the request will be populated, but not in the packet. here we create
+ * the new packet so we can set the size, addr, and proper flags.
+ */
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
delete oldPkt;
- TheGpuISA::RawMachInst *data =
- new TheGpuISA::RawMachInst[pkt->req->getSize() /
- sizeof(TheGpuISA::RawMachInst)];
-
- pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+ /**
+ * we should have reserved an entry in the fetch buffer
+ * for this cache line. here we get the pointer to the
+ * entry used to buffer this request's line data.
+ */
+ pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId)
+ .reservedBuf(pkt->req->getVaddr()));
// New SenderState for the memory access
pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
Wavefront *wavefront = sender_state->wavefront;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
- "%d bytes, %d instructions!\n", computeUnit->cu_id,
- wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
- pkt->req->getSize(), pkt->req->getSize() /
- sizeof(TheGpuISA::RawMachInst));
+ "%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
+ wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
if (wavefront->dropFetch) {
assert(wavefront->instructionBuffer.empty());
+ assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
wavefront->dropFetch = false;
} else {
- TheGpuISA::RawMachInst *inst_index_ptr =
- (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
-
- assert(wavefront->instructionBuffer.size() <= 4);
-
- for (int i = 0; i < pkt->req->getSize() /
- sizeof(TheGpuISA::RawMachInst); ++i) {
- GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
-
- assert(inst_ptr);
-
- if (inst_ptr->instSize() == 8) {
- /**
- * this instruction occupies 2 consecutive
- * entries in the instruction array, the
- * second of which contains a nullptr. so if
- * this inst is 8 bytes we advance two entries
- * instead of 1
- */
- ++i;
- }
-
- DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
- computeUnit->cu_id, wavefront->simdId,
- wavefront->wfSlotId, inst_ptr->disassemble());
-
- GPUDynInstPtr gpuDynInst =
- std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
- computeUnit->getAndIncSeqNum());
-
- wavefront->instructionBuffer.push_back(gpuDynInst);
- }
+ fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
}
wavefront->pendingFetch = false;
delete pkt;
}
+void
+FetchUnit::flushBuf(int wfSlotId)
+{
+ fetchBuf.at(wfSlotId).flushBuf();
+}
+
void
FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
{
waveList = wave_list;
}
+
+/** FetchBufDesc */
+void
+FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size,
+ Wavefront *wf)
+{
+ wavefront = wf;
+ fetchDepth = fetch_depth;
+ maxIbSize = wavefront->maxIbSize;
+ cacheLineSize = cache_line_size;
+ maxFbSize = cacheLineSize * fetchDepth;
+
+ // Calculate the number of bits to address a cache line
+ panic_if(!isPowerOf2(cacheLineSize),
+ "Cache line size should be a power of two.");
+ cacheLineBits = floorLog2(cacheLineSize);
+
+ bufStart = new uint8_t[maxFbSize];
+ readPtr = bufStart;
+ bufEnd = bufStart + maxFbSize;
+
+ for (int i = 0; i < fetchDepth; ++i) {
+ freeList.emplace_back(readPtr + i * cacheLineSize);
+ }
+}
+
+void
+FetchUnit::FetchBufDesc::flushBuf()
+{
+ restartFromBranch = true;
+ /**
+ * free list may have some entries
+ * so we clear it here to avoid duplicates
+ */
+ freeList.clear();
+ bufferedPCs.clear();
+ reservedPCs.clear();
+ readPtr = bufStart;
+
+ for (int i = 0; i < fetchDepth; ++i) {
+ freeList.push_back(bufStart + i * cacheLineSize);
+ }
+
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
+ "buffer\n", wavefront->simdId, wavefront->wfSlotId,
+ wavefront->wfDynId);
+}
+
+Addr
+FetchUnit::FetchBufDesc::nextFetchAddr()
+{
+ Addr next_line = 0;
+
+ if (bufferedAndReservedLines()) {
+ Addr last_line_fetched = 0;
+ if (!reservedLines()) {
+ /**
+ * get the PC of the most recently fetched cache line,
+ * then return the address of the next line.
+ */
+ last_line_fetched = bufferedPCs.rbegin()->first;
+ } else {
+ last_line_fetched = reservedPCs.rbegin()->first;
+ }
+
+ next_line = last_line_fetched + cacheLineSize;
+
+ /**
+ * should not be trying to fetch a line that has already
+ * been fetched.
+ */
+ assert(bufferedPCs.find(next_line) == bufferedPCs.end());
+ assert(reservedPCs.find(next_line) == reservedPCs.end());
+ } else {
+ /**
+ * we do not have any buffered cache lines yet, so we
+ * assume this is the initial fetch, or the first fetch
+ * after a branch, and get the PC directly from the WF.
+ * in the case of a branch, we may not start at the
+ * beginning of a cache line, so we adjust the readPtr by
+ * the current PC's offset from the start of the line.
+ */
+ next_line = makeLineAddress(wavefront->pc(), cacheLineBits);
+ readPtr = bufStart;
+
+ /**
+ * if we are here we have no buffered lines. in the case we flushed
+ * the buffer due to a branch, we may need to start fetching from
+ * some offset from the start of the fetch buffer, so we adjust for
+ * that here.
+ */
+ if (restartFromBranch) {
+ restartFromBranch = false;
+ int byte_offset
+ = wavefront->pc() - makeLineAddress(wavefront->pc(),
+ cacheLineBits);
+ readPtr += byte_offset;
+ }
+ }
+
+ return next_line;
+}
+
+void
+FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
+{
+ // we should have free buffer space, and the line
+ // at vaddr should not already be cached.
+ assert(hasFreeSpace());
+ assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+ assert(reservedPCs.find(vaddr) == reservedPCs.end());
+ assert(bufferedAndReservedLines() < fetchDepth);
+
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry "
+ "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
+ wavefront->wfDynId, vaddr);
+
+ /**
+ * we reserve buffer space, by moving it out of the
+ * free list, however we do not mark the buffered
+ * line as valid until the fetch unit for this buffer
+ * has receieved the response from the memory system.
+ */
+ uint8_t *inst_buf = freeList.front();
+ reservedPCs.emplace(vaddr, inst_buf);
+ freeList.pop_front();
+}
+
+void
+FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+{
+ assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
+ wavefront->simdId, wavefront->wfSlotId,
+ wavefront->wfDynId, vaddr);
+
+ /**
+ * this address should have an entry reserved in the
+ * fetch buffer already, however it should be invalid
+ * until the fetch completes.
+ */
+ auto reserved_pc = reservedPCs.find(vaddr);
+ assert(reserved_pc != reservedPCs.end());
+ bufferedPCs.emplace(vaddr, reserved_pc->second);
+
+ if (readPtr == bufEnd) {
+ readPtr = bufStart;
+ }
+
+ reserved_pc->second = nullptr;
+ reservedPCs.erase(reserved_pc);
+}
+
+bool
+FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
+{
+ return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst);
+}
+
+void
+FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
+{
+ Addr cur_wave_pc = roundDown(wavefront->pc(),
+ wavefront->computeUnit->cacheLineSize());
+ if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still "
+ "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
+ wavefront->wfDynId, cur_wave_pc);
+
+ // should be reserved, but not buffered yet
+ assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
+
+ return;
+ }
+
+ auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
+ auto oldest_buffered_pc = bufferedPCs.begin();
+
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
+ "(PC = %#x) can be released.\n", wavefront->simdId,
+ wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
+ wavefront->pc());
+
+#ifdef DEBUG
+ int idx = 0;
+ for (const auto &buf_pc : bufferedPCs) {
+ DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first);
+ ++idx;
+ }
+#endif
+
+ // if we haven't buffered data for this PC, we shouldn't
+ // be fetching from it.
+ assert(current_buffered_pc != bufferedPCs.end());
+
+ /**
+ * we're using a std::map so the addresses are sorted. if this
+ * PC is not the oldest one in the map, we must be fetching from
+ * a newer block, and we can release the oldest PC's fetch buffer
+ * entry back to the free list.
+ */
+ if (current_buffered_pc != oldest_buffered_pc) {
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
+ "removing it from the fetch buffer.\n", wavefront->simdId,
+ wavefront->wfSlotId, wavefront->wfDynId,
+ oldest_buffered_pc->first);
+
+ freeList.emplace_back(oldest_buffered_pc->second);
+ oldest_buffered_pc->second = nullptr;
+ bufferedPCs.erase(oldest_buffered_pc);
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n",
+ wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+ bufferedLines());
+ }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeInsts()
+{
+ assert(readPtr);
+
+ if (splitDecode()) {
+ decodeSplitInst();
+ }
+
+ while (wavefront->instructionBuffer.size() < maxIbSize
+ && hasFetchDataToProcess()) {
+ if (splitDecode()) {
+ decodeSplitInst();
+ } else {
+ TheGpuISA::MachInst mach_inst
+ = reinterpret_cast<TheGpuISA::MachInst>(readPtr);
+ GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+ readPtr += gpu_static_inst->instSize();
+
+ assert(readPtr <= bufEnd);
+
+ GPUDynInstPtr gpu_dyn_inst
+ = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+ wavefront, gpu_static_inst,
+ wavefront->computeUnit->
+ getAndIncSeqNum());
+ wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
+ "%d bytes remain.\n", wavefront->simdId,
+ wavefront->wfSlotId, wavefront->wfDynId,
+ gpu_static_inst->disassemble(),
+ gpu_static_inst->instSize(),
+ fetchBytesRemaining());
+ }
+ }
+}
+
+void
+FetchUnit::FetchBufDesc::decodeSplitInst()
+{
+ TheGpuISA::RawMachInst split_inst = 0;
+ int dword_size = sizeof(uint32_t);
+ int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size;
+
+ for (int i = 0; i < num_dwords; ++i) {
+ ((uint32_t*)(&split_inst))[i] = *reinterpret_cast<uint32_t*>(readPtr);
+ if (readPtr + dword_size >= bufEnd) {
+ readPtr = bufStart;
+ }
+ }
+
+ assert(readPtr == bufStart);
+
+ TheGpuISA::MachInst mach_inst
+ = reinterpret_cast<TheGpuISA::MachInst>(&split_inst);
+ GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
+ readPtr += (gpu_static_inst->instSize() - dword_size);
+ assert(readPtr < bufEnd);
+
+ GPUDynInstPtr gpu_dyn_inst
+ = std::make_shared<GPUDynInst>(wavefront->computeUnit,
+ wavefront, gpu_static_inst,
+ wavefront->computeUnit->
+ getAndIncSeqNum());
+ wavefront->instructionBuffer.push_back(gpu_dyn_inst);
+
+ DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
+ "(%d bytes). %d bytes remain in %d buffered lines.\n",
+ wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
+ gpu_static_inst->disassemble(), split_inst,
+ gpu_static_inst->instSize(), fetchBytesRemaining(),
+ bufferedLines());
+}
+
+bool
+FetchUnit::FetchBufDesc::splitDecode() const
+{
+ /**
+ * if a read of a raw instruction would go beyond the end
+ * of the fetch buffer, then we must perform a split decode.
+ */
+ bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd;
+
+ return is_split;
+}
+
+int
+FetchUnit::FetchBufDesc::fetchBytesRemaining() const
+{
+ int bytes_remaining = 0;
+
+ if (bufferedLines() && readPtr != bufEnd) {
+ auto last_buf_pc = bufferedPCs.rbegin();
+ uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
+ int byte_diff = end_ptr - readPtr;
+
+ if (end_ptr > readPtr) {
+ bytes_remaining = byte_diff;
+ } else if (end_ptr < readPtr) {
+ bytes_remaining = bufferedBytes() + byte_diff;
+ }
+ }
+
+ assert(bytes_remaining <= bufferedBytes());
+ return bytes_remaining;
+}
#include <string>
#include <utility>
-#include <vector>
#include "arch/gpu_decoder.hh"
#include "base/statistics.hh"
void initiateFetch(Wavefront *wavefront);
void fetch(PacketPtr pkt, Wavefront *wavefront);
void processFetchReturn(PacketPtr pkt);
+ void flushBuf(int wfSlotId);
static uint32_t globalFetchUnitID;
private:
+ /**
+ * fetch buffer descriptor. holds buffered
+ * instruction data in the fetch unit.
+ */
+ class FetchBufDesc
+ {
+ public:
+ FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr),
+ readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0),
+ cacheLineSize(0), restartFromBranch(false), wavefront(nullptr),
+ _decoder(nullptr)
+ {
+ }
+
+ ~FetchBufDesc()
+ {
+ delete[] bufStart;
+ }
+
+ /**
+ * allocate the fetch buffer space, and set the fetch depth
+ * (number of lines that may be buffered), fetch size
+ * (cache line size), and parent WF for this fetch buffer.
+ */
+ void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf);
+
+ int
+ bufferedAndReservedLines() const
+ {
+ return bufferedLines() + reservedLines();
+ }
+
+ int bufferedLines() const { return bufferedPCs.size(); }
+ int bufferedBytes() const { return bufferedLines() * cacheLineSize; }
+ int reservedLines() const { return reservedPCs.size(); }
+ bool hasFreeSpace() const { return !freeList.empty(); }
+ void flushBuf();
+ Addr nextFetchAddr();
+
+ /**
+ * reserve an entry in the fetch buffer for PC = vaddr,
+ */
+ void reserveBuf(Addr vaddr);
+
+ /**
+ * return a pointer to the raw fetch buffer data.
+ * this allows the fetch pkt to use this data directly
+ * to avoid unnecessary memcpy and malloc/new.
+ */
+ uint8_t*
+ reservedBuf(Addr vaddr) const
+ {
+ auto reserved_pc = reservedPCs.find(vaddr);
+ assert(reserved_pc != reservedPCs.end());
+ assert(reserved_pc == reservedPCs.begin());
+
+ return reserved_pc->second;
+ }
+
+ void fetchDone(Addr vaddr);
+
+ /**
+ * checks if the buffer contains valid data. this essentially
+ * tells fetch when there is data remaining that needs to be
+ * decoded into the WF's IB.
+ */
+ bool hasFetchDataToProcess() const;
+
+ /**
+ * each time the fetch stage is ticked, we check if there
+ * are any data in the fetch buffer that may be decoded and
+ * sent to the IB. because we are modeling the fetch buffer
+ * as a circular buffer, it is possible that an instruction
+ * can straddle the end/beginning of the fetch buffer, so
+ * decodeSplitInsts() handles that case.
+ */
+ void decodeInsts();
+
+ /**
+ * checks if the wavefront can release any of its fetch
+ * buffer entries. this will occur when the WF's PC goes
+ * beyond any of the currently buffered cache lines.
+ */
+ void checkWaveReleaseBuf();
+
+ void
+ decoder(TheGpuISA::Decoder *dec)
+ {
+ _decoder = dec;
+ }
+
+ bool
+ pcBuffered(Addr pc) const
+ {
+ bool buffered = bufferedPCs.find(pc) != bufferedPCs.end()
+ && reservedPCs.find(pc) != reservedPCs.end();
+
+ return buffered;
+ }
+
+ /**
+ * calculates the number of fetched bytes that have yet
+ * to be decoded.
+ */
+ int fetchBytesRemaining() const;
+
+ private:
+ void decodeSplitInst();
+
+ /**
+ * check if the next instruction to be processed out of
+ * the fetch buffer is split across the end/beginning of
+ * the fetch buffer.
+ */
+ bool splitDecode() const;
+
+ /**
+ * the set of PCs (fetch addresses) that are currently
+ * buffered. bufferedPCs are valid, reservedPCs are
+ * waiting for their buffers to be filled with valid
+ * fetch data.
+ */
+ std::map<Addr, uint8_t*> bufferedPCs;
+ std::map<Addr, uint8_t*> reservedPCs;
+
+ /**
+ * represents the fetch buffer free list. holds buffer space
+ * that is currently free. each pointer in this array must
+ * have enough space to hold a cache line. in reality we
+ * have one actual fetch buffer: 'bufStart', these pointers
+ * point to addresses within bufStart that are aligned to the
+ * cache line size.
+ */
+ std::deque<uint8_t*> freeList;
+
+ /**
+ * raw instruction buffer. holds cache line data associated with
+ * the set of PCs (fetch addresses) that are buffered here.
+ */
+ uint8_t *bufStart;
+ uint8_t *bufEnd;
+ /**
+ * pointer that points to the next chunk of inst data to be
+ * decoded.
+ */
+ uint8_t *readPtr;
+ // how many lines the fetch unit may buffer
+ int fetchDepth;
+ // maximum size (in number of insts) of the WF's IB
+ int maxIbSize;
+ // maximum size (in bytes) of this fetch buffer
+ int maxFbSize;
+ int cacheLineSize;
+ int cacheLineBits;
+ bool restartFromBranch;
+ // wavefront whose IB is serviced by this fetch buffer
+ Wavefront *wavefront;
+ TheGpuISA::Decoder *_decoder;
+ };
+
bool timingSim;
ComputeUnit *computeUnit;
TheGpuISA::Decoder decoder;
// Pointer to list of waves dispatched on to this SIMD unit
std::vector<Wavefront*> *waveList;
+ // holds the fetch buffers. each wave has 1 entry.
+ std::vector<FetchBufDesc> fetchBuf;
+ /**
+ * number of cache lines we can fetch and buffer.
+ * this includes the currently fetched line (i.e., the
+ * line that corresponds to the WF's current PC), as
+ * well as any lines that may be prefetched.
+ */
+ int fetchDepth;
};
#endif // __FETCH_UNIT_HH__
* POSSIBILITY OF SUCH DAMAGE.
*/
-#include "gpu-compute/global_memory_pipeline.hh"
-
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
#include "debug/GPUCoalescer.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUReg.hh"
#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
- outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+ maxWaveRequests(p->max_wave_requests), inflightStores(0),
inflightLoads(0)
{
}
return true;
}
+void
+GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
+{
+ // We require one token from the coalescer's uncoalesced table to
+ // proceed
+ int token_count = 1;
+
+ DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
+ assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
+ mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
+}
+
+bool
+GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
+{
+ // Ensure we haven't exceeded the maximum number of vmem requests
+ // for this wavefront
+ if ((mp->wavefront()->outstandingReqsRdGm
+ + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
+ return false;
+ }
+
+ return true;
+}
+
void
GlobalMemPipeline::exec()
{
// check the VRF to see if the operands of a load (or load component
// of an atomic) are accessible
- if ((m) && (m->isLoad() || m->isAtomicRet())) {
+ if (m && (m->isLoad() || m->isAtomicRet())) {
w = m->wavefront();
- accessVrf =
- w->computeUnit->vrf[w->simdId]->
- vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
+ accessVrf = w->computeUnit->vrf[w->simdId]->
+ canScheduleWriteOperandsFromLoad(w, m);
+
}
if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
- accessVrf && m->statusBitVector == VectorMask(0) &&
- (computeUnit->shader->coissue_return ||
- computeUnit->wfWait.at(m->pipeId).rdy())) {
+ accessVrf && (computeUnit->shader->coissue_return ||
+ computeUnit->vectorGlobalMemUnit.rdy())) {
w = m->wavefront();
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
+ m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
+ if (m->isLoad() || m->isAtomicRet()) {
+ w->computeUnit->vrf[w->simdId]->
+ scheduleWriteOperandsFromLoad(w, m);
+ }
+
completeRequest(m);
- // Decrement outstanding register count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ Tick accessTime = curTick() - m->getAccessTime();
- if (m->isStore() || m->isAtomic()) {
+ // Decrement outstanding requests count
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ if (m->isStore() || m->isAtomic() || m->isMemSync()) {
+ computeUnit->shader->sampleStore(accessTime);
computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
m->time, -1);
}
- if (m->isLoad() || m->isAtomic()) {
+ if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
+ computeUnit->shader->sampleLoad(accessTime);
computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
m->time, -1);
}
+ w->validateRequestCounters();
+
+ // Generate stats for round-trip time for vectory memory insts
+ // going all the way to memory and stats for individual cache
+ // blocks generated by the instruction.
+ m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
+ computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
+ computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+
// Mark write bus busy for appropriate amount of time
computeUnit->glbMemToVrfBus.set(m->time);
if (!computeUnit->shader->coissue_return)
- w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+ w->computeUnit->vectorGlobalMemUnit.set(m->time);
}
// If pipeline has executed a global memory instruction
mp->disassemble(), mp->seqNum());
// Memfences will not return tokens and must be issued so we should
// not request one as this will deplete the token count until deadlock
- if (!mp->isMemFence()) {
+ if (!mp->isMemSync()) {
assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
mp->computeUnit()->getTokenManager()->acquireTokens(1);
}
mp->initiateAcc(mp);
- if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+ if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
/**
* if we are not in out-of-order data delivery mode
* then we keep the responses sorted in program order.
GPUDynInstPtr
GlobalMemPipeline::getNextReadyResp()
{
- if (outOfOrderDataDelivery) {
- if (!gmReturnedLoads.empty()) {
- return gmReturnedLoads.front();
- } else if (!gmReturnedStores.empty()) {
- return gmReturnedStores.front();
- }
- } else {
- if (!gmOrderedRespBuffer.empty()) {
- auto mem_req = gmOrderedRespBuffer.begin();
+ if (!gmOrderedRespBuffer.empty()) {
+ auto mem_req = gmOrderedRespBuffer.begin();
- if (mem_req->second.second) {
- return mem_req->second.first;
- }
+ if (mem_req->second.second) {
+ return mem_req->second.first;
}
}
--inflightStores;
}
- if (outOfOrderDataDelivery) {
- if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
- assert(!gmReturnedLoads.empty());
- gmReturnedLoads.pop();
- } else if (gpuDynInst->isStore()) {
- assert(!gmReturnedStores.empty());
- gmReturnedStores.pop();
- }
- } else {
- // we should only pop the oldest requst, and it
- // should be marked as done if we are here
- assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
- assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
- assert(gmOrderedRespBuffer.begin()->second.second);
- // remove this instruction from the buffer by its
- // unique seq ID
- gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
- }
+ // we should only pop the oldest requst, and it
+ // should be marked as done if we are here
+ assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+ assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+ assert(gmOrderedRespBuffer.begin()->second.second);
+ // remove this instruction from the buffer by its
+ // unique seq ID
+ gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
}
void
GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
{
+ gpuDynInst->setAccessTime(curTick());
+ gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
gmIssuedRequests.push(gpuDynInst);
}
void
GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
{
- if (outOfOrderDataDelivery) {
- if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
- assert(isGMLdRespFIFOWrRdy());
- gmReturnedLoads.push(gpuDynInst);
- } else {
- assert(isGMStRespFIFOWrRdy());
- gmReturnedStores.push(gpuDynInst);
- }
- } else {
- auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
- // if we are getting a response for this mem request,
- // then it ought to already be in the ordered response
- // buffer
- assert(mem_req != gmOrderedRespBuffer.end());
- mem_req->second.second = true;
- }
+ auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+ // if we are getting a response for this mem request,
+ // then it ought to already be in the ordered response
+ // buffer
+ assert(mem_req != gmOrderedRespBuffer.end());
+ mem_req->second.second = true;
}
void
void init(ComputeUnit *cu);
void exec();
- std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
- std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
-
/**
- * find the next ready response to service. for OoO mode we
- * simply pop the oldest (based on when the response was
- * received) response in the response FIFOs. for in-order mode
- * we pop the oldest (in program order) response, and only if
- * it is marked as done.
+ * Find the next ready response to service. In order to ensure
+ * that no waitcnts are violated, we pop the oldest (in program order)
+ * response, and only if it is marked as done. This is because waitcnt
+ * values expect memory operations to complete and decrement their
+ * counter values in program order.
*/
GPUDynInstPtr getNextReadyResp();
/**
* once a memory request is finished we remove it from the
- * buffer. this method determines which response buffer
- * we're using based on the mode (in-order vs. OoO).
+ * buffer.
*/
void completeRequest(GPUDynInstPtr gpuDynInst);
/**
- * issues a request to the pipeline - i.e., enqueue it
- * in the request buffer.
+ * Issues a request to the pipeline (i.e., enqueue it
+ * in the request buffer).
*/
void issueRequest(GPUDynInstPtr gpuDynInst);
/**
- * this method handles responses sent to this GM pipeline by the
- * CU. in the case of in-order delivery it simply marks the reqeust
- * as done in the ordered buffer to indicate that the requst is
- * finished. for out-of-order data delivery, the requests are enqueued
- * (in the order in which they are received) in the response FIFOs.
+ * This method handles responses sent to this GM pipeline by the
+ * CU. Simply marks the reqeust as done in the ordered buffer to
+ * indicate that the requst is finished.
*/
void handleResponse(GPUDynInstPtr gpuDynInst);
- bool
- isGMLdRespFIFOWrRdy() const
- {
- return gmReturnedLoads.size() < gmQueueSize;
- }
-
- bool
- isGMStRespFIFOWrRdy() const
- {
- return gmReturnedStores.size() < gmQueueSize;
- }
-
bool
isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
{
const std::string &name() const { return _name; }
void regStats();
-
void
incLoadVRFBankConflictCycles(int num_cycles)
{
}
bool coalescerReady(GPUDynInstPtr mp) const;
+ bool outstandingReqsCheck(GPUDynInstPtr mp) const;
+
+ void acqCoalescerToken(GPUDynInstPtr mp);
private:
ComputeUnit *computeUnit;
std::string _name;
int gmQueueSize;
- bool outOfOrderDataDelivery;
+ int maxWaveRequests;
// number of cycles of delaying the update of a VGPR that is the
// target of a load instruction (or the load component of an atomic)
int globalMemSize;
/*
- * this buffer holds the memory responses when in-order data
- * deilvery is used - the responses are ordered by their unique
- * sequence number, which is monotonically increasing. when a
- * memory request returns its "done" flag is set to true. during
- * each tick the the GM pipeline will check if the oldest request
- * is finished, and if so it will be removed from the queue.
+ * This buffer holds the memory responses in order data - the responses
+ * are ordered by their unique sequence number, which is monotonically
+ * increasing. When a memory request returns its "done" flag is set to
+ * true. During each tick the the GM pipeline will check if the oldest
+ * request is finished, and if so it will be removed from the queue.
*
* key: memory instruction's sequence ID
*
// Global Memory Request FIFO: all global memory requests
// are issued to this FIFO from the memory pipelines
std::queue<GPUDynInstPtr> gmIssuedRequests;
-
- // Globa Store Response FIFO: all responses of global memory
- // stores are sent to this FIFO from TCP
- std::queue<GPUDynInstPtr> gmReturnedStores;
-
- // Global Load Response FIFO: all responses of global memory
- // loads are sent to this FIFO from TCP
- std::queue<GPUDynInstPtr> gmReturnedLoads;
};
#endif // __GLOBAL_MEMORY_PIPELINE_HH__
--- /dev/null
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_command_processor.hh"
+
+#include "debug/GPUCommandProc.hh"
+#include "debug/GPUKernelInfo.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "params/GPUCommandProcessor.hh"
+
+GPUCommandProcessor::GPUCommandProcessor(const Params *p)
+ : HSADevice(p), dispatcher(*p->dispatcher)
+{
+ dispatcher.setCommandProcessor(this);
+}
+
+/**
+ * submitDispatchPkt() is the entry point into the CP from the HSAPP
+ * and is only meant to be used with AQL kernel dispatch packets.
+ * After the HSAPP receives and extracts an AQL packet, it sends
+ * it to the CP, which is responsible for gathering all relevant
+ * information about a task, initializing CU state, and sending
+ * it to the dispatcher for WG creation and dispatch.
+ *
+ * First we need capture all information from the the AQL pkt and
+ * the code object, then store it in an HSAQueueEntry. Once the
+ * packet and code are extracted, we extract information from the
+ * queue descriptor that the CP needs to perform state initialization
+ * on the CU. Finally we call dispatch() to send the task to the
+ * dispatcher. When the task completely finishes, we call finishPkt()
+ * on the HSA packet processor in order to remove the packet from the
+ * queue, and notify the runtime that the task has completed.
+ */
+void
+GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+ Addr host_pkt_addr)
+{
+ static int dynamic_task_id = 0;
+ _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
+
+ /**
+ * we need to read a pointer in the application's address
+ * space to pull out the kernel code descriptor.
+ */
+ auto *tc = sys->threads[0];
+ auto &virt_proxy = tc->getVirtProxy();
+
+ /**
+ * The kernel_object is a pointer to the machine code, whose entry
+ * point is an 'amd_kernel_code_t' type, which is included in the
+ * kernel binary, and describes various aspects of the kernel. The
+ * desired entry is the 'kernel_code_entry_byte_offset' field,
+ * which provides the byte offset (positive or negative) from the
+ * address of the amd_kernel_code_t to the start of the machine
+ * instructions.
+ */
+ AMDKernelCode akc;
+ virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
+ sizeof(AMDKernelCode));
+
+ DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
+ "kernel object\n", akc.kernel_code_entry_byte_offset);
+
+ Addr machine_code_addr = (Addr)disp_pkt->kernel_object
+ + akc.kernel_code_entry_byte_offset;
+
+ DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
+ machine_code_addr);
+
+ Addr kern_name_addr(0);
+ virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10,
+ (uint8_t*)&kern_name_addr, 0x8);
+
+ std::string kernel_name;
+ virt_proxy.readString(kernel_name, kern_name_addr);
+
+ DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
+
+ HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
+ dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
+
+ DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
+ "grid size (%dx%dx%d) kernarg addr: %#x, completion "
+ "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
+ disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
+ disp_pkt->grid_size_x, disp_pkt->grid_size_y,
+ disp_pkt->grid_size_z, disp_pkt->kernarg_address,
+ disp_pkt->completion_signal);
+
+ DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
+ "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
+ "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
+ task->numScalarRegs(), task->codeAddr(), 0, 0);
+
+ initABI(task);
+ ++dynamic_task_id;
+}
+
+/**
+ * submitVendorPkt() is for accepting vendor-specific packets from
+ * the HSAPP. Vendor-specific packets may be used by the runtime to
+ * send commands to the HSA device that are specific to a particular
+ * vendor. The vendor-specific packets should be defined by the vendor
+ * in the runtime.
+ */
+
+/**
+ * TODO: For now we simply tell the HSAPP to finish the packet,
+ * however a future patch will update this method to provide
+ * the proper handling of any required vendor-specific packets.
+ * In the version of ROCm that is currently supported (1.6)
+ * the runtime will send packets that direct the CP to
+ * invalidate the GPUs caches. We do this automatically on
+ * each kernel launch in the CU, so this is safe for now.
+ */
+void
+GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+ Addr host_pkt_addr)
+{
+ hsaPP->finishPkt(raw_pkt, queue_id);
+}
+
+/**
+ * Once the CP has finished extracting all relevant information about
+ * a task and has initialized the ABI state, we send a description of
+ * the task to the dispatcher. The dispatcher will create and dispatch
+ * WGs to the CUs.
+ */
+void
+GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task)
+{
+ dispatcher.dispatch(task);
+}
+
+/**
+ * The CP is responsible for traversing all HSA-ABI-related data
+ * structures from memory and initializing the ABI state.
+ * Information provided by the MQD, AQL packet, and code object
+ * metadata will be used to initialze register file state.
+ */
+void
+GPUCommandProcessor::initABI(HSAQueueEntry *task)
+{
+ auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task);
+
+ Addr hostReadIdxPtr
+ = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
+
+ dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
+ sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent,
+ &readDispIdOffEvent->readDispIdOffset);
+}
+
+System*
+GPUCommandProcessor::system()
+{
+ return sys;
+}
+
+AddrRangeList
+GPUCommandProcessor::getAddrRanges() const
+{
+ AddrRangeList ranges;
+ return ranges;
+}
+
+void
+GPUCommandProcessor::setShader(Shader *shader)
+{
+ _shader = shader;
+}
+
+Shader*
+GPUCommandProcessor::shader()
+{
+ return _shader;
+}
+
+GPUCommandProcessor*
+GPUCommandProcessorParams::create()
+{
+ return new GPUCommandProcessor(this);
+}
--- /dev/null
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * The GPUCommandProcessor (CP) is responsible for accepting commands, in
+ * the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP
+ * works with several components, including the HSAPP and the dispatcher.
+ * When the HSAPP sends a ready task to the CP, it will perform the necessary
+ * operations to extract relevant data structures from memory, such as the
+ * AQL queue descriptor and AQL packet, and initializes register state for the
+ * task's wavefronts.
+ */
+
+#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
+#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
+
+#include "dev/hsa/hsa_device.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
+
+struct GPUCommandProcessorParams;
+class GPUDispatcher;
+class Shader;
+
+class GPUCommandProcessor : public HSADevice
+{
+ public:
+ typedef GPUCommandProcessorParams Params;
+
+ GPUCommandProcessor() = delete;
+ GPUCommandProcessor(const Params *p);
+
+ void setShader(Shader *shader);
+ Shader* shader();
+
+ void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
+ Addr host_pkt_addr) override;
+ void submitVendorPkt(void *raw_pkt, uint32_t queue_id,
+ Addr host_pkt_addr) override;
+ void dispatchPkt(HSAQueueEntry *task);
+
+ Tick write(PacketPtr pkt) override { return 0; }
+ Tick read(PacketPtr pkt) override { return 0; }
+ AddrRangeList getAddrRanges() const override;
+ System *system();
+
+ private:
+ Shader *_shader;
+ GPUDispatcher &dispatcher;
+
+ void initABI(HSAQueueEntry *task);
+
+ /**
+ * Perform a DMA read of the read_dispatch_id_field_base_byte_offset
+ * field, which follows directly after the read_dispatch_id (the read
+ * pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor
+ * (MQD)), to find the base address of the MQD. The MQD is the runtime's
+ * soft representation of a HW queue descriptor (HQD).
+ *
+ * Any fields below the read dispatch ID in the amd_hsa_queue_t should
+ * not change according to the HSA standard, therefore we should be able
+ * to get them based on their known relative position to the read dispatch
+ * ID.
+ */
+ class ReadDispIdOffsetDmaEvent : public DmaCallback
+ {
+ public:
+ ReadDispIdOffsetDmaEvent(GPUCommandProcessor &gpu_cmd_proc,
+ HSAQueueEntry *task)
+ : DmaCallback(), readDispIdOffset(0), gpuCmdProc(gpu_cmd_proc),
+ _task(task)
+ {
+ }
+
+ void
+ process() override
+ {
+ /**
+ * Now that the read pointer's offset from the base of
+ * the MQD is known, we can use that to calculate the
+ * the address of the MQD itself, the dispatcher will
+ * DMA that into the HSAQueueEntry when a kernel is
+ * launched.
+ */
+ _task->hostAMDQueueAddr
+ = gpuCmdProc.hsaPP->getQueueDesc(_task->queueId())
+ ->hostReadIndexPtr - readDispIdOffset;
+
+ /**
+ * DMA a copy of the MQD into the task. Some fields of
+ * the MQD will be used to initialize register state.
+ */
+ auto *mqdDmaEvent = new MQDDmaEvent(gpuCmdProc, _task);
+ gpuCmdProc.dmaReadVirt(_task->hostAMDQueueAddr,
+ sizeof(_amd_queue_t), mqdDmaEvent,
+ &_task->amdQueue);
+ }
+
+ uint32_t readDispIdOffset;
+
+ private:
+ GPUCommandProcessor &gpuCmdProc;
+ HSAQueueEntry *_task;
+ };
+
+ /**
+ * Perform a DMA read of the MQD that corresponds to a hardware
+ * queue descriptor (HQD). We store a copy of the MQD in the
+ * HSAQueueEntry object so we can send a copy of it along with
+ * a dispatch packet, which is needed to initialize register
+ * state.
+ */
+ class MQDDmaEvent : public DmaCallback
+ {
+ public:
+ MQDDmaEvent(GPUCommandProcessor &gpu_cmd_proc, HSAQueueEntry *task)
+ : DmaCallback(), gpuCmdProc(gpu_cmd_proc), _task(task)
+ {
+ }
+
+ void
+ process() override
+ {
+ gpuCmdProc.dispatchPkt(_task);
+ }
+
+ private:
+ GPUCommandProcessor &gpuCmdProc;
+ HSAQueueEntry *_task;
+ };
+};
+
+#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__
--- /dev/null
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ * Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_compute_driver.hh"
+
+#include "cpu/thread_context.hh"
+#include "debug/GPUDriver.hh"
+#include "dev/hsa/hsa_device.hh"
+#include "dev/hsa/hsa_packet_processor.hh"
+#include "dev/hsa/kfd_ioctl.h"
+#include "params/GPUComputeDriver.hh"
+#include "sim/syscall_emul_buf.hh"
+
+GPUComputeDriver::GPUComputeDriver(Params *p)
+ : HSADriver(p)
+{
+ DPRINTF(GPUDriver, "Constructing KFD: device\n");
+}
+
+int
+GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
+{
+ auto &virt_proxy = tc->getVirtProxy();
+
+ switch (req) {
+ case AMDKFD_IOC_GET_VERSION:
+ {
+ DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
+
+ TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf);
+ args->major_version = 1;
+ args->minor_version = 0;
+
+ args.copyOut(virt_proxy);
+ }
+ break;
+ case AMDKFD_IOC_CREATE_QUEUE:
+ {
+ DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
+
+ allocateQueue(virt_proxy, ioc_buf);
+
+ DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
+ }
+ break;
+ case AMDKFD_IOC_DESTROY_QUEUE:
+ {
+ TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf);
+ args.copyIn(virt_proxy);
+ DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
+ "queue offset %d\n", args->queue_id);
+ device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id);
+ }
+ break;
+ case AMDKFD_IOC_SET_MEMORY_POLICY:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
+ }
+ break;
+ case AMDKFD_IOC_GET_CLOCK_COUNTERS:
+ {
+ DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
+
+ TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf);
+ args.copyIn(virt_proxy);
+
+ // Set nanosecond resolution
+ args->system_clock_freq = 1000000000;
+
+ /**
+ * Derive all clock counters based on the tick. All
+ * device clocks are identical and perfectly in sync.
+ */
+ uint64_t elapsed_nsec = curTick() / SimClock::Int::ns;
+ args->gpu_clock_counter = elapsed_nsec;
+ args->cpu_clock_counter = elapsed_nsec;
+ args->system_clock_counter = elapsed_nsec;
+
+ args.copyOut(virt_proxy);
+ }
+ break;
+ case AMDKFD_IOC_GET_PROCESS_APERTURES:
+ {
+ DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
+
+ TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf);
+ args->num_of_nodes = 1;
+
+ /**
+ * Set the GPUVM/LDS/Scratch APEs exactly as they
+ * are in the real driver, see the KFD driver
+ * in the ROCm Linux kernel source:
+ * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+ */
+ for (int i = 0; i < args->num_of_nodes; ++i) {
+ /**
+ * While the GPU node numbers start at 0, we add 1
+ * to force the count to start at 1. This is to
+ * ensure that the base/limit addresses are
+ * calculated correctly.
+ */
+ args->process_apertures[i].scratch_base
+ = scratchApeBase(i + 1);
+ args->process_apertures[i].scratch_limit =
+ scratchApeLimit(args->process_apertures[i].scratch_base);
+
+ args->process_apertures[i].lds_base = ldsApeBase(i + 1);
+ args->process_apertures[i].lds_limit =
+ ldsApeLimit(args->process_apertures[i].lds_base);
+
+ args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1);
+ args->process_apertures[i].gpuvm_limit =
+ gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
+
+ // NOTE: Must match ID populated by hsaTopology.py
+ args->process_apertures[i].gpu_id = 2765;
+
+ DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
+ args->process_apertures[i].gpuvm_base);
+ DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
+ args->process_apertures[i].gpuvm_limit);
+
+ DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
+ args->process_apertures[i].lds_base);
+ DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
+ args->process_apertures[i].lds_limit);
+
+ DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
+ args->process_apertures[i].scratch_base);
+ DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
+ args->process_apertures[i].scratch_limit);
+
+ /**
+ * The CPU's 64b address space can only use the
+ * areas with VA[63:47] == 0x1ffff or VA[63:47] == 0,
+ * therefore we must ensure that the apertures do not
+ * fall in the CPU's address space.
+ */
+ assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
+ 47) != 0x1ffff);
+ assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
+ 47) != 0);
+ assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
+ 47) != 0x1ffff);
+ assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
+ 47) != 0);
+ assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
+ 47) != 0x1ffff);
+ assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
+ 47) != 0);
+ assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
+ 47) != 0x1ffff);
+ assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
+ 47) != 0);
+ assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
+ 47) != 0x1ffff);
+ assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63,
+ 47) != 0);
+ assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
+ 47) != 0x1ffff);
+ assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63,
+ 47) != 0);
+ }
+
+ args.copyOut(virt_proxy);
+ }
+ break;
+ case AMDKFD_IOC_UPDATE_QUEUE:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
+ }
+ break;
+ case AMDKFD_IOC_CREATE_EVENT:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n");
+ }
+ break;
+ case AMDKFD_IOC_DESTROY_EVENT:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
+ }
+ break;
+ case AMDKFD_IOC_SET_EVENT:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n");
+ }
+ break;
+ case AMDKFD_IOC_RESET_EVENT:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
+ }
+ break;
+ case AMDKFD_IOC_WAIT_EVENTS:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
+ }
+ break;
+ case AMDKFD_IOC_DBG_REGISTER:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
+ }
+ break;
+ case AMDKFD_IOC_DBG_UNREGISTER:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
+ }
+ break;
+ case AMDKFD_IOC_DBG_ADDRESS_WATCH:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
+ }
+ break;
+ case AMDKFD_IOC_DBG_WAVE_CONTROL:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
+ }
+ break;
+ case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
+ }
+ break;
+ case AMDKFD_IOC_FREE_MEMORY_OF_GPU:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
+ }
+ break;
+ case AMDKFD_IOC_MAP_MEMORY_TO_GPU:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
+ }
+ break;
+ case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
+ }
+ case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n");
+ }
+ break;
+ case AMDKFD_IOC_SET_CU_MASK:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
+ }
+ break;
+ case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE"
+ "\n");
+ }
+ break;
+ case AMDKFD_IOC_SET_TRAP_HANDLER:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
+ }
+ break;
+ case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW:
+ {
+ DPRINTF(GPUDriver,
+ "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
+
+ TypedBufferArg<kfd_ioctl_get_process_apertures_new_args>
+ ioc_args(ioc_buf);
+
+ ioc_args.copyIn(virt_proxy);
+ ioc_args->num_of_nodes = 1;
+
+ for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
+ TypedBufferArg<kfd_process_device_apertures> ape_args
+ (ioc_args->kfd_process_device_apertures_ptr);
+
+ ape_args->scratch_base = scratchApeBase(i + 1);
+ ape_args->scratch_limit =
+ scratchApeLimit(ape_args->scratch_base);
+ ape_args->lds_base = ldsApeBase(i + 1);
+ ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
+ ape_args->gpuvm_base = gpuVmApeBase(i + 1);
+ ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base);
+
+ ape_args->gpu_id = 2765;
+
+ assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
+ assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
+ assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
+ assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
+ assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
+ assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
+ assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
+ assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
+ assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff);
+ assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0);
+ assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff);
+ assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0);
+
+ ape_args.copyOut(virt_proxy);
+ }
+
+ ioc_args.copyOut(virt_proxy);
+ }
+ break;
+ case AMDKFD_IOC_GET_DMABUF_INFO:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
+ }
+ break;
+ case AMDKFD_IOC_IMPORT_DMABUF:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
+ }
+ break;
+ case AMDKFD_IOC_GET_TILE_CONFIG:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
+ }
+ break;
+ case AMDKFD_IOC_IPC_IMPORT_HANDLE:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n");
+ }
+ break;
+ case AMDKFD_IOC_IPC_EXPORT_HANDLE:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n");
+ }
+ break;
+ case AMDKFD_IOC_CROSS_MEMORY_COPY:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n");
+ }
+ break;
+ case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE:
+ {
+ warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n");
+ }
+ break;
+ default:
+ fatal("%s: bad ioctl %d\n", req);
+ break;
+ }
+ return 0;
+}
+
+Addr
+GPUComputeDriver::gpuVmApeBase(int gpuNum) const
+{
+ return ((Addr)gpuNum << 61) + 0x1000000000000L;
+}
+
+Addr
+GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const
+{
+ return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
+}
+
+Addr
+GPUComputeDriver::scratchApeBase(int gpuNum) const
+{
+ return ((Addr)gpuNum << 61) + 0x100000000L;
+}
+
+Addr
+GPUComputeDriver::scratchApeLimit(Addr apeBase) const
+{
+ return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+}
+
+Addr
+GPUComputeDriver::ldsApeBase(int gpuNum) const
+{
+ return ((Addr)gpuNum << 61) + 0x0;
+}
+
+Addr
+GPUComputeDriver::ldsApeLimit(Addr apeBase) const
+{
+ return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+}
+
+GPUComputeDriver*
+GPUComputeDriverParams::create()
+{
+ return new GPUComputeDriver(this);
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Sooraj Puthoor
+ * Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * The GPUComputeDriver implements an HSADriver for an HSA AMD GPU
+ * agent. Other GPU devices, or other HSA agents, should not derive
+ * from this class. Instead device-specific implementations of an
+ * HSADriver should be provided for each unique device.
+ */
+
+#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
+#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
+
+#include "dev/hsa/hsa_driver.hh"
+
+struct GPUComputeDriverParams;
+
+class GPUComputeDriver final : public HSADriver
+{
+ public:
+ typedef GPUComputeDriverParams Params;
+ GPUComputeDriver(Params *p);
+ int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override;
+
+ private:
+ /**
+ * The aperture (APE) base/limit pairs are set
+ * statically at startup by the real KFD. AMD
+ * x86_64 CPUs only use the areas in the 64b
+ * address space where VA[63:47] == 0x1ffff or
+ * VA[63:47] = 0. These methods generate the APE
+ * base/limit pairs in exactly the same way as
+ * the real KFD does, which ensures these APEs do
+ * not fall into the CPU's address space
+ *
+ * see the macros in the KFD driver in the ROCm
+ * Linux kernel source:
+ *
+ * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+ */
+ Addr gpuVmApeBase(int gpuNum) const;
+ Addr gpuVmApeLimit(Addr apeBase) const;
+ Addr scratchApeBase(int gpuNum) const;
+ Addr scratchApeLimit(Addr apeBase) const;
+ Addr ldsApeBase(int gpuNum) const;
+ Addr ldsApeLimit(Addr apeBase) const;
+};
+
+#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__
#include "debug/GPUMem.hh"
#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
- GPUStaticInst *static_inst, uint64_t instSeqNum)
- : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
- n_reg(0), useContinuation(false),
- statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum)
+ GPUStaticInst *static_inst, InstSeqNum instSeqNum)
+ : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
+ (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+ _staticInst(static_inst), _seqNum(instSeqNum)
{
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
- d_data = new uint8_t[computeUnit()->wfSize() * 16];
+ // vector instructions can have up to 4 source/destination operands
+ d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
a_data = new uint8_t[computeUnit()->wfSize() * 8];
x_data = new uint8_t[computeUnit()->wfSize() * 8];
+ // scalar loads can read up to 16 Dwords of data (see publicly
+ // available GCN3 ISA manual)
+ scalar_data = new uint8_t[16 * sizeof(uint32_t)];
+ for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
+ scalar_data[i] = 0;
+ }
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
a_data[i] = 0;
x_data[i] = 0;
}
- for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+ for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
d_data[i] = 0;
}
+ time = 0;
+
+ cu_id = _cu->cu_id;
+ if (_wf) {
+ simdId = _wf->simdId;
+ wfDynId = _wf->wfDynId;
+ kern_id = _wf->kernId;
+ wg_id = _wf->wgId;
+ wfSlotId = _wf->wfSlotId;
+ } else {
+ simdId = -1;
+ wfDynId = -1;
+ kern_id = -1;
+ wg_id = -1;
+ wfSlotId = -1;
+ }
}
GPUDynInst::~GPUDynInst()
delete[] d_data;
delete[] a_data;
delete[] x_data;
+ delete[] scalar_data;
+ delete _staticInst;
}
void
return _staticInst->numDstRegOperands();
}
+int
+GPUDynInst::numSrcVecOperands()
+{
+ return _staticInst->numSrcVecOperands();
+}
+
+int
+GPUDynInst::numDstVecOperands()
+{
+ return _staticInst->numDstVecOperands();
+}
+
+int
+GPUDynInst::numSrcVecDWORDs()
+{
+ return _staticInst->numSrcVecDWORDs();
+}
+
+int
+GPUDynInst::numDstVecDWORDs()
+{
+ return _staticInst->numDstVecDWORDs();
+}
+
+int
+GPUDynInst::numOpdDWORDs(int operandIdx)
+{
+ return _staticInst->numOpdDWORDs(operandIdx);
+}
+
int
GPUDynInst::getNumOperands()
{
return _staticInst->isScalarRegister(operandIdx);
}
-bool
-GPUDynInst::isCondRegister(int operandIdx)
-{
- return _staticInst->isCondRegister(operandIdx);
-}
-
int
GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
{
return _staticInst->isSrcOperand(operandIdx);
}
+bool
+GPUDynInst::hasSourceSgpr() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+GPUDynInst::hasSourceVgpr() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+GPUDynInst::hasDestinationSgpr() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+GPUDynInst::srcIsVgpr(int index) const
+{
+ assert(index >= 0 && index < _staticInst->getNumOperands());
+ if (_staticInst->isVectorRegister(index) &&
+ _staticInst->isSrcOperand(index)) {
+ return true;
+ }
+ return false;
+}
+
+bool
+GPUDynInst::hasDestinationVgpr() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+GPUDynInst::isOpcode(const std::string& opcodeStr,
+ const std::string& extStr) const
+{
+ return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
+ _staticInst->opcode().find(extStr) != std::string::npos;
+}
+
+bool
+GPUDynInst::isOpcode(const std::string& opcodeStr) const
+{
+ return _staticInst->opcode().find(opcodeStr) != std::string::npos;
+}
+
const std::string&
GPUDynInst::disassemble() const
{
return _staticInst->disassemble();
}
-uint64_t
+InstSeqNum
GPUDynInst::seqNum() const
{
return _seqNum;
return _staticInst->executed_as;
}
+bool
+GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
+{
+ assert(s);
+ for (int i = 0; i < getNumOperands(); ++i) {
+ if (isVectorRegister(i) && isSrcOperand(i)) {
+ for (int j = 0; j < s->getNumOperands(); ++j) {
+ if (s->isVectorRegister(j) && s->isDstOperand(j)) {
+ if (i == j)
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+
+bool
+GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
+{
+ assert(s);
+ for (int i = 0; i < getNumOperands(); ++i) {
+ if (isScalarRegister(i) && isSrcOperand(i)) {
+ for (int j = 0; j < s->getNumOperands(); ++j) {
+ if (s->isScalarRegister(j) && s->isDstOperand(j)) {
+ if (i == j)
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+
// Process a memory instruction and (if necessary) submit timing request
void
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
cu->cu_id, simdId, wfSlotId, exec_mask);
_staticInst->initiateAcc(gpuDynInst);
- time = 0;
}
void
GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
{
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
+ "%#x\n complete",
+ cu->cu_id, simdId, wfSlotId, exec_mask);
+
_staticInst->completeAcc(gpuDynInst);
}
return _staticInst->isBranch();
}
+bool
+GPUDynInst::isCondBranch() const
+{
+ return _staticInst->isCondBranch();
+}
+
bool
GPUDynInst::isNop() const
{
return _staticInst->isNop();
}
+bool
+GPUDynInst::isEndOfKernel() const
+{
+ return _staticInst->isEndOfKernel();
+}
+
+bool
+GPUDynInst::isKernelLaunch() const
+{
+ return _staticInst->isKernelLaunch();
+}
+
+bool
+GPUDynInst::isSDWAInst() const
+{
+ return _staticInst->isSDWAInst();
+}
+
+bool
+GPUDynInst::isDPPInst() const
+{
+ return _staticInst->isDPPInst();
+}
+
bool
GPUDynInst::isReturn() const
{
}
bool
-GPUDynInst::isMemFence() const
+GPUDynInst::isMemSync() const
{
- return _staticInst->isMemFence();
+ return _staticInst->isMemSync();
}
bool
return _staticInst->isAtomicRet();
}
+bool
+GPUDynInst::isVector() const
+{
+ return !_staticInst->isScalar();
+}
+
bool
GPUDynInst::isScalar() const
{
return _staticInst->writesVCC();
}
+bool
+GPUDynInst::readsMode() const
+{
+ return _staticInst->readsMode();
+}
+
+bool
+GPUDynInst::writesMode() const
+{
+ return _staticInst->writesMode();
+}
+
+bool
+GPUDynInst::readsEXEC() const
+{
+ return _staticInst->readsEXEC();
+}
+
+bool
+GPUDynInst::writesEXEC() const
+{
+ return _staticInst->writesEXEC();
+}
+
+bool
+GPUDynInst::ignoreExec() const
+{
+ return _staticInst->ignoreExec();
+}
+
+bool
+GPUDynInst::writesExecMask() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ return _staticInst->isDstOperand(i) &&
+ _staticInst->isExecMaskRegister(i);
+ }
+ return false;
+}
+
+bool
+GPUDynInst::readsExecMask() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ return _staticInst->isSrcOperand(i) &&
+ _staticInst->isExecMaskRegister(i);
+ }
+ return false;
+}
+
+bool
+GPUDynInst::writesFlatScratch() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
+ return _staticInst->isFlatScratchRegister(i);
+ }
+ }
+ return false;
+}
+
+bool
+GPUDynInst::readsFlatScratch() const
+{
+ for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
+ if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
+ return _staticInst->isFlatScratchRegister(i);
+ }
+ }
+ return false;
+}
+
bool
GPUDynInst::isAtomicAnd() const
{
}
bool
-GPUDynInst::isWorkitemScope() const
-{
- return _staticInst->isWorkitemScope();
-}
-
-bool
-GPUDynInst::isWavefrontScope() const
+GPUDynInst::isGloballyCoherent() const
{
- return _staticInst->isWavefrontScope();
+ return _staticInst->isGloballyCoherent();
}
bool
-GPUDynInst::isWorkgroupScope() const
+GPUDynInst::isSystemCoherent() const
{
- return _staticInst->isWorkgroupScope();
+ return _staticInst->isSystemCoherent();
}
bool
-GPUDynInst::isDeviceScope() const
+GPUDynInst::isF16() const
{
- return _staticInst->isDeviceScope();
+ return _staticInst->isF16();
}
bool
-GPUDynInst::isSystemScope() const
+GPUDynInst::isF32() const
{
- return _staticInst->isSystemScope();
+ return _staticInst->isF32();
}
bool
-GPUDynInst::isNoScope() const
+GPUDynInst::isF64() const
{
- return _staticInst->isNoScope();
+ return _staticInst->isF64();
}
bool
-GPUDynInst::isRelaxedOrder() const
+GPUDynInst::isFMA() const
{
- return _staticInst->isRelaxedOrder();
+ return _staticInst->isFMA();
}
bool
-GPUDynInst::isAcquire() const
+GPUDynInst::isMAC() const
{
- return _staticInst->isAcquire();
+ return _staticInst->isMAC();
}
bool
-GPUDynInst::isRelease() const
+GPUDynInst::isMAD() const
{
- return _staticInst->isRelease();
+ return _staticInst->isMAD();
}
-bool
-GPUDynInst::isAcquireRelease() const
-{
- return _staticInst->isAcquireRelease();
-}
+void
+GPUDynInst::doApertureCheck(const VectorMask &mask)
+{
+ assert(mask.any());
+ // find the segment of the first active address, after
+ // that we check that all other active addresses also
+ // fall within the same APE
+ for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+ if (mask[lane]) {
+ if (computeUnit()->shader->isLdsApe(addr[lane])) {
+ // group segment
+ staticInstruction()->executed_as = Enums::SC_GROUP;
+ break;
+ } else if (computeUnit()->shader->isScratchApe(addr[lane])) {
+ // private segment
+ staticInstruction()->executed_as = Enums::SC_PRIVATE;
+ break;
+ } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
+ // we won't support GPUVM
+ fatal("flat access is in GPUVM APE\n");
+ } else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
+ bits(addr[lane], 63, 47)) {
+ // we are in the "hole", this is a memory violation
+ fatal("flat access at addr %#x has a memory violation\n",
+ addr[lane]);
+ } else {
+ // global memory segment
+ staticInstruction()->executed_as = Enums::SC_GLOBAL;
+ break;
+ }
+ }
+ }
-bool
-GPUDynInst::isNoOrder() const
-{
- return _staticInst->isNoOrder();
+ // we should have found the segment
+ assert(executedAs() != Enums::SC_NONE);
+
+ // flat accesses should not straddle multiple APEs so we
+ // must check that all addresses fall within the same APE
+ if (executedAs() == Enums::SC_GROUP) {
+ for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+ if (mask[lane]) {
+ // if the first valid addr we found above was LDS,
+ // all the rest should be
+ assert(computeUnit()->shader->isLdsApe(addr[lane]));
+ }
+ }
+ } else if (executedAs() == Enums::SC_PRIVATE) {
+ for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+ if (mask[lane]) {
+ // if the first valid addr we found above was private,
+ // all the rest should be
+ assert(computeUnit()->shader->isScratchApe(addr[lane]));
+ }
+ }
+ } else {
+ for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
+ if (mask[lane]) {
+ // if the first valid addr we found above was global,
+ // all the rest should be. because we don't have an
+ // explicit range of the global segment, we just make
+ // sure that the address fall in no other APE and that
+ // it is not a memory violation
+ assert(!computeUnit()->shader->isLdsApe(addr[lane]));
+ assert(!computeUnit()->shader->isScratchApe(addr[lane]));
+ assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
+ assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
+ && bits(addr[lane], 63, 47)));
+ }
+ }
+ }
}
-bool
-GPUDynInst::isGloballyCoherent() const
-{
- return _staticInst->isGloballyCoherent();
+void
+GPUDynInst::resolveFlatSegment(const VectorMask &mask)
+{
+ doApertureCheck(mask);
+
+
+ // Now that we know the aperature, do the following:
+ // 1. Transform the flat address to its segmented equivalent.
+ // 2. Set the execUnitId based an the aperture check.
+ // 3. Decrement any extra resources that were reserved. Other
+ // resources are released as normal, below.
+ if (executedAs() == Enums::SC_GLOBAL) {
+ // no transormation for global segment
+ wavefront()->execUnitId = wavefront()->flatGmUnitId;
+ if (isLoad()) {
+ wavefront()->rdLmReqsInPipe--;
+ } else if (isStore()) {
+ wavefront()->wrLmReqsInPipe--;
+ } else if (isAtomic() || isMemSync()) {
+ wavefront()->wrLmReqsInPipe--;
+ wavefront()->rdLmReqsInPipe--;
+ } else {
+ panic("Invalid memory operation!\n");
+ }
+ } else if (executedAs() == Enums::SC_GROUP) {
+ for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+ if (mask[lane]) {
+ // flat address calculation goes here.
+ // addr[lane] = segmented address
+ panic("Flat group memory operation is unimplemented!\n");
+ }
+ }
+ wavefront()->execUnitId = wavefront()->flatLmUnitId;
+ if (isLoad()) {
+ wavefront()->rdGmReqsInPipe--;
+ } else if (isStore()) {
+ wavefront()->wrGmReqsInPipe--;
+ } else if (isAtomic() || isMemSync()) {
+ wavefront()->rdGmReqsInPipe--;
+ wavefront()->wrGmReqsInPipe--;
+ } else {
+ panic("Invalid memory operation!\n");
+ }
+ } else if (executedAs() == Enums::SC_PRIVATE) {
+ /**
+ * Flat instructions may resolve to the private segment (scratch),
+ * which is backed by main memory and provides per-lane scratch
+ * memory. Flat addressing uses apertures - registers that specify
+ * the address range in the VA space where LDS/private memory is
+ * mapped. The value of which is set by the kernel mode driver.
+ * These apertures use addresses that are not used by x86 CPUs.
+ * When the address of a Flat operation falls into one of the
+ * apertures, the Flat operation is redirected to either LDS or
+ * to the private memory segment.
+ *
+ * For private memory the SW runtime will allocate some space in
+ * the VA space for each AQL queue. The base address of which is
+ * stored in scalar registers per the AMD GPU ABI. The amd_queue_t
+ * scratch_backing_memory_location provides the base address in
+ * memory for the queue's private segment. Various other fields
+ * loaded into register state during kernel launch specify per-WF
+ * and per-work-item offsets so that individual lanes may access
+ * their private segment allocation.
+ *
+ * For more details about flat addressing see:
+ * http://rocm-documentation.readthedocs.io/en/latest/
+ * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
+ *
+ * https://github.com/ROCm-Developer-Tools/
+ * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
+ * #flat-addressing
+ */
+
+ uint32_t numSgprs = wavefront()->maxSgprs;
+ uint32_t physSgprIdx =
+ wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
+ numSgprs - 3);
+ uint32_t offset =
+ wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
+ physSgprIdx =
+ wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
+ numSgprs - 4);
+ uint32_t size =
+ wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
+ for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+ if (mask[lane]) {
+ addr[lane] = addr[lane] + lane * size + offset +
+ wavefront()->computeUnit->shader->getHiddenPrivateBase() -
+ wavefront()->computeUnit->shader->getScratchBase();
+ }
+ }
+ wavefront()->execUnitId = wavefront()->flatLmUnitId;
+ if (isLoad()) {
+ wavefront()->rdGmReqsInPipe--;
+ } else if (isStore()) {
+ wavefront()->wrGmReqsInPipe--;
+ } else if (isAtomic() || isMemSync()) {
+ wavefront()->rdGmReqsInPipe--;
+ wavefront()->wrGmReqsInPipe--;
+ } else {
+ panic("Invalid memory operation!\n");
+ }
+ } else {
+ for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
+ if (mask[lane]) {
+ panic("flat addr %#llx maps to bad segment %d\n",
+ addr[lane], executedAs());
+ }
+ }
+ }
}
-bool
-GPUDynInst::isSystemCoherent() const
+TheGpuISA::ScalarRegU32
+GPUDynInst::srcLiteral() const
{
- return _staticInst->isSystemCoherent();
+ return _staticInst->srcLiteral();
}
void
if (_staticInst->isLocalMem()) {
// access to LDS (shared) memory
cu->dynamicLMemInstrCnt++;
+ } else if (_staticInst->isFlat()) {
+ cu->dynamicFlatMemInstrCnt++;
} else {
// access to global memory
cu->dynamicGMemInstrCnt++;
}
}
+
+void
+GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
+{
+ // Only take the first measurement in the case of coalescing
+ if (roundTripTime.size() > hopId)
+ return;
+
+ roundTripTime.push_back(currentTime);
+}
+
+void
+GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
+{
+ if (lineAddressTime.count(addr)) {
+ if (lineAddressTime[addr].size() > hopId) {
+ return;
+ }
+
+ lineAddressTime[addr].push_back(currentTime);
+ } else if (hopId == 0) {
+ auto addressTimeVec = std::vector<Tick> { currentTime };
+ lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
+ }
+}
#include "base/amo.hh"
#include "base/logging.hh"
-#include "enums/MemType.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
} else {
computeUnit->numFailedCASOps++;
}
-
- if (computeUnit->xact_cas_mode) {
- computeUnit->xactCasLoadMap.clear();
- }
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};
-typedef enum
-{
- VT_32,
- VT_64,
-} vgpr_type;
-
class GPUDynInst : public GPUExecContext
{
public:
void execute(GPUDynInstPtr gpuDynInst);
int numSrcRegOperands();
int numDstRegOperands();
+ int numDstVecOperands();
+ int numSrcVecOperands();
+ int numSrcVecDWORDs();
+ int numDstVecDWORDs();
+ int numOpdDWORDs(int operandIdx);
int getNumOperands();
bool isVectorRegister(int operandIdx);
bool isScalarRegister(int operandIdx);
- bool isCondRegister(int operandIdx);
int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
int getOperandSize(int operandIdx);
bool isDstOperand(int operandIdx);
bool isSrcOperand(int operandIdx);
+ bool hasDestinationSgpr() const;
+ bool hasSourceSgpr() const;
+ bool hasDestinationVgpr() const;
+ bool hasSourceVgpr() const;
+
+ bool hasSgprRawDependence(GPUDynInstPtr s);
+ bool hasVgprRawDependence(GPUDynInstPtr s);
+
+ // returns true if the string "opcodeStr" is found in the
+ // opcode of the instruction
+ bool isOpcode(const std::string& opcodeStr) const;
+ bool isOpcode(const std::string& opcodeStr,
+ const std::string& extStr) const;
+ // returns true if source operand at "index" is a vector register
+ bool srcIsVgpr(int index) const;
+
const std::string &disassemble() const;
- uint64_t seqNum() const;
+ InstSeqNum seqNum() const;
Enums::StorageClassType executedAs();
- // The address of the memory operation
+ // virtual address for scalar memory operations
+ Addr scalarAddr;
+ // virtual addressies for vector memory operations
std::vector<Addr> addr;
Addr pAddr;
- // The data to get written
+ // vector data to get written
uint8_t *d_data;
+ // scalar data to be transferred
+ uint8_t *scalar_data;
// Additional data (for atomics)
uint8_t *a_data;
// Additional data (for atomics)
// The execution mask
VectorMask exec_mask;
- // The memory type (M_U32, M_S32, ...)
- Enums::MemType m_type;
-
- // The equivalency class
- int equiv;
- // The return VGPR type (VT_32 or VT_64)
- vgpr_type v_type;
- // Number of VGPR's accessed (1, 2, or 4)
- int n_reg;
- // The return VGPR index
- int dst_reg;
- // There can be max 4 dest regs>
- int dst_reg_vec[4];
// SIMD where the WF of the memory instruction has been mapped to
int simdId;
// unique id of the WF where the memory instruction belongs to
int kern_id;
// The CU id of the requesting wf
int cu_id;
+ // The workgroup id of the requesting wf
+ int wg_id;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
// execution pipeline id where the memory instruction has been scheduled
- int pipeId;
+ int execUnitId;
// The execution time of this operation
Tick time;
// The latency of this operation
WaitClass latency;
- // A list of bank conflicts for the 4 cycles.
- uint32_t bc[4];
-
- // A pointer to ROM
- uint8_t *rom;
- // The size of the READONLY segment
- int sz_rom;
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
GPUStaticInst* staticInstruction() { return _staticInst; }
+ TheGpuISA::ScalarRegU32 srcLiteral() const;
+
bool isALU() const;
bool isBranch() const;
+ bool isCondBranch() const;
bool isNop() const;
bool isReturn() const;
+ bool isEndOfKernel() const;
+ bool isKernelLaunch() const;
+ bool isSDWAInst() const;
+ bool isDPPInst() const;
bool isUnconditionalJump() const;
bool isSpecialOp() const;
bool isWaitcnt() const;
bool isBarrier() const;
- bool isMemFence() const;
+ bool isMemSync() const;
bool isMemRef() const;
bool isFlat() const;
bool isLoad() const;
bool isAtomicRet() const;
bool isScalar() const;
+ bool isVector() const;
bool readsSCC() const;
bool writesSCC() const;
bool readsVCC() const;
bool writesVCC() const;
+ bool readsEXEC() const;
+ bool writesEXEC() const;
+ bool readsMode() const;
+ bool writesMode() const;
+ bool ignoreExec() const;
+ bool readsFlatScratch() const;
+ bool writesFlatScratch() const;
+ bool readsExecMask() const;
+ bool writesExecMask() const;
bool isAtomicAnd() const;
bool isAtomicOr() const;
bool isReadOnlySeg() const;
bool isSpillSeg() const;
- bool isWorkitemScope() const;
- bool isWavefrontScope() const;
- bool isWorkgroupScope() const;
- bool isDeviceScope() const;
- bool isSystemScope() const;
- bool isNoScope() const;
-
- bool isRelaxedOrder() const;
- bool isAcquire() const;
- bool isRelease() const;
- bool isAcquireRelease() const;
- bool isNoOrder() const;
-
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
- /*
- * Loads/stores/atomics may have acquire/release semantics associated
- * withthem. Some protocols want to see the acquire/release as separate
- * requests from the load/store/atomic. We implement that separation
- * using continuations (i.e., a function pointer with an object associated
- * with it). When, for example, the front-end generates a store with
- * release semantics, we will first issue a normal store and set the
- * continuation in the GPUDynInst to a function that generate a
- * release request. That continuation will be called when the normal
- * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
- * continuation will be called in the context of the same GPUDynInst
- * that generated the initial store.
- */
- std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
-
- // when true, call execContinuation when response arrives
- bool useContinuation;
+ bool isF16() const;
+ bool isF32() const;
+ bool isF64() const;
+
+ bool isFMA() const;
+ bool isMAC() const;
+ bool isMAD() const;
+
+ // for FLAT memory ops. check the segment address
+ // against the APE registers to see if it falls
+ // within one of the APE ranges for LDS/SCRATCH/GPUVM.
+ // if it does not fall into one of the three APEs, it
+ // will be a regular global access.
+ void doApertureCheck(const VectorMask &mask);
+ // Function to resolve a flat accesses during execution stage.
+ void resolveFlatSegment(const VectorMask &mask);
template<typename c0> AtomicOpFunctorPtr
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
}
void
- setRequestFlags(RequestPtr req, bool setMemOrder=true)
+ setRequestFlags(RequestPtr req) const
{
- // currently these are the easy scopes to deduce
- if (isPrivateSeg()) {
- req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
- } else if (isSpillSeg()) {
- req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
- } else if (isGlobalSeg()) {
- req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
- } else if (isReadOnlySeg()) {
- req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
- } else if (isGroupSeg()) {
- req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
- } else if (isFlat()) {
- panic("TODO: translate to correct scope");
- } else {
- fatal("%s has bad segment type\n", disassemble());
+ if (isGloballyCoherent()) {
+ req->setCacheCoherenceFlags(Request::GLC_BIT);
}
- if (isWavefrontScope()) {
- req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
- Request::WAVEFRONT_SCOPE);
- } else if (isWorkgroupScope()) {
- req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
- Request::WORKGROUP_SCOPE);
- } else if (isDeviceScope()) {
- req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
- Request::DEVICE_SCOPE);
- } else if (isSystemScope()) {
- req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
- Request::SYSTEM_SCOPE);
- } else if (!isNoScope() && !isWorkitemScope()) {
- fatal("%s has bad scope type\n", disassemble());
+ if (isSystemCoherent()) {
+ req->setCacheCoherenceFlags(Request::SLC_BIT);
}
- if (setMemOrder) {
- // set acquire and release flags
- if (isAcquire()) {
- req->setFlags(Request::ACQUIRE);
- } else if (isRelease()) {
- req->setFlags(Request::RELEASE);
- } else if (isAcquireRelease()) {
- req->setFlags(Request::ACQUIRE | Request::RELEASE);
- } else if (!isNoOrder()) {
- fatal("%s has bad memory order\n", disassemble());
- }
- }
-
- // set atomic type
- // currently, the instruction genenerator only produces atomic return
- // but a magic instruction can produce atomic no return
if (isAtomicRet()) {
req->setFlags(Request::ATOMIC_RETURN_OP);
} else if (isAtomicNoRet()) {
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
}
+
+ if (isMemSync()) {
+ // the path for kernel launch and kernel end is different
+ // from non-kernel mem sync.
+ assert(!isKernelLaunch());
+ assert(!isEndOfKernel());
+
+ // must be wbinv inst if not kernel launch/end
+ req->setCacheCoherenceFlags(Request::ACQUIRE);
+ }
}
// Map returned packets and the addresses they satisfy with which lane they
// Track the status of memory requests per lane, a bit per lane
VectorMask statusBitVector;
// for ld_v# or st_v#
- std::vector<int> statusVector;
std::vector<int> tlbHitLevel;
+ // for misaligned scalar ops we track the number
+ // of outstanding reqs here
+ int numScalarReqs;
+
+ Tick getAccessTime() const { return accessTime; }
+
+ void setAccessTime(Tick currentTime) { accessTime = currentTime; }
+
+ void profileRoundTripTime(Tick currentTime, int hopId);
+ std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
+
+ void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
+ const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
+ { return lineAddressTime; }
+
+ // inst used to save/restore a wavefront context
+ bool isSaveRestore;
private:
GPUStaticInst *_staticInst;
- uint64_t _seqNum;
+ const InstSeqNum _seqNum;
+
+ // the time the request was started
+ Tick accessTime = -1;
+
+ // hold the tick when the instruction arrives at certain hop points
+ // on it's way to main memory
+ std::vector<Tick> roundTripTime;
+
+ // hold each cache block address for the instruction and a vector
+ // to hold the tick when the block arrives at certain hop points
+ std::map<Addr, std::vector<Tick>> lineAddressTime;
};
#endif // __GPU_DYN_INST_HH__
}
void
-GPUExecContext::writeMiscReg(int opIdx, RegVal operandVal)
+GPUExecContext::writeMiscReg(int opIdx, RegVal val)
{
assert(gpuISA);
- gpuISA->writeMiscReg(opIdx, operandVal);
+ gpuISA->writeMiscReg(opIdx, val);
}
#include "gpu-compute/gpu_static_inst.hh"
GPUStaticInst::GPUStaticInst(const std::string &opcode)
- : executed_as(Enums::SC_NONE), opcode(opcode),
- _instNum(0), _instAddr(0)
+ : executed_as(Enums::SC_NONE), _opcode(opcode),
+ _instNum(0), _instAddr(0), srcVecOperands(-1), dstVecOperands(-1),
+ srcVecDWORDs(-1), dstVecDWORDs(-1)
{
- setFlag(NoOrder);
}
const std::string&
return disassembly;
}
+
+int
+GPUStaticInst::numSrcVecOperands()
+{
+ if (srcVecOperands > -1)
+ return srcVecOperands;
+
+ srcVecOperands = 0;
+ if (!isScalar()) {
+ for (int k = 0; k < getNumOperands(); ++k) {
+ if (isVectorRegister(k) && isSrcOperand(k))
+ srcVecOperands++;
+ }
+ }
+ return srcVecOperands;
+}
+
+int
+GPUStaticInst::numDstVecOperands()
+{
+ if (dstVecOperands > -1)
+ return dstVecOperands;
+
+ dstVecOperands = 0;
+ if (!isScalar()) {
+ for (int k = 0; k < getNumOperands(); ++k) {
+ if (isVectorRegister(k) && isDstOperand(k))
+ dstVecOperands++;
+ }
+ }
+ return dstVecOperands;
+}
+
+int
+GPUStaticInst::numSrcVecDWORDs()
+{
+ if (srcVecDWORDs > -1) {
+ return srcVecDWORDs;
+ }
+
+ srcVecDWORDs = 0;
+ if (!isScalar()) {
+ for (int i = 0; i < getNumOperands(); i++) {
+ if (isVectorRegister(i) && isSrcOperand(i)) {
+ int dwords = numOpdDWORDs(i);
+ srcVecDWORDs += dwords;
+ }
+ }
+ }
+ return srcVecDWORDs;
+}
+
+int
+GPUStaticInst::numDstVecDWORDs()
+{
+ if (dstVecDWORDs > -1) {
+ return dstVecDWORDs;
+ }
+
+ dstVecDWORDs = 0;
+ if (!isScalar()) {
+ for (int i = 0; i < getNumOperands(); i++) {
+ if (isVectorRegister(i) && isDstOperand(i)) {
+ int dwords = numOpdDWORDs(i);
+ dstVecDWORDs += dwords;
+ }
+ }
+ }
+ return dstVecDWORDs;
+}
+
+int
+GPUStaticInst::numOpdDWORDs(int operandIdx)
+{
+ return getOperandSize(operandIdx) <= 4 ? 1
+ : getOperandSize(operandIdx) / 4;
+}
{
public:
GPUStaticInst(const std::string &opcode);
+ virtual ~GPUStaticInst() { }
void instAddr(int inst_addr) { _instAddr = inst_addr; }
int instAddr() const { return _instAddr; }
int nextInstAddr() const { return _instAddr + instSize(); }
int ipdInstNum() const { return _ipdInstNum; }
+ virtual TheGpuISA::ScalarRegU32 srcLiteral() const { return 0; }
+
virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
virtual void generateDisassembly() = 0;
const std::string& disassemble();
virtual int getNumOperands() = 0;
- virtual bool isCondRegister(int operandIndex) = 0;
virtual bool isScalarRegister(int operandIndex) = 0;
virtual bool isVectorRegister(int operandIndex) = 0;
virtual bool isSrcOperand(int operandIndex) = 0;
virtual bool isDstOperand(int operandIndex) = 0;
+ virtual bool isFlatScratchRegister(int opIdx) = 0;
+ virtual bool isExecMaskRegister(int opIdx) = 0;
virtual int getOperandSize(int operandIndex) = 0;
virtual int getRegisterIndex(int operandIndex,
virtual int numDstRegOperands() = 0;
virtual int numSrcRegOperands() = 0;
- virtual bool isValid() const = 0;
+ virtual int coalescerTokenCount() const { return 0; }
+
+ int numDstVecOperands();
+ int numSrcVecOperands();
+ int numDstVecDWORDs();
+ int numSrcVecDWORDs();
+
+ int numOpdDWORDs(int operandIdx);
bool isALU() const { return _flags[ALU]; }
bool isBranch() const { return _flags[Branch]; }
+ bool isCondBranch() const { return _flags[CondBranch]; }
bool isNop() const { return _flags[Nop]; }
bool isReturn() const { return _flags[Return]; }
+ bool isEndOfKernel() const { return _flags[EndOfKernel]; }
+ bool isKernelLaunch() const { return _flags[KernelLaunch]; }
+ bool isSDWAInst() const { return _flags[IsSDWA]; }
+ bool isDPPInst() const { return _flags[IsDPP]; }
bool
isUnconditionalJump() const
bool isWaitcnt() const { return _flags[Waitcnt]; }
bool isBarrier() const { return _flags[MemBarrier]; }
- bool isMemFence() const { return _flags[MemFence]; }
+ bool isMemSync() const { return _flags[MemSync]; }
bool isMemRef() const { return _flags[MemoryRef]; }
bool isFlat() const { return _flags[Flat]; }
bool isLoad() const { return _flags[Load]; }
bool writesSCC() const { return _flags[WritesSCC]; }
bool readsVCC() const { return _flags[ReadsVCC]; }
bool writesVCC() const { return _flags[WritesVCC]; }
+ // Identify instructions that implicitly read the Execute mask
+ // as a source operand but not to dictate which threads execute.
+ bool readsEXEC() const { return _flags[ReadsEXEC]; }
+ bool writesEXEC() const { return _flags[WritesEXEC]; }
+ bool readsMode() const { return _flags[ReadsMode]; }
+ bool writesMode() const { return _flags[WritesMode]; }
+ bool ignoreExec() const { return _flags[IgnoreExec]; }
bool isAtomicAnd() const { return _flags[AtomicAnd]; }
bool isAtomicOr() const { return _flags[AtomicOr]; }
bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; }
bool isSpillSeg() const { return _flags[SpillSegment]; }
- bool isWorkitemScope() const { return _flags[WorkitemScope]; }
- bool isWavefrontScope() const { return _flags[WavefrontScope]; }
- bool isWorkgroupScope() const { return _flags[WorkgroupScope]; }
- bool isDeviceScope() const { return _flags[DeviceScope]; }
- bool isSystemScope() const { return _flags[SystemScope]; }
- bool isNoScope() const { return _flags[NoScope]; }
-
- bool isRelaxedOrder() const { return _flags[RelaxedOrder]; }
- bool isAcquire() const { return _flags[Acquire]; }
- bool isRelease() const { return _flags[Release]; }
- bool isAcquireRelease() const { return _flags[AcquireRelease]; }
- bool isNoOrder() const { return _flags[NoOrder]; }
-
/**
- * Coherence domain of a memory instruction. Only valid for
- * machine ISA. The coherence domain specifies where it is
- * possible to perform memory synchronization, e.g., acquire
- * or release, from the shader kernel.
+ * Coherence domain of a memory instruction. The coherence domain
+ * specifies where it is possible to perform memory synchronization
+ * (e.g., acquire or release) from the shader kernel.
*
- * isGloballyCoherent(): returns true if kernel is sharing memory
- * with other work-items on the same device (GPU)
+ * isGloballyCoherent(): returns true if WIs share same device
+ * isSystemCoherent(): returns true if WIs or threads in different
+ * devices share memory
*
- * isSystemCoherent(): returns true if kernel is sharing memory
- * with other work-items on a different device (GPU) or the host (CPU)
*/
bool isGloballyCoherent() const { return _flags[GloballyCoherent]; }
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
+ // Floating-point instructions
+ bool isF16() const { return _flags[F16]; }
+ bool isF32() const { return _flags[F32]; }
+ bool isF64() const { return _flags[F64]; }
+
+ // FMA, MAC, MAD instructions
+ bool isFMA() const { return _flags[FMA]; }
+ bool isMAC() const { return _flags[MAC]; }
+ bool isMAD() const { return _flags[MAD]; }
+
virtual int instSize() const = 0;
// only used for memory instructions
// For flat memory accesses
Enums::StorageClassType executed_as;
- void setFlag(Flags flag) { _flags[flag] = true; }
-
- virtual void
- execLdAcq(GPUDynInstPtr gpuDynInst)
- {
- fatal("calling execLdAcq() on a non-load instruction.\n");
- }
-
- virtual void
- execSt(GPUDynInstPtr gpuDynInst)
- {
- fatal("calling execLdAcq() on a non-load instruction.\n");
- }
-
- virtual void
- execAtomic(GPUDynInstPtr gpuDynInst)
- {
- fatal("calling execAtomic() on a non-atomic instruction.\n");
- }
-
- virtual void
- execAtomicAcq(GPUDynInstPtr gpuDynInst)
- {
- fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+ void setFlag(Flags flag) {
+ _flags[flag] = true;
+
+ if (isGroupSeg()) {
+ executed_as = Enums::SC_GROUP;
+ } else if (isGlobalSeg()) {
+ executed_as = Enums::SC_GLOBAL;
+ } else if (isPrivateSeg()) {
+ executed_as = Enums::SC_PRIVATE;
+ } else if (isSpillSeg()) {
+ executed_as = Enums::SC_SPILL;
+ } else if (isReadOnlySeg()) {
+ executed_as = Enums::SC_READONLY;
+ } else if (isKernArgSeg()) {
+ executed_as = Enums::SC_KERNARG;
+ } else if (isArgSeg()) {
+ executed_as = Enums::SC_ARG;
+ }
}
+ const std::string& opcode() const { return _opcode; }
protected:
- const std::string opcode;
+ const std::string _opcode;
std::string disassembly;
int _instNum;
int _instAddr;
+ int srcVecOperands;
+ int dstVecOperands;
+ int srcVecDWORDs;
+ int dstVecDWORDs;
/**
* Identifier of the immediate post-dominator instruction.
*/
KernelLaunchStaticInst() : GPUStaticInst("kernel_launch")
{
setFlag(Nop);
+ setFlag(KernelLaunch);
+ setFlag(MemSync);
setFlag(Scalar);
- setFlag(Acquire);
- setFlag(SystemScope);
setFlag(GlobalSegment);
}
void
generateDisassembly() override
{
- disassembly = opcode;
+ disassembly = _opcode;
}
int getNumOperands() override { return 0; }
- bool isCondRegister(int operandIndex) override { return false; }
+ bool isFlatScratchRegister(int opIdx) override { return false; }
+ // return true if the Execute mask is explicitly used as a source
+ // register operand
+ bool isExecMaskRegister(int opIdx) override { return false; }
bool isScalarRegister(int operandIndex) override { return false; }
bool isVectorRegister(int operandIndex) override { return false; }
bool isSrcOperand(int operandIndex) override { return false; }
int numDstRegOperands() override { return 0; }
int numSrcRegOperands() override { return 0; }
- bool isValid() const override { return true; }
int instSize() const override { return 0; }
};
allocationPolicy = p->allocationPolicy;
hasMemSidePort = false;
accessDistance = p->accessDistance;
- clock = p->clk_domain->clockPeriod();
tlb.assign(size, TlbEntry());
{
bool delayedResponse;
- return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
- latency);
+ return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,
+ false, latency);
}
void
}
/*
- * We now know the TLB lookup outcome (if it's a hit or a miss), as well
- * as the TLB access latency.
+ * We now know the TLB lookup outcome (if it's a hit or a miss), as
+ * well as the TLB access latency.
*
* We create and schedule a new TLBEvent which will help us take the
- * appropriate actions (e.g., update TLB on a hit, send request to lower
- * level TLB on a miss, or start a page walk if this was the last-level
- * TLB)
+ * appropriate actions (e.g., update TLB on a hit, send request to
+ * lower level TLB on a miss, or start a page walk if this was the
+ * last-level TLB)
*/
TLBEvent *tlb_event =
new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
assert(tlb_event);
DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
- curTick() + this->ticks(hitLatency));
+ curTick() + cyclesToTicks(Cycles(hitLatency)));
- schedule(tlb_event, curTick() + this->ticks(hitLatency));
+ schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
}
- GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
- PacketPtr _pkt)
- : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
- outcome(tlb_outcome), pkt(_pkt)
+ GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
+ tlbOutcome tlb_outcome, PacketPtr _pkt)
+ : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+ outcome(tlb_outcome), pkt(_pkt)
{
}
bool storeCheck = flags & (StoreCheck << FlagShift);
// Do paging protection checks.
- bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+ bool inUser
+ = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
* The latter calls handelHit with TLB miss as tlbOutcome.
*/
void
- GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
- PacketPtr pkt)
+ GpuTLB::handleTranslationReturn(Addr virt_page_addr,
+ tlbOutcome tlb_outcome, PacketPtr pkt)
{
-
assert(pkt);
Addr vaddr = pkt->req->getVaddr();
TlbEntry *local_entry, *new_entry;
if (tlb_outcome == TLB_HIT) {
- DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+ DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
+ vaddr);
local_entry = sender_state->tlbEntry;
} else {
DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
vaddr);
- // We are returning either from a page walk or from a hit at a lower
- // TLB level. The senderState should be "carrying" a pointer to the
- // correct TLBEntry.
+ /**
+ * We are returning either from a page walk or from a hit at a
+ * lower TLB level. The senderState should be "carrying" a pointer
+ * to the correct TLBEntry.
+ */
new_entry = sender_state->tlbEntry;
assert(new_entry);
local_entry = new_entry;
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
assert(tlb_event);
tlb_event->updateOutcome(PAGE_WALK);
- schedule(tlb_event, curTick() + ticks(missLatency2));
+ schedule(tlb_event,
+ curTick() + cyclesToTicks(Cycles(missLatency2)));
}
} else if (outcome == PAGE_WALK) {
if (update_stats)
return virtPageAddr;
}
- /*
+ /**
* recvTiming receives a coalesced timing request from a TLBCoalescer
* and it calls issueTLBLookup()
* It only rejects the packet if we have exceeded the max
DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
"%#x\n", vaddr);
- // We are returning either from a page walk or from a hit at a lower
- // TLB level. The senderState should be "carrying" a pointer to the
- // correct TLBEntry.
+ /**
+ * We are returning either from a page walk or from a hit at a
+ * lower TLB level. The senderState should be "carrying" a pointer
+ * to the correct TLBEntry.
+ */
new_entry = sender_state->tlbEntry;
assert(new_entry);
local_entry = new_entry;
} else {
// If this was a prefetch, then do the normal thing if it
// was a successful translation. Otherwise, send an empty
- // TLB entry back so that it can be figured out as empty and
- // handled accordingly.
+ // TLB entry back so that it can be figured out as empty
+ // and handled accordingly.
if (pte) {
DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
pte->paddr);
assert(virt_page_addr == tlb_event->getTLBEventVaddr());
tlb_event->updateOutcome(MISS_RETURN);
- tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+ tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());
return true;
}
tmp_access_info.sumDistance = 0;
tmp_access_info.meanDistance = 0;
- ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
- tmp_access_info));
+ ret = TLBFootprint.insert(
+ AccessPatternTable::value_type(virt_page_addr, tmp_access_info));
bool first_page_access = ret.second;
page_stat_file = simout.create(name().c_str())->stream();
// print header
- *page_stat_file << "page,max_access_distance,mean_access_distance, "
- << "stddev_distance" << std::endl;
+ *page_stat_file
+ << "page,max_access_distance,mean_access_distance, "
+ << "stddev_distance" << std::endl;
}
// update avg. reuse distance footprint
- AccessPatternTable::iterator iter, iter_begin, iter_end;
unsigned int sum_avg_reuse_distance_per_page = 0;
// iterate through all pages seen by this TLB
- for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
- sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
- iter->second.accessesPerPage;
+ for (auto &iter : TLBFootprint) {
+ sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /
+ iter.second.accessesPerPage;
if (accessDistance) {
- unsigned int tmp = iter->second.localTLBAccesses[0];
+ unsigned int tmp = iter.second.localTLBAccesses[0];
unsigned int prev = tmp;
- for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+ for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
if (i) {
tmp = prev + 1;
}
- prev = iter->second.localTLBAccesses[i];
+ prev = iter.second.localTLBAccesses[i];
// update the localTLBAccesses value
// with the actual differece
- iter->second.localTLBAccesses[i] -= tmp;
+ iter.second.localTLBAccesses[i] -= tmp;
// compute the sum of AccessDistance per page
// used later for mean
- iter->second.sumDistance +=
- iter->second.localTLBAccesses[i];
+ iter.second.sumDistance +=
+ iter.second.localTLBAccesses[i];
}
- iter->second.meanDistance =
- iter->second.sumDistance / iter->second.accessesPerPage;
+ iter.second.meanDistance =
+ iter.second.sumDistance / iter.second.accessesPerPage;
// compute std_dev and max (we need a second round because we
// need to know the mean value
unsigned int max_distance = 0;
unsigned int stddev_distance = 0;
- for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+ for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
unsigned int tmp_access_distance =
- iter->second.localTLBAccesses[i];
+ iter.second.localTLBAccesses[i];
if (tmp_access_distance > max_distance) {
max_distance = tmp_access_distance;
}
unsigned int diff =
- tmp_access_distance - iter->second.meanDistance;
+ tmp_access_distance - iter.second.meanDistance;
stddev_distance += pow(diff, 2);
}
stddev_distance =
- sqrt(stddev_distance/iter->second.accessesPerPage);
+ sqrt(stddev_distance/iter.second.accessesPerPage);
if (page_stat_file) {
- *page_stat_file << std::hex << iter->first << ",";
+ *page_stat_file << std::hex << iter.first << ",";
*page_stat_file << std::dec << max_distance << ",";
- *page_stat_file << std::dec << iter->second.meanDistance
+ *page_stat_file << std::dec << iter.second.meanDistance
<< ",";
*page_stat_file << std::dec << stddev_distance;
*page_stat_file << std::endl;
}
// erase the localTLBAccesses array
- iter->second.localTLBAccesses.clear();
+ iter.second.localTLBAccesses.clear();
}
}
uint32_t configAddress;
- // TLB clock: will inherit clock from shader's clock period in terms
- // of nuber of ticks of curTime (aka global simulation clock)
- // The assignment of TLB clock from shader clock is done in the python
- // config files.
- int clock;
-
public:
- // clock related functions ; maps to-and-from Simulation ticks and
- // object clocks.
- Tick frequency() const { return SimClock::Frequency / clock; }
-
- Tick
- ticks(int numCycles) const
- {
- return (Tick)clock * numCycles;
- }
-
- Tick curCycle() const { return curTick() / clock; }
- Tick tickToCycles(Tick val) const { return val / clock;}
-
typedef X86GPUTLBParams Params;
GpuTLB(const Params *p);
~GpuTLB();
--- /dev/null
+/*
+ * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+/**
+ * @file
+ * HSAQueuEntry is the simulator's internal representation of an
+ * AQL queue entry (task). It encasulates all of the relevant info
+ * about a task, which is gathered from various runtime data
+ * structures including: the AQL MQD, the AQL packet, and the code
+ * object.
+ */
+
+#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__
+#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__
+
+#include <bitset>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "base/intmath.hh"
+#include "base/types.hh"
+#include "dev/hsa/hsa_packet.hh"
+#include "dev/hsa/hsa_queue.hh"
+#include "gpu-compute/kernel_code.hh"
+
+class HSAQueueEntry
+{
+ public:
+ HSAQueueEntry(std::string kernel_name, uint32_t queue_id,
+ int dispatch_id, void *disp_pkt, AMDKernelCode *akc,
+ Addr host_pkt_addr, Addr code_addr)
+ : kernName(kernel_name),
+ _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,
+ (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,
+ (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},
+ _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
+ (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
+ (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
+ numVgprs(akc->workitem_vgpr_count),
+ numSgprs(akc->wavefront_sgpr_count),
+ _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
+ _hostDispPktAddr(host_pkt_addr),
+ _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
+ ->completion_signal),
+ codeAddress(code_addr),
+ kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),
+ _outstandingInvs(-1), _outstandingWbs(0),
+ _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->
+ group_segment_size),
+ _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->
+ private_segment_size),
+ _contextId(0), _wgId{{ 0, 0, 0 }},
+ _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),
+ _globalWgId(0), dispatchComplete(false)
+
+ {
+ initialVgprState.reset();
+ initialSgprState.reset();
+
+ for (int i = 0; i < MAX_DIM; ++i) {
+ _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);
+ _numWgTotal *= _numWg[i];
+ }
+
+ parseKernelCode(akc);
+ }
+
+ const std::string&
+ kernelName() const
+ {
+ return kernName;
+ }
+
+ int
+ wgSize(int dim) const
+ {
+ assert(dim < MAX_DIM);
+ return _wgSize[dim];
+ }
+
+ int
+ gridSize(int dim) const
+ {
+ assert(dim < MAX_DIM);
+ return _gridSize[dim];
+ }
+
+ int
+ numVectorRegs() const
+ {
+ return numVgprs;
+ }
+
+ int
+ numScalarRegs() const
+ {
+ return numSgprs;
+ }
+
+ uint32_t
+ queueId() const
+ {
+ return _queueId;
+ }
+
+ int
+ dispatchId() const
+ {
+ return _dispatchId;
+ }
+
+ void*
+ dispPktPtr()
+ {
+ return dispPkt;
+ }
+
+ Addr
+ hostDispPktAddr() const
+ {
+ return _hostDispPktAddr;
+ }
+
+ Addr
+ completionSignal() const
+ {
+ return _completionSignal;
+ }
+
+ Addr
+ codeAddr() const
+ {
+ return codeAddress;
+ }
+
+ Addr
+ kernargAddr() const
+ {
+ return kernargAddress;
+ }
+
+ int
+ ldsSize() const
+ {
+ return _ldsSize;
+ }
+
+ int privMemPerItem() const { return _privMemPerItem; }
+
+ int
+ contextId() const
+ {
+ return _contextId;
+ }
+
+ bool
+ dispComplete() const
+ {
+ return dispatchComplete;
+ }
+
+ int
+ wgId(int dim) const
+ {
+ assert(dim < MAX_DIM);
+ return _wgId[dim];
+ }
+
+ void
+ wgId(int dim, int val)
+ {
+ assert(dim < MAX_DIM);
+ _wgId[dim] = val;
+ }
+
+ int
+ globalWgId() const
+ {
+ return _globalWgId;
+ }
+
+ void
+ globalWgId(int val)
+ {
+ _globalWgId = val;
+ }
+
+ int
+ numWg(int dim) const
+ {
+ assert(dim < MAX_DIM);
+ return _numWg[dim];
+ }
+
+ void
+ notifyWgCompleted()
+ {
+ ++_numWgCompleted;
+ }
+
+ int
+ numWgCompleted() const
+ {
+ return _numWgCompleted;
+ }
+
+ int
+ numWgTotal() const
+ {
+ return _numWgTotal;
+ }
+
+ void
+ markWgDispatch()
+ {
+ ++_wgId[0];
+ ++_globalWgId;
+
+ if (wgId(0) * wgSize(0) >= gridSize(0)) {
+ _wgId[0] = 0;
+ ++_wgId[1];
+
+ if (wgId(1) * wgSize(1) >= gridSize(1)) {
+ _wgId[1] = 0;
+ ++_wgId[2];
+
+ if (wgId(2) * wgSize(2) >= gridSize(2)) {
+ dispatchComplete = true;
+ }
+ }
+ }
+ }
+
+ int
+ numWgAtBarrier() const
+ {
+ return numWgArrivedAtBarrier;
+ }
+
+ bool vgprBitEnabled(int bit) const
+ {
+ return initialVgprState.test(bit);
+ }
+
+ bool sgprBitEnabled(int bit) const
+ {
+ return initialSgprState.test(bit);
+ }
+
+ /**
+ * Host-side addr of the amd_queue_t on which
+ * this task was queued.
+ */
+ Addr hostAMDQueueAddr;
+
+ /**
+ * Keep a copy of the AMD HSA queue because we
+ * need info from some of its fields to initialize
+ * register state.
+ */
+ _amd_queue_t amdQueue;
+
+ // the maximum number of dimensions for a grid or workgroup
+ const static int MAX_DIM = 3;
+
+ /* getter */
+ int
+ outstandingInvs() {
+ return _outstandingInvs;
+ }
+
+ /**
+ * Whether invalidate has started or finished -1 is the
+ * initial value indicating inv has not started for the
+ * kernel.
+ */
+ bool
+ isInvStarted()
+ {
+ return (_outstandingInvs != -1);
+ }
+
+ /**
+ * update the number of pending invalidate requests
+ *
+ * val: negative to decrement, positive to increment
+ */
+ void
+ updateOutstandingInvs(int val)
+ {
+ _outstandingInvs += val;
+ assert(_outstandingInvs >= 0);
+ }
+
+ /**
+ * Forcefully change the state to be inv done.
+ */
+ void
+ markInvDone()
+ {
+ _outstandingInvs = 0;
+ }
+
+ /**
+ * Is invalidate done?
+ */
+ bool
+ isInvDone() const
+ {
+ assert(_outstandingInvs >= 0);
+ return (_outstandingInvs == 0);
+ }
+
+ int
+ outstandingWbs() const
+ {
+ return _outstandingWbs;
+ }
+
+ /**
+ * Update the number of pending writeback requests.
+ *
+ * val: negative to decrement, positive to increment
+ */
+ void
+ updateOutstandingWbs(int val)
+ {
+ _outstandingWbs += val;
+ assert(_outstandingWbs >= 0);
+ }
+
+ private:
+ void
+ parseKernelCode(AMDKernelCode *akc)
+ {
+ /** set the enable bits for the initial SGPR state */
+ initialSgprState.set(PrivateSegBuf,
+ akc->enable_sgpr_private_segment_buffer);
+ initialSgprState.set(DispatchPtr,
+ akc->enable_sgpr_dispatch_ptr);
+ initialSgprState.set(QueuePtr,
+ akc->enable_sgpr_queue_ptr);
+ initialSgprState.set(KernargSegPtr,
+ akc->enable_sgpr_kernarg_segment_ptr);
+ initialSgprState.set(DispatchId,
+ akc->enable_sgpr_dispatch_id);
+ initialSgprState.set(FlatScratchInit,
+ akc->enable_sgpr_flat_scratch_init);
+ initialSgprState.set(PrivateSegSize,
+ akc->enable_sgpr_private_segment_size);
+ initialSgprState.set(GridWorkgroupCountX,
+ akc->enable_sgpr_grid_workgroup_count_x);
+ initialSgprState.set(GridWorkgroupCountY,
+ akc->enable_sgpr_grid_workgroup_count_y);
+ initialSgprState.set(GridWorkgroupCountZ,
+ akc->enable_sgpr_grid_workgroup_count_z);
+ initialSgprState.set(WorkgroupIdX,
+ akc->enable_sgpr_workgroup_id_x);
+ initialSgprState.set(WorkgroupIdY,
+ akc->enable_sgpr_workgroup_id_y);
+ initialSgprState.set(WorkgroupIdZ,
+ akc->enable_sgpr_workgroup_id_z);
+ initialSgprState.set(WorkgroupInfo,
+ akc->enable_sgpr_workgroup_info);
+ initialSgprState.set(PrivSegWaveByteOffset,
+ akc->enable_sgpr_private_segment_wave_byte_offset);
+
+ /**
+ * set the enable bits for the initial VGPR state. the
+ * workitem Id in the X dimension is always initialized.
+ */
+ initialVgprState.set(WorkitemIdX, true);
+ initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y);
+ initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z);
+ }
+
+ // name of the kernel associated with the AQL entry
+ std::string kernName;
+ // workgroup Size (3 dimensions)
+ std::array<int, MAX_DIM> _wgSize;
+ // grid Size (3 dimensions)
+ std::array<int, MAX_DIM> _gridSize;
+ // total number of VGPRs per work-item
+ int numVgprs;
+ // total number of SGPRs per wavefront
+ int numSgprs;
+ // id of AQL queue in which this entry is placed
+ uint32_t _queueId;
+ int _dispatchId;
+ // raw AQL packet pointer
+ void *dispPkt;
+ // host-side addr of the dispatch packet
+ Addr _hostDispPktAddr;
+ // pointer to bool
+ Addr _completionSignal;
+ // base address of the raw machine code
+ Addr codeAddress;
+ // base address of the kernel args
+ Addr kernargAddress;
+ /**
+ * Number of outstanding invs for the kernel.
+ * values:
+ * -1: initial value, invalidate has not started for the kernel
+ * 0: 1)-1->0, about to start (a transient state, added in the same cycle)
+ * 2)+1->0, all inv requests are finished, i.e., invalidate done
+ * ?: positive value, indicating the number of pending inv requests
+ */
+ int _outstandingInvs;
+ /**
+ * Number of outstanding wbs for the kernel
+ * values:
+ * 0: 1)initial value, flush has not started for the kernel
+ * 2)+1->0: all wb requests are finished, i.e., flush done
+ * ?: positive value, indicating the number of pending wb requests
+ */
+ int _outstandingWbs;
+ int _ldsSize;
+ int _privMemPerItem;
+ int _contextId;
+ std::array<int, MAX_DIM> _wgId;
+ std::array<int, MAX_DIM> _numWg;
+ int _numWgTotal;
+ int numWgArrivedAtBarrier;
+ // The number of completed work groups
+ int _numWgCompleted;
+ int _globalWgId;
+ bool dispatchComplete;
+
+ std::bitset<NumVectorInitFields> initialVgprState;
+ std::bitset<NumScalarInitFields> initialSgprState;
+};
+
+#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__
--- /dev/null
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#ifndef __GPU_COMPUTE_KERNEL_CODE_HH__
+#define __GPU_COMPUTE_KERNEL_CODE_HH__
+
+#include <bitset>
+#include <cstdint>
+
+/**
+ * these enums represent the indices into the
+ * initialRegState bitfields in HsaKernelInfo.
+ * each bit specifies whether or not the
+ * particular piece of state that the bit
+ * corresponds to should be initialized into
+ * the VGPRs/SGPRs. the order in which the
+ * fields are placed matters, as all enabled
+ * pieces of state will be initialized into
+ * contiguous registers in the same order
+ * as their position in the bitfield - which
+ * is specified in the HSA ABI.
+ */
+enum ScalarRegInitFields : int
+{
+ PrivateSegBuf = 0,
+ DispatchPtr = 1,
+ QueuePtr = 2,
+ KernargSegPtr = 3,
+ DispatchId = 4,
+ FlatScratchInit = 5,
+ PrivateSegSize = 6,
+ GridWorkgroupCountX = 7,
+ GridWorkgroupCountY = 8,
+ GridWorkgroupCountZ = 9,
+ WorkgroupIdX = 10,
+ WorkgroupIdY = 11,
+ WorkgroupIdZ = 12,
+ WorkgroupInfo = 13,
+ PrivSegWaveByteOffset = 14,
+ NumScalarInitFields = 15
+};
+
+enum VectorRegInitFields : int
+{
+ WorkitemIdX = 0,
+ WorkitemIdY = 1,
+ WorkitemIdZ = 2,
+ NumVectorInitFields = 3
+};
+
+struct AMDKernelCode
+{
+ uint32_t amd_kernel_code_version_major;
+ uint32_t amd_kernel_code_version_minor;
+ uint16_t amd_machine_kind;
+ uint16_t amd_machine_version_major;
+ uint16_t amd_machine_version_minor;
+ uint16_t amd_machine_version_stepping;
+ int64_t kernel_code_entry_byte_offset;
+ int64_t kernel_code_prefetch_byte_offset;
+ uint64_t kernel_code_prefetch_byte_size;
+ uint64_t max_scratch_backing_memory_byte_size;
+
+ /**
+ * The fields below are used to set program settings for
+ * compute shaders. Here they are primarily used to setup
+ * initial register state. See the following for full details
+ * about kernel launch, state initialization, and the AMD kernel
+ * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/
+ * blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
+ * #initial-kernel-register-state
+ */
+
+ // the 32b below here represent the fields of
+ // the COMPUTE_PGM_RSRC1 register
+ uint32_t granulated_workitem_vgpr_count : 6;
+ uint32_t granulated_wavefront_sgpr_count : 4;
+ uint32_t priority : 2;
+ uint32_t float_mode_round_32 : 2;
+ uint32_t float_mode_round_16_64 : 2;
+ uint32_t float_mode_denorm_32 : 2;
+ uint32_t float_mode_denorm_16_64 : 2;
+ uint32_t priv : 1;
+ uint32_t enable_dx10_clamp : 1;
+ uint32_t debug_mode : 1;
+ uint32_t enable_ieee_mode : 1;
+ uint32_t bulky : 1;
+ uint32_t cdbg_user : 1;
+ uint32_t compute_pgm_rsrc1_reserved : 6;
+ // end COMPUTE_PGM_RSRC1 register
+
+ // the 32b below here represent the fields of
+ // the COMPUTE_PGM_RSRC2 register
+ uint32_t enable_sgpr_private_segment_wave_byte_offset : 1;
+ uint32_t user_sgpr_count : 5;
+ uint32_t enable_trap_handler : 1;
+ uint32_t enable_sgpr_workgroup_id_x : 1;
+ uint32_t enable_sgpr_workgroup_id_y : 1;
+ uint32_t enable_sgpr_workgroup_id_z : 1;
+ uint32_t enable_sgpr_workgroup_info : 1;
+ uint32_t enable_vgpr_workitem_id_y : 1;
+ uint32_t enable_vgpr_workitem_id_z : 1;
+ uint32_t enable_exception_address_watch : 1;
+ uint32_t enable_exception_memory_violation : 1;
+ uint32_t granulated_lds_size : 9;
+ uint32_t enable_exception_ieee_754_fp_invalid_operation : 1;
+ uint32_t enable_exception_fp_denormal_source : 1;
+ uint32_t enable_exception_ieee_754_fp_division_by_zero : 1;
+ uint32_t enable_exception_ieee_754_fp_overflow : 1;
+ uint32_t enable_exception_ieee_754_fp_underflow : 1;
+ uint32_t enable_exception_ieee_754_fp_inexact : 1;
+ uint32_t enable_exception_int_divide_by_zero : 1;
+ uint32_t compute_pgm_rsrc2_reserved : 1;
+ // end COMPUTE_PGM_RSRC2
+
+ // the 32b below here represent the fields of
+ // KERNEL_CODE_PROPERTIES
+ uint32_t enable_sgpr_private_segment_buffer : 1;
+ uint32_t enable_sgpr_dispatch_ptr : 1;
+ uint32_t enable_sgpr_queue_ptr : 1;
+ uint32_t enable_sgpr_kernarg_segment_ptr : 1;
+ uint32_t enable_sgpr_dispatch_id : 1;
+ uint32_t enable_sgpr_flat_scratch_init : 1;
+ uint32_t enable_sgpr_private_segment_size : 1;
+ uint32_t enable_sgpr_grid_workgroup_count_x : 1;
+ uint32_t enable_sgpr_grid_workgroup_count_y : 1;
+ uint32_t enable_sgpr_grid_workgroup_count_z : 1;
+ uint32_t kernel_code_properties_reserved1 : 6;
+ uint32_t enable_ordered_append_gds : 1;
+ uint32_t private_element_size : 2;
+ uint32_t is_ptr64 : 1;
+ uint32_t is_dynamic_callstack : 1;
+ uint32_t is_debug_enabled : 1;
+ uint32_t is_xnack_enabled : 1;
+ uint32_t kernel_code_properties_reserved2 : 9;
+ // end KERNEL_CODE_PROPERTIES
+
+ uint32_t workitem_private_segment_byte_size;
+ uint32_t workgroup_group_segment_byte_size;
+ uint32_t gds_segment_byte_size;
+ uint64_t kernarg_segment_byte_size;
+ uint32_t workgroup_fbarrier_count;
+ uint16_t wavefront_sgpr_count;
+ uint16_t workitem_vgpr_count;
+ uint16_t reserved_vgpr_first;
+ uint16_t reserved_vgpr_count;
+ uint16_t reserved_sgpr_first;
+ uint16_t reserved_sgpr_count;
+ uint16_t debug_wavefront_private_segment_offset_sgpr;
+ uint16_t debug_private_segment_buffer_sgpr;
+ uint8_t kernarg_segment_alignment;
+ uint8_t group_segment_alignment;
+ uint8_t private_segment_alignment;
+ uint8_t wavefront_size;
+ int32_t call_convention;
+ uint8_t reserved[12];
+ uint64_t runtime_loader_kernel_symbol;
+ uint64_t control_directives[16];
+};
+
+#endif // __GPU_COMPUTE_KERNEL_CODE_HH__
parent->loadBusLength();
// delay for accessing the LDS
Tick processingTime =
- parent->shader->ticks(bankConflicts * bankConflictPenalty) +
- parent->shader->ticks(busLength);
+ parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
+ parent->cyclesToTicks(Cycles(busLength));
// choose (delay + last packet in queue) or (now + delay) as the time to
// return this
Tick doneAt = earliestReturnTime() + processingTime;
#include <utility>
#include <vector>
-#include "enums/MemType.hh"
#include "gpu-compute/misc.hh"
#include "mem/port.hh"
#include "params/LdsState.hh"
class ComputeUnit;
/**
- * this represents a slice of the overall LDS, intended to be associated with an
- * individual workgroup
+ * this represents a slice of the overall LDS, intended to be associated with
+ * an individual workgroup
*/
class LdsChunk
{
read(const uint32_t index)
{
fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
- fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+ fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
+ "chunk");
T *p0 = (T *) (&(chunk.at(index)));
return *p0;
}
write(const uint32_t index, const T value)
{
fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
- fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+ fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS "
+ "chunk");
T *p0 = (T *) (&(chunk.at(index)));
*p0 = value;
}
protected:
- // the lds reference counter
- // The key is the workgroup ID and dispatch ID
- // The value is the number of wavefronts that reference this LDS, as
- // wavefronts are launched, the counter goes up for that workgroup and when
- // they return it decreases, once it reaches 0 then this chunk of the LDS is
- // returned to the available pool. However,it is deallocated on the 1->0
- // transition, not whenever the counter is 0 as it always starts with 0 when
- // the workgroup asks for space
+ /**
+ * the lds reference counter
+ * The key is the workgroup ID and dispatch ID
+ * The value is the number of wavefronts that reference this LDS, as
+ * wavefronts are launched, the counter goes up for that workgroup and when
+ * they return it decreases, once it reaches 0 then this chunk of the LDS
+ * is returned to the available pool. However,it is deallocated on the 1->0
+ * transition, not whenever the counter is 0 as it always starts with 0
+ * when the workgroup asks for space
+ */
std::unordered_map<uint32_t,
std::unordered_map<uint32_t, int32_t>> refCounter;
const uint32_t size)
{
if (chunkMap.find(dispatchId) != chunkMap.end()) {
- fatal_if(
+ panic_if(
chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
"duplicate workgroup ID asking for space in the LDS "
"did[%d] wgid[%d]", dispatchId, wgId);
}
- fatal_if(bytesAllocated + size > maximumSize,
- "request would ask for more space than is available");
+ if (bytesAllocated + size > maximumSize) {
+ return nullptr;
+ } else {
+ bytesAllocated += size;
+
+ auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+ panic_if(!value.second, "was unable to allocate a new chunkMap");
+
+ // make an entry for this workgroup
+ refCounter[dispatchId][wgId] = 0;
- bytesAllocated += size;
+ return &chunkMap[dispatchId][wgId];
+ }
+ }
+
+ /*
+ * return pointer to lds chunk for wgid
+ */
+ LdsChunk *
+ getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
+ {
+ fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
+ "fetch for unknown dispatch ID did[%d]", dispatchId);
- chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
- // make an entry for this workgroup
- refCounter[dispatchId][wgId] = 0;
+ fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
+ "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
+ wgId, dispatchId);
- return &chunkMap[dispatchId][wgId];
+ return &chunkMap[dispatchId][wgId];
}
bool
#include "gpu-compute/local_memory_pipeline.hh"
+#include "debug/GPUMem.hh"
#include "debug/GPUPort.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
bool accessVrf = true;
Wavefront *w = nullptr;
- if ((m) && (m->isLoad() || m->isAtomicRet())) {
+ if ((m) && m->latency.rdy() && (m->isLoad() || m->isAtomicRet())) {
w = m->wavefront();
- accessVrf =
- w->computeUnit->vrf[w->simdId]->
- vrfOperandAccessReady(m->seqNum(), w, m,
- VrfAccessType::WRITE);
+ accessVrf = w->computeUnit->vrf[w->simdId]->
+ canScheduleWriteOperandsFromLoad(w, m);
+
}
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
- computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
- || computeUnit->wfWait.at(m->pipeId).rdy())) {
+ computeUnit->locMemToVrfBus.rdy()
+ && (computeUnit->shader->coissue_return
+ || computeUnit->vectorSharedMemUnit.rdy())) {
lmReturnedRequests.pop();
w = m->wavefront();
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
+ m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
m->completeAcc(m);
+ if (m->isLoad() || m->isAtomicRet()) {
+ w->computeUnit->vrf[w->simdId]->
+ scheduleWriteOperandsFromLoad(w, m);
+ }
+
// Decrement outstanding request count
computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
// Mark write bus busy for appropriate amount of time
computeUnit->locMemToVrfBus.set(m->time);
if (computeUnit->shader->coissue_return == 0)
- w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+ w->computeUnit->vectorSharedMemUnit.set(m->time);
}
// If pipeline has executed a local memory instruction
}
}
+void
+LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
+{
+ gpuDynInst->setAccessTime(curTick());
+ lmIssuedRequests.push(gpuDynInst);
+}
+
void
LocalMemPipeline::regStats()
{
LocalMemPipeline(const ComputeUnitParams *params);
void init(ComputeUnit *cu);
void exec();
-
- std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
+ void issueRequest(GPUDynInstPtr gpuDynInst);
+
+
bool
isLMRespFIFOWrRdy() const
{
#include <memory>
#include "base/logging.hh"
+#include "sim/clocked_object.hh"
class GPUDynInst;
-typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits>
+ VectorMask;
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
+enum InstMemoryHop : int {
+ Initiate = 0,
+ CoalsrSend = 1,
+ CoalsrRecv = 2,
+ GMEnqueue = 3,
+ Complete = 4,
+ InstMemoryHopMax = 5
+};
+
+enum BlockMemoryHop : int {
+ BlockSend = 0,
+ BlockRecv = 1
+};
+
class WaitClass
{
public:
- WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
- void init(uint64_t *_tcnt, uint32_t _numStages=0)
+ WaitClass() : nxtAvail(0), lookAheadAvail(0), clockedObject(nullptr) { }
+
+ WaitClass(ClockedObject *_clockedObject, uint64_t _numStages=0)
+ : nxtAvail(0), lookAheadAvail(0), clockedObject(_clockedObject),
+ numStages(_numStages) { }
+
+ void init(ClockedObject *_clockedObject, uint64_t _numStages=0)
{
- tcnt = _tcnt;
+ clockedObject = _clockedObject;
numStages = _numStages;
}
- void set(uint32_t i)
+ void set(uint64_t i)
{
- fatal_if(nxtAvail > *tcnt,
+ fatal_if(nxtAvail > clockedObject->clockEdge(),
"Can't allocate resource because it is busy!!!");
- nxtAvail = *tcnt + i;
+ nxtAvail = clockedObject->clockEdge() + i;
+ }
+ void preset(uint64_t delay)
+ {
+ lookAheadAvail = std::max(lookAheadAvail, delay +
+ (clockedObject->clockEdge()) - numStages);
+ }
+ bool rdy(Cycles cycles = Cycles(0)) const
+ {
+ return clockedObject->clockEdge(cycles) >= nxtAvail;
}
- void preset(uint32_t delay)
+ bool prerdy() const
{
- lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+ return clockedObject->clockEdge() >= lookAheadAvail;
}
- bool rdy() const { return *tcnt >= nxtAvail; }
- bool prerdy() const { return *tcnt >= lookAheadAvail; }
private:
// timestamp indicating when resource will be available
// pending uses of the resource (when there is a cycle gap between
// rdy() and set()
uint64_t lookAheadAvail;
- // current timestamp
- uint64_t *tcnt;
+ // clockedObject for current timestamp
+ ClockedObject *clockedObject;
// number of stages between checking if a resource is ready and
// setting the resource's utilization
- uint32_t numStages;
+ uint64_t numStages;
};
class Float16
Float16(float x)
{
- uint32_t ai = *(uint32_t *)&x;
+ uint32_t ai = *(reinterpret_cast<uint32_t *>(&x));
uint32_t s = (ai >> 31) & 0x1;
uint32_t exp = (ai >> 23) & 0xff;
val1 |= (exp << 23);
val1 |= (mant << 13);
- return *(float*)&val1;
+ return *(reinterpret_cast<float *>(&val1));
}
};
#include "gpu-compute/pool_manager.hh"
-PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
- : _minAllocation(minAlloc), _poolSize(poolSize)
+PoolManager::PoolManager(const PoolManagerParams *p)
+ : SimObject(p), _minAllocation(p->min_alloc), _poolSize(p->pool_size)
{
- assert(poolSize > 0);
+ assert(_poolSize > 0);
}
#include <cstdint>
#include <string>
+#include "params/PoolManager.hh"
+#include "sim/sim_object.hh"
+
// Pool Manager Logic
-class PoolManager
+class PoolManager : public SimObject
{
public:
- PoolManager(uint32_t minAlloc, uint32_t poolSize);
+ PoolManager(const PoolManagerParams *p);
+ virtual ~PoolManager() { _poolSize = 0; }
uint32_t minAllocation() { return _minAllocation; }
virtual std::string printRegion() = 0;
virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> ®ion) = 0;
--- /dev/null
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ * Mark Wyse
+ */
+
+#include "gpu-compute/register_file.hh"
+
+#include <sstream>
+#include <string>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "debug/GPURF.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterFile.hh"
+
+RegisterFile::RegisterFile(const RegisterFileParams *p)
+ : SimObject(p), simdId(p->simd_id), _numRegs(p->num_regs)
+{
+ fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n");
+ fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+ busy.clear();
+ busy.resize(_numRegs, 0);
+}
+
+RegisterFile::~RegisterFile()
+{
+}
+
+void
+RegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+}
+
+std::string
+RegisterFile::dump() const
+{
+ std::stringstream ss;
+ ss << "Busy: ";
+ for (int i = 0; i < busy.size(); i++) {
+ ss << (int)busy[i];
+ }
+ ss << "\n";
+ return ss.str();
+}
+
+// Scoreboard functions
+
+bool
+RegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+ return true;
+}
+
+bool
+RegisterFile::regBusy(int idx) const
+{
+ return busy.at(idx);
+}
+
+void
+RegisterFile::markReg(int regIdx, bool value)
+{
+ DPRINTF(GPURF, "SIMD[%d] markReg(): physReg[%d] = %d\n",
+ simdId, regIdx, (int)value);
+ busy.at(regIdx) = value;
+}
+
+void
+RegisterFile::enqRegFreeEvent(uint32_t regIdx, uint64_t delay)
+{
+ DPRINTF(GPURF, "SIMD[%d] enqRegFreeEvent physReg[%d] at %llu\n",
+ simdId, regIdx, curTick() + delay);
+ schedule(new MarkRegFreeScbEvent(this, regIdx),
+ curTick() + delay);
+}
+
+void
+RegisterFile::enqRegBusyEvent(uint32_t regIdx, uint64_t delay)
+{
+ DPRINTF(GPURF, "SIMD[%d] enqRegBusyEvent physReg[%d] at %llu\n",
+ simdId, regIdx, curTick() + delay);
+ schedule(new MarkRegBusyScbEvent(this, regIdx),
+ curTick() + delay);
+}
+
+// Schedule functions
+bool
+RegisterFile::canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+ return true;
+}
+
+void
+RegisterFile::scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+ return true;
+}
+
+void
+RegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::canScheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
+{
+ return true;
+}
+
+void
+RegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+bool
+RegisterFile::operandReadComplete(Wavefront *w, GPUDynInstPtr ii)
+{
+ return true;
+}
+
+// Exec functions
+void
+RegisterFile::exec()
+{
+}
+
+void
+RegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+}
+
+RegisterFile*
+RegisterFileParams::create()
+{
+ return new RegisterFile(this);
+}
+
+// Events
+
+// Mark a register as free in the scoreboard/busy vector
+void
+RegisterFile::MarkRegFreeScbEvent::process()
+{
+ rf->markReg(regIdx, false);
+}
+
+// Mark a register as busy in the scoreboard/busy vector
+void
+RegisterFile::MarkRegBusyScbEvent::process()
+{
+ rf->markReg(regIdx, true);
+}
+
+void
+RegisterFile::dispatchInstruction(GPUDynInstPtr ii)
+{
+}
+
+void
+RegisterFile::regStats()
+{
+ registerReads
+ .name(name() + ".register_reads")
+ .desc("Total number of DWORDs read from register file")
+ ;
+
+ registerWrites
+ .name(name() + ".register_writes")
+ .desc("Total number of DWORDS written to register file")
+ ;
+
+ sramReads
+ .name(name() + ".sram_reads")
+ .desc("Total number of register file bank SRAM activations for reads")
+ ;
+
+ sramWrites
+ .name(name() + ".sram_writes")
+ .desc("Total number of register file bank SRAM activations for writes")
+ ;
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ * Mark Wyse
+ */
+
+#ifndef __REGISTER_FILE_HH__
+#define __REGISTER_FILE_HH__
+
+#include <limits>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/misc.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class PoolManager;
+class Wavefront;
+
+struct RegisterFileParams;
+
+// Abstract Register File
+// This register file class can be inherited from to create both
+// scalar and vector register files.
+class RegisterFile : public SimObject
+{
+ public:
+ RegisterFile(const RegisterFileParams *p);
+ virtual ~RegisterFile();
+ virtual void setParent(ComputeUnit *_computeUnit);
+ int numRegs() const { return _numRegs; }
+ virtual void regStats() override;
+
+ // State functions
+
+ // Scoreboard functions
+ virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+ virtual bool regBusy(int idx) const;
+ virtual void markReg(int regIdx, bool value);
+
+ // Abstract Register Event
+ class RegisterEvent : public Event
+ {
+ protected:
+ RegisterFile *rf;
+ int regIdx;
+
+ public:
+ RegisterEvent(RegisterFile *_rf, int _regIdx)
+ : rf(_rf), regIdx(_regIdx) { setFlags(AutoDelete); }
+ };
+
+ // Register Event to mark a register as free in the scoreboard/busy vector
+ class MarkRegFreeScbEvent : public RegisterEvent
+ {
+ public:
+ MarkRegFreeScbEvent(RegisterFile *_rf, int _regIdx)
+ : RegisterEvent(_rf, _regIdx) { }
+ void process();
+ };
+
+ // Register Event to mark a register as busy in the scoreboard/busy vector
+ class MarkRegBusyScbEvent : public RegisterEvent
+ {
+ public:
+ MarkRegBusyScbEvent(RegisterFile *_rf, int _regIdx)
+ : RegisterEvent(_rf, _regIdx) { }
+ void process();
+ };
+
+ // Schedule an event to mark a register as free/busy in
+ // the scoreboard/busy vector. Delay is already in Ticks
+ virtual void enqRegFreeEvent(uint32_t regIdx, uint64_t delay);
+ virtual void enqRegBusyEvent(uint32_t regIdx, uint64_t delay);
+
+ // Schedule functions
+
+ // The following functions are called by the SCH stage when attempting
+ // to move a wave from the readyList to the schList.
+ // canSchedule* checks if the RF is ready to provide operands for
+ // the instruction, while schedule* requests the RF to begin reading
+ // and writing of operands. Calling schedule* may only occur
+ // immediately after canSchedule* was called and returned True
+ virtual bool canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
+ virtual bool canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
+ virtual void scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii);
+ virtual void scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii);
+
+ // The following function is called to check if all operands
+ // have been read for the given instruction
+ virtual bool operandReadComplete(Wavefront *w, GPUDynInstPtr ii);
+
+ // The following two functions are only called by returning loads to
+ // check if the register file can support the incoming writes
+ virtual bool canScheduleWriteOperandsFromLoad(Wavefront *w,
+ GPUDynInstPtr ii);
+ // Queue the register writes. Assumes canScheduleWriteOperandsFromLoad
+ // was called immediately prior and returned True
+ virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+ GPUDynInstPtr ii);
+
+ // ExecRF is invoked every cycle by the compute unit and may be
+ // used to model detailed timing of the register file.
+ virtual void exec();
+
+ // Called to inform RF that an instruction is executing
+ // to schedule events for writeback, etc., as needed
+ virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
+
+ // Debug functions
+ virtual std::string dump() const;
+
+ virtual void dispatchInstruction(GPUDynInstPtr ii);
+
+ protected:
+ ComputeUnit* computeUnit;
+ int simdId;
+
+ // flag indicating if a register is busy
+ std::vector<bool> busy;
+
+ // numer of registers in this register file
+ int _numRegs;
+ // Stats
+ // Total number of register reads, incremented once per DWORD per thread
+ Stats::Scalar registerReads;
+ // Total number of register writes, incremented once per DWORD per thread
+ Stats::Scalar registerWrites;
+
+ // Number of register file SRAM activations for reads.
+ // The register file may be implemented with multiple SRAMs. This stat
+ // tracks how many times the SRAMs are accessed for reads.
+ Stats::Scalar sramReads;
+ // Number of register file SRAM activations for writes
+ Stats::Scalar sramWrites;
+};
+
+#endif // __REGISTER_FILE_HH__
--- /dev/null
+/*
+ * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Mark Wyse
+ */
+
+#include "gpu-compute/register_manager.hh"
+
+#include "config/the_gpu_isa.hh"
+#include "debug/GPURename.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/static_register_manager_policy.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterManager.hh"
+
+RegisterManager::RegisterManager(const RegisterManagerParams *p)
+ : SimObject(p), srfPoolMgrs(p->srf_pool_managers),
+ vrfPoolMgrs(p->vrf_pool_managers)
+{
+ if (p->policy == "static") {
+ policy = new StaticRegisterManagerPolicy();
+ } else {
+ fatal("Unimplemented Register Manager Policy");
+ }
+
+}
+
+RegisterManager::~RegisterManager()
+{
+ for (auto mgr : srfPoolMgrs) {
+ delete mgr;
+ }
+ for (auto mgr : vrfPoolMgrs) {
+ delete mgr;
+ }
+}
+
+void
+RegisterManager::exec()
+{
+ policy->exec();
+}
+
+void
+RegisterManager::setParent(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ policy->setParent(computeUnit);
+ for (int i = 0; i < srfPoolMgrs.size(); i++) {
+ fatal_if(computeUnit->srf[i]->numRegs() %
+ srfPoolMgrs[i]->minAllocation(),
+ "Min SGPR allocation is not multiple of VRF size\n");
+ }
+ for (int i = 0; i < vrfPoolMgrs.size(); i++) {
+ fatal_if(computeUnit->vrf[i]->numRegs() %
+ vrfPoolMgrs[i]->minAllocation(),
+ "Min VGPG allocation is not multiple of VRF size\n");
+ }
+}
+
+// compute mapping for vector register
+int
+RegisterManager::mapVgpr(Wavefront* w, int vgprIndex)
+{
+ return policy->mapVgpr(w, vgprIndex);
+}
+
+// compute mapping for scalar register
+int
+RegisterManager::mapSgpr(Wavefront* w, int sgprIndex)
+{
+ return policy->mapSgpr(w, sgprIndex);
+}
+
+// check if we can allocate registers
+bool
+RegisterManager::canAllocateVgprs(int simdId, int nWfs, int demandPerWf)
+{
+ return policy->canAllocateVgprs(simdId, nWfs, demandPerWf);
+}
+
+bool
+RegisterManager::canAllocateSgprs(int simdId, int nWfs, int demandPerWf)
+{
+ return policy->canAllocateSgprs(simdId, nWfs, demandPerWf);
+}
+
+// allocate registers
+void
+RegisterManager::allocateRegisters(Wavefront *w, int vectorDemand,
+ int scalarDemand)
+{
+ policy->allocateRegisters(w, vectorDemand, scalarDemand);
+}
+
+void
+RegisterManager::freeRegisters(Wavefront* w)
+{
+ policy->freeRegisters(w);
+}
+
+void
+RegisterManager::regStats()
+{
+ policy->regStats();
+}
+
+RegisterManager*
+RegisterManagerParams::create()
+{
+ return new RegisterManager(this);
+}
--- /dev/null
+/*
+ * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __REGISTER_MANAGER_HH__
+#define __REGISTER_MANAGER_HH__
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/pool_manager.hh"
+#include "gpu-compute/register_manager_policy.hh"
+#include "sim/sim_object.hh"
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+struct RegisterManagerParams;
+
+/*
+ * Rename stage.
+ */
+class RegisterManager : public SimObject
+{
+ public:
+ RegisterManager(const RegisterManagerParams* params);
+ ~RegisterManager();
+ void setParent(ComputeUnit *cu);
+ void exec();
+
+ // Stats related variables and methods
+ void regStats();
+
+ // lookup virtual to physical register translation
+ int mapVgpr(Wavefront* w, int vgprIndex);
+ int mapSgpr(Wavefront* w, int sgprIndex);
+
+ // check if we can allocate registers
+ bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf);
+ bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf);
+
+ // allocate registers
+ void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand);
+
+ // free all registers used by the WF
+ void freeRegisters(Wavefront *w);
+
+ std::vector<PoolManager*> srfPoolMgrs;
+ std::vector<PoolManager*> vrfPoolMgrs;
+
+ private:
+ RegisterManagerPolicy *policy;
+
+ ComputeUnit *computeUnit;
+
+ std::string _name;
+};
+
+#endif // __REGISTER_MANAGER_HH__
--- /dev/null
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __REGISTER_MANAGER_POLICY_HH__
+#define __REGISTER_MANAGER_POLICY_HH__
+
+#include <cstdint>
+
+class ComputeUnit;
+class HSAQueueEntry;
+class Wavefront;
+
+/**
+ * Register Manager Policy abstract class
+ *
+ * A Register Manager Policy implements all of the functionality
+ * of the Register Manager, including register mapping, allocation,
+ * and freeing. Different policies may be implemented that support
+ * different architectures or different methods of mapping and
+ * allocation.
+ */
+class RegisterManagerPolicy
+{
+ public:
+ virtual void setParent(ComputeUnit *_cu) { cu = _cu; }
+
+ // Execute: called by RenameStage::execute()
+ virtual void exec() = 0;
+
+ // provide virtual to physical register mapping
+ virtual int mapVgpr(Wavefront* w, int vgprIndex) = 0;
+ virtual int mapSgpr(Wavefront* w, int sgprIndex) = 0;
+
+ // check if requested number of vector registers can be allocated
+ virtual bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) = 0;
+ // check if requested number of scalar registers can be allocated
+ // machine ISA only
+ virtual bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) = 0;
+
+ // allocate vector registers and reserve from register pool
+ virtual void allocateRegisters(Wavefront *w, int vectorDemand,
+ int scalarDemand) = 0;
+
+ // free all remaining registers held by specified WF
+ virtual void freeRegisters(Wavefront *w) = 0;
+
+ // stats
+ virtual void regStats() = 0;
+
+ protected:
+ ComputeUnit *cu;
+};
+
+#endif // __REGISTER_MANAGER_POLICY_HH__
#include <vector>
+#include "base/logging.hh"
#include "gpu-compute/scheduling_policy.hh"
#include "gpu-compute/wavefront.hh"
--- /dev/null
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos
+ */
+
+#include "gpu-compute/scalar_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p) :
+ computeUnit(nullptr), queueSize(p->scalar_mem_queue_size),
+ inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+ScalarMemPipeline::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ScalarMemPipeline";
+}
+
+void
+ScalarMemPipeline::exec()
+{
+ // afind oldest scalar request whose data has arrived
+ GPUDynInstPtr m = !returnedLoads.empty() ? returnedLoads.front() :
+ !returnedStores.empty() ? returnedStores.front() : nullptr;
+
+ Wavefront *w = nullptr;
+
+ bool accessSrf = true;
+ // check the SRF to see if the operands of a load (or load component
+ // of an atomic) are accessible
+ if ((m) && (m->isLoad() || m->isAtomicRet())) {
+ w = m->wavefront();
+
+ accessSrf =
+ w->computeUnit->srf[w->simdId]->
+ canScheduleWriteOperandsFromLoad(w, m);
+ }
+
+ if ((!returnedStores.empty() || !returnedLoads.empty()) &&
+ m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
+ accessSrf &&
+ (computeUnit->shader->coissue_return ||
+ computeUnit->scalarMemUnit.rdy())) {
+
+ w = m->wavefront();
+
+ if (m->isLoad() || m->isAtomicRet()) {
+ w->computeUnit->srf[w->simdId]->
+ scheduleWriteOperandsFromLoad(w, m);
+ }
+
+ m->completeAcc(m);
+
+ if (m->isLoad() || m->isAtomic()) {
+ returnedLoads.pop();
+ assert(inflightLoads > 0);
+ --inflightLoads;
+ } else {
+ returnedStores.pop();
+ assert(inflightStores > 0);
+ --inflightStores;
+ }
+
+ // Decrement outstanding register count
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+ if (m->isStore() || m->isAtomic()) {
+ computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
+ m->time, -1);
+ }
+
+ if (m->isLoad() || m->isAtomic()) {
+ computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
+ m->time, -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->scalarMemToSrfBus.set(m->time);
+ if (!computeUnit->shader->coissue_return)
+ w->computeUnit->scalarMemUnit.set(m->time);
+ }
+
+ // If pipeline has executed a global memory instruction
+ // execute global memory packets and issue global
+ // memory packets to DTLB
+ if (!issuedRequests.empty()) {
+ GPUDynInstPtr mp = issuedRequests.front();
+ if (mp->isLoad() || mp->isAtomic()) {
+
+ if (inflightLoads >= queueSize) {
+ return;
+ } else {
+ ++inflightLoads;
+ }
+ } else {
+ if (inflightStores >= queueSize) {
+ return;
+ } else {
+ ++inflightStores;
+ }
+ }
+ mp->initiateAcc(mp);
+ issuedRequests.pop();
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
+ computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+ }
+}
+
+void
+ScalarMemPipeline::regStats()
+{
+}
--- /dev/null
+/*
+ * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos
+ */
+
+#ifndef __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
+#define __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file scalar_memory_pipeline.hh
+ *
+ * The scalar memory pipeline issues global memory packets
+ * from the scalar ALU to the DTLB and L1 Scalar Data Cache.
+ * The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This exec() method also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class ScalarMemPipeline
+{
+ public:
+ ScalarMemPipeline(const ComputeUnitParams *params);
+ void init(ComputeUnit *cu);
+ void exec();
+
+ std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
+ std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return returnedStores; }
+ std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return returnedLoads; }
+
+ bool
+ isGMLdRespFIFOWrRdy() const
+ {
+ return returnedLoads.size() < queueSize;
+ }
+
+ bool
+ isGMStRespFIFOWrRdy() const
+ {
+ return returnedStores.size() < queueSize;
+ }
+
+ bool
+ isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+ {
+ return (issuedRequests.size() + pendReqs) < queueSize;
+ }
+
+ const std::string &name() const { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ int queueSize;
+
+ // Counters to track and limit the inflight scalar loads and stores
+ // generated by this memory pipeline.
+ int inflightStores;
+ int inflightLoads;
+
+ // Scalar Memory Request FIFO: all global memory scalar requests
+ // are issued to this FIFO from the scalar memory pipelines
+ std::queue<GPUDynInstPtr> issuedRequests;
+
+ // Scalar Store Response FIFO: all responses of global memory
+ // scalar stores are sent to this FIFO from L1 Scalar Data Cache
+ std::queue<GPUDynInstPtr> returnedStores;
+
+ // Scalar Load Response FIFO: all responses of global memory
+ // scalar loads are sent to this FIFO from L1 Scalar Data Cache
+ std::queue<GPUDynInstPtr> returnedLoads;
+};
+
+#endif // __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__
--- /dev/null
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ * Mark Wyse
+ */
+
+#include "gpu-compute/scalar_register_file.hh"
+
+#include "base/logging.hh"
+#include "debug/GPUSRF.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ScalarRegisterFile.hh"
+
+ScalarRegisterFile::ScalarRegisterFile(const ScalarRegisterFileParams *p)
+ : RegisterFile(p)
+{
+ regFile.resize(numRegs(), 0);
+}
+
+bool
+ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
+
+ int sgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+ ii->getOperandSize(i) / 4;
+
+ for (int j = 0; j < nRegs; ++j) {
+ int pSgpr =
+ computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
+
+ if (regBusy(pSgpr)) {
+ if (ii->isDstOperand(i)) {
+ w->numTimesBlockedDueWAXDependencies++;
+ } else if (ii->isSrcOperand(i)) {
+ DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
+ w->wfDynId, ii->disassemble(), pSgpr);
+ w->numTimesBlockedDueRAWDependencies++;
+ }
+ return false;
+ }
+ } // nRegs
+ } // isScalar
+ } // operand
+ return true;
+}
+
+void
+ScalarRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
+{
+ // iterate over all register destination operands
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+
+ int sgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+ ii->getOperandSize(i) / 4;
+
+ for (int j = 0; j < nRegs; ++j) {
+ int physReg =
+ computeUnit->registerManager->mapSgpr(w, sgprIdx + j);
+
+ // mark the destination scalar register as busy
+ markReg(physReg, true);
+ }
+ }
+ }
+}
+
+void
+ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+ for (int i = 0; i < ii->getNumOperands(); i++) {
+ if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) {
+ int DWORDs = ii->getOperandSize(i) <= 4 ? 1
+ : ii->getOperandSize(i) / 4;
+ registerReads += DWORDs;
+ }
+ }
+
+ if (!ii->isLoad() && !(ii->isAtomic() || ii->isMemSync())) {
+ Cycles delay(computeUnit->scalarPipeLength());
+ Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+ for (int i = 0; i < ii->getNumOperands(); i++) {
+ if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+ int sgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1
+ : ii->getOperandSize(i) / 4;
+ for (int j = 0; j < nRegs; j++) {
+ int physReg = computeUnit->registerManager->
+ mapSgpr(w, sgprIdx + j);
+ enqRegFreeEvent(physReg, tickDelay);
+ }
+
+ registerWrites += nRegs;
+ }
+ }
+ }
+}
+
+void
+ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w,
+ GPUDynInstPtr ii)
+{
+ assert(ii->isLoad() || ii->isAtomicRet());
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isScalarRegister(i) && ii->isDstOperand(i)) {
+
+ int sgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+ ii->getOperandSize(i) / 4;
+
+ for (int j = 0; j < nRegs; ++j) {
+ int physReg = computeUnit->registerManager->
+ mapSgpr(w, sgprIdx + j);
+ enqRegFreeEvent(physReg, computeUnit->clockPeriod());
+ }
+
+ registerWrites += nRegs;
+ }
+ }
+}
+
+ScalarRegisterFile*
+ScalarRegisterFileParams::create()
+{
+ return new ScalarRegisterFile(this);
+}
--- /dev/null
+/*
+ * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: John Kalamatianos,
+ * Mark Wyse
+ */
+
+#ifndef __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
+#define __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
+
+#include "arch/gpu_isa.hh"
+#include "base/statistics.hh"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUSRF.hh"
+#include "gpu-compute/register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+struct ScalarRegisterFileParams;
+
+// Scalar Register File
+class ScalarRegisterFile : public RegisterFile
+{
+ public:
+ using ScalarRegU32 = TheGpuISA::ScalarRegU32;
+
+ ScalarRegisterFile(const ScalarRegisterFileParams *p);
+ ~ScalarRegisterFile() { }
+
+ virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
+ virtual void scheduleWriteOperands(Wavefront *w,
+ GPUDynInstPtr ii) override;
+ virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+ GPUDynInstPtr ii) override;
+ virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
+
+ void
+ setParent(ComputeUnit *_computeUnit) override
+ {
+ RegisterFile::setParent(_computeUnit);
+ }
+
+ // Read a register that is writeable (e.g., a DST operand)
+ ScalarRegU32&
+ readWriteable(int regIdx)
+ {
+ return regFile[regIdx];
+ }
+
+ // Read a register that is not writeable (e.g., src operand)
+ ScalarRegU32
+ read(int regIdx) const
+ {
+ return regFile[regIdx];
+ }
+
+ // Write a register
+ void
+ write(int regIdx, ScalarRegU32 value)
+ {
+ regFile[regIdx] = value;
+ }
+
+ void
+ printReg(Wavefront *wf, int regIdx) const
+ {
+ DPRINTF(GPUSRF, "WF[%d][%d]: Id%d s[%d] = %#x\n", wf->simdId,
+ wf->wfSlotId, wf->wfDynId, regIdx, regFile[regIdx]);
+ }
+
+ private:
+ std::vector<ScalarRegU32> regFile;
+};
+
+#endif // __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__
#include "gpu-compute/schedule_stage.hh"
+#include <unordered_set>
+
+#include "debug/GPUSched.hh"
+#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
- : numSIMDs(p->num_SIMDs),
- numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
+ : vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
+ scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
+ locMemBusRdy(false), locMemIssueRdy(false)
{
- for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ for (int j = 0; j < cu->numExeUnits(); ++j) {
scheduler.emplace_back(p);
}
+ wavesInSch.clear();
+ schList.resize(cu->numExeUnits());
+ for (auto &dq : schList) {
+ dq.clear();
+ }
}
ScheduleStage::~ScheduleStage()
{
scheduler.clear();
- waveStatusList.clear();
+ wavesInSch.clear();
+ schList.clear();
}
void
computeUnit = cu;
_name = computeUnit->name() + ".ScheduleStage";
- for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ fatal_if(scheduler.size() != computeUnit->readyList.size(),
+ "Scheduler should have same number of entries as CU's readyList");
+ for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
scheduler[j].bindList(&computeUnit->readyList[j]);
}
- for (int j = 0; j < numSIMDs; ++j) {
- waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+ dispatchList = &computeUnit->dispatchList;
+
+ assert(computeUnit->numVectorGlobalMemUnits == 1);
+ assert(computeUnit->numVectorSharedMemUnits == 1);
+}
+
+void
+ScheduleStage::exec()
+{
+ // Update readyList
+ for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ // delete all ready wavefronts whose instruction buffers are now
+ // empty because the last instruction was executed
+ computeUnit->updateReadyList(j);
+ /**
+ * Remove any wave that already has an instruction present in SCH
+ * waiting for RF reads to complete. This prevents out of order
+ * execution within a wave.
+ */
+ for (auto wIt = computeUnit->readyList.at(j).begin();
+ wIt != computeUnit->readyList.at(j).end();) {
+ if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
+ *wIt = nullptr;
+ wIt = computeUnit->readyList.at(j).erase(wIt);
+ } else {
+ wIt++;
+ }
+ }
+ }
+
+ // Attempt to add another wave for each EXE type to schList queues
+ // VMEM resources are iterated first, effectively giving priority
+ // to VMEM over VALU for scheduling read of operands to the RFs.
+ // Scalar Memory are iterated after VMEM
+
+ // Iterate VMEM and SMEM
+ int firstMemUnit = computeUnit->firstMemUnit();
+ int lastMemUnit = computeUnit->lastMemUnit();
+ for (int j = firstMemUnit; j <= lastMemUnit; j++) {
+ int readyListSize = computeUnit->readyList[j].size();
+ // If no wave is ready to be scheduled on the execution resource
+ // then skip scheduling for this execution resource
+ if (!readyListSize) {
+ rdyListEmpty[j]++;
+ continue;
+ }
+ rdyListNotEmpty[j]++;
+
+ // Pick a wave and attempt to add it to schList
+ Wavefront *w = scheduler[j].chooseWave();
+ if (!addToSchList(j, w)) {
+ // For waves not added to schList, increment count of cycles
+ // this wave spends in SCH stage.
+ w->schCycles++;
+ addToSchListStalls[j]++;
+ }
}
- dispatchList = &computeUnit->dispatchList;
+ // Iterate everything else
+ for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ // skip the VMEM resources
+ if (j >= firstMemUnit && j <= lastMemUnit) {
+ continue;
+ }
+ int readyListSize = computeUnit->readyList[j].size();
+ // If no wave is ready to be scheduled on the execution resource
+ // then skip scheduling for this execution resource
+ if (!readyListSize) {
+ rdyListEmpty[j]++;
+ continue;
+ }
+ rdyListNotEmpty[j]++;
+
+ // Pick a wave and attempt to add it to schList
+ Wavefront *w = scheduler[j].chooseWave();
+ if (!addToSchList(j, w)) {
+ // For waves not added to schList, increment count of cycles
+ // this wave spends in SCH stage.
+ w->schCycles++;
+ addToSchListStalls[j]++;
+ }
+ }
+
+ // At this point, the schList queue per EXE type may contain
+ // multiple waves, in order of age (oldest to youngest).
+ // Wave may be in RFBUSY, indicating they are waiting for registers
+ // to be read, or in RFREADY, indicating they are candidates for
+ // the dispatchList and execution
+
+ // Iterate schList queues and check if any of the waves have finished
+ // reading their operands, moving those waves to RFREADY status
+ checkRfOperandReadComplete();
+
+ // Fill the dispatch list with the oldest wave of each EXE type that
+ // is ready to execute
+ // Wave is picked if status in schList is RFREADY and it passes resource
+ // ready checks similar to those currently in SCB
+ fillDispatchList();
+
+ // Resource arbitration on waves in dispatchList
+ // Losing waves are re-inserted to the schList at a location determined
+ // by wave age
+
+ // Arbitrate access to the VRF->LDS bus
+ arbitrateVrfToLdsBus();
+
+ // Schedule write operations to the register files
+ scheduleRfDestOperands();
+
+ // Lastly, reserve resources for waves that are ready to execute.
+ reserveResources();
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+ Wavefront *w)
+{
+ dispatchList->at(unitId).first = w;
+ dispatchList->at(unitId).second = s;
+}
+
+bool
+ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
+{
+ GPUDynInstPtr ii = w->instructionBuffer.front();
+ assert(ii);
+ bool accessVrfWr = true;
+ if (!ii->isScalar()) {
+ accessVrfWr =
+ computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+ }
+ bool accessSrfWr =
+ computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
+ bool accessRf = accessVrfWr && accessSrfWr;
+ if (accessRf) {
+ if (!ii->isScalar()) {
+ computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
+ }
+ computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
+ return true;
+ } else {
+ rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+ if (!accessSrfWr) {
+ rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+ }
+ if (!accessVrfWr) {
+ rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+ }
+
+ // Increment stall counts for WF
+ w->schStalls++;
+ w->schRfAccessStalls++;
+ }
+ return false;
+}
+
+void
+ScheduleStage::scheduleRfDestOperands()
+{
+ for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ if (!dispatchList->at(j).first) {
+ continue;
+ }
+ // get the wave on dispatch list and attempt to allocate write
+ // resources in the RFs
+ Wavefront *w = dispatchList->at(j).first;
+ if (!schedRfWrites(j, w)) {
+ reinsertToSchList(j, w);
+ doDispatchListTransition(j, EMPTY);
+ // if this is a flat inst, also transition the LM pipe to empty
+ // Note: since FLAT/LM arbitration occurs before scheduling
+ // destination operands to the RFs, it is possible that a LM
+ // instruction lost arbitration, but would have been able to
+ // pass the RF destination operand check here, and execute
+ // instead of the FLAT.
+ if (w->instructionBuffer.front()->isFlat()) {
+ assert(dispatchList->at(w->localMem).second == SKIP);
+ doDispatchListTransition(w->localMem, EMPTY);
+ }
+ }
+ }
+}
+
+bool
+ScheduleStage::addToSchList(int exeType, Wavefront *w)
+{
+ // Attempt to add the wave to the schList if the VRF can support the
+ // wave's next instruction
+ GPUDynInstPtr ii = w->instructionBuffer.front();
+ assert(ii);
+ bool accessVrf = true;
+ if (!ii->isScalar()) {
+ accessVrf =
+ computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
+ }
+ bool accessSrf =
+ computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
+ // If RFs can support instruction, add to schList in RFBUSY state,
+ // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
+ // to the VRF
+ bool accessRf = accessVrf && accessSrf;
+ if (accessRf) {
+ DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
+ exeType, w->simdId, w->wfDynId,
+ ii->seqNum(), ii->disassemble());
+
+ computeUnit->insertInPipeMap(w);
+ wavesInSch.emplace(w->wfDynId);
+ schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
+ if (w->isOldestInstWaitcnt()) {
+ w->setStatus(Wavefront::S_WAITCNT);
+ }
+ if (!ii->isScalar()) {
+ computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
+ }
+ computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
+
+ DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
+ exeType, w->simdId, w->wfDynId,
+ ii->seqNum(), ii->disassemble());
+ return true;
+ } else {
+ // Number of stall cycles due to RF access denied
+ rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+ // Count number of denials due to each reason
+ // Multiple items may contribute to the denied request
+ if (!accessVrf) {
+ rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+ }
+ if (!accessSrf) {
+ rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+ }
+
+ // Increment stall counts for WF
+ w->schStalls++;
+ w->schRfAccessStalls++;
+ DPRINTF(GPUSched, "schList[%d]: Could not add: "
+ "SIMD[%d] WV[%d]: %d: %s\n",
+ exeType, w->simdId, w->wfDynId,
+ ii->seqNum(), ii->disassemble());
+ }
+ return false;
+}
+
+void
+ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
+{
+ // Insert wave w into schList for specified exeType.
+ // Wave is inserted in age order, with oldest wave being at the
+ // front of the schList
+ auto schIter = schList.at(exeType).begin();
+ while (schIter != schList.at(exeType).end()
+ && schIter->first->wfDynId < w->wfDynId) {
+ schIter++;
+ }
+ schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
+}
+
+void
+ScheduleStage::checkMemResources()
+{
+ // Check for resource availability in the next cycle
+ scalarMemBusRdy = false;
+ scalarMemIssueRdy = false;
+ // check if there is a SRF->Global Memory bus available and
+ if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
+ scalarMemBusRdy = true;
+ }
+ // check if we can issue a scalar memory instruction
+ if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
+ scalarMemIssueRdy = true;
+ }
+
+ glbMemBusRdy = false;
+ glbMemIssueRdy = false;
+ // check if there is a VRF->Global Memory bus available
+ if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+ glbMemBusRdy = true;
+ }
+ // check if we can issue a Global memory instruction
+ if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
+ glbMemIssueRdy = true;
+ }
+
+ locMemBusRdy = false;
+ locMemIssueRdy = false;
+ // check if there is a VRF->LDS bus available
+ if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+ locMemBusRdy = true;
+ }
+ // check if we can issue a LDS instruction
+ if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
+ locMemIssueRdy = true;
+ }
+}
+
+bool
+ScheduleStage::dispatchReady(Wavefront *w)
+{
+ vectorAluRdy = false;
+ scalarAluRdy = false;
+ // check for available vector/scalar ALUs in the next cycle
+ if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
+ vectorAluRdy = true;
+ }
+ if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+ scalarAluRdy = true;
+ }
+ GPUDynInstPtr ii = w->instructionBuffer.front();
+
+ if (ii->isNop()) {
+ // S_NOP requires SALU. V_NOP requires VALU.
+ // TODO: Scalar NOP does not require SALU in hardware,
+ // and is executed out of IB directly.
+ if (ii->isScalar() && !scalarAluRdy) {
+ dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ return false;
+ } else if (!ii->isScalar() && !vectorAluRdy) {
+ dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+ return false;
+ }
+ } else if (ii->isEndOfKernel()) {
+ // EndPgm instruction
+ if (ii->isScalar() && !scalarAluRdy) {
+ dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ return false;
+ }
+ } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
+ // Barrier, Branch, or ALU instruction
+ if (ii->isScalar() && !scalarAluRdy) {
+ dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ return false;
+ } else if (!ii->isScalar() && !vectorAluRdy) {
+ dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+ return false;
+ }
+ } else if (!ii->isScalar() && ii->isGlobalMem()) {
+ // Vector Global Memory instruction
+ bool rdy = true;
+ if (!glbMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+ }
+ if (!glbMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+ }
+ if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else if (ii->isScalar() && ii->isGlobalMem()) {
+ // Scalar Global Memory instruction
+ bool rdy = true;
+ if (!scalarMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+ }
+ if (!scalarMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit->scalarMemoryPipe.
+ isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
+ w->scalarWrGmReqsInPipe)) {
+ rdy = false;
+ dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else if (!ii->isScalar() && ii->isLocalMem()) {
+ // Vector Local Memory instruction
+ bool rdy = true;
+ if (!locMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+ }
+ if (!locMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit->localMemoryPipe.
+ isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+ rdy = false;
+ dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else if (!ii->isScalar() && ii->isFlat()) {
+ // Vector Flat memory instruction
+ bool rdy = true;
+ if (!glbMemIssueRdy || !locMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+ }
+ if (!glbMemBusRdy || !locMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+ }
+ if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+ }
+ if (!computeUnit->localMemoryPipe.
+ isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else {
+ panic("%s: unknown instr checked for readiness", ii->disassemble());
+ return false;
+ }
+ dispNrdyStalls[SCH_RDY]++;
+ return true;
}
void
-ScheduleStage::arbitrate()
-{
- // iterate over all Memory pipelines
- for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
- if (dispatchList->at(j).first) {
- Wavefront *waveToMemPipe = dispatchList->at(j).first;
- // iterate over all execution pipelines
- for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
- if ((i != j) && (dispatchList->at(i).first)) {
- Wavefront *waveToExePipe = dispatchList->at(i).first;
- // if the two selected wavefronts are mapped to the same
- // SIMD unit then they share the VRF
- if (waveToMemPipe->simdId == waveToExePipe->simdId) {
- int simdId = waveToMemPipe->simdId;
- // Read VRF port arbitration:
- // If there are read VRF port conflicts between the
- // a memory and another instruction we drop the other
- // instruction. We don't need to check for write VRF
- // port conflicts because the memory instruction either
- // does not need to write to the VRF (store) or will
- // write to the VRF when the data comes back (load) in
- // which case the arbiter of the memory pipes will
- // resolve any conflicts
- if (computeUnit->vrf[simdId]->
- isReadConflict(waveToMemPipe->wfSlotId,
- waveToExePipe->wfSlotId)) {
- // FIXME: The "second" member variable is never
- // used in the model. I am setting it to READY
- // simply to follow the protocol of setting it
- // when the WF has an instruction ready to issue
- waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
- .second = READY;
-
- dispatchList->at(i).first = nullptr;
- dispatchList->at(i).second = EMPTY;
- break;
- }
+ScheduleStage::fillDispatchList()
+{
+ // update execution resource status
+ checkMemResources();
+ // iterate execution resources
+ for (int j = 0; j < computeUnit->numExeUnits(); j++) {
+ assert(dispatchList->at(j).second == EMPTY);
+
+ // iterate waves in schList to pick one for dispatch
+ auto schIter = schList.at(j).begin();
+ bool dispatched = false;
+ while (schIter != schList.at(j).end()) {
+ // only attempt to dispatch if status is RFREADY
+ if (schIter->second == RFREADY) {
+ // Check if this wave is ready for dispatch
+ bool dispRdy = dispatchReady(schIter->first);
+ if (!dispatched && dispRdy) {
+ // No other wave has been dispatched for this exe
+ // resource, and this wave is ready. Place this wave
+ // on dispatchList and make it ready for execution
+ // next cycle.
+
+ // Acquire a coalescer token if it is a global mem
+ // operation.
+ GPUDynInstPtr mp = schIter->first->
+ instructionBuffer.front();
+ if (!mp->isMemSync() && !mp->isScalar() &&
+ (mp->isGlobalMem() || mp->isFlat())) {
+ computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
+ }
+
+ doDispatchListTransition(j, EXREADY, schIter->first);
+ DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
+ "EMPTY->EXREADY\n", j);
+ schIter->first = nullptr;
+ schIter = schList.at(j).erase(schIter);
+ dispatched = true;
+ } else {
+ // Either another wave has been dispatched, or this wave
+ // was not ready, so it is stalled this cycle
+ schIter->first->schStalls++;
+ if (!dispRdy) {
+ // not ready for dispatch, increment stall stat
+ schIter->first->schResourceStalls++;
}
+ // Examine next wave for this resource
+ schIter++;
}
+ } else {
+ // Wave not in RFREADY, try next wave
+ schIter++;
}
}
+
+ // Increment stall count if no wave sent to dispatchList for
+ // current execution resource
+ if (!dispatched) {
+ schListToDispListStalls[j]++;
+ } else {
+ schListToDispList[j]++;
+ }
}
}
void
-ScheduleStage::exec()
+ScheduleStage::arbitrateVrfToLdsBus()
{
- for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
- uint32_t readyListSize = computeUnit->readyList[j].size();
+ // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
+ // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
+ // and a VRF->LDS bus. In GFx9, this is not the case.
- // If no wave is ready to be scheduled on the execution resource
- // then skip scheduling for this execution resource
- if (!readyListSize) {
- continue;
- }
+ // iterate the GM pipelines
+ for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
+ // get the GM pipe index in the dispatchList
+ int gm_exe_unit = computeUnit->firstMemUnit() + i;
+ // get the wave in the dispatchList
+ Wavefront *w = dispatchList->at(gm_exe_unit).first;
+ // If the WF is valid, ready to execute, and the instruction
+ // is a flat access, arbitrate with the WF's assigned LM pipe
+ if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
+ w->instructionBuffer.front()->isFlat()) {
+ // If the associated LM pipe also has a wave selected, block
+ // that wave and let the Flat instruction issue. The WF in the
+ // LM pipe is added back to the schList for consideration next
+ // cycle.
+ if (dispatchList->at(w->localMem).second == EXREADY) {
+ reinsertToSchList(w->localMem,
+ dispatchList->at(w->localMem).first);
+ // Increment stall stats for LDS-VRF arbitration
+ ldsBusArbStalls++;
+ dispatchList->at(w->localMem).first->schLdsArbStalls++;
+ }
+ // With arbitration of LM pipe complete, transition the
+ // LM pipe to SKIP state in the dispatchList to inform EX stage
+ // that a Flat instruction is executing next cycle
+ doDispatchListTransition(w->localMem, SKIP, w);
+ DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
+ "EXREADY->SKIP\n", w->localMem);
+ }
+ }
+}
+
+void
+ScheduleStage::checkRfOperandReadComplete()
+{
+ // Iterate the schList queues and check if operand reads
+ // have completed in the RFs. If so, mark the wave as ready for
+ // selection for dispatchList
+ for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ for (auto &p : schList.at(j)) {
+ Wavefront *w = p.first;
+ assert(w);
- Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
- dispatchList->at(j).first = waveToBeDispatched;
- waveToBeDispatched->updateResources();
- dispatchList->at(j).second = FILLED;
+ // Increment the number of cycles the wave spends in the
+ // SCH stage, since this loop visits every wave in SCH.
+ w->schCycles++;
- waveStatusList[waveToBeDispatched->simdId]->at(
- waveToBeDispatched->wfSlotId).second = BLOCKED;
+ GPUDynInstPtr ii = w->instructionBuffer.front();
+ bool vrfRdy = true;
+ if (!ii->isScalar()) {
+ vrfRdy =
+ computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
+ }
+ bool srfRdy =
+ computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
+ bool operandsReady = vrfRdy && srfRdy;
+ if (operandsReady) {
+ DPRINTF(GPUSched,
+ "schList[%d]: WV[%d] operands ready for: %d: %s\n",
+ j, w->wfDynId, ii->seqNum(), ii->disassemble());
+ DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
+ j, w->wfDynId);
+ p.second = RFREADY;
+ } else {
+ DPRINTF(GPUSched,
+ "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
+ j, w->wfDynId, ii->seqNum(), ii->disassemble());
+
+ // operands not ready yet, increment SCH stage stats
+ // aggregate to all wavefronts on the CU
+ p.second = RFBUSY;
+
+ // Increment stall stats
+ w->schStalls++;
+ w->schOpdNrdyStalls++;
- assert(computeUnit->readyList[j].size() == readyListSize - 1);
+ opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+ if (!vrfRdy) {
+ opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+ }
+ if (!srfRdy) {
+ opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+ }
+ }
+ }
}
- // arbitrate over all shared resources among instructions being issued
- // simultaneously
- arbitrate();
+}
+
+void
+ScheduleStage::reserveResources()
+{
+ std::vector<bool> exeUnitReservations;
+ exeUnitReservations.resize(computeUnit->numExeUnits(), false);
+
+ for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ Wavefront *dispatchedWave = dispatchList->at(j).first;
+ if (dispatchedWave) {
+ DISPATCH_STATUS s = dispatchList->at(j).second;
+ if (s == EMPTY) {
+ continue;
+ } else if (s == EXREADY) {
+ // Wave is ready for execution
+ std::vector<int> execUnitIds =
+ dispatchedWave->reserveResources();
+ GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
+
+ if (!ii->isScalar()) {
+ computeUnit->vrf[dispatchedWave->simdId]->
+ dispatchInstruction(ii);
+ }
+ computeUnit->srf[dispatchedWave->simdId]->
+ dispatchInstruction(ii);
+
+ std::stringstream ss;
+ for (auto id : execUnitIds) {
+ ss << id << " ";
+ }
+ DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
+ " Reserving ExeRes[ %s]\n",
+ j, dispatchedWave->simdId, dispatchedWave->wfDynId,
+ ii->seqNum(), ii->disassemble(), ss.str());
+ // mark the resources as reserved for this cycle
+ for (auto execUnitId : execUnitIds) {
+ panic_if(exeUnitReservations.at(execUnitId),
+ "Execution unit %d is reserved!!!\n"
+ "SIMD[%d] WV[%d]: %d: %s",
+ execUnitId, dispatchedWave->simdId,
+ dispatchedWave->wfDynId,
+ ii->seqNum(), ii->disassemble());
+ exeUnitReservations.at(execUnitId) = true;
+ }
+
+ // If wavefront::reserveResources reserved multiple resources,
+ // then we're executing a flat memory instruction. This means
+ // that we've reserved a global and local memory unit. Thus,
+ // we need to mark the latter execution unit as not available.
+ if (execUnitIds.size() > 1) {
+ int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
+ assert(dispatchList->at(lm_exec_unit).second == SKIP);
+ }
+ } else if (s == SKIP) {
+ // Shared Memory pipe reserved for FLAT instruction.
+ // Verify the GM pipe for this wave is ready to execute
+ // and the wave in the GM pipe is the same as the wave
+ // in the LM pipe
+ int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
+ assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
+ dispatchedWave->wfDynId);
+ assert(dispatchList->at(gm_exec_unit).second == EXREADY);
+ }
+ }
+ }
+}
+
+void
+ScheduleStage::deleteFromSch(Wavefront *w)
+{
+ wavesInSch.erase(w->wfDynId);
}
void
ScheduleStage::regStats()
{
+ rdyListNotEmpty
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".rdy_list_not_empty")
+ .desc("number of cycles one or more wave on ready list per "
+ "execution resource")
+ ;
+
+ rdyListEmpty
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".rdy_list_empty")
+ .desc("number of cycles no wave on ready list per "
+ "execution resource")
+ ;
+
+ addToSchListStalls
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".sch_list_add_stalls")
+ .desc("number of cycles a wave is not added to schList per "
+ "execution resource when ready list is not empty")
+ ;
+
+ schListToDispList
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".sch_list_to_disp_list")
+ .desc("number of cycles a wave is added to dispatchList per "
+ "execution resource")
+ ;
+
+ schListToDispListStalls
+ .init(computeUnit->numExeUnits())
+ .name(name() + ".sch_list_to_disp_list_stalls")
+ .desc("number of cycles no wave is added to dispatchList per "
+ "execution resource")
+ ;
+
+ // Operand Readiness Stall Cycles
+ opdNrdyStalls
+ .init(SCH_RF_OPD_NRDY_CONDITIONS)
+ .name(name() + ".opd_nrdy_stalls")
+ .desc("number of stalls in SCH due to operands not ready")
+ ;
+ opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
+ opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
+ opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
+
+ // dispatchReady Stall Cycles
+ dispNrdyStalls
+ .init(SCH_NRDY_CONDITIONS)
+ .name(name() + ".disp_nrdy_stalls")
+ .desc("number of stalls in SCH due to resource not ready")
+ ;
+ dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
+ dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
+ dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
+ csprintf("VectorMemIssue"));
+ dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+ csprintf("VectorMemBusBusy"));
+ dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
+ csprintf("VectorMemCoalescer"));
+ dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
+ dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
+ csprintf("ScalarMemIssue"));
+ dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+ csprintf("ScalarMemBusBusy"));
+ dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
+ csprintf("ScalarMemFIFO"));
+ dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
+ csprintf("LocalMemIssue"));
+ dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+ csprintf("LocalMemBusBusy"));
+ dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
+ csprintf("LocalMemFIFO"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
+ csprintf("FlatMemIssue"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
+ csprintf("FlatMemBusBusy"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
+ csprintf("FlatMemCoalescer"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
+ csprintf("FlatMemFIFO"));
+ dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
+
+ // RF Access Stall Cycles
+ rfAccessStalls
+ .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
+ .name(name() + ".rf_access_stalls")
+ .desc("number of stalls due to RF access denied")
+ ;
+ rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
+ rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
+ rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
+ rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
+ rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
+
+ // Stall cycles due to wave losing LDS bus arbitration
+ ldsBusArbStalls
+ .name(name() + ".lds_bus_arb_stalls")
+ .desc("number of stalls due to VRF->LDS bus conflicts")
+ ;
}
#ifndef __SCHEDULE_STAGE_HH__
#define __SCHEDULE_STAGE_HH__
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
#include <utility>
#include <vector>
class ScheduleStage
{
public:
- ScheduleStage(const ComputeUnitParams *params);
+ ScheduleStage(const ComputeUnitParams *params, ComputeUnit *cu);
~ScheduleStage();
void init(ComputeUnit *cu);
void exec();
- void arbitrate();
+
// Stats related variables and methods
std::string name() { return _name; }
+ enum SchNonRdyType {
+ SCH_SCALAR_ALU_NRDY,
+ SCH_VECTOR_ALU_NRDY,
+ SCH_VECTOR_MEM_ISSUE_NRDY,
+ SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+ SCH_VECTOR_MEM_COALESCER_NRDY,
+ SCH_VECTOR_MEM_REQS_NRDY,
+ SCH_CEDE_SIMD_NRDY,
+ SCH_SCALAR_MEM_ISSUE_NRDY,
+ SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+ SCH_SCALAR_MEM_FIFO_NRDY,
+ SCH_LOCAL_MEM_ISSUE_NRDY,
+ SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+ SCH_LOCAL_MEM_FIFO_NRDY,
+ SCH_FLAT_MEM_ISSUE_NRDY,
+ SCH_FLAT_MEM_BUS_BUSY_NRDY,
+ SCH_FLAT_MEM_COALESCER_NRDY,
+ SCH_FLAT_MEM_REQS_NRDY,
+ SCH_FLAT_MEM_FIFO_NRDY,
+ SCH_RDY,
+ SCH_NRDY_CONDITIONS
+ };
+ enum schopdnonrdytype_e {
+ SCH_VRF_OPD_NRDY,
+ SCH_SRF_OPD_NRDY,
+ SCH_RF_OPD_NRDY,
+ SCH_RF_OPD_NRDY_CONDITIONS
+ };
+ enum schrfaccessnonrdytype_e {
+ SCH_VRF_RD_ACCESS_NRDY,
+ SCH_VRF_WR_ACCESS_NRDY,
+ SCH_SRF_RD_ACCESS_NRDY,
+ SCH_SRF_WR_ACCESS_NRDY,
+ SCH_RF_ACCESS_NRDY,
+ SCH_RF_ACCESS_NRDY_CONDITIONS
+ };
+
void regStats();
+ // Called by ExecStage to inform SCH of instruction execution
+ void deleteFromSch(Wavefront *w);
+
+ // Schedule List status
+ enum SCH_STATUS
+ {
+ RFBUSY = 0, // RF busy reading operands
+ RFREADY, // ready for exec
+ };
+
private:
ComputeUnit *computeUnit;
- uint32_t numSIMDs;
- uint32_t numMemUnits;
-
// Each execution resource will have its own
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
- // Stores the status of waves. A READY implies the
- // wave is ready to be scheduled this cycle and
- // is already present in the readyList
- std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
- waveStatusList;
-
// List of waves which will be dispatched to
- // each execution resource. A FILLED implies
- // dispatch list is non-empty and
- // execution unit has something to execute
- // this cycle. Currently, the dispatch list of
+ // each execution resource.
+ // Currently, the dispatch list of
// an execution resource can hold only one wave because
// an execution resource can execute only one wave in a cycle.
std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+ // Stats
+
+ // Number of cycles with empty (or not empty) readyList, per execution
+ // resource, when the CU is active (not sleeping)
+ Stats::Vector rdyListEmpty;
+ Stats::Vector rdyListNotEmpty;
+
+ // Number of cycles, per execution resource, when at least one wave
+ // was on the readyList and picked by scheduler, but was unable to be
+ // added to the schList, when the CU is active (not sleeping)
+ Stats::Vector addToSchListStalls;
+
+ // Number of cycles, per execution resource, when a wave is selected
+ // as candidate for dispatchList from schList
+ // Note: may be arbitrated off dispatchList (e.g., LDS arbitration)
+ Stats::Vector schListToDispList;
+
+ // Per execution resource stat, incremented once per cycle if no wave
+ // was selected as candidate for dispatch and moved to dispatchList
+ Stats::Vector schListToDispListStalls;
+
+ // Number of times a wave is selected by the scheduler but cannot
+ // be added to the schList due to register files not being able to
+ // support reads or writes of operands. RF_ACCESS_NRDY condition is always
+ // incremented if at least one read/write not supported, other
+ // conditions are incremented independently from each other.
+ Stats::Vector rfAccessStalls;
+
+ // Number of times a wave is executing FLAT instruction and
+ // forces another wave occupying its required local memory resource
+ // to be deselected for execution, and placed back on schList
+ Stats::Scalar ldsBusArbStalls;
+
+ // Count of times VRF and/or SRF blocks waves on schList from
+ // performing RFBUSY->RFREADY transition
+ Stats::Vector opdNrdyStalls;
+
+ // Count of times resource required for dispatch is not ready and
+ // blocks wave in RFREADY state on schList from potentially moving
+ // to dispatchList
+ Stats::Vector dispNrdyStalls;
+
std::string _name;
+
+ // called by exec() to add a wave to schList if the RFs can support it
+ bool addToSchList(int exeType, Wavefront *w);
+ // re-insert a wave to schList if wave lost arbitration
+ // wave is inserted such that age order (oldest to youngest) is preserved
+ void reinsertToSchList(int exeType, Wavefront *w);
+ // check waves in schList to see if RF reads complete
+ void checkRfOperandReadComplete();
+ // check execution resources for readiness
+ bool vectorAluRdy;
+ bool scalarAluRdy;
+ bool scalarMemBusRdy;
+ bool scalarMemIssueRdy;
+ bool glbMemBusRdy;
+ bool glbMemIssueRdy;
+ bool locMemBusRdy;
+ bool locMemIssueRdy;
+ // check status of memory pipes and RF to Mem buses
+ void checkMemResources();
+ // resource ready check called by fillDispatchList
+ bool dispatchReady(Wavefront *w);
+ // pick waves from schList and populate dispatchList with one wave
+ // per EXE resource type
+ void fillDispatchList();
+ // arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList
+ void arbitrateVrfToLdsBus();
+ // schedule destination operand writes to register files for waves in
+ // dispatchList
+ void scheduleRfDestOperands();
+ // invoked by scheduleRfDestOperands to schedule RF writes for a wave
+ bool schedRfWrites(int exeType, Wavefront *w);
+ // reserve resources for waves surviving arbitration in dispatchList
+ void reserveResources();
+
+ void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+ Wavefront *w = nullptr);
+
+ // Set tracking wfDynId for each wave present in schedule stage
+ // Used to allow only one instruction per wave in schedule
+ std::unordered_set<uint64_t> wavesInSch;
+
+ // List of waves (one list per exe resource) that are in schedule
+ // stage. Waves are added to this list after selected by scheduler
+ // from readyList. Waves are removed from this list and placed on
+ // dispatchList when status reaches SCHREADY.
+ // Waves are kept ordered by age for each resource, always favoring
+ // forward progress for the oldest wave.
+ // The maximum number of waves per resource can be determined by either
+ // the VRF/SRF availability or limits imposed by paremeters (to be added)
+ // of the SCH stage or CU.
+ std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
};
#endif // __SCHEDULE_STAGE_HH__
#include "gpu-compute/scoreboard_check_stage.hh"
+#include "debug/GPUExec.hh"
+#include "debug/GPUSched.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
#include "params/ComputeUnit.hh"
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
- : numSIMDs(p->num_SIMDs),
- numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
- numShrMemPipes(p->num_shared_mem_pipes),
- vectorAluInstAvail(nullptr),
- lastGlbMemSimd(-1),
- lastShrMemSimd(-1), glbMemInstAvail(nullptr),
- shrMemInstAvail(nullptr)
{
}
ScoreboardCheckStage::~ScoreboardCheckStage()
{
readyList.clear();
- waveStatusList.clear();
- shrMemInstAvail = nullptr;
- glbMemInstAvail = nullptr;
}
void
computeUnit = cu;
_name = computeUnit->name() + ".ScoreboardCheckStage";
- for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+ for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
readyList.push_back(&computeUnit->readyList[unitId]);
}
-
- for (int unitId = 0; unitId < numSIMDs; ++unitId) {
- waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
- }
-
- vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
- glbMemInstAvail= &computeUnit->glbMemInstAvail;
- shrMemInstAvail= &computeUnit->shrMemInstAvail;
}
void
-ScoreboardCheckStage::initStatistics()
+ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus)
{
- lastGlbMemSimd = -1;
- lastShrMemSimd = -1;
- *glbMemInstAvail = 0;
- *shrMemInstAvail = 0;
-
- for (int unitId = 0; unitId < numSIMDs; ++unitId)
- vectorAluInstAvail->at(unitId) = false;
+ panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS,
+ "Instruction ready status %d is illegal!!!", rdyStatus);
+ stallCycles[rdyStatus]++;
}
-void
-ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+// It also returns the reason (in rdyStatus) if the instruction is not
+// ready. Finally it sets the execution resource type (in exesResType)
+// of the instruction, only if it ready.
+bool
+ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
+ int *exeResType, int wfSlot)
{
- if (curWave->instructionBuffer.empty())
- return;
-
- // track which vector SIMD unit has at least one WV with a vector
- // ALU as the oldest instruction in its Instruction buffer
- vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
- curWave->isOldestInstALU();
-
- // track how many vector SIMD units have at least one WV with a
- // vector Global memory instruction as the oldest instruction
- // in its Instruction buffer
- if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
- curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
- *glbMemInstAvail <= 1) {
- (*glbMemInstAvail)++;
- lastGlbMemSimd = unitId;
+ /**
+ * The waitCnt checks have to be done BEFORE checking for Instruction
+ * buffer empty condition. Otherwise, it will result into a deadlock if
+ * the last instruction in the Instruction buffer is a waitCnt: after
+ * executing the waitCnt, the Instruction buffer would be empty and the
+ * ready check logic will exit BEFORE checking for wait counters being
+ * satisfied.
+ */
+
+ // waitCnt instruction has been dispatched or executed: next
+ // instruction should be blocked until waitCnts are satisfied.
+ if (w->getStatus() == Wavefront::S_WAITCNT) {
+ if (!w->waitCntsSatisfied()) {
+ *rdyStatus = NRDY_WAIT_CNT;
+ return false;
+ }
+ }
+
+ // Is the wave waiting at a barrier. Check this condition BEFORE checking
+ // for instruction buffer occupancy to avoid a deadlock when the barrier is
+ // the last instruction in the instruction buffer.
+ if (w->stalledAtBarrier) {
+ if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
+ computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
+ // Are all threads at barrier?
+ *rdyStatus = NRDY_BARRIER_WAIT;
+ return false;
+ }
+ w->oldBarrierCnt = w->barrierCnt;
+ w->stalledAtBarrier = false;
+ }
+
+ // Check WF status: it has to be running
+ if (w->getStatus() == Wavefront::S_STOPPED ||
+ w->getStatus() == Wavefront::S_RETURNING ||
+ w->getStatus() == Wavefront::S_STALLED) {
+ *rdyStatus = NRDY_WF_STOP;
+ return false;
+ }
+
+ // is the Instruction buffer empty
+ if ( w->instructionBuffer.empty()) {
+ *rdyStatus = NRDY_IB_EMPTY;
+ return false;
+ }
+
+ // Check next instruction from instruction buffer
+ GPUDynInstPtr ii = w->nextInstr();
+ // Only instruction in the instruction buffer has been dispatched.
+ // No need to check it again for readiness
+ if (!ii) {
+ *rdyStatus = NRDY_IB_EMPTY;
+ return false;
+ }
+
+ // The following code is very error prone and the entire process for
+ // checking readiness will be fixed eventually. In the meantime, let's
+ // make sure that we do not silently let an instruction type slip
+ // through this logic and always return not ready.
+ if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
+ ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
+ ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat())) {
+ panic("next instruction: %s is of unknown type\n", ii->disassemble());
}
- // track how many vector SIMD units have at least one WV with a
- // vector shared memory (LDS) instruction as the oldest instruction
- // in its Instruction buffer
- // TODO: parametrize the limit of the LDS units
- if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
- lastShrMemSimd != unitId) {
- (*shrMemInstAvail)++;
- lastShrMemSimd = unitId;
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
+ computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
+
+ // Non-scalar (i.e., vector) instructions may use VGPRs
+ if (!ii->isScalar()) {
+ if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
+ *rdyStatus = NRDY_VGPR_NRDY;
+ return false;
+ }
}
+ // Scalar and non-scalar instructions may use SGPR
+ if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
+ *rdyStatus = NRDY_SGPR_NRDY;
+ return false;
+ }
+
+ // The hardware implicitly executes S_WAITCNT 0 before executing
+ // the S_ENDPGM instruction. Implementing this implicit S_WAITCNT.
+ // isEndOfKernel() is used to identify the S_ENDPGM instruction
+ // On identifying it, we do the following:
+ // 1. Wait for all older instruction to execute
+ // 2. Once all the older instruction are executed, we add a wait
+ // count for the executed instruction(s) to complete.
+ if (ii->isEndOfKernel()) {
+ // Waiting for older instruction to execute
+ if (w->instructionBuffer.front()->seqNum() != ii->seqNum()) {
+ *rdyStatus = NRDY_WAIT_CNT;
+ return false;
+ }
+ // Older instructions have executed, adding implicit wait count
+ w->setStatus(Wavefront::S_WAITCNT);
+ w->setWaitCnts(0, 0, 0);
+ if (!w->waitCntsSatisfied()) {
+ *rdyStatus = NRDY_WAIT_CNT;
+ return false;
+ }
+ }
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+ w->simdId, w->wfSlotId, ii->disassemble());
+ *exeResType = mapWaveToExeUnit(w);
+ *rdyStatus = INST_RDY;
+ return true;
+}
+
+int
+ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
+{
+ GPUDynInstPtr ii = w->nextInstr();
+ assert(ii);
+ if (ii->isFlat()) {
+ /**
+ * NOTE: Flat memory ops requires both GM and LM resources.
+ * The simulator models consumption of both GM and LM
+ * resources in the schedule stage. At instruction execution time,
+ * after the aperture check is performed, only the GM or LM pipe
+ * is actually reserved by the timing model. The GM unit is returned
+ * here since Flat ops occupy the GM slot in the ready and dispatch
+ * lists. They also consume the LM slot in the dispatch list.
+ */
+ return w->globalMem;
+ } else if (ii->isLocalMem()) {
+ return w->localMem;
+ } else if (ii->isGlobalMem()) {
+ if (!ii->isScalar()) {
+ return w->globalMem;
+ } else {
+ return w->scalarMem;
+ }
+ } else if (ii->isBranch() ||
+ ii->isALU() ||
+ (ii->isKernArgSeg() && ii->isLoad()) ||
+ ii->isArgSeg() ||
+ ii->isReturn() ||
+ ii->isEndOfKernel() ||
+ ii->isNop() ||
+ ii->isBarrier()) {
+ if (!ii->isScalar()) {
+ return w->simdId;
+ } else {
+ return w->scalarAluGlobalIdx;
+ }
+ }
+ panic("%s: unmapped to an execution resource", ii->disassemble());
+ return computeUnit->numExeUnits();
}
void
ScoreboardCheckStage::exec()
{
- initStatistics();
-
// reset the ready list for all execution units; it will be
// constructed every cycle since resource availability may change
- for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+ for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+ // Reset wavefront pointers to nullptr so clear() on the vector
+ // does not accidentally destruct the wavefront object
+ for (int i = 0; i < readyList[unitId]->size(); i++) {
+ readyList[unitId]->at(i) = nullptr;
+ }
readyList[unitId]->clear();
}
-
- // iterate over the Wavefronts of all SIMD units
- for (int unitId = 0; unitId < numSIMDs; ++unitId) {
- for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+ // iterate over all WF slots across all vector ALUs
+ for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
+ for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
// reset the ready status of each wavefront
- waveStatusList[unitId]->at(wvId).second = BLOCKED;
- Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
- collectStatistics(curWave, unitId);
-
- if (curWave->ready(Wavefront::I_ALU)) {
- readyList[unitId]->push_back(curWave);
- waveStatusList[unitId]->at(wvId).second = READY;
- } else if (curWave->ready(Wavefront::I_GLOBAL)) {
- if (computeUnit->cedeSIMD(unitId, wvId)) {
- continue;
- }
-
- readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
- waveStatusList[unitId]->at(wvId).second = READY;
- } else if (curWave->ready(Wavefront::I_SHARED)) {
- readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
- waveStatusList[unitId]->at(wvId).second = READY;
- } else if (curWave->ready(Wavefront::I_FLAT)) {
- readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
- waveStatusList[unitId]->at(wvId).second = READY;
- } else if (curWave->ready(Wavefront::I_PRIVATE)) {
- readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
- waveStatusList[unitId]->at(wvId).second = READY;
+ Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
+ nonrdytype_e rdyStatus = NRDY_ILLEGAL;
+ int exeResType = -1;
+ // check WF readiness: If the WF's oldest
+ // instruction is ready to issue then add the WF to the ready list
+ if (ready(curWave, &rdyStatus, &exeResType, wfSlot)) {
+ assert(curWave->simdId == simdId);
+ DPRINTF(GPUSched,
+ "Adding to readyList[%d]: SIMD[%d] WV[%d]: %d: %s\n",
+ exeResType,
+ curWave->simdId, curWave->wfDynId,
+ curWave->nextInstr()->seqNum(),
+ curWave->nextInstr()->disassemble());
+ readyList.at(exeResType)->push_back(curWave);
}
+ collectStatistics(rdyStatus);
}
}
}
void
ScoreboardCheckStage::regStats()
{
+ stallCycles
+ .init(NRDY_CONDITIONS)
+ .name(name() + ".stall_cycles")
+ .desc("number of cycles wave stalled in SCB")
+ ;
+ stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop"));
+ stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty"));
+ stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt"));
+ stallCycles.subname(NRDY_BARRIER_WAIT, csprintf("BarrierWait"));
+ stallCycles.subname(NRDY_VGPR_NRDY, csprintf("VgprBusy"));
+ stallCycles.subname(NRDY_SGPR_NRDY, csprintf("SgprBusy"));
+ stallCycles.subname(INST_RDY, csprintf("InstrReady"));
}
#include <cstdint>
#include <string>
+#include <unordered_map>
#include <utility>
#include <vector>
+#include "sim/stats.hh"
+
class ComputeUnit;
class Wavefront;
struct ComputeUnitParams;
-enum WAVE_STATUS
-{
- BLOCKED = 0,
- READY
-};
-
/*
* Scoreboard check stage.
* All wavefronts are analyzed to see if they are ready
class ScoreboardCheckStage
{
public:
+ enum nonrdytype_e {
+ NRDY_ILLEGAL,
+ NRDY_WF_STOP,
+ NRDY_IB_EMPTY,
+ NRDY_WAIT_CNT,
+ NRDY_BARRIER_WAIT,
+ NRDY_VGPR_NRDY,
+ NRDY_SGPR_NRDY,
+ INST_RDY,
+ NRDY_CONDITIONS
+ };
+
ScoreboardCheckStage(const ComputeUnitParams* params);
~ScoreboardCheckStage();
void init(ComputeUnit *cu);
void regStats();
private:
- void collectStatistics(Wavefront *curWave, int unitId);
- void initStatistics();
+ void collectStatistics(nonrdytype_e rdyStatus);
+ int mapWaveToExeUnit(Wavefront *w);
+ bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
+ int *exeResType, int wfSlot);
ComputeUnit *computeUnit;
- uint32_t numSIMDs;
- uint32_t numMemUnits;
- uint32_t numShrMemPipes;
-
- // flag per vector SIMD unit that is set when there is at least one
- // WF that has a vector ALU instruction as the oldest in its
- // Instruction Buffer
- std::vector<bool> *vectorAluInstAvail;
- int lastGlbMemSimd;
- int lastShrMemSimd;
- int *glbMemInstAvail;
- int *shrMemInstAvail;
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list
std::vector<std::vector<Wavefront*>*> readyList;
- // Stores the status of waves. A READY implies the
- // wave is ready to be scheduled this cycle and
- // is already present in the readyList
- std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
- waveStatusList;
+ // Stats
+ Stats::Vector stallCycles;
std::string _name;
};
#include "base/chunk_generator.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUMem.hh"
-#include "debug/HSAIL.hh"
+#include "debug/GPUShader.hh"
+#include "debug/GPUWgLatency.hh"
#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_command_processor.hh"
#include "gpu-compute/gpu_static_inst.hh"
-#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "sim/sim_exit.hh"
-Shader::Shader(const Params *p)
- : ClockedObject(p), clock(p->clk_domain->clockPeriod()),
- cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
- tickEvent([this]{ processTick(); }, "Shader tick",
- false, Event::CPU_Tick_Pri),
- timingSim(p->timing), hsail_mode(SIMT),
- impl_kern_boundary_sync(p->impl_kern_boundary_sync),
- separate_acquire_release(p->separate_acquire_release), coissue_return(1),
- trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
- globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
- box_tick_cnt(0), start_tick_cnt(0)
+Shader::Shader(const Params *p) : ClockedObject(p),
+ _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
+ gpuTc(nullptr), cpuPointer(p->cpu_pointer),
+ tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
+ false, Event::CPU_Tick_Pri),
+ timingSim(p->timing), hsail_mode(SIMT),
+ impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+ coissue_return(1),
+ trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+ globalMemSize(p->globalmem),
+ nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc),
+ _dispatcher(*p->dispatcher),
+ max_valu_insts(p->max_valu_insts), total_valu_insts(0)
{
+ gpuCmdProc.setShader(this);
+ _dispatcher.setShader(this);
+
+ _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
+ _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
+
+ _ldsApe.base = ((Addr)1 << 61) + 0x0;
+ _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+
+ _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
+ _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
+
+ shHiddenPrivateBaseVmid = 0;
cuList.resize(n_cu);
+ panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
+
for (int i = 0; i < n_cu; ++i) {
cuList[i] = p->CUs[i];
assert(i == cuList[i]->cu_id);
cuList[i]->shader = this;
+ cuList[i]->idleCUTimeout = p->idlecu_timeout;
}
}
+GPUDispatcher&
+Shader::dispatcher()
+{
+ return _dispatcher;
+}
+
Addr
Shader::mmap(int length)
{
auto mem_state = proc->memState;
if (proc->mmapGrowsDown()) {
- DPRINTF(HSAIL, "GROWS DOWN");
+ DPRINTF(GPUShader, "GROWS DOWN");
start = mem_state->getMmapEnd() - length;
mem_state->setMmapEnd(start);
} else {
- DPRINTF(HSAIL, "GROWS UP");
+ DPRINTF(GPUShader, "GROWS UP");
start = mem_state->getMmapEnd();
mem_state->setMmapEnd(start + length);
mem_state->getMmapEnd());
}
- DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+ DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
proc->allocateMem(start, length);
}
void
-Shader::exec()
+Shader::execScheduledAdds()
{
- tick_cnt = curTick();
- box_tick_cnt = curTick() - start_tick_cnt;
+ assert(!sa_when.empty());
// apply any scheduled adds
for (int i = 0; i < sa_n; ++i) {
- if (sa_when[i] <= tick_cnt) {
+ if (sa_when[i] <= curTick()) {
*sa_val[i] += sa_x[i];
+ panic_if(*sa_val[i] < 0, "Negative counter value\n");
sa_val.erase(sa_val.begin() + i);
sa_x.erase(sa_x.begin() + i);
sa_when.erase(sa_when.begin() + i);
--i;
}
}
+ if (!sa_when.empty()) {
+ Tick shader_wakeup = *std::max_element(sa_when.begin(),
+ sa_when.end());
+ DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
+ schedule(tickEvent, shader_wakeup);
+ } else {
+ DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
+ }
+}
+
+/*
+ * dispatcher/shader arranges invalidate requests to the CUs
+ */
+void
+Shader::prepareInvalidate(HSAQueueEntry *task) {
+ // if invalidate has already started/finished, then do nothing
+ if (task->isInvStarted()) return;
+
+ // invalidate has never started; it can only perform once at kernel launch
+ assert(task->outstandingInvs() == -1);
+ int kernId = task->dispatchId();
+ // counter value is 0 now, indicating the inv is about to start
+ _dispatcher.updateInvCounter(kernId, +1);
+
+ // iterate all cus managed by the shader, to perform invalidate.
+ for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+ // create a request to hold INV info; the request's fields will
+ // be updated in cu before use
+ auto req = std::make_shared<Request>(0, 0, 0,
+ cuList[i_cu]->masterId(),
+ 0, -1);
+
+ _dispatcher.updateInvCounter(kernId, +1);
+ // all necessary INV flags are all set now, call cu to execute
+ cuList[i_cu]->doInvalidate(req, task->dispatchId());
+ }
+}
- // clock all of the cu's
- for (int i = 0; i < n_cu; ++i)
- cuList[i]->exec();
+/**
+ * dispatcher/shader arranges flush requests to the CUs
+ */
+void
+Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
+ int kernId = gpuDynInst->kern_id;
+ // flush has never been started, performed only once at kernel end
+ assert(_dispatcher.getOutstandingWbs(kernId) == 0);
+
+ // iterate all cus, managed by the shader, to perform flush.
+ for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+ _dispatcher.updateWbCounter(kernId, +1);
+ cuList[i_cu]->doFlush(gpuDynInst);
+ }
}
bool
-Shader::dispatch_workgroups(NDRange *ndr)
+Shader::dispatchWorkgroups(HSAQueueEntry *task)
{
bool scheduledSomething = false;
int cuCount = 0;
// dispatch workgroup iff the following two conditions are met:
// (a) wg_rem is true - there are unassigned workgroups in the grid
// (b) there are enough free slots in cu cuList[i] for this wg
- if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+ if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
scheduledSomething = true;
- DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
-
- // ticks() member function translates cycles to simulation ticks.
- if (!tickEvent.scheduled()) {
- schedule(tickEvent, curTick() + this->ticks(1));
+ DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
+ curCu, task->globalWgId());
+ DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
+ curTick(), task->globalWgId(), curCu);
+
+ if (!cuList[curCu]->tickEvent.scheduled()) {
+ if (!_activeCus)
+ _lastInactiveTick = curTick();
+ _activeCus++;
}
- cuList[curCu]->StartWorkgroup(ndr);
- ndr->wgId[0]++;
- ndr->globalWgId++;
- if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
- ndr->wgId[0] = 0;
- ndr->wgId[1]++;
-
- if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
- ndr->wgId[1] = 0;
- ndr->wgId[2]++;
-
- if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
- ndr->wg_disp_rem = false;
- break;
- }
- }
- }
+ panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
+ "Invalid activeCu size\n");
+ cuList[curCu]->dispWorkgroup(task);
+
+ task->markWgDispatch();
}
++cuCount;
}
void
-Shader::handshake(GpuDispatcher *_dispatcher)
+Shader::regStats()
{
- dispatcher = _dispatcher;
+ ClockedObject::regStats();
+
+ shaderActiveTicks
+ .name(name() + ".shader_active_ticks")
+ .desc("Total ticks that any CU attached to this shader is active")
+ ;
+ allLatencyDist
+ .init(0, 1600000, 10000)
+ .name(name() + ".allLatencyDist")
+ .desc("delay distribution for all")
+ .flags(Stats::pdf | Stats::oneline);
+
+ loadLatencyDist
+ .init(0, 1600000, 10000)
+ .name(name() + ".loadLatencyDist")
+ .desc("delay distribution for loads")
+ .flags(Stats::pdf | Stats::oneline);
+
+ storeLatencyDist
+ .init(0, 1600000, 10000)
+ .name(name() + ".storeLatencyDist")
+ .desc("delay distribution for stores")
+ .flags(Stats::pdf | Stats::oneline);
+
+ vectorInstSrcOperand
+ .init(4)
+ .name(name() + ".vec_inst_src_operand")
+ .desc("vector instruction source operand distribution");
+
+ vectorInstDstOperand
+ .init(4)
+ .name(name() + ".vec_inst_dst_operand")
+ .desc("vector instruction destination operand distribution");
+
+ initToCoalesceLatency
+ .init(0, 1600000, 10000)
+ .name(name() + ".initToCoalesceLatency")
+ .desc("Ticks from vmem inst initiateAcc to coalescer issue")
+ .flags(Stats::pdf | Stats::oneline);
+
+ rubyNetworkLatency
+ .init(0, 1600000, 10000)
+ .name(name() + ".rubyNetworkLatency")
+ .desc("Ticks from coalescer issue to coalescer hit callback")
+ .flags(Stats::pdf | Stats::oneline);
+
+ gmEnqueueLatency
+ .init(0, 1600000, 10000)
+ .name(name() + ".gmEnqueueLatency")
+ .desc("Ticks from coalescer hit callback to GM pipe enqueue")
+ .flags(Stats::pdf | Stats::oneline);
+
+ gmToCompleteLatency
+ .init(0, 1600000, 10000)
+ .name(name() + ".gmToCompleteLatency")
+ .desc("Ticks queued in GM pipes ordered response buffer")
+ .flags(Stats::pdf | Stats::oneline);
+
+ coalsrLineAddresses
+ .init(0, 20, 1)
+ .name(name() + ".coalsrLineAddresses")
+ .desc("Number of cache lines for coalesced request")
+ .flags(Stats::pdf | Stats::oneline);
+
+ int wfSize = cuList[0]->wfSize();
+ cacheBlockRoundTrip = new Stats::Distribution[wfSize];
+ for (int idx = 0; idx < wfSize; ++idx) {
+ std::stringstream namestr;
+ ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
+ cacheBlockRoundTrip[idx]
+ .init(0, 1600000, 10000)
+ .name(namestr.str())
+ .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
+ .flags(Stats::pdf | Stats::oneline);
+ }
}
void
RequestPtr req1, req2;
req->splitOnVaddr(split_addr, req1, req2);
-
PacketPtr pkt1 = new Packet(req2, cmd);
PacketPtr pkt2 = new Packet(req1, cmd);
}
}
-bool
-Shader::busy()
-{
- for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
- if (!cuList[i_cu]->isDone()) {
- return true;
- }
- }
-
- return false;
-}
-
void
-Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+Shader::ScheduleAdd(int *val,Tick when,int x)
{
sa_val.push_back(val);
- sa_when.push_back(tick_cnt + when);
+ when += curTick();
+ sa_when.push_back(when);
sa_x.push_back(x);
++sa_n;
-}
-
-
-void
-Shader::processTick()
-{
- if (busy()) {
- exec();
- schedule(tickEvent, curTick() + ticks(1));
+ if (!tickEvent.scheduled() || (when < tickEvent.when())) {
+ DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
+ "%lu\n", when);
+ reschedule(tickEvent, when, true);
+ } else {
+ assert(tickEvent.scheduled());
+ DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
+ "%lu\n", when);
}
}
Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
bool suppress_func_errors)
{
- AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
+ suppress_func_errors);
}
void
pkt->senderState =
new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
- if (cu_id == n_cu) {
- dispatcher->tlbPort->sendFunctional(pkt);
- } else {
- // even when the perLaneTLB flag is turned on
- // it's ok tp send all accesses through lane 0
- // since the lane # is not known here,
- // This isn't important since these are functional accesses.
- cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
- }
+ // even when the perLaneTLB flag is turned on
+ // it's ok tp send all accesses through lane 0
+ // since the lane # is not known here,
+ // This isn't important since these are functional accesses.
+ cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
/* safe_cast the senderState */
TheISA::GpuTLB::TranslationState *sender_state =
delete sender_state->tlbEntry;
delete pkt->senderState;
}
+
+/*
+ * allow the shader to sample stats from constituent devices
+ */
+void
+Shader::sampleStore(const Tick accessTime)
+{
+ storeLatencyDist.sample(accessTime);
+ allLatencyDist.sample(accessTime);
+}
+
+/*
+ * allow the shader to sample stats from constituent devices
+ */
+void
+Shader::sampleLoad(const Tick accessTime)
+{
+ loadLatencyDist.sample(accessTime);
+ allLatencyDist.sample(accessTime);
+}
+
+void
+Shader::sampleInstRoundTrip(std::vector<Tick> roundTripTime)
+{
+ // Only sample instructions that go all the way to main memory
+ if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
+ return;
+ }
+
+ Tick t1 = roundTripTime[0];
+ Tick t2 = roundTripTime[1];
+ Tick t3 = roundTripTime[2];
+ Tick t4 = roundTripTime[3];
+ Tick t5 = roundTripTime[4];
+
+ initToCoalesceLatency.sample(t2-t1);
+ rubyNetworkLatency.sample(t3-t2);
+ gmEnqueueLatency.sample(t4-t3);
+ gmToCompleteLatency.sample(t5-t4);
+}
+
+void
+Shader::sampleLineRoundTrip(const std::map<Addr, std::vector<Tick>>& lineMap)
+{
+ coalsrLineAddresses.sample(lineMap.size());
+ std::vector<Tick> netTimes;
+
+ // For each cache block address generated by a vmem inst, calculate
+ // the round-trip time for that cache block.
+ for (auto& it : lineMap) {
+ const std::vector<Tick>& timeVec = it.second;
+ if (timeVec.size() == 2) {
+ netTimes.push_back(timeVec[1] - timeVec[0]);
+ }
+ }
+
+ // Sort the cache block round trip times so that the first
+ // distrubtion is always measuring the fastests and the last
+ // distrubtion is always measuring the slowest cache block.
+ std::sort(netTimes.begin(), netTimes.end());
+
+ // Sample the round trip time for each N cache blocks into the
+ // Nth distribution.
+ int idx = 0;
+ for (auto& time : netTimes) {
+ cacheBlockRoundTrip[idx].sample(time);
+ ++idx;
+ }
+}
+
+void
+Shader::notifyCuSleep() {
+ // If all CUs attached to his shader are asleep, update shaderActiveTicks
+ panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
+ "Invalid activeCu size\n");
+ _activeCus--;
+ if (!_activeCus)
+ shaderActiveTicks += curTick() - _lastInactiveTick;
+}
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
- * Author: Steve Reinhardt
+ * Authors: Steve Reinhardt
*/
#ifndef __SHADER_HH__
#include "cpu/simple_thread.hh"
#include "cpu/thread_context.hh"
#include "cpu/thread_state.hh"
-#include "enums/MemType.hh"
#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
-#include "gpu-compute/qstruct.hh"
#include "mem/page_table.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "sim/sim_object.hh"
class BaseTLB;
-class GpuDispatcher;
+class GPUCommandProcessor;
+class GPUDispatcher;
namespace TheISA
{
static const int LDS_SIZE = 65536;
+// aperture (APE) registers define the base/limit
+// pair for the ATC mapped memory space. currently
+// the only APEs we consider are for GPUVM/LDS/scratch.
+// the APEs are registered with unique values based
+// on a per-device basis
+struct ApertureRegister
+{
+ Addr base;
+ Addr limit;
+};
+
// Class Shader: This describes a single shader instance. Most
// configurations will only have a single shader.
class Shader : public ClockedObject
{
- protected:
- // Shader's clock period in terms of number of ticks of curTime,
- // aka global simulation clock
- Tick clock;
+ private:
+ ApertureRegister _gpuVmApe;
+ ApertureRegister _ldsApe;
+ ApertureRegister _scratchApe;
+ Addr shHiddenPrivateBaseVmid;
+
+ // Number of active Cus attached to this shader
+ int _activeCus;
+
+ // Last tick that all CUs attached to this shader were inactive
+ Tick _lastInactiveTick;
+
+ // some stats for measuring latency
+ Stats::Distribution allLatencyDist;
+ Stats::Distribution loadLatencyDist;
+ Stats::Distribution storeLatencyDist;
+
+ // average ticks from vmem inst initiateAcc to coalescer issue,
+ // average ticks from coalescer issue to coalescer hit callback,
+ // average ticks from coalescer hit callback to GM pipe enqueue,
+ // and average ticks spent in GM pipe's ordered resp buffer.
+ Stats::Distribution initToCoalesceLatency;
+ Stats::Distribution rubyNetworkLatency;
+ Stats::Distribution gmEnqueueLatency;
+ Stats::Distribution gmToCompleteLatency;
+
+ // average number of cache blocks requested by vmem inst, and
+ // average ticks for cache blocks to main memory for the Nth
+ // cache block generated by a vmem inst.
+ Stats::Distribution coalsrLineAddresses;
+ Stats::Distribution *cacheBlockRoundTrip;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
- // clock related functions ; maps to-and-from
- // Simulation ticks and shader clocks.
- Tick frequency() const { return SimClock::Frequency / clock; }
-
- Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
-
- Tick getClock() const { return clock; }
- Tick curCycle() const { return curTick() / clock; }
- Tick tickToCycles(Tick val) const { return val / clock;}
-
+ GPUDispatcher &dispatcher();
+ void sampleLoad(const Tick accessTime);
+ void sampleStore(const Tick accessTime);
+ void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
+ void sampleLineRoundTrip(const std::map<Addr,
+ std::vector<Tick>> &roundTripTime);
SimpleThread *cpuThread;
ThreadContext *gpuTc;
BaseCPU *cpuPointer;
- void processTick();
+ const ApertureRegister&
+ gpuVmApe() const
+ {
+ return _gpuVmApe;
+ }
+
+ const ApertureRegister&
+ ldsApe() const
+ {
+ return _ldsApe;
+ }
+
+ const ApertureRegister&
+ scratchApe() const
+ {
+ return _scratchApe;
+ }
+
+ bool
+ isGpuVmApe(Addr addr) const
+ {
+ bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
+
+ return is_gpu_vm;
+ }
+
+ bool
+ isLdsApe(Addr addr) const
+ {
+ bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
+
+ return is_lds;
+ }
+
+ bool
+ isScratchApe(Addr addr) const
+ {
+ bool is_scratch
+ = addr >= _scratchApe.base && addr <= _scratchApe.limit;
+
+ return is_scratch;
+ }
+
+ Addr
+ getScratchBase()
+ {
+ return _scratchApe.base;
+ }
+
+ Addr
+ getHiddenPrivateBase()
+ {
+ return shHiddenPrivateBaseVmid;
+ }
+
+ void
+ initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
+ {
+ Addr sh_hidden_base_new = queueBase - offset;
+
+ // We are initializing sh_hidden_private_base_vmid from the
+ // amd queue descriptor from the first queue.
+ // The sh_hidden_private_base_vmid is supposed to be same for
+ // all the queues from the same process
+ if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
+ // Do not panic if shHiddenPrivateBaseVmid == 0,
+ // that is if it is uninitialized. Panic only
+ // if the value is initilized and we get
+ // a differnt base later.
+ panic_if(shHiddenPrivateBaseVmid != 0,
+ "Currently we support only single process\n");
+ }
+ shHiddenPrivateBaseVmid = sh_hidden_base_new;
+ }
+
EventFunctionWrapper tickEvent;
// is this simulation going to be timing mode in the memory?
// If set, issue acq packet @ kernel launch
int impl_kern_boundary_sync;
- // If set, generate a separate packet for acquire/release on
- // ld_acquire/st_release/atomic operations
- int separate_acquire_release;
// If set, fetch returns may be coissued with instructions
int coissue_return;
// If set, always dump all 64 gprs to trace
int trace_vgpr_all;
// Number of cu units in the shader
int n_cu;
- // Number of wavefront slots per cu
+ // Number of wavefront slots per SIMD per CU
int n_wf;
+
// The size of global memory
int globalMemSize;
- /*
- * Bytes/work-item for call instruction
- * The number of arguments for an hsail function will
- * vary. We simply determine the maximum # of arguments
- * required by any hsail function up front before the
- * simulation (during parsing of the Brig) and record
- * that number here.
- */
- int funcargs_size;
-
// Tracks CU that rr dispatcher should attempt scheduling
int nextSchedCu;
uint32_t sa_n;
// Pointer to value to be increments
- std::vector<uint32_t*> sa_val;
+ std::vector<int*> sa_val;
// When to do the increment
std::vector<uint64_t> sa_when;
// Amount to increment by
// List of Compute Units (CU's)
std::vector<ComputeUnit*> cuList;
- uint64_t tick_cnt;
- uint64_t box_tick_cnt;
- uint64_t start_tick_cnt;
+ GPUCommandProcessor &gpuCmdProc;
+ GPUDispatcher &_dispatcher;
+
+ /**
+ * Statistics
+ */
+ Stats::Scalar shaderActiveTicks;
+ Stats::Vector vectorInstSrcOperand;
+ Stats::Vector vectorInstDstOperand;
+ void regStats();
- GpuDispatcher *dispatcher;
+ int max_valu_insts;
+ int total_valu_insts;
Shader(const Params *p);
~Shader();
virtual void init();
- // Run shader
- void exec();
-
- // Check to see if shader is busy
- bool busy();
+ // Run shader scheduled adds
+ void execScheduledAdds();
// Schedule a 32-bit value to be incremented some time in the future
- void ScheduleAdd(uint32_t *val, Tick when, int x);
+ void ScheduleAdd(int *val, Tick when, int x);
bool processTimingPacket(PacketPtr pkt);
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
cuList[cu_id] = compute_unit;
}
- void handshake(GpuDispatcher *dispatcher);
- bool dispatch_workgroups(NDRange *ndr);
+ void prepareInvalidate(HSAQueueEntry *task);
+ void prepareFlush(GPUDynInstPtr gpuDynInst);
+
+ bool dispatchWorkgroups(HSAQueueEntry *task);
Addr mmap(int length);
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
void updateContext(int cid);
void hostWakeUp(BaseCPU *cpu);
+ void notifyCuSleep();
};
#endif // __SHADER_HH__
#include "base/logging.hh"
+SimplePoolManager *
+SimplePoolManagerParams::create()
+{
+ return new SimplePoolManager(this);
+}
+
// return the min number of elements that the manager can reserve given
// a request for "size" elements
uint32_t
bool
SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
{
- assert(numRegions * minAllocatedElements(size) <= poolSize());
-
return _reservedGroups == 0;
}
#include <cstdint>
#include "gpu-compute/pool_manager.hh"
+#include "params/SimplePoolManager.hh"
// Simple Pool Manager: allows one region per pool. No region merging is
// supported.
class SimplePoolManager : public PoolManager
{
public:
- SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
- : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+ SimplePoolManager(const PoolManagerParams *p)
+ : PoolManager(p), _regionSize(0), _nxtFreeIdx(0),
_reservedGroups(0)
{
}
// be reserved)
uint32_t _regionSize;
// next index to allocate a region
- uint8_t _nxtFreeIdx;
+ int _nxtFreeIdx;
// number of groups that reserve a region
uint32_t _reservedGroups;
};
--- /dev/null
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#include "gpu-compute/static_register_manager_policy.hh"
+
+#include "config/the_gpu_isa.hh"
+#include "debug/GPURename.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/pool_manager.hh"
+#include "gpu-compute/scalar_register_file.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+StaticRegisterManagerPolicy::StaticRegisterManagerPolicy()
+{
+}
+
+void
+StaticRegisterManagerPolicy::exec()
+{
+}
+
+int
+StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex)
+{
+ panic_if((vgprIndex >= w->reservedVectorRegs)
+ || (w->reservedVectorRegs < 0),
+ "VGPR index %d is out of range: VGPR range=[0,%d]",
+ vgprIndex, w->reservedVectorRegs);
+
+ // add the offset from where the VGPRs of the wavefront have been assigned
+ int physicalVgprIndex = w->startVgprIndex + vgprIndex;
+
+ panic_if(!((w->startVgprIndex <= physicalVgprIndex) &&
+ (w->startVgprIndex + w->reservedVectorRegs - 1)
+ >= physicalVgprIndex),
+ "Invalid VGPR index %d\n", physicalVgprIndex);
+
+ // calculate physical VGPR index
+ return physicalVgprIndex % w->computeUnit->vrf[w->simdId]->numRegs();
+}
+
+int
+StaticRegisterManagerPolicy::mapSgpr(Wavefront* w, int sgprIndex)
+{
+ panic_if(!((sgprIndex < w->reservedScalarRegs)
+ && (w->reservedScalarRegs > 0)),
+ "SGPR index %d is out of range: SGPR range=[0,%d]\n",
+ sgprIndex, w->reservedScalarRegs);
+
+ // add the offset from where the SGPRs of the wavefront have been assigned
+ int physicalSgprIndex = w->startSgprIndex + sgprIndex;
+
+ panic_if(!((w->startSgprIndex <= physicalSgprIndex) &&
+ (w->startSgprIndex + w->reservedScalarRegs - 1)
+ >= physicalSgprIndex),
+ "Invalid SGPR index %d\n", physicalSgprIndex);
+
+ // calculate physical SGPR index
+ return physicalSgprIndex % w->computeUnit->srf[w->simdId]->numRegs();
+}
+
+bool
+StaticRegisterManagerPolicy::canAllocateVgprs(int simdId, int nWfs,
+ int demandPerWf)
+{
+ return cu->registerManager->vrfPoolMgrs[simdId]->
+ canAllocate(nWfs, demandPerWf);
+}
+
+bool
+StaticRegisterManagerPolicy::canAllocateSgprs(int simdId, int nWfs,
+ int demandPerWf)
+{
+ return cu->registerManager->srfPoolMgrs[simdId]->
+ canAllocate(nWfs, demandPerWf);
+}
+
+void
+StaticRegisterManagerPolicy::allocateRegisters(Wavefront *w, int vectorDemand,
+ int scalarDemand)
+{
+ uint32_t allocatedSize = 0;
+ w->startVgprIndex = cu->registerManager->vrfPoolMgrs[w->simdId]->
+ allocateRegion(vectorDemand, &allocatedSize);
+ w->reservedVectorRegs = allocatedSize;
+ cu->vectorRegsReserved[w->simdId] += w->reservedVectorRegs;
+ panic_if(cu->vectorRegsReserved[w->simdId] > cu->numVecRegsPerSimd,
+ "VRF[%d] has been overallocated %d > %d\n",
+ w->simdId, cu->vectorRegsReserved[w->simdId],
+ cu->numVecRegsPerSimd);
+
+ if (scalarDemand) {
+ w->startSgprIndex = cu->registerManager->srfPoolMgrs[w->simdId]->
+ allocateRegion(scalarDemand, &allocatedSize);
+ w->reservedScalarRegs = allocatedSize;
+ cu->scalarRegsReserved[w->simdId] += w->reservedScalarRegs;
+ panic_if(cu->scalarRegsReserved[w->simdId] > cu->numScalarRegsPerSimd,
+ "SRF[%d] has been overallocated %d > %d\n",
+ w->simdId, cu->scalarRegsReserved[w->simdId],
+ cu->numScalarRegsPerSimd);
+ }
+}
+
+void
+StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
+{
+ // free the vector registers of the completed wavefront
+ w->computeUnit->vectorRegsReserved[w->simdId] -= w->reservedVectorRegs;
+ // free the scalar registers of the completed wavefront
+ w->computeUnit->scalarRegsReserved[w->simdId] -= w->reservedScalarRegs;
+
+ panic_if(w->computeUnit->vectorRegsReserved[w->simdId] < 0,
+ "Freeing VRF[%d] registers left %d registers reserved\n",
+ w->simdId,
+ w->computeUnit->vectorRegsReserved[w->simdId]);
+ panic_if(w->computeUnit->scalarRegsReserved[w->simdId] < 0,
+ "Freeing SRF[%d] registers left %d registers reserved\n",
+ w->simdId,
+ w->computeUnit->scalarRegsReserved[w->simdId]);
+
+ int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
+ w->computeUnit->vrf[w->simdId]->numRegs();
+
+ w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
+ freeRegion(w->startVgprIndex, endIndex);
+
+ // mark/pre-mark all registers as not busy
+ for (int i = 0; i < w->reservedVectorRegs; i++) {
+ uint32_t physVgprIdx = mapVgpr(w, i);
+ w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
+ }
+
+ w->reservedVectorRegs = 0;
+ w->startVgprIndex = 0;
+
+ endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
+ w->computeUnit->srf[w->simdId]->numRegs();
+ w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
+ freeRegion(w->startSgprIndex, endIndex);
+
+ // mark/pre-mark all registers as not busy
+ for (int i = 0; i < w->reservedScalarRegs; i++) {
+ uint32_t physSgprIdx = mapSgpr(w, i);
+ w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);
+ }
+
+ w->reservedScalarRegs = 0;
+ w->startSgprIndex = 0;
+}
+
+void
+StaticRegisterManagerPolicy::regStats()
+{
+}
--- /dev/null
+/*
+ * Copyright (c) 2016 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Mark Wyse
+ */
+
+#ifndef __STATIC_REGISTER_MANAGER_POLICY_HH__
+#define __STATIC_REGISTER_MANAGER_POLICY_HH__
+
+#include "gpu-compute/register_manager_policy.hh"
+
+class HSAQueueEntry;
+
+class StaticRegisterManagerPolicy : public RegisterManagerPolicy
+{
+ public:
+
+ StaticRegisterManagerPolicy();
+
+ void exec() override;
+
+ int mapVgpr(Wavefront* w, int vgprIndex) override;
+ int mapSgpr(Wavefront* w, int sgprIndex) override;
+
+ bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) override;
+ bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) override;
+
+ void allocateRegisters(Wavefront *w, int vectorDemand,
+ int scalarDemand) override;
+
+ void freeRegisters(Wavefront *w) override;
+
+ void regStats() override;
+};
+
+#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__
TLBCoalescer::TLBCoalescer(const Params *p)
: ClockedObject(p),
- clock(p->clk_domain->clockPeriod()),
TLBProbesPerCycle(p->probesPerCycle),
coalescingWindow(p->coalescingWindow),
disableCoalescing(p->disableCoalescing),
//coalesced requests to the TLB
if (!coalescer->probeTLBEvent.scheduled()) {
coalescer->schedule(coalescer->probeTLBEvent,
- curTick() + coalescer->ticks(1));
+ curTick() + coalescer->clockPeriod());
}
return true;
//we've receeived a retry. Schedule a probeTLBEvent
if (!coalescer->probeTLBEvent.scheduled())
coalescer->schedule(coalescer->probeTLBEvent,
- curTick() + coalescer->ticks(1));
+ curTick() + coalescer->clockPeriod());
}
void
// send the coalesced request for virt_page_addr
if (!memSidePort[0]->sendTimingReq(first_packet)) {
- DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+ DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
virt_page_addr);
// No need for a retries queue since we are already buffering
*/
class TLBCoalescer : public ClockedObject
{
- protected:
- // TLB clock: will inherit clock from shader's clock period in terms
- // of nuber of ticks of curTime (aka global simulation clock)
- // The assignment of TLB clock from shader clock is done in the
- // python config files.
- int clock;
-
public:
typedef TLBCoalescerParams Params;
TLBCoalescer(const Params *p);
* option is to change it to curTick(), so we coalesce based
* on the receive time.
*/
- typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+ typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
+ CoalescingFIFO;
CoalescingFIFO coalescerFIFO;
void updatePhysAddresses(PacketPtr pkt);
void regStats() override;
- // Clock related functions. Maps to-and-from
- // Simulation ticks and object clocks.
- Tick frequency() const { return SimClock::Frequency / clock; }
- Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
- Tick curCycle() const { return curTick() / clock; }
- Tick tickToCycles(Tick val) const { return val / clock;}
-
class CpuSidePort : public SlavePort
{
public:
virtual void
recvRespRetry()
{
- fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+ fatal("recvRespRetry() is not implemented in the TLB "
+ "coalescer.\n");
}
virtual AddrRangeList getAddrRanges() const;
#include <string>
#include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
-#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/wavefront.hh"
#include "params/VectorRegisterFile.hh"
VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
- : SimObject(p),
- manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
- simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
- vgprState(new VecRegisterState())
+ : RegisterFile(p)
{
- fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
- fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+ regFile.resize(numRegs(), VecRegContainer());
- fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
- "multiple of VRF size\n");
-
- busy.clear();
- busy.resize(numRegsPerSimd, 0);
- nxtBusy.clear();
- nxtBusy.resize(numRegsPerSimd, 0);
-
- vgprState->init(numRegsPerSimd, p->wfSize);
-}
-
-void
-VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
-{
- computeUnit = _computeUnit;
- vgprState->setParent(computeUnit);
-}
-
-uint8_t
-VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
-{
- uint8_t status = nxtBusy.at(idx);
-
- if (operandSize > 4) {
- status = status | (nxtBusy.at((idx + 1) % numRegs()));
- }
-
- return status;
-}
-
-uint8_t
-VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
-{
- uint8_t status = busy.at(idx);
-
- if (operandSize > 4) {
- status = status | (busy.at((idx + 1) % numRegs()));
- }
-
- return status;
-}
-
-void
-VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
-{
- nxtBusy.at(regIdx) = value;
-
- if (operandSize > 4) {
- nxtBusy.at((regIdx + 1) % numRegs()) = value;
- }
-}
-
-void
-VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
-{
- busy.at(regIdx) = value;
-
- if (operandSize > 4) {
- busy.at((regIdx + 1) % numRegs()) = value;
+ for (auto ® : regFile) {
+ reg.zero();
}
}
VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
{
for (int i = 0; i < ii->getNumOperands(); ++i) {
- if (ii->isVectorRegister(i)) {
- uint32_t vgprIdx = ii->getRegisterIndex(i, ii);
- uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
-
- if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
- if (ii->isDstOperand(i)) {
- w->numTimesBlockedDueWAXDependencies++;
- } else if (ii->isSrcOperand(i)) {
- w->numTimesBlockedDueRAWDependencies++;
- }
-
- return false;
- }
-
- if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
- if (ii->isDstOperand(i)) {
- w->numTimesBlockedDueWAXDependencies++;
- } else if (ii->isSrcOperand(i)) {
- w->numTimesBlockedDueRAWDependencies++;
+ if (ii->isVectorRegister(i) && ii->isSrcOperand(i)) {
+ int vgprIdx = ii->getRegisterIndex(i, ii);
+
+ // determine number of registers
+ int nRegs =
+ ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4;
+ for (int j = 0; j < nRegs; j++) {
+ int pVgpr = computeUnit->registerManager
+ ->mapVgpr(w, vgprIdx + j);
+ if (regBusy(pVgpr)) {
+ if (ii->isDstOperand(i)) {
+ w->numTimesBlockedDueWAXDependencies++;
+ } else if (ii->isSrcOperand(i)) {
+ DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
+ w->wfDynId, ii->disassemble(), pVgpr);
+ w->numTimesBlockedDueRAWDependencies++;
+ }
+ return false;
}
-
- return false;
}
}
}
-
return true;
}
void
-VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+VectorRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii)
{
- bool loadInstr = ii->isLoad();
- bool atomicInstr = ii->isAtomic() || ii->isMemFence();
-
- bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
-
// iterate over all register destination operands
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
- uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
- ii->getOperandSize(i), 1);
-
- // mark the destination vector register as busy
- markReg(physReg, ii->getOperandSize(i), 1);
- // clear the in-flight status of the destination vector register
- preMarkReg(physReg, ii->getOperandSize(i), 0);
-
- // FIXME: if we ever model correct timing behavior
- // for load argument instructions then we should not
- // set the destination register as busy now but when
- // the data returns. Loads and Atomics should free
- // their destination registers when the data returns,
- // not now
- if (!atomicInstr && !loadNoArgInstr) {
- uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
- computeUnit->spBypassLength() :
- computeUnit->dpBypassLength();
-
- // schedule an event for marking the register as ready
- computeUnit->registerEvent(w->simdId, physReg,
- ii->getOperandSize(i),
- computeUnit->shader->tick_cnt +
- computeUnit->shader->ticks(pipeLen),
- 0);
+ int vgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+ ii->getOperandSize(i) / 4;
+
+ for (int j = 0; j < nRegs; ++j) {
+ int physReg = computeUnit->registerManager
+ ->mapVgpr(w, vgprIdx + j);
+
+ // If instruction is atomic instruction and
+ // the atomics do not return value, then
+ // do not mark this reg as busy.
+ if (!(ii->isAtomic() && !ii->isAtomicRet())) {
+ /**
+ * if the instruction is a load with EXEC = 0, then
+ * we do not mark the reg. we do this to avoid a
+ * deadlock that can occur because a load reserves
+ * its destination regs before checking its exec mask,
+ * and in the case it is 0, it will not send/recv any
+ * packets, and therefore it will never free its dest
+ * reg(s).
+ */
+ if (!ii->isLoad() || (ii->isLoad()
+ && ii->exec_mask.any())) {
+ markReg(physReg, true);
+ }
+ }
}
}
}
}
-int
-VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
- std::vector<uint32_t> ®Vec, uint32_t operandSize,
- uint64_t timestamp)
+void
+VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
{
- int delay = 0;
+ // increment count of number of DWORDs read from VRF
+ int DWORDs = ii->numSrcVecDWORDs();
+ registerReads += (DWORDs * w->execMask().count());
+
+ uint64_t mask = w->execMask().to_ullong();
+ int srams = w->execMask().size() / 4;
+ for (int i = 0; i < srams; i++) {
+ if (mask & 0xF) {
+ sramReads += DWORDs;
+ }
+ mask = mask >> 4;
+ }
- panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
- regVec.size());
+ if (!ii->isLoad()
+ && !(ii->isAtomic() || ii->isMemSync())) {
+ int opSize = 4;
+ for (int i = 0; i < ii->getNumOperands(); i++) {
+ if (ii->getOperandSize(i) > opSize) {
+ opSize = ii->getOperandSize(i);
+ }
+ }
+ Cycles delay(opSize <= 4 ? computeUnit->spBypassLength()
+ : computeUnit->dpBypassLength());
+ Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+ for (int i = 0; i < ii->getNumOperands(); i++) {
+ if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+ int vgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1
+ : ii->getOperandSize(i) / 4;
+ for (int j = 0; j < nRegs; j++) {
+ int physReg = computeUnit->registerManager
+ ->mapVgpr(w, vgprIdx + j);
+ enqRegFreeEvent(physReg, tickDelay);
+ }
+ }
+ }
- for (int i = 0; i < regVec.size(); ++i) {
- // mark the destination VGPR as free when the timestamp expires
- computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
- computeUnit->shader->tick_cnt + timestamp +
- computeUnit->shader->ticks(delay), 0);
- }
+ // increment count of number of DWORDs written to VRF
+ DWORDs = ii->numDstVecDWORDs();
+ registerWrites += (DWORDs * w->execMask().count());
- return delay;
+ mask = w->execMask().to_ullong();
+ srams = w->execMask().size() / 4;
+ for (int i = 0; i < srams; i++) {
+ if (mask & 0xF) {
+ sramWrites += DWORDs;
+ }
+ mask = mask >> 4;
+ }
+ }
}
void
-VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+VectorRegisterFile::scheduleWriteOperandsFromLoad(
+ Wavefront *w, GPUDynInstPtr ii)
{
- // iterate over all register destination operands
+ assert(ii->isLoad() || ii->isAtomicRet());
for (int i = 0; i < ii->getNumOperands(); ++i) {
if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
- uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii),
- ii->getOperandSize(i), 1);
- // set the in-flight status of the destination vector register
- preMarkReg(physReg, ii->getOperandSize(i), 1);
+ int vgprIdx = ii->getRegisterIndex(i, ii);
+ int nRegs = ii->getOperandSize(i) <= 4 ? 1 :
+ ii->getOperandSize(i) / 4;
+
+ for (int j = 0; j < nRegs; ++j) {
+ int physReg = computeUnit->registerManager
+ ->mapVgpr(w, vgprIdx + j);
+ enqRegFreeEvent(physReg, computeUnit->clockPeriod());
+ }
}
}
-}
-
-bool
-VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
- GPUDynInstPtr ii,
- VrfAccessType accessType)
-{
- bool ready = true;
-
- return ready;
-}
-
-bool
-VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
- VrfAccessType accessType)
-{
- bool ready = true;
-
- return ready;
+ // increment count of number of DWORDs written to VRF
+ int DWORDs = ii->numDstVecDWORDs();
+ registerWrites += (DWORDs * ii->exec_mask.count());
+
+ uint64_t mask = ii->exec_mask.to_ullong();
+ int srams = ii->exec_mask.size() / 4;
+ for (int i = 0; i < srams; i++) {
+ if (mask & 0xF) {
+ sramWrites += DWORDs;
+ }
+ mask = mask >> 4;
+ }
}
VectorRegisterFile*
#ifndef __VECTOR_REGISTER_FILE_HH__
#define __VECTOR_REGISTER_FILE_HH__
-#include <list>
-
-#include "base/statistics.hh"
-#include "base/trace.hh"
-#include "base/types.hh"
+#include "arch/gpu_isa.hh"
+#include "config/the_gpu_isa.hh"
#include "debug/GPUVRF.hh"
-#include "gpu-compute/vector_register_state.hh"
-#include "sim/sim_object.hh"
-
-class ComputeUnit;
-class Shader;
-class SimplePoolManager;
-class Wavefront;
+#include "gpu-compute/register_file.hh"
+#include "gpu-compute/wavefront.hh"
struct VectorRegisterFileParams;
-enum class VrfAccessType : uint8_t
-{
- READ = 0x01,
- WRITE = 0x02,
- RD_WR = READ | WRITE
-};
-
// Vector Register File
-class VectorRegisterFile : public SimObject
+class VectorRegisterFile : public RegisterFile
{
public:
+ using VecRegContainer = TheGpuISA::VecRegContainerU32;
+
VectorRegisterFile(const VectorRegisterFileParams *p);
+ ~VectorRegisterFile() { }
- void setParent(ComputeUnit *_computeUnit);
+ virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override;
+ virtual void scheduleWriteOperands(Wavefront *w,
+ GPUDynInstPtr ii) override;
+ virtual void scheduleWriteOperandsFromLoad(Wavefront *w,
+ GPUDynInstPtr ii) override;
+ virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override;
- // Read a register
- template<typename T>
- T
- read(int regIdx, int threadId=0)
+ void
+ setParent(ComputeUnit *_computeUnit) override
{
- T p0 = vgprState->read<T>(regIdx, threadId);
- DPRINTF(GPUVRF, "reading vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)p0);
-
- return p0;
+ RegisterFile::setParent(_computeUnit);
}
- // Write a register
- template<typename T>
- void
- write(int regIdx, T value, int threadId=0)
+ // Read a register that is writeable (e.g., a DST operand)
+ VecRegContainer&
+ readWriteable(int regIdx)
{
- DPRINTF(GPUVRF, "writing vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)value);
- vgprState->write<T>(regIdx, value, threadId);
+ return regFile[regIdx];
}
- uint8_t regBusy(int idx, uint32_t operandSize) const;
- uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
-
- int numRegs() const { return numRegsPerSimd; }
-
- void markReg(int regIdx, uint32_t operandSize, uint8_t value);
- void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
-
- virtual void exec(GPUDynInstPtr ii, Wavefront *w);
-
- virtual int exec(uint64_t dynamic_id, Wavefront *w,
- std::vector<uint32_t> ®Vec, uint32_t operandSize,
- uint64_t timestamp);
-
- bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
- virtual void updateEvents() { }
- virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
-
- virtual bool
- isReadConflict(int memWfId, int exeWfId) const
+ // Read a register that is not writeable (e.g., src operand)
+ const VecRegContainer&
+ read(int regIdx) const
{
- return false;
+ return regFile[regIdx];
}
- virtual bool
- isWriteConflict(int memWfId, int exeWfId) const
+ // Write a register
+ void
+ write(int regIdx, const VecRegContainer &value)
{
- return false;
+ regFile[regIdx] = value;
}
- virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
- GPUDynInstPtr ii,
- VrfAccessType accessType);
-
- virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
- VrfAccessType accessType);
-
- SimplePoolManager *manager;
-
- protected:
- ComputeUnit* computeUnit;
- int simdId;
-
- // flag indicating if a register is busy
- std::vector<uint8_t> busy;
- // flag indicating if a register will be busy (by instructions
- // in the SIMD pipeline)
- std::vector<uint8_t> nxtBusy;
-
- // numer of registers (bank size) per simd unit (bank)
- int numRegsPerSimd;
+ void
+ printReg(Wavefront *wf, int regIdx) const
+ {
+#ifndef NDEBUG
+ const auto &vec_reg_cont = regFile[regIdx];
+ auto vgpr = vec_reg_cont.as<TheGpuISA::VecElemU32>();
+
+ for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ DPRINTF(GPUVRF, "WF[%d][%d]: WV[%d] v[%d][%d] = %#x\n",
+ wf->simdId, wf->wfSlotId, wf->wfDynId, regIdx, lane,
+ vgpr[lane]);
+ }
+ }
+#endif
+ }
- // vector register state
- VecRegisterState *vgprState;
+ private:
+ std::vector<VecRegContainer> regFile;
};
#endif // __VECTOR_REGISTER_FILE_HH__
#include "gpu-compute/wavefront.hh"
#include "debug/GPUExec.hh"
+#include "debug/GPUInitAbi.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
Wavefront*
}
Wavefront::Wavefront(const Params *p)
- : SimObject(p), callArgMem(nullptr), _gpuISA()
+ : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
+ maxIbSize(p->max_ib_size), _gpuISA(*this),
+ vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1)
{
lastTrace = 0;
- simdId = p->simdId;
- wfSlotId = p->wf_slot_id;
+ execUnitId = -1;
status = S_STOPPED;
reservedVectorRegs = 0;
+ reservedScalarRegs = 0;
startVgprIndex = 0;
+ startSgprIndex = 0;
outstandingReqs = 0;
- memReqsInPipe = 0;
outstandingReqsWrGm = 0;
outstandingReqsWrLm = 0;
outstandingReqsRdGm = 0;
rdGmReqsInPipe = 0;
wrLmReqsInPipe = 0;
wrGmReqsInPipe = 0;
-
+ scalarRdGmReqsInPipe = 0;
+ scalarWrGmReqsInPipe = 0;
+ scalarOutstandingReqsRdGm = 0;
+ scalarOutstandingReqsWrGm = 0;
+ lastNonIdleTick = 0;
barrierCnt = 0;
oldBarrierCnt = 0;
stalledAtBarrier = false;
+ ldsChunk = nullptr;
memTraceBusy = 0;
oldVgprTcnt = 0xffffffffffffffffll;
oldDgprTcnt = 0xffffffffffffffffll;
- oldVgpr.resize(p->wfSize);
+ oldVgpr.resize(p->wf_size);
pendingFetch = false;
dropFetch = false;
- condRegState = new ConditionRegisterState();
- maxSpVgprs = 0;
- maxDpVgprs = 0;
- lastAddr.resize(p->wfSize);
- workItemFlatId.resize(p->wfSize);
- oldDgpr.resize(p->wfSize);
- barCnt.resize(p->wfSize);
+ maxVgprs = 0;
+ maxSgprs = 0;
+
+ lastAddr.resize(p->wf_size);
+ workItemFlatId.resize(p->wf_size);
+ oldDgpr.resize(p->wf_size);
+ barCnt.resize(p->wf_size);
for (int i = 0; i < 3; ++i) {
- workItemId[i].resize(p->wfSize);
+ workItemId[i].resize(p->wf_size);
}
+
+ _execMask.set();
+ rawDist.clear();
+ lastInstExec = 0;
+ vecReads.clear();
}
void
{
SimObject::regStats();
- srcRegOpDist
- .init(0, 4, 2)
- .name(name() + ".src_reg_operand_dist")
- .desc("number of executed instructions with N source register operands")
- ;
-
- dstRegOpDist
- .init(0, 3, 2)
- .name(name() + ".dst_reg_operand_dist")
- .desc("number of executed instructions with N destination register "
- "operands")
- ;
-
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueWAXDependencies
.name(name() + ".timesBlockedDueWAXDependencies")
"dependencies")
;
- // FIXME: the name of the WF needs to be unique
- numTimesBlockedDueVrfPortAvail
- .name(name() + ".timesBlockedDueVrfPortAvail")
- .desc("number of times instructions are blocked due to VRF port "
- "availability")
+ numInstrExecuted
+ .name(name() + ".num_instr_executed")
+ .desc("number of instructions executed by this WF slot")
+ ;
+
+ schCycles
+ .name(name() + ".sch_cycles")
+ .desc("number of cycles spent in schedule stage")
+ ;
+
+ schStalls
+ .name(name() + ".sch_stalls")
+ .desc("number of cycles WF is stalled in SCH stage")
+ ;
+
+ schRfAccessStalls
+ .name(name() + ".sch_rf_access_stalls")
+ .desc("number of cycles wave selected in SCH but RF denied adding "
+ "instruction")
+ ;
+
+ schResourceStalls
+ .name(name() + ".sch_resource_stalls")
+ .desc("number of cycles stalled in sch by resource not available")
+ ;
+
+ schOpdNrdyStalls
+ .name(name() + ".sch_opd_nrdy_stalls")
+ .desc("number of cycles stalled in sch waiting for RF reads to "
+ "complete")
+ ;
+
+ schLdsArbStalls
+ .name(name() + ".sch_lds_arb_stalls")
+ .desc("number of cycles wave stalled due to LDS-VRF arbitration")
+ ;
+
+ vecRawDistance
+ .init(0,20,1)
+ .name(name() + ".vec_raw_distance")
+ .desc("Count of RAW distance in dynamic instructions for this WF")
+ ;
+
+ readsPerWrite
+ .init(0,4,1)
+ .name(name() + ".vec_reads_per_write")
+ .desc("Count of Vector reads per write for this WF")
;
}
Wavefront::init()
{
reservedVectorRegs = 0;
+ reservedScalarRegs = 0;
startVgprIndex = 0;
+ startSgprIndex = 0;
+
+ scalarAlu = computeUnit->mapWaveToScalarAlu(this);
+ scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
+ globalMem = computeUnit->mapWaveToGlobalMem(this);
+ localMem = computeUnit->mapWaveToLocalMem(this);
+ scalarMem = computeUnit->mapWaveToScalarMem(this);
+}
+
+void
+Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
+{
+ int regInitIdx = 0;
+
+ // iterate over all the init fields and check which
+ // bits are enabled
+ for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
+
+ if (task->sgprBitEnabled(en_bit)) {
+ int physSgprIdx = 0;
+ uint32_t wiCount = 0;
+ uint32_t firstWave = 0;
+ int orderedAppendTerm = 0;
+ int numWfsInWg = 0;
+ uint32_t finalValue = 0;
+ Addr host_disp_pkt_addr = task->hostDispPktAddr();
+ Addr kernarg_addr = task->kernargAddr();
+ Addr hidden_priv_base(0);
+
+ switch (en_bit) {
+ case PrivateSegBuf:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[0]);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting PrivateSegBuffer: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[0]);
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[1]);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting PrivateSegBuffer: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[1]);
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[2]);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting PrivateSegBuffer: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[2]);
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[3]);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting PrivateSegBuffer: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ task->amdQueue.scratch_resource_descriptor[3]);
+ break;
+ case DispatchPtr:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ ((uint32_t*)&host_disp_pkt_addr)[0]);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting DispatchPtr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ ((uint32_t*)&host_disp_pkt_addr)[0]);
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ ((uint32_t*)&host_disp_pkt_addr)[1]);
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting DispatchPtr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ ((uint32_t*)&host_disp_pkt_addr)[1]);
+
+ ++regInitIdx;
+ break;
+ case QueuePtr:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ ((uint32_t*)&task->hostAMDQueueAddr)[0]);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting QueuePtr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ ((uint32_t*)&task->hostAMDQueueAddr)[0]);
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ ((uint32_t*)&task->hostAMDQueueAddr)[1]);
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting QueuePtr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ ((uint32_t*)&task->hostAMDQueueAddr)[1]);
+
+ ++regInitIdx;
+ break;
+ case KernargSegPtr:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ ((uint32_t*)&kernarg_addr)[0]);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting KernargSegPtr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ ((uint32_t*)kernarg_addr)[0]);
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ ((uint32_t*)&kernarg_addr)[1]);
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting KernargSegPtr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ ((uint32_t*)kernarg_addr)[1]);
+
+ ++regInitIdx;
+ break;
+ case FlatScratchInit:
+ physSgprIdx
+ = computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ (TheGpuISA::ScalarRegU32)(task->amdQueue
+ .scratch_backing_memory_location & 0xffffffff));
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting FlatScratch Addr: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ (TheGpuISA::ScalarRegU32)(task->amdQueue
+ .scratch_backing_memory_location & 0xffffffff));
+
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ // This vallue should be sizeof(DWORD) aligned, that is
+ // 4 byte aligned
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ task->amdQueue.scratch_workitem_byte_size);
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting FlatScratch size: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ task->amdQueue.scratch_workitem_byte_size);
+ /**
+ * Since flat scratch init is needed for this kernel, this
+ * kernel is going to have flat memory instructions and we
+ * need to initialize the hidden private base for this queue.
+ * scratch_resource_descriptor[0] has this queue's scratch
+ * base address. scratch_backing_memory_location has the
+ * offset to this queue's scratch base address from the
+ * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
+ * queue's scratch base address for address calculation
+ * (stored in scratch_resource_descriptor[0]). But that
+ * address calculation shoule be done by first finding the
+ * queue's scratch base address using the calculation
+ * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
+ * SH_HIDDEN_PRIVATE_BASE_VMID.
+ *
+ * For more details see:
+ * http://rocm-documentation.readthedocs.io/en/latest/
+ * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
+ *
+ * https://github.com/ROCm-Developer-Tools/
+ * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
+ * #flat-addressing
+ */
+ hidden_priv_base =
+ (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
+ (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
+ & 0x000000000000ffff) << 32);
+ computeUnit->shader->initShHiddenPrivateBase(
+ hidden_priv_base,
+ task->amdQueue.scratch_backing_memory_location);
+ break;
+ case GridWorkgroupCountX:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ wiCount = ((task->gridSize(0) +
+ task->wgSize(0) - 1) /
+ task->wgSize(0));
+ computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting num WG X: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, wiCount);
+ break;
+ case GridWorkgroupCountY:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ wiCount = ((task->gridSize(1) +
+ task->wgSize(1) - 1) /
+ task->wgSize(1));
+ computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting num WG Y: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, wiCount);
+ break;
+ case GridWorkgroupCountZ:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ wiCount = ((task->gridSize(2) +
+ task->wgSize(2) - 1) /
+ task->wgSize(2));
+ computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting num WG Z: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, wiCount);
+ break;
+ case WorkgroupIdX:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ workGroupId[0]);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting WG ID X: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
+ break;
+ case WorkgroupIdY:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ workGroupId[1]);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting WG ID Y: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
+ break;
+ case WorkgroupIdZ:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->write(physSgprIdx,
+ workGroupId[2]);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting WG ID Z: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
+ break;
+ case PrivSegWaveByteOffset:
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ /**
+ * the compute_tmpring_size_wavesize specifies the number of
+ * kB allocated per wavefront, hence the multiplication by
+ * 1024.
+ *
+ * to get the per wavefront offset into the scratch
+ * memory, we also multiply this by the wfId. the wfId stored
+ * in the Wavefront class, however, is the wave ID within the
+ * WG, whereas here we need the global WFID because the
+ * scratch space will be divided amongst all waves in the
+ * kernel. to get the global ID we multiply the WGID by
+ * the WG size, then add the WFID of the wave within its WG.
+ */
+ computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
+ (wgId * (wgSz / 64) + wfId) *
+ task->amdQueue.compute_tmpring_size_wavesize);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting Private Seg Offset: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx,
+ 1024 * (wgId * (wgSz / 64) + wfId) *
+ task->amdQueue.compute_tmpring_size_wavesize);
+ break;
+ case WorkgroupInfo:
+ firstWave = (wfId == 0) ? 1 : 0;
+ numWfsInWg = divCeil(wgSizeInWorkItems,
+ computeUnit->wfSize());
+ finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
+ finalValue |= (orderedAppendTerm << 6);
+ finalValue |= numWfsInWg;
+ physSgprIdx =
+ computeUnit->registerManager->mapSgpr(this, regInitIdx);
+ computeUnit->srf[simdId]->
+ write(physSgprIdx, finalValue);
+
+ ++regInitIdx;
+ DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+ "Setting WG Info: s[%d] = %x\n",
+ computeUnit->cu_id, simdId,
+ wfSlotId, wfDynId, physSgprIdx, finalValue);
+ break;
+ default:
+ fatal("SGPR enable bit %i not supported\n", en_bit);
+ break;
+ }
+ }
+ }
+
+ regInitIdx = 0;
+
+ // iterate over all the init fields and check which
+ // bits are enabled
+ for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
+ if (task->vgprBitEnabled(en_bit)) {
+ uint32_t physVgprIdx = 0;
+ TheGpuISA::VecRegContainerU32 raw_vgpr;
+
+ switch (en_bit) {
+ case WorkitemIdX:
+ {
+ physVgprIdx = computeUnit->registerManager
+ ->mapVgpr(this, regInitIdx);
+ TheGpuISA::VecRegU32 vgpr_x
+ = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+ for (int lane = 0; lane < workItemId[0].size(); ++lane) {
+ vgpr_x[lane] = workItemId[0][lane];
+ }
+
+ computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+ rawDist[regInitIdx] = 0;
+ ++regInitIdx;
+ }
+ break;
+ case WorkitemIdY:
+ {
+ physVgprIdx = computeUnit->registerManager
+ ->mapVgpr(this, regInitIdx);
+ TheGpuISA::VecRegU32 vgpr_y
+ = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+ for (int lane = 0; lane < workItemId[1].size(); ++lane) {
+ vgpr_y[lane] = workItemId[1][lane];
+ }
+
+ computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+ rawDist[regInitIdx] = 0;
+ ++regInitIdx;
+ }
+ break;
+ case WorkitemIdZ:
+ {
+ physVgprIdx = computeUnit->registerManager->
+ mapVgpr(this, regInitIdx);
+ TheGpuISA::VecRegU32 vgpr_z
+ = raw_vgpr.as<TheGpuISA::VecElemU32>();
+
+ for (int lane = 0; lane < workItemId[2].size(); ++lane) {
+ vgpr_z[lane] = workItemId[2][lane];
+ }
+
+ computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
+ rawDist[regInitIdx] = 0;
+ ++regInitIdx;
+ }
+ break;
+ }
+ }
+ }
}
void
-Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
{
- condRegState->init(num_cregs);
- maxSpVgprs = num_sregs;
- maxDpVgprs = num_dregs;
+ maxVgprs = num_vregs;
+ maxSgprs = num_sregs;
}
Wavefront::~Wavefront()
{
- if (callArgMem)
- delete callArgMem;
- delete condRegState;
}
void
-Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
+Wavefront::setStatus(status_e newStatus)
+{
+ if (computeUnit->idleCUTimeout > 0) {
+ // Wavefront's status transitions to stalled or stopped
+ if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
+ newStatus == S_WAITCNT) &&
+ (status != newStatus)) {
+ computeUnit->idleWfs++;
+ assert(computeUnit->idleWfs <=
+ (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
+ if (computeUnit->idleWfs ==
+ (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
+ lastNonIdleTick = curTick();
+ }
+ // Wavefront's status transitions to an active state (from
+ // a stopped or stalled state)
+ } else if ((status == S_STOPPED || status == S_STALLED ||
+ status == S_WAITCNT) &&
+ (status != newStatus)) {
+ // if all WFs in the CU were idle then check if the idleness
+ // period exceeded the timeout threshold
+ if (computeUnit->idleWfs ==
+ (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
+ panic_if((curTick() - lastNonIdleTick) >=
+ computeUnit->idleCUTimeout,
+ "CU%d has been idle for %d ticks at tick %d",
+ computeUnit->cu_id, computeUnit->idleCUTimeout,
+ curTick());
+ }
+ computeUnit->idleWfs--;
+ assert(computeUnit->idleWfs >= 0);
+ }
+ }
+ status = newStatus;
+}
+
+void
+Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
{
wfDynId = _wf_dyn_id;
- basePtr = _base_ptr;
+ _pc = init_pc;
+
status = S_RUNNING;
+
+ vecReads.resize(maxVgprs, 0);
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
- if (ii->isGlobalMem() || ii->isFlat())
+ if (ii->isGlobalMem() ||
+ (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) {
return true;
+ }
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
- if (ii->isLocalMem()) {
+ if (ii->isLocalMem() ||
+ (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstWaitcnt()
+{
+ if (instructionBuffer.empty())
+ return false;
+
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (ii->isWaitcnt()) {
+ // waitcnt is a scalar
+ assert(ii->isScalar());
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstScalarALU()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
+ || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
+ (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
}
bool
-Wavefront::isOldestInstALU()
+Wavefront::isOldestInstVectorALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
- if (status != S_STOPPED && (ii->isNop() ||
- ii->isReturn() || ii->isBranch() ||
- ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
+ if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
+ ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
+ || (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
- if (status != S_STOPPED && ii->isGlobalMem()) {
+ if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstScalarMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
-// Return true if the Wavefront's instruction
-// buffer has branch instruction.
bool
-Wavefront::instructionBufferHasBranch()
+Wavefront::stopFetch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
-
- if (ii->isReturn() || ii->isBranch()) {
+ if (ii->isReturn() || ii->isBranch() ||
+ ii->isEndOfKernel()) {
return true;
}
}
return false;
}
-// Remap HSAIL register to physical VGPR.
-// HSAIL register = virtual register assigned to an operand by HLC compiler
-uint32_t
-Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+void
+Wavefront::freeResources()
{
- assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
- // add the offset from where the VGPRs of the wavefront have been assigned
- uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
- // HSAIL double precision (DP) register: calculate the physical VGPR index
- // assuming that DP registers are placed after SP ones in the VRF. The DP
- // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
- // the DP VGPR index before mapping it to the physical VRF address space
- if (mode == 1 && size > 4) {
- physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
- }
-
- assert((startVgprIndex <= physicalVgprIndex) &&
- (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
-
- // calculate absolute physical VGPR index
- return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+ execUnitId = -1;
}
-// Return true if this wavefront is ready
-// to execute an instruction of the specified type.
-int
-Wavefront::ready(itype_e type)
+void Wavefront::validateRequestCounters()
{
- // Check to make sure wave is running
- if (status == S_STOPPED || status == S_RETURNING ||
- instructionBuffer.empty()) {
- return 0;
- }
-
- // Is the wave waiting at a barrier
- if (stalledAtBarrier) {
- if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
- computeUnit->getRefCounter(dispatchId, wgId))) {
- // Are all threads at barrier?
- return 0;
- }
- oldBarrierCnt = barrierCnt;
- stalledAtBarrier = false;
- }
-
- // Read instruction
- GPUDynInstPtr ii = instructionBuffer.front();
+ panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
+ wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
+ outstandingReqs < 0,
+ "Negative requests in pipe for WF%d for slot%d"
+ " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
+ " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
+ " Outstanding Reqs=%d\n",
+ wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
+ rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
+}
- bool ready_inst M5_VAR_USED = false;
- bool glbMemBusRdy = false;
- bool glbMemIssueRdy = false;
- if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
- for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
- if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
- glbMemBusRdy = true;
- if (computeUnit->wfWait[j].prerdy())
- glbMemIssueRdy = true;
+void
+Wavefront::reserveGmResource(GPUDynInstPtr ii)
+{
+ if (!ii->isScalar()) {
+ if (ii->isLoad()) {
+ rdGmReqsInPipe++;
+ } else if (ii->isStore()) {
+ wrGmReqsInPipe++;
+ } else if (ii->isAtomic() || ii->isMemSync()) {
+ rdGmReqsInPipe++;
+ wrGmReqsInPipe++;
+ } else {
+ panic("Invalid memory operation!\n");
}
- }
- bool locMemBusRdy = false;
- bool locMemIssueRdy = false;
- if (type == I_SHARED || type == I_FLAT) {
- for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
- if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
- locMemBusRdy = true;
- if (computeUnit->wfWait[j].prerdy())
- locMemIssueRdy = true;
+ execUnitId = globalMem;
+ } else {
+ if (ii->isLoad()) {
+ scalarRdGmReqsInPipe++;
+ } else if (ii->isStore()) {
+ scalarWrGmReqsInPipe++;
+ } else if (ii->isAtomic() || ii->isMemSync()) {
+ scalarWrGmReqsInPipe++;
+ scalarRdGmReqsInPipe++;
+ } else {
+ panic("Invalid memory operation!\n");
}
+ execUnitId = scalarMem;
}
+}
- // The following code is very error prone and the entire process for
- // checking readiness will be fixed eventually. In the meantime, let's
- // make sure that we do not silently let an instruction type slip
- // through this logic and always return not ready.
- if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
- ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
- ii->isMemFence() || ii->isFlat())) {
- panic("next instruction: %s is of unknown type\n", ii->disassemble());
- }
-
- DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
- computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
-
- if (type == I_ALU && ii->isBarrier()) {
- // Here for ALU instruction (barrier)
- if (!computeUnit->wfWait[simdId].prerdy()) {
- // Is wave slot free?
- return 0;
- }
-
- // Are there in pipe or outstanding memory requests?
- if ((outstandingReqs + memReqsInPipe) > 0) {
- return 0;
- }
-
- ready_inst = true;
- } else if (type == I_ALU && ii->isNop()) {
- // Here for ALU instruction (nop)
- if (!computeUnit->wfWait[simdId].prerdy()) {
- // Is wave slot free?
- return 0;
- }
-
- ready_inst = true;
- } else if (type == I_ALU && ii->isReturn()) {
- // Here for ALU instruction (return)
- if (!computeUnit->wfWait[simdId].prerdy()) {
- // Is wave slot free?
- return 0;
- }
-
- // Are there in pipe or outstanding memory requests?
- if ((outstandingReqs + memReqsInPipe) > 0) {
- return 0;
- }
-
- ready_inst = true;
- } else if (type == I_ALU && (ii->isBranch() ||
- ii->isALU() ||
- (ii->isKernArgSeg() && ii->isLoad()) ||
- ii->isArgSeg())) {
- // Here for ALU instruction (all others)
- if (!computeUnit->wfWait[simdId].prerdy()) {
- // Is alu slot free?
- return 0;
- }
- if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
- VrfAccessType::RD_WR)) {
- return 0;
- }
-
- if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
- return 0;
- }
- ready_inst = true;
- } else if (type == I_GLOBAL && ii->isGlobalMem()) {
- // Here Global memory instruction
- if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
- // Are there in pipe or outstanding global memory write requests?
- if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
- return 0;
- }
- }
-
- if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
- // Are there in pipe or outstanding global memory read requests?
- if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
- return 0;
- }
-
- if (!glbMemIssueRdy) {
- // Is WV issue slot free?
- return 0;
- }
-
- if (!glbMemBusRdy) {
- // Is there an available VRF->Global memory read bus?
- return 0;
- }
-
- // Does the coalescer have space for our instruction?
- if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
- return 0;
- }
-
- if (!computeUnit->globalMemoryPipe.
- isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
- // Can we insert a new request to the Global Mem Request FIFO?
- return 0;
- }
- // can we schedule source & destination operands on the VRF?
- if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
- VrfAccessType::RD_WR)) {
- return 0;
- }
- if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
- return 0;
- }
- ready_inst = true;
- } else if (type == I_SHARED && ii->isLocalMem()) {
- // Here for Shared memory instruction
- if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
- if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
- return 0;
- }
- }
-
- if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
- if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
- return 0;
- }
- }
-
- if (!locMemBusRdy) {
- // Is there an available VRF->LDS read bus?
- return 0;
- }
- if (!locMemIssueRdy) {
- // Is wave slot free?
- return 0;
- }
-
- if (!computeUnit->localMemoryPipe.
- isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
- // Can we insert a new request to the LDS Request FIFO?
- return 0;
- }
- // can we schedule source & destination operands on the VRF?
- if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
- VrfAccessType::RD_WR)) {
- return 0;
- }
- if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
- return 0;
- }
- ready_inst = true;
- } else if (type == I_FLAT && ii->isFlat()) {
- if (!glbMemBusRdy) {
- // Is there an available VRF->Global memory read bus?
- return 0;
- }
-
- if (!locMemBusRdy) {
- // Is there an available VRF->LDS read bus?
- return 0;
- }
-
- if (!glbMemIssueRdy) {
- // Is wave slot free?
- return 0;
- }
-
- if (!locMemIssueRdy) {
- return 0;
- }
-
- // Does the coalescer have space for our instruction?
- if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
- return 0;
- }
-
- if (!computeUnit->globalMemoryPipe.
- isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
- // Can we insert a new request to the Global Mem Request FIFO?
- return 0;
- }
-
- if (!computeUnit->localMemoryPipe.
- isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
- // Can we insert a new request to the LDS Request FIFO?
- return 0;
- }
- // can we schedule source & destination operands on the VRF?
- if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
- VrfAccessType::RD_WR)) {
- return 0;
- }
- // are all the operands ready? (RAW, WAW and WAR depedencies met?)
- if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
- return 0;
- }
- ready_inst = true;
+void
+Wavefront::reserveLmResource(GPUDynInstPtr ii)
+{
+ fatal_if(ii->isScalar(),
+ "Scalar instructions can not access Shared memory!!!");
+ if (ii->isLoad()) {
+ rdLmReqsInPipe++;
+ } else if (ii->isStore()) {
+ wrLmReqsInPipe++;
+ } else if (ii->isAtomic() || ii->isMemSync()) {
+ wrLmReqsInPipe++;
+ rdLmReqsInPipe++;
} else {
- return 0;
+ panic("Invalid memory operation!\n");
}
-
- assert(ready_inst);
-
- DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
- simdId, wfSlotId, ii->disassemble());
- return 1;
+ execUnitId = localMem;
}
-void
-Wavefront::updateResources()
+std::vector<int>
+Wavefront::reserveResources()
{
+ // vector of execution unit IDs to return to schedule stage
+ // this return is only used for debugging and an assertion...
+ std::vector<int> execUnitIds;
+
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
- computeUnit->vrf[simdId]->updateResources(this, ii);
+
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
- ii->isBranch() ||
- // FIXME: Kernel argument loads are currently treated as ALU operations
- // since we don't send memory packets at execution. If we fix that then
- // we should map them to one of the memory pipelines
+ ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
- ii->isReturn()) {
- computeUnit->aluPipe[simdId].preset(computeUnit->shader->
- ticks(computeUnit->spBypassLength()));
- // this is to enforce a fixed number of cycles per issue slot per SIMD
- computeUnit->wfWait[simdId].preset(computeUnit->shader->
- ticks(computeUnit->issuePeriod));
- } else if (ii->isBarrier()) {
- computeUnit->wfWait[simdId].preset(computeUnit->shader->
- ticks(computeUnit->issuePeriod));
- } else if (ii->isLoad() && ii->isFlat()) {
- assert(Enums::SC_NONE != ii->executedAs());
- memReqsInPipe++;
- rdGmReqsInPipe++;
- if ( Enums::SC_SHARED == ii->executedAs() ) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- preset(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- preset(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- }
- } else if (ii->isStore() && ii->isFlat()) {
- assert(Enums::SC_NONE != ii->executedAs());
- memReqsInPipe++;
- wrGmReqsInPipe++;
- if (Enums::SC_SHARED == ii->executedAs()) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- preset(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ ii->isReturn() || ii->isEndOfKernel()) {
+ if (!ii->isScalar()) {
+ execUnitId = simdId;
} else {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- preset(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ execUnitId = scalarAluGlobalIdx;
}
- } else if (ii->isLoad() && ii->isGlobalMem()) {
- memReqsInPipe++;
- rdGmReqsInPipe++;
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- preset(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if (ii->isStore() && ii->isGlobalMem()) {
- memReqsInPipe++;
- wrGmReqsInPipe++;
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- preset(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
- memReqsInPipe++;
- wrGmReqsInPipe++;
- rdGmReqsInPipe++;
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- preset(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if (ii->isLoad() && ii->isLocalMem()) {
- memReqsInPipe++;
- rdLmReqsInPipe++;
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- preset(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if (ii->isStore() && ii->isLocalMem()) {
- memReqsInPipe++;
- wrLmReqsInPipe++;
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- preset(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
- memReqsInPipe++;
- wrLmReqsInPipe++;
- rdLmReqsInPipe++;
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- preset(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ // this is to enforce a fixed number of cycles per issue slot per SIMD
+ } else if (ii->isBarrier()) {
+ execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
+ } else if (ii->isFlat()) {
+ assert(!ii->isScalar());
+ reserveLmResource(ii);
+ // add execUnitId, reserved by reserveLmResource, list before it is
+ // overwriten by reserveGmResource
+ execUnitIds.push_back(execUnitId);
+ flatLmUnitId = execUnitId;
+ reserveGmResource(ii);
+ flatGmUnitId = execUnitId;
+ execUnitIds.push_back(flatGmUnitId);
+ execUnitId = -1;
+ } else if (ii->isGlobalMem()) {
+ reserveGmResource(ii);
+ } else if (ii->isLocalMem()) {
+ reserveLmResource(ii);
+ } else if (ii->isPrivateSeg()) {
+ fatal_if(ii->isScalar(),
+ "Scalar instructions can not access Private memory!!!");
+ reserveGmResource(ii);
+ } else {
+ panic("reserveResources -> Couldn't process op!\n");
+ }
+
+ if (execUnitId != -1) {
+ execUnitIds.push_back(execUnitId);
}
+ assert(execUnitIds.size());
+ return execUnitIds;
}
void
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
- instructionBuffer.empty()) {
+ status==S_STALLED || instructionBuffer.empty()) {
return;
}
+ if (status == S_WAITCNT) {
+ /**
+ * if this wave is in S_WAITCNT state, then
+ * it should enter exec() precisely one time
+ * before the waitcnts are satisfied, in order
+ * to execute the waitcnt instruction itself
+ * thus we assert that the waitcnt is the
+ * oldest instruction. if we enter exec() with
+ * active waitcnts, and we're not executing
+ * the waitcnt instruction, something must be
+ * wrong
+ */
+ assert(isOldestInstWaitcnt());
+ }
+
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
- const uint32_t old_pc = pc();
+ const Addr old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
- "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
- ii->disassemble(), old_pc);
-
- // update the instruction stats in the CU
+ "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
+ wfDynId, ii->disassemble(), old_pc, ii->seqNum());
ii->execute(ii);
+ // delete the dynamic instruction from the pipeline map
+ computeUnit->deleteFromPipeMap(this);
+ // update the instruction stats in the CU
computeUnit->updateInstStats(ii);
- // access the VRF
- computeUnit->vrf[simdId]->exec(ii, this);
- srcRegOpDist.sample(ii->numSrcRegOperands());
- dstRegOpDist.sample(ii->numDstRegOperands());
+
+ // inform VRF of instruction execution to schedule write-back
+ // and scoreboard ready for registers
+ if (!ii->isScalar()) {
+ computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
+ }
+ computeUnit->srf[simdId]->waveExecuteInst(this, ii);
+
+ computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
+ computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
computeUnit->numInstrExecuted++;
+ numInstrExecuted++;
+ computeUnit->instExecPerSimd[simdId]++;
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
- if (pc() == old_pc) {
- uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
- // PC not modified by instruction, proceed to next or pop frame
- pc(new_pc);
- if (new_pc == rpc()) {
- popFromReconvergenceStack();
- discardFetch();
- } else {
- instructionBuffer.pop_front();
+
+ if (lastInstExec) {
+ computeUnit->instInterleave[simdId].
+ sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
+ }
+ lastInstExec = computeUnit->instExecPerSimd[simdId];
+
+ // want to track:
+ // number of reads that occur per value written
+
+ // vector RAW dependency tracking
+ for (int i = 0; i < ii->getNumOperands(); i++) {
+ if (ii->isVectorRegister(i)) {
+ int vgpr = ii->getRegisterIndex(i, ii);
+ int nReg = ii->getOperandSize(i) <= 4 ? 1 :
+ ii->getOperandSize(i) / 4;
+ for (int n = 0; n < nReg; n++) {
+ if (ii->isSrcOperand(i)) {
+ // This check should never fail, but to be safe we check
+ if (rawDist.find(vgpr+n) != rawDist.end()) {
+ vecRawDistance.
+ sample(numInstrExecuted.value() - rawDist[vgpr+n]);
+ }
+ // increment number of reads to this register
+ vecReads[vgpr+n]++;
+ } else if (ii->isDstOperand(i)) {
+ // rawDist is set on writes, but will not be set
+ // for the first write to each physical register
+ if (rawDist.find(vgpr+n) != rawDist.end()) {
+ // sample the number of reads that were performed
+ readsPerWrite.sample(vecReads[vgpr+n]);
+ }
+ // on a write, reset count of reads to 0
+ vecReads[vgpr+n] = 0;
+
+ rawDist[vgpr+n] = numInstrExecuted.value();
+ }
+ }
}
+ }
+
+ if (pc() == old_pc) {
+ // PC not modified by instruction, proceed to next
+ _gpuISA.advancePC(ii);
+ instructionBuffer.pop_front();
} else {
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
+ computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+ ii->disassemble());
discardFetch();
}
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
+ computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->numVecOpsExecuted += num_active_lanes;
+
+ if (ii->isF16() && ii->isALU()) {
+ if (ii->isF32() || ii->isF64()) {
+ fatal("Instruction is tagged as both (1) F16, and (2)"
+ "either F32 or F64.");
+ }
+ computeUnit->numVecOpsExecutedF16 += num_active_lanes;
+ if (ii->isFMA()) {
+ computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ else if (ii->isMAC()) {
+ computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ else if (ii->isMAD()) {
+ computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ }
+ if (ii->isF32() && ii->isALU()) {
+ if (ii->isF16() || ii->isF64()) {
+ fatal("Instruction is tagged as both (1) F32, and (2)"
+ "either F16 or F64.");
+ }
+ computeUnit->numVecOpsExecutedF32 += num_active_lanes;
+ if (ii->isFMA()) {
+ computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ else if (ii->isMAC()) {
+ computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ else if (ii->isMAD()) {
+ computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ }
+ if (ii->isF64() && ii->isALU()) {
+ if (ii->isF16() || ii->isF32()) {
+ fatal("Instruction is tagged as both (1) F64, and (2)"
+ "either F16 or F32.");
+ }
+ computeUnit->numVecOpsExecutedF64 += num_active_lanes;
+ if (ii->isFMA()) {
+ computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ else if (ii->isMAC()) {
+ computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ else if (ii->isMAD()) {
+ computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
+ computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
+ }
+ }
if (isGmInstruction(ii)) {
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
} else if (isLmInstruction(ii)) {
}
}
- // ---- Update Vector ALU pipeline and other resources ------------------ //
+ /**
+ * we return here to avoid spurious errors related to flat insts
+ * and their address segment resolution.
+ */
+ if (execMask().none() && ii->isFlat()) {
+ computeUnit->getTokenManager()->recvTokens(1);
+ return;
+ }
+
+ // Update Vector ALU pipeline and other resources
+ bool flat_as_gm = false;
+ bool flat_as_lm = false;
+ if (ii->isFlat()) {
+ flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) ||
+ (ii->executedAs() == Enums::SC_PRIVATE);
+ flat_as_lm = (ii->executedAs() == Enums::SC_GROUP);
+ }
+
// Single precision ALU or Branch or Return or Special instruction
+ // Note, we use the same timing regardless of SP or DP ALU operation.
if (ii->isALU() || ii->isSpecialOp() ||
- ii->isBranch() ||
- // FIXME: Kernel argument loads are currently treated as ALU operations
- // since we don't send memory packets at execution. If we fix that then
- // we should map them to one of the memory pipelines
+ ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
- ii->isArgSeg() ||
- ii->isReturn()) {
- computeUnit->aluPipe[simdId].set(computeUnit->shader->
- ticks(computeUnit->spBypassLength()));
-
+ ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
// this is to enforce a fixed number of cycles per issue slot per SIMD
- computeUnit->wfWait[simdId].set(computeUnit->shader->
- ticks(computeUnit->issuePeriod));
+ if (!ii->isScalar()) {
+ computeUnit->vectorALUs[simdId].set(computeUnit->
+ cyclesToTicks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->scalarALUs[scalarAlu].set(computeUnit->
+ cyclesToTicks(computeUnit->issuePeriod));
+ }
+ // Barrier on Scalar ALU
} else if (ii->isBarrier()) {
- computeUnit->wfWait[simdId].set(computeUnit->shader->
- ticks(computeUnit->issuePeriod));
- } else if (ii->isLoad() && ii->isFlat()) {
- assert(Enums::SC_NONE != ii->executedAs());
-
- if (Enums::SC_SHARED == ii->executedAs()) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- set(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ computeUnit->scalarALUs[scalarAlu].set(computeUnit->
+ cyclesToTicks(computeUnit->issuePeriod));
+ // GM or Flat as GM Load
+ } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
+ if (!ii->isScalar()) {
+ computeUnit->vrfToGlobalMemPipeBus.set(
+ computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
+ computeUnit->vectorGlobalMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesVMemPerSimd[simdId] +=
+ computeUnit->vrf_gm_bus_latency;
} else {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- set(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ computeUnit->srfToScalarMemPipeBus.set(computeUnit->
+ cyclesToTicks(computeUnit->srf_scm_bus_latency));
+ computeUnit->scalarMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesScMemPerSimd[simdId] +=
+ computeUnit->srf_scm_bus_latency;
}
- } else if (ii->isStore() && ii->isFlat()) {
- assert(Enums::SC_NONE != ii->executedAs());
- if (Enums::SC_SHARED == ii->executedAs()) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- set(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ // GM or Flat as GM Store
+ } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
+ if (!ii->isScalar()) {
+ computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
+ cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
+ computeUnit->vectorGlobalMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesVMemPerSimd[simdId] +=
+ (2 * computeUnit->vrf_gm_bus_latency);
} else {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- set(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ computeUnit->srfToScalarMemPipeBus.set(computeUnit->
+ cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
+ computeUnit->scalarMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesScMemPerSimd[simdId] +=
+ (2 * computeUnit->srf_scm_bus_latency);
}
- } else if (ii->isLoad() && ii->isGlobalMem()) {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- set(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if (ii->isStore() && ii->isGlobalMem()) {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- set(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
- computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
- set(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->GlbMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if (ii->isLoad() && ii->isLocalMem()) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- set(computeUnit->shader->ticks(4));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if (ii->isStore() && ii->isLocalMem()) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- set(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
- } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
- computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
- set(computeUnit->shader->ticks(8));
- computeUnit->wfWait[computeUnit->ShrMemUnitId()].
- set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if ((ii->isAtomic() || ii->isMemSync()) &&
+ (ii->isGlobalMem() || flat_as_gm)) {
+ if (!ii->isScalar()) {
+ computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
+ cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
+ computeUnit->vectorGlobalMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesVMemPerSimd[simdId] +=
+ (2 * computeUnit->vrf_gm_bus_latency);
+ } else {
+ computeUnit->srfToScalarMemPipeBus.set(computeUnit->
+ cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
+ computeUnit->scalarMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesScMemPerSimd[simdId] +=
+ (2 * computeUnit->srf_scm_bus_latency);
+ }
+ // LM or Flat as LM Load
+ } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
+ computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
+ cyclesToTicks(computeUnit->vrf_lm_bus_latency));
+ computeUnit->vectorSharedMemUnit.
+ set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesLdsPerSimd[simdId] +=
+ computeUnit->vrf_lm_bus_latency;
+ // LM or Flat as LM Store
+ } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
+ computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
+ cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
+ computeUnit->vectorSharedMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesLdsPerSimd[simdId] +=
+ (2 * computeUnit->vrf_lm_bus_latency);
+ // LM or Flat as LM, Atomic or MemFence
+ } else if ((ii->isAtomic() || ii->isMemSync()) &&
+ (ii->isLocalMem() || flat_as_lm)) {
+ computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
+ cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
+ computeUnit->vectorSharedMemUnit.
+ set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
+ computeUnit->instCyclesLdsPerSimd[simdId] +=
+ (2 * computeUnit->vrf_lm_bus_latency);
+ } else {
+ panic("Bad instruction type!\n");
}
}
return barCnt[lane] < maxBarCnt;
}
-void
-Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
- const VectorMask& mask)
+GPUDynInstPtr
+Wavefront::nextInstr()
{
- assert(mask.count());
- reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
+ // Read next instruction from instruction buffer
+ GPUDynInstPtr ii = instructionBuffer.front();
+ // if the WF has been dispatched in the schedule stage then
+ // check the next oldest instruction for readiness
+ if (computeUnit->pipeMap.find(ii->seqNum()) !=
+ computeUnit->pipeMap.end()) {
+ if (instructionBuffer.size() > 1) {
+ auto it = instructionBuffer.begin() + 1;
+ return *it;
+ } else { // No new instructions to check
+ return nullptr;
+ }
+ }
+ return ii;
}
void
-Wavefront::popFromReconvergenceStack()
+Wavefront::discardFetch()
{
- assert(!reconvergenceStack.empty());
+ instructionBuffer.clear();
+ dropFetch |= pendingFetch;
- DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
- computeUnit->cu_id, simdId, wfSlotId, wfDynId,
- execMask().to_string<char, std::string::traits_type,
- std::string::allocator_type>().c_str(), pc());
+ /**
+ * clear the fetch buffer for this wave in order to
+ * remove any stale inst data
+ */
+ computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
+}
- reconvergenceStack.pop_back();
+bool
+Wavefront::waitCntsSatisfied()
+{
+ // Both vmWaitCnt && lgkmWaitCnt uninitialized means
+ // waitCnt instruction has been dispatched but not executed yet: next
+ // instruction should be blocked until waitCnt is executed.
+ if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
+ return false;
+ }
- DPRINTF(WavefrontStack, "%3i %s\n", pc(),
- execMask().to_string<char, std::string::traits_type,
- std::string::allocator_type>().c_str());
+ // If we reach here, that means waitCnt instruction is executed and
+ // the waitcnts are set by the execute method. Check if waitcnts are
+ // satisfied.
-}
+ // current number of vector memory ops in flight
+ int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm;
-void
-Wavefront::discardFetch()
-{
- instructionBuffer.clear();
- dropFetch |=pendingFetch;
-}
+ // current number of export insts or vector memory writes in flight
+ int exp_cnt = outstandingReqsWrGm;
-uint32_t
-Wavefront::pc() const
-{
- return reconvergenceStack.back()->pc;
+ // current number of scalar/LDS memory ops in flight
+ // we do not consider GDS/message ops
+ int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm +
+ scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm;
+
+ if (vmWaitCnt != -1) {
+ if (vm_cnt > vmWaitCnt) {
+ // vmWaitCnt not satisfied
+ return false;
+ }
+ }
+
+ if (expWaitCnt != -1) {
+ if (exp_cnt > expWaitCnt) {
+ // expWaitCnt not satisfied
+ return false;
+ }
+ }
+
+ if (lgkmWaitCnt != -1) {
+ if (lgkm_cnt > lgkmWaitCnt) {
+ // lgkmWaitCnt not satisfied
+ return false;
+ }
+ }
+
+ // if we get here all outstanding waitcnts must
+ // be satisfied, so we resume normal operation
+ clearWaitCnts();
+
+ return true;
}
-uint32_t
-Wavefront::rpc() const
+void
+Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
{
- return reconvergenceStack.back()->rpc;
+ // the scoreboard should have set the status
+ // to S_WAITCNT once a waitcnt instruction
+ // was marked as ready
+ assert(status == S_WAITCNT);
+
+ // waitcnt instruction shouldn't be sending
+ // negative counts
+ assert(vm_wait_cnt >= 0);
+ assert(exp_wait_cnt >= 0);
+ assert(lgkm_wait_cnt >= 0);
+ // waitcnts are a max of 15 because we have
+ // only 1 nibble (4 bits) to set the counts
+ assert(vm_wait_cnt <= 0xf);
+ assert(exp_wait_cnt <= 0x7);
+ assert(lgkm_wait_cnt <= 0x1f);
+
+ /**
+ * prior waitcnts should be satisfied,
+ * at which time the WF resets them
+ * back to -1, indicating they are no
+ * longer active
+ */
+ assert(vmWaitCnt == -1);
+ assert(expWaitCnt == -1);
+ assert(lgkmWaitCnt == -1);
+
+ /**
+ * if the instruction encoding
+ * indicates a waitcnt of 0xf,
+ * that means the waitcnt is
+ * not being used
+ */
+ if (vm_wait_cnt != 0xf)
+ vmWaitCnt = vm_wait_cnt;
+
+ if (exp_wait_cnt != 0x7)
+ expWaitCnt = exp_wait_cnt;
+
+ if (lgkm_wait_cnt != 0x1f)
+ lgkmWaitCnt = lgkm_wait_cnt;
}
-VectorMask
-Wavefront::execMask() const
+void
+Wavefront::clearWaitCnts()
{
- return reconvergenceStack.back()->execMask;
+ // reset the waitcnts back to
+ // -1, indicating they are no
+ // longer valid
+ vmWaitCnt = -1;
+ expWaitCnt = -1;
+ lgkmWaitCnt = -1;
+
+ // resume running normally
+ status = S_RUNNING;
}
-bool
-Wavefront::execMask(int lane) const
+Addr
+Wavefront::pc() const
{
- return reconvergenceStack.back()->execMask[lane];
+ return _pc;
}
-
void
-Wavefront::pc(uint32_t new_pc)
+Wavefront::pc(Addr new_pc)
{
- reconvergenceStack.back()->pc = new_pc;
+ _pc = new_pc;
}
-uint32_t
-Wavefront::getStaticContextSize() const
+VectorMask&
+Wavefront::execMask()
{
- return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
- sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
- sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
- sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
- computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
+ return _execMask;
}
-void
-Wavefront::getContext(const void *out)
-{
- uint8_t *iter = (uint8_t *)out;
- for (int i = 0; i < barCnt.size(); i++) {
- *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
- }
- *(int *)iter = wfId; iter += sizeof(wfId);
- *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
- *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
- *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
- *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
- *(uint32_t *)iter = wgId; iter += sizeof(wgId);
- *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
- *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
- *(Addr *)iter = privBase; iter += sizeof(privBase);
- *(Addr *)iter = spillBase; iter += sizeof(spillBase);
-
- int stackSize = reconvergenceStack.size();
- ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
- std::numeric_limits<uint32_t>::max(),
- std::numeric_limits<uint64_t>::max()};
- for (int i = 0; i < workItemId[0].size(); i++) {
- if (i < stackSize) {
- *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
- iter += sizeof(ReconvergenceStackEntry);
- reconvergenceStack.pop_back();
- } else {
- *(ReconvergenceStackEntry *)iter = empty;
- iter += sizeof(ReconvergenceStackEntry);
- }
- }
-
- int wf_size = computeUnit->wfSize();
- for (int i = 0; i < maxSpVgprs; i++) {
- uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
- for (int lane = 0; lane < wf_size; lane++) {
- uint32_t regVal = computeUnit->vrf[simdId]->
- read<uint32_t>(vgprIdx,lane);
- *(uint32_t *)iter = regVal; iter += sizeof(regVal);
- }
- }
-
- for (int i = 0; i < maxDpVgprs; i++) {
- uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
- for (int lane = 0; lane < wf_size; lane++) {
- uint64_t regVal = computeUnit->vrf[simdId]->
- read<uint64_t>(vgprIdx,lane);
- *(uint64_t *)iter = regVal; iter += sizeof(regVal);
- }
- }
-
- for (int i = 0; i < condRegState->numRegs(); i++) {
- for (int lane = 0; lane < wf_size; lane++) {
- uint64_t regVal = condRegState->read<uint64_t>(i, lane);
- *(uint64_t *)iter = regVal; iter += sizeof(regVal);
- }
- }
-
- /* saving LDS content */
- if (ldsChunk)
- for (int i = 0; i < ldsChunk->size(); i++) {
- char val = ldsChunk->read<char>(i);
- *(char *) iter = val; iter += sizeof(val);
- }
+bool
+Wavefront::execMask(int lane) const
+{
+ return _execMask[lane];
}
void
-Wavefront::setContext(const void *in)
-{
- uint8_t *iter = (uint8_t *)in;
- for (int i = 0; i < barCnt.size(); i++) {
- barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
- }
- wfId = *(int *)iter; iter += sizeof(wfId);
- maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
- oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
- barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
- computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
- wgId = *(uint32_t *)iter; iter += sizeof(wgId);
- barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
- initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
- privBase = *(Addr *)iter; iter += sizeof(privBase);
- spillBase = *(Addr *)iter; iter += sizeof(spillBase);
-
- for (int i = 0; i < workItemId[0].size(); i++) {
- ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
- iter += sizeof(ReconvergenceStackEntry);
- if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
- pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
- newEntry.execMask);
- }
- }
- int wf_size = computeUnit->wfSize();
-
- for (int i = 0; i < maxSpVgprs; i++) {
- uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
- for (int lane = 0; lane < wf_size; lane++) {
- uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
- computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
- }
- }
-
- for (int i = 0; i < maxDpVgprs; i++) {
- uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
- for (int lane = 0; lane < wf_size; lane++) {
- uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
- computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
- }
+Wavefront::freeRegisterFile()
+{
+ /* clear busy registers */
+ for (int i=0; i < maxVgprs; i++) {
+ int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
+ computeUnit->vrf[simdId]->markReg(vgprIdx, false);
}
- for (int i = 0; i < condRegState->numRegs(); i++) {
- for (int lane = 0; lane < wf_size; lane++) {
- uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
- condRegState->write<uint64_t>(i, lane, regVal);
- }
- }
- /** Restoring LDS contents */
- if (ldsChunk)
- for (int i = 0; i < ldsChunk->size(); i++) {
- char val = *(char *) iter; iter += sizeof(val);
- ldsChunk->write<char>(i, val);
- }
+ /* Free registers used by this wavefront */
+ uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
+ computeUnit->vrf[simdId]->numRegs();
+ computeUnit->registerManager->vrfPoolMgrs[simdId]->
+ freeRegion(startVgprIndex, endIndex);
}
void
-Wavefront::computeActualWgSz(NDRange *ndr)
+Wavefront::computeActualWgSz(HSAQueueEntry *task)
{
actualWgSzTotal = 1;
- for (int d = 0; d < 3; ++d) {
- actualWgSz[d] = std::min(workGroupSz[d],
- gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
+ for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
+ actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
+ - task->wgId(d) * workGroupSz[d]);
actualWgSzTotal *= actualWgSz[d];
}
}
* POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __WAVEFRONT_HH__
-#define __WAVEFRONT_HH__
+#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
+#define __GPU_COMPUTE_WAVEFRONT_HH__
#include <cassert>
#include <deque>
+#include <list>
#include <memory>
-#include <stack>
+#include <unordered_map>
#include <vector>
#include "arch/gpu_isa.hh"
#include "base/logging.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
-#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/misc.hh"
-#include "gpu-compute/ndrange.hh"
#include "params/Wavefront.hh"
#include "sim/sim_object.hh"
-static const int MAX_NUM_INSTS_PER_WF = 12;
-
-/**
- * A reconvergence stack entry conveys the necessary state to implement
- * control flow divergence.
- */
-struct ReconvergenceStackEntry {
- /**
- * PC of current instruction.
- */
- uint32_t pc;
- /**
- * PC of the immediate post-dominator instruction, i.e., the value of
- * @a pc for the first instruction that will be executed by the wavefront
- * when a reconvergence point is reached.
- */
- uint32_t rpc;
- /**
- * Execution mask.
- */
- VectorMask execMask;
-};
-
-/*
- * Arguments for the hsail opcode call, are user defined and variable length.
- * The hardware/finalizer can support arguments in hardware or use memory to
- * pass arguments. For now, let's assume that an unlimited number of arguments
- * are supported in hardware (the compiler inlines functions whenver it can
- * anyways, so unless someone is interested in the implications of linking/
- * library functions, I think this is a reasonable assumption given the typical
- * size of an OpenCL kernel).
- *
- * Note that call args are different than kernel arguments:
- * * All work-items in a kernel refer the same set of kernel arguments
- * * Each work-item has it's on set of call args. So a call argument at
- * address 0x4 is different for work-item 0 and work-item 1.
- *
- * Ok, the table below shows an example of how we organize the call arguments in
- * the CallArgMem class.
- *
- * int foo(int arg1, double arg2)
- * ___________________________________________________
- * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
- * |---------------------------------------------------|
- * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
- * |---------------------------------------------------|
- * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
- * ___________________________________________________
- */
-class CallArgMem
-{
- public:
- // pointer to buffer for storing function arguments
- uint8_t *mem;
- int wfSize;
- // size of function args
- int funcArgsSizePerItem;
-
- template<typename CType>
- int
- getLaneOffset(int lane, int addr)
- {
- return addr * wfSize + sizeof(CType) * lane;
- }
-
- CallArgMem(int func_args_size_per_item, int wf_size)
- : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
- {
- mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
- }
-
- ~CallArgMem()
- {
- free(mem);
- }
-
- template<typename CType>
- uint8_t*
- getLaneAddr(int lane, int addr)
- {
- return mem + getLaneOffset<CType>(lane, addr);
- }
-
- template<typename CType>
- void
- setLaneAddr(int lane, int addr, CType val)
- {
- *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
- }
-};
-
class Wavefront : public SimObject
{
public:
- enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
- enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
-
- // Base pointer for array of instruction pointers
- uint64_t basePtr;
+ enum status_e {
+ // wavefront is stalled
+ S_STOPPED,
+ // wavefront is returning from a kernel
+ S_RETURNING,
+ // wavefront is running normally
+ S_RUNNING,
+ // wavefront is stalled
+ S_STALLED,
+ /**
+ * wavefront has unsatisfied wait counts
+ *
+ * while in this state the WF will only execute if
+ * the oldest instruction is the waitcnt. while in
+ * S_WAITCNT, the wavefront will not be ready until
+ * all of its waitcnts have been satisfied. the
+ * scoreboard ready() function will check the status
+ * of the waitcnts whenever the WF is in S_WAITCNT,
+ * and once they are satisfied, it will resume normal
+ * operation.
+ */
+ S_WAITCNT
+ };
uint32_t oldBarrierCnt;
uint32_t barrierCnt;
uint32_t barrierId;
uint32_t barrierSlots;
- status_e status;
// HW slot id where the WF is mapped to inside a SIMD unit
- int wfSlotId;
+ const int wfSlotId;
int kernId;
// SIMD unit where the WV has been scheduled
- int simdId;
+ const int simdId;
+ // id of the execution unit (or pipeline) where the oldest instruction
+ // of the WF is scheduled
+ int execUnitId;
+ int flatLmUnitId;
+ int flatGmUnitId;
// pointer to parent CU
ComputeUnit *computeUnit;
+ int maxIbSize;
std::deque<GPUDynInstPtr> instructionBuffer;
bool pendingFetch;
bool dropFetch;
-
- // Condition Register State (for HSAIL simulations only)
- class ConditionRegisterState *condRegState;
- // number of single precision VGPRs required by WF
- uint32_t maxSpVgprs;
- // number of double precision VGPRs required by WF
- uint32_t maxDpVgprs;
- // map virtual to physical vector register
- uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
- void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+ // last tick during which all WFs in the CU are not idle
+ Tick lastNonIdleTick;
+
+ // Execution unit resource ID's associated with this WF
+ // These are static mappings set at WF slot construction and
+ // based off of the simdId and wfSlotId.
+
+ // Index to scalarALUs resource vector in CU
+ int scalarAlu;
+
+ // Indices into readyList/dispatchList of resources used by this
+ // wavefront
+ int scalarAluGlobalIdx;
+ int globalMem;
+ int localMem;
+ int scalarMem;
+
+ // number of VGPRs required by WF
+ uint32_t maxVgprs;
+ // number of SGPRs required by WF
+ uint32_t maxSgprs;
+ void freeResources();
+ GPUDynInstPtr nextInstr();
+ void setStatus(status_e newStatus);
+ status_e getStatus() { return status; }
+ void resizeRegFiles(int num_vregs, int num_sregs);
bool isGmInstruction(GPUDynInstPtr ii);
bool isLmInstruction(GPUDynInstPtr ii);
+ bool isOldestInstWaitcnt();
bool isOldestInstGMem();
bool isOldestInstLMem();
bool isOldestInstPrivMem();
bool isOldestInstFlatMem();
- bool isOldestInstALU();
+ bool isOldestInstVectorALU();
+ bool isOldestInstScalarALU();
+ bool isOldestInstScalarMem();
bool isOldestInstBarrier();
+
// used for passing spill address to DDInstGPU
std::vector<Addr> lastAddr;
std::vector<uint32_t> workItemId[3];
/* the actual WG size can differ than the maximum size */
uint32_t actualWgSz[3];
uint32_t actualWgSzTotal;
- void computeActualWgSz(NDRange *ndr);
+ void computeActualWgSz(HSAQueueEntry *task);
// wavefront id within a workgroup
uint32_t wfId;
uint32_t maxDynWaveId;
uint32_t dispatchId;
- // outstanding global+local memory requests
- uint32_t outstandingReqs;
- // memory requests between scoreboard
- // and execute stage not yet executed
- uint32_t memReqsInPipe;
+ // vector and scalar memory requests pending in memory system
+ int outstandingReqs;
// outstanding global memory write requests
- uint32_t outstandingReqsWrGm;
+ int outstandingReqsWrGm;
// outstanding local memory write requests
- uint32_t outstandingReqsWrLm;
+ int outstandingReqsWrLm;
// outstanding global memory read requests
- uint32_t outstandingReqsRdGm;
+ int outstandingReqsRdGm;
// outstanding local memory read requests
- uint32_t outstandingReqsRdLm;
- uint32_t rdLmReqsInPipe;
- uint32_t rdGmReqsInPipe;
- uint32_t wrLmReqsInPipe;
- uint32_t wrGmReqsInPipe;
+ int outstandingReqsRdLm;
+ // outstanding scalar memory read requests
+ int scalarOutstandingReqsRdGm;
+ // outstanding scalar memory write requests
+ int scalarOutstandingReqsWrGm;
+ int rdLmReqsInPipe;
+ int rdGmReqsInPipe;
+ int wrLmReqsInPipe;
+ int wrGmReqsInPipe;
+ int scalarRdGmReqsInPipe;
+ int scalarWrGmReqsInPipe;
int memTraceBusy;
uint64_t lastTrace;
- // number of vector registers reserved by WF
+ // number of virtual vector registers reserved by WF
int reservedVectorRegs;
+ // number of virtual scalar registers reserved by WF
+ int reservedScalarRegs;
// Index into the Vector Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startVgprIndex;
+ // Index into the Scalar Register File's namespace where the WF's registers
+ // will live while the WF is executed
+ uint32_t startSgprIndex;
// Old value of destination gpr (for trace)
std::vector<uint32_t> oldVgpr;
// to this workgroup (thus this wavefront)
LdsChunk *ldsChunk;
- // A pointer to the spill area
- Addr spillBase;
- // The size of the spill area
- uint32_t spillSizePerItem;
- // The vector width of the spill area
- uint32_t spillWidth;
-
- // A pointer to the private memory area
- Addr privBase;
- // The size of the private memory area
- uint32_t privSizePerItem;
-
- // A pointer ot the read-only memory area
- Addr roBase;
- // size of the read-only memory area
- uint32_t roSize;
-
- // pointer to buffer for storing kernel arguments
- uint8_t *kernelArgs;
// unique WF id over all WFs executed across all CUs
uint64_t wfDynId;
- // number of times instruction issue for this wavefront is blocked
- // due to VRF port availability
- Stats::Scalar numTimesBlockedDueVrfPortAvail;
+ // Wavefront slot stats
+
+ // Number of instructions executed by this wavefront slot across all
+ // dynamic wavefronts
+ Stats::Scalar numInstrExecuted;
+
+ // Number of cycles this WF spends in SCH stage
+ Stats::Scalar schCycles;
+
+ // Number of stall cycles encounterd by this WF in SCH stage
+ Stats::Scalar schStalls;
+
+ // The following stats sum to the value of schStalls, and record, per
+ // WF slot, what the cause of each stall was at a coarse granularity.
+
+ // Cycles WF is selected by scheduler, but RFs cannot support instruction
+ Stats::Scalar schRfAccessStalls;
+ // Cycles spent waiting for execution resources
+ Stats::Scalar schResourceStalls;
+ // cycles spent waiting for RF reads to complete in SCH stage
+ Stats::Scalar schOpdNrdyStalls;
+ // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+ // but another wave is executing FLAT, which requires LM and GM and forces
+ // this WF to stall.
+ Stats::Scalar schLdsArbStalls;
+
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueWAXDependencies;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueRAWDependencies;
- // distribution of executed instructions based on their register
- // operands; this is used to highlight the load on the VRF
- Stats::Distribution srcRegOpDist;
- Stats::Distribution dstRegOpDist;
-
- // Functions to operate on call argument memory
- // argument memory for hsail call instruction
- CallArgMem *callArgMem;
- void
- initCallArgMem(int func_args_size_per_item, int wf_size)
- {
- callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
- }
- template<typename CType>
- CType
- readCallArgMem(int lane, int addr)
- {
- return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
- }
+ // dyn inst id (per SIMD) of last instruction exec from this wave
+ uint64_t lastInstExec;
- template<typename CType>
- void
- writeCallArgMem(int lane, int addr, CType val)
- {
- callArgMem->setLaneAddr<CType>(lane, addr, val);
- }
+ // Distribution to track the distance between producer and consumer
+ // for vector register values
+ Stats::Distribution vecRawDistance;
+ // Map to track the dyn instruction id of each vector register value
+ // produced, indexed by physical vector register ID
+ std::unordered_map<int,uint64_t> rawDist;
+
+ // Distribution to track the number of times every vector register
+ // value produced is consumed.
+ Stats::Distribution readsPerWrite;
+ // Counts the number of reads performed to each physical register
+ // - counts are reset to 0 for each dynamic wavefront launched
+ std::vector<int> vecReads;
+
+ void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
+
+ // context for save/restore
+ uint8_t *context;
typedef WavefrontParams Params;
Wavefront(const Params *p);
computeUnit = cu;
}
+ void validateRequestCounters();
void start(uint64_t _wfDynId, uint64_t _base_ptr);
void exec();
- void updateResources();
- int ready(itype_e type);
- bool instructionBufferHasBranch();
+ // called by SCH stage to reserve
+ std::vector<int> reserveResources();
+ bool stopFetch();
void regStats();
- VectorMask getPred() { return execMask() & initMask; }
bool waitingAtBarrier(int lane);
- void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
- const VectorMask& exec_mask);
-
- void popFromReconvergenceStack();
-
- uint32_t pc() const;
-
- uint32_t rpc() const;
-
- VectorMask execMask() const;
+ Addr pc() const;
+ void pc(Addr new_pc);
+ VectorMask& execMask();
bool execMask(int lane) const;
- void pc(uint32_t new_pc);
void discardFetch();
- /**
- * Returns the size of the static hardware context of a particular wavefront
- * This should be updated everytime the context is changed
- */
- uint32_t getStaticContextSize() const;
+ bool waitCntsSatisfied();
+ void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
+ void clearWaitCnts();
- /**
- * Returns the hardware context as a stream of bytes
- * This method is designed for HSAIL execution
- */
- void getContext(const void *out);
-
- /**
- * Sets the hardware context fromt a stream of bytes
- * This method is designed for HSAIL execution
- */
- void setContext(const void *in);
+ /** Freeing VRF space */
+ void freeRegisterFile();
TheGpuISA::GPUISA&
gpuISA()
private:
TheGpuISA::GPUISA _gpuISA;
+
+ void reserveGmResource(GPUDynInstPtr ii);
+ void reserveLmResource(GPUDynInstPtr ii);
+
/**
- * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
- * to be visited by the wavefront, and the associated execution masks. The
- * reconvergence stack grows every time the wavefront reaches a divergence
- * point (branch instruction), and shrinks every time the wavefront
- * reaches a reconvergence point (immediate post-dominator instruction).
+ * the following are used for waitcnt instructions
+ * vmWaitCnt: once set, we wait for the oustanding
+ * number of vector mem instructions to be
+ * at, or below vmWaitCnt.
+ *
+ * expWaitCnt: once set, we wait for the outstanding
+ * number outstanding VM writes or EXP
+ * insts to be at, or below expWaitCnt.
+ *
+ * lgkmWaitCnt: once set, we wait for the oustanding
+ * number of LDS, GDS, scalar memory,
+ * and message instructions to be at, or
+ * below lgkmCount. we currently do not
+ * support GDS/message ops.
*/
- std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+ int vmWaitCnt;
+ int expWaitCnt;
+ int lgkmWaitCnt;
+ status_e status;
+ Addr _pc;
+ VectorMask _execMask;
};
-#endif // __WAVEFRONT_HH__
+#endif // __GPU_COMPUTE_WAVEFRONT_HH__
WriteResp, "WriteReq" },
/* WriteResp */
{ SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
+ /* WriteCompleteResp - The WriteCompleteResp command is needed
+ * because in the GPU memory model we use a WriteResp to indicate
+ * that a write has reached the cache controller so we can free
+ * resources at the coalescer. Later, when the write succesfully
+ * completes we send a WriteCompleteResp to the CU so its wait
+ * counters can be updated. Wait counters in the CU is how memory
+ * dependences are handled in the GPU ISA. */
+ { SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
/* WritebackDirty */
{ SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
InvalidCmd, "WritebackDirty" },
ReadRespWithInvalidate,
WriteReq,
WriteResp,
+ WriteCompleteResp,
WritebackDirty,
WritebackClean,
WriteClean, // writes dirty data below without evicting
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
- if (in_msg.segment == HSASegment:SPILL) {
- trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
- } else if (WB) {
+ if (WB) {
trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
} else {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
--- /dev/null
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+structure (GPUCoalescer, external = "yes") {
+ void readCallback(Addr, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void writeCallback(Addr, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void evictionCallback(Addr);
+ void recordCPReadCallBack(MachineID, MachineID);
+ void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+ void readCallback(Addr, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void writeCallback(Addr, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void invCallback(Addr);
+ void wbCallback(Addr);
+ void evictionCallback(Addr);
+}
include "MOESI_AMD_Base-msg.sm";
include "MOESI_AMD_Base-dir.sm";
include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-msg.sm";
include "GPU_VIPER-TCP.sm";
include "GPU_VIPER-SQC.sm";
include "GPU_VIPER-TCC.sm";
CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write through";
- HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
int wfid, default="0", desc="wavefront id";
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";
NotPresent, desc="block is NotPresent";
Busy, desc="block is in a transient state, currently invalid";
}
-//HSA scopes
-enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
- UNSPECIFIED, desc="Unspecified scope";
- NOSCOPE, desc="Explictly unscoped";
- WAVEFRONT, desc="Wavefront scope";
- WORKGROUP, desc="Workgroup scope";
- DEVICE, desc="Device scope";
- SYSTEM, desc="System scope";
-}
-
-// HSA segment types
-enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
- GLOBAL, desc="Global segment";
- GROUP, desc="Group segment";
- PRIVATE, desc="Private segment";
- KERNARG, desc="Kernarg segment";
- READONLY, desc="Readonly segment";
- SPILL, desc="Spill segment";
- ARG, desc="Arg segment";
-}
// TesterStatus
enumeration(TesterStatus, desc="...") {
bool checkResourceAvailable(CacheResourceType, Addr);
}
-structure (GPUCoalescer, external = "yes") {
- void readCallback(Addr, DataBlock);
- void readCallback(Addr, MachineType, DataBlock);
- void readCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles);
- void readCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles, bool);
- void writeCallback(Addr, DataBlock);
- void writeCallback(Addr, MachineType, DataBlock);
- void writeCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles);
- void writeCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles, bool);
- void evictionCallback(Addr);
- void recordCPReadCallBack(MachineID, MachineID);
- void recordCPWriteCallBack(MachineID, MachineID);
-}
-
-structure (VIPERCoalescer, external = "yes") {
- void readCallback(Addr, DataBlock);
- void readCallback(Addr, MachineType, DataBlock);
- void readCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles);
- void readCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles, bool);
- void writeCallback(Addr, DataBlock);
- void writeCallback(Addr, MachineType, DataBlock);
- void writeCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles);
- void writeCallback(Addr, MachineType, DataBlock,
- Cycles, Cycles, Cycles, bool);
- void invCallback(Addr);
- void wbCallback(Addr);
- void evictionCallback(Addr);
-}
-
structure(RubyRequest, desc="...", interface="Message", external="yes") {
Addr LineAddress, desc="Line address for this request";
Addr PhysicalAddress, desc="Physical address for this request";
WriteMask writeMask, desc="Writethrough mask";
DataBlock WTData, desc="Writethrough data block";
int wfid, desc="Writethrough wavefront";
- HSAScope scope, desc="HSA scope";
- HSASegment segment, desc="HSA segment";
PacketPtr pkt, desc="Packet associated with this request";
}
#include "debug/RubyQueue.hh"
#include "mem/ruby/network/Network.hh"
#include "mem/ruby/protocol/MemoryMsg.hh"
-#include "mem/ruby/system/GPUCoalescer.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "mem/ruby/system/Sequencer.hh"
#include "sim/system.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/DataBlock.hh"
#include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/Message.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"
using namespace std;
-GPUCoalescer *
-RubyGPUCoalescerParams::create()
-{
- return new GPUCoalescer(this);
-}
-
-HSAScope
-reqScopeToHSAScope(const RequestPtr &req)
-{
- HSAScope accessScope = HSAScope_UNSPECIFIED;
- if (req->isScoped()) {
- if (req->isWavefrontScope()) {
- accessScope = HSAScope_WAVEFRONT;
- } else if (req->isWorkgroupScope()) {
- accessScope = HSAScope_WORKGROUP;
- } else if (req->isDeviceScope()) {
- accessScope = HSAScope_DEVICE;
- } else if (req->isSystemScope()) {
- accessScope = HSAScope_SYSTEM;
- } else {
- fatal("Bad scope type");
- }
- }
- return accessScope;
-}
-
-HSASegment
-reqSegmentToHSASegment(const RequestPtr &req)
-{
- HSASegment accessSegment = HSASegment_GLOBAL;
-
- if (req->isGlobalSegment()) {
- accessSegment = HSASegment_GLOBAL;
- } else if (req->isGroupSegment()) {
- accessSegment = HSASegment_GROUP;
- } else if (req->isPrivateSegment()) {
- accessSegment = HSASegment_PRIVATE;
- } else if (req->isKernargSegment()) {
- accessSegment = HSASegment_KERNARG;
- } else if (req->isReadonlySegment()) {
- accessSegment = HSASegment_READONLY;
- } else if (req->isSpillSegment()) {
- accessSegment = HSASegment_SPILL;
- } else if (req->isArgSegment()) {
- accessSegment = HSASegment_ARG;
- } else {
- fatal("Bad segment type");
- }
-
- return accessSegment;
-}
-
UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
: coalescer(gc)
{
{
for (auto iter = instMap.begin(); iter != instMap.end(); ) {
if (iter->second.empty()) {
+ DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
instMap.erase(iter++);
coalescer->getGMTokenPort().sendTokens(1);
} else {
}
}
+bool
+UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
+ // iterate the instructions held in UncoalescedTable to see whether there
+ // are more requests to issue; if yes, not yet done; otherwise, done
+ for (auto& inst : instMap) {
+ DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
+ ,inst.first, inst.second.size());
+ if (inst.first == instSeqNum) { return false; }
+ }
+
+ return true;
+}
+
void
UncoalescedTable::printRequestTable(std::stringstream& ss)
{
- ss << "UncoalescedTable contains " << instMap.size()
- << " address entries." << std::endl;
+ ss << "Listing pending packets from " << instMap.size() << " instructions";
+
for (auto& inst : instMap) {
- ss << "Addr 0x" << std::hex << inst.first << std::dec
- << " with " << inst.second.size() << " packets"
- << std::endl;
+ ss << "\tAddr: " << printAddress(inst.first) << " with "
+ << inst.second.size() << " pending packets" << std::endl;
}
}
assert(m_dataCache_ptr);
m_runningGarnetStandalone = p->garnet_standalone;
- assumingRfOCoherence = p->assume_rfo;
}
GPUCoalescer::~GPUCoalescer()
if (current_time - req->getIssueTime() > m_deadlock_threshold) {
std::stringstream ss;
printRequestTable(ss);
- ss << "Outstanding requests: " << m_outstanding_count
- << std::endl;
-
- panic("Possible Deadlock detected. Aborting!\n"
- "version: %d request.paddr: 0x%x coalescedTable: %d "
- "current time: %u issue_time: %d difference: %d\n"
- "Request Tables:\n %s", m_version,
- req->getFirstPkt()->getAddr(),
- coalescedTable.size(), cyclesToTicks(current_time),
- cyclesToTicks(req->getIssueTime()),
- cyclesToTicks(current_time - req->getIssueTime()),
- ss.str());
+ warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
+ m_version, ss.str());
+ panic("Aborting due to deadlock!\n");
}
}
}
void
GPUCoalescer::printRequestTable(std::stringstream& ss)
{
- uncoalescedTable.printRequestTable(ss);
+ ss << "Printing out " << coalescedTable.size()
+ << " outstanding requests in the coalesced table\n";
- ss << "CoalescedTable contains " << coalescedTable.size()
- << " address entries." << std::endl;
for (auto& requestList : coalescedTable) {
- ss << "Addr 0x" << std::hex << requestList.first << std::dec
- << ": type-";
for (auto& request : requestList.second) {
- ss << RubyRequestType_to_string(request->getRubyType())
- << " pkts-" << request->getPackets().size()
- << " issued-" << request->getIssueTime() << " seqNum-"
- << request->getSeqNum() << "; ";
+ ss << "\tAddr: " << printAddress(requestList.first) << "\n"
+ << "\tInstruction sequence number: "
+ << request->getSeqNum() << "\n"
+ << "\t\tType: "
+ << RubyRequestType_to_string(request->getRubyType()) << "\n"
+ << "\t\tNumber of associated packets: "
+ << request->getPackets().size() << "\n"
+ << "\t\tIssue time: "
+ << request->getIssueTime() * clockPeriod() << "\n"
+ << "\t\tDifference from current tick: "
+ << (curCycle() - request->getIssueTime()) * clockPeriod();
}
- ss << std::endl;
}
+
+ // print out packets waiting to be issued in uncoalesced table
+ uncoalescedTable.printRequestTable(ss);
}
void
hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
forwardRequestTime, firstResponseTime, isRegion);
+ // remove this crequest in coalescedTable
delete crequest;
coalescedTable.at(address).pop_front();
}
}
+void
+GPUCoalescer::writeCompleteCallback(Addr address,
+ uint64_t instSeqNum,
+ MachineType mach)
+{
+ DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
+ " instSeqNum = %d\n", address, instSeqNum);
+
+ assert(pendingWriteInsts.count(instSeqNum) == 1);
+ PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
+
+ // check the uncoalescedTable to see whether all requests for the inst
+ // have been issued or not
+ bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
+ DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
+ "reqsAllIssued=%d\n", reqsAllIssued,
+ inst.getNumPendingStores()-1, reqsAllIssued);
+
+ if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
+ // if the pending write instruction has received all write completion
+ // callbacks for its issued Ruby requests, we can now start respond
+ // the requesting CU in one response packet.
+ inst.ackWriteCompletion(m_usingRubyTester);
+
+ DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
+ instSeqNum);
+ pendingWriteInsts.erase(instSeqNum);
+ }
+}
+
void
GPUCoalescer::readCallback(Addr address, DataBlock& data)
{
{
PacketPtr pkt = crequest->getFirstPkt();
Addr request_address = pkt->getAddr();
- Addr request_line_address = makeLineAddress(request_address);
+ Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
RubyRequestType type = crequest->getRubyType();
"%s\n",
RubyRequestType_to_string(type));
}
-
- // If using the RubyTester, update the RubyTester sender state's
- // subBlock with the recieved data. The tester will later access
- // this state.
- // Note: RubyPort will access it's sender state before the
- // RubyTester.
- if (m_usingRubyTester) {
- RubyPort::SenderState *requestSenderState =
- safe_cast<RubyPort::SenderState*>(pkt->senderState);
- RubyTester::SenderState* testerSenderState =
- safe_cast<RubyTester::SenderState*>
- (requestSenderState->predecessor);
- testerSenderState->subBlock.mergeFrom(data);
- }
}
} else if (pkt->isWrite()) {
req_type = RubyRequestType_ST;
} else {
- // Acquire and release packets will have been issued by
- // makeRequest, so we do not need to check for it here.
panic("Unsupported ruby packet type\n");
}
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
- // Check for GPU Barrier Kernel End or Kernel Begin
- // Leave these to be handled by the child class
- // Kernel End/Barrier = isFlush + isRelease
- // Kernel Begin = isFlush + isAcquire
- if (pkt->req->isKernel()) {
- if (pkt->req->isAcquire()){
- // This is a Kernel Begin leave handling to
- // virtual xCoalescer::makeRequest
- return RequestStatus_Issued;
- }else if (pkt->req->isRelease()) {
- // This is a Kernel End leave handling to
- // virtual xCoalescer::makeRequest
- // If we are here then we didn't call
- // a virtual version of this function
- // so we will also schedule the callback
- int wf_id = 0;
- if (pkt->req->hasContextId()) {
- wf_id = pkt->req->contextId();
- }
- insertKernel(wf_id, pkt);
- newKernelEnds.push_back(wf_id);
- if (!issueEvent.scheduled()) {
- schedule(issueEvent, curTick());
- }
- return RequestStatus_Issued;
- }
- }
+ // all packets must have valid instruction sequence numbers
+ assert(pkt->req->hasInstSeqNum());
- if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
- !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
- (pkt->req->isRelease() || pkt->req->isAcquire())) {
- if (assumingRfOCoherence) {
- // If we reached here, this request must be a memFence
- // and the protocol implements RfO, the coalescer can
- // assume sequentially consistency and schedule the callback
- // immediately.
- // Currently the code implements fence callbacks
- // by reusing the mechanism for kernel completions.
- // This should be fixed.
- int wf_id = 0;
- if (pkt->req->hasContextId()) {
- wf_id = pkt->req->contextId();
- }
- insertKernel(wf_id, pkt);
- newKernelEnds.push_back(wf_id);
- if (!issueEvent.scheduled()) {
- schedule(issueEvent, curTick());
- }
- return RequestStatus_Issued;
- } else {
- // If not RfO, return issued here and let the child coalescer
- // take care of it.
- return RequestStatus_Issued;
+ if (pkt->cmd == MemCmd::MemSyncReq) {
+ // issue mem_sync requests immedidately to the cache system without
+ // going though uncoalescedTable like normal LD/ST/Atomic requests
+ issueMemSyncRequest(pkt);
+ } else {
+ // otherwise, this must be either read or write command
+ assert(pkt->isRead() || pkt->isWrite());
+
+ // the pkt is temporarily stored in the uncoalesced table until
+ // it's picked for coalescing process later in this cycle or in a
+ // future cycle
+ uncoalescedTable.insertPacket(pkt);
+ DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+ pkt->getAddr());
+
+ // we schedule an issue event here to process the uncoalesced table
+ // and try to issue Ruby request to cache system
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
}
}
- uncoalescedTable.insertPacket(pkt);
- DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
-
- if (!issueEvent.scheduled())
- schedule(issueEvent, curTick());
- // TODO: issue hardware prefetches here
+ // we always return RequestStatus_Issued in this coalescer
+ // b/c the coalescer's resouce was checked ealier and the coalescer is
+ // queueing up aliased requets in its coalesced table
return RequestStatus_Issued;
}
+/**
+ * TODO: Figure out what do with this code. This code may go away
+ * and/or be merged into the VIPER coalescer once the VIPER
+ * protocol is re-integrated with GCN3 codes.
+ */
+/*
void
GPUCoalescer::issueRequest(CoalescedRequest* crequest)
{
}
assert(m_mandatory_q_ptr);
- m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-}
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}*/
template <class KEY, class VALUE>
std::ostream &
}
-void
-GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
- DPRINTF(RubyStats, "Recorded statistic: %s\n",
- SequencerRequestType_to_string(requestType));
-}
-
bool
GPUCoalescer::coalescePacket(PacketPtr pkt)
{
// be counted as outstanding requests.
m_outstanding_count++;
+ // We track all issued or to-be-issued Ruby requests associated with
+ // write instructions. An instruction may have multiple Ruby
+ // requests.
+ if (pkt->cmd == MemCmd::WriteReq) {
+ DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
+ " the pending write instruction list\n", seqNum,
+ line_addr);
+
+ RubyPort::SenderState* ss =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+ // we need to save this port because it will be used to call
+ // back the requesting CU when we receive write
+ // complete callbacks for all issued Ruby requests of this
+ // instruction.
+ RubyPort::MemSlavePort* mem_slave_port = ss->port;
+
+ GPUDynInstPtr gpuDynInst = nullptr;
+
+ if (!m_usingRubyTester) {
+ // If this coalescer is connected to a real CU, we need
+ // to save the corresponding gpu dynamic instruction.
+ // CU will use that instruction to decrement wait counters
+ // in the issuing wavefront.
+ // For Ruby tester, gpuDynInst == nullptr
+ ComputeUnit::DataPort::SenderState* cu_state =
+ safe_cast<ComputeUnit::DataPort::SenderState*>
+ (ss->predecessor);
+ gpuDynInst = cu_state->_gpuDynInst;
+ }
+
+ PendingWriteInst& inst = pendingWriteInsts[seqNum];
+ inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
+ }
+
return true;
}
}
}
-void
-GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
-{
- if (myMachID == senderMachID) {
- CP_TCPLdHits++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
- CP_TCPLdTransfers++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
- CP_TCCLdHits++;
- } else {
- CP_LdMiss++;
- }
-}
-
-void
-GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
-{
- if (myMachID == senderMachID) {
- CP_TCPStHits++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
- CP_TCPStTransfers++;
- } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
- CP_TCCStHits++;
- } else {
- CP_StMiss++;
- }
-}
-
void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
{
Cycles firstResponseTime,
bool success, bool isRegion)
{
- RubyRequestType type = crequest->getRubyType();
- Cycles issued_time = crequest->getIssueTime();
- Cycles completion_time = curCycle();
- assert(completion_time >= issued_time);
- Cycles total_lat = completion_time - issued_time;
-
- // cache stats (valid for RfO protocol only)
- if (mach == MachineType_TCP) {
- if (type == RubyRequestType_LD) {
- GPU_TCPLdHits++;
- } else {
- GPU_TCPStHits++;
- }
- } else if (mach == MachineType_L1Cache_wCC) {
- if (type == RubyRequestType_LD) {
- GPU_TCPLdTransfers++;
- } else {
- GPU_TCPStTransfers++;
- }
- } else if (mach == MachineType_TCC) {
- if (type == RubyRequestType_LD) {
- GPU_TCCLdHits++;
- } else {
- GPU_TCCStHits++;
- }
- } else {
- if (type == RubyRequestType_LD) {
- GPU_LdMiss++;
- } else {
- GPU_StMiss++;
- }
- }
-
- // Profile all access latency, even zero latency accesses
- m_latencyHist.sample(total_lat);
- m_typeLatencyHist[type]->sample(total_lat);
-
- // Profile the miss latency for all non-zero demand misses
- if (total_lat != Cycles(0)) {
- m_missLatencyHist.sample(total_lat);
- m_missTypeLatencyHist[type]->sample(total_lat);
-
- if (mach != MachineType_NUM) {
- m_missMachLatencyHist[mach]->sample(total_lat);
- m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
-
- if ((issued_time <= initialRequestTime) &&
- (initialRequestTime <= forwardRequestTime) &&
- (forwardRequestTime <= firstResponseTime) &&
- (firstResponseTime <= completion_time)) {
-
- m_IssueToInitialDelayHist[mach]->sample(
- initialRequestTime - issued_time);
- m_InitialToForwardDelayHist[mach]->sample(
- forwardRequestTime - initialRequestTime);
- m_ForwardToFirstResponseDelayHist[mach]->sample(
- firstResponseTime - forwardRequestTime);
- m_FirstResponseToCompletionDelayHist[mach]->sample(
- completion_time - firstResponseTime);
- }
- }
-
- }
-
- DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
- curTick(), m_version, "Coal",
- success ? "Done" : "SC_Failed", "", "",
- printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
}
void
m_missTypeMachLatencyHist[i][j]->init(10);
}
}
-
- // GPU cache stats
- GPU_TCPLdHits
- .name(name() + ".gpu_tcp_ld_hits")
- .desc("loads that hit in the TCP")
- ;
- GPU_TCPLdTransfers
- .name(name() + ".gpu_tcp_ld_transfers")
- .desc("TCP to TCP load transfers")
- ;
- GPU_TCCLdHits
- .name(name() + ".gpu_tcc_ld_hits")
- .desc("loads that hit in the TCC")
- ;
- GPU_LdMiss
- .name(name() + ".gpu_ld_misses")
- .desc("loads that miss in the GPU")
- ;
-
- GPU_TCPStHits
- .name(name() + ".gpu_tcp_st_hits")
- .desc("stores that hit in the TCP")
- ;
- GPU_TCPStTransfers
- .name(name() + ".gpu_tcp_st_transfers")
- .desc("TCP to TCP store transfers")
- ;
- GPU_TCCStHits
- .name(name() + ".gpu_tcc_st_hits")
- .desc("stores that hit in the TCC")
- ;
- GPU_StMiss
- .name(name() + ".gpu_st_misses")
- .desc("stores that miss in the GPU")
- ;
-
- // CP cache stats
- CP_TCPLdHits
- .name(name() + ".cp_tcp_ld_hits")
- .desc("loads that hit in the TCP")
- ;
- CP_TCPLdTransfers
- .name(name() + ".cp_tcp_ld_transfers")
- .desc("TCP to TCP load transfers")
- ;
- CP_TCCLdHits
- .name(name() + ".cp_tcc_ld_hits")
- .desc("loads that hit in the TCC")
- ;
- CP_LdMiss
- .name(name() + ".cp_ld_misses")
- .desc("loads that miss in the GPU")
- ;
-
- CP_TCPStHits
- .name(name() + ".cp_tcp_st_hits")
- .desc("stores that hit in the TCP")
- ;
- CP_TCPStTransfers
- .name(name() + ".cp_tcp_st_transfers")
- .desc("TCP to TCP store transfers")
- ;
- CP_TCCStHits
- .name(name() + ".cp_tcc_st_hits")
- .desc("stores that hit in the TCC")
- ;
- CP_StMiss
- .name(name() + ".cp_st_misses")
- .desc("stores that miss in the GPU")
- ;
}
#include <unordered_map>
#include "base/statistics.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
#include "mem/request.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/Consumer.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
#include "mem/ruby/protocol/PrefetchBit.hh"
#include "mem/ruby/protocol/RubyAccessMode.hh"
#include "mem/ruby/protocol/RubyRequestType.hh"
class RubyGPUCoalescerParams;
-HSAScope reqScopeToHSAScope(const RequestPtr &req);
-HSASegment reqSegmentToHSASegment(const RequestPtr &req);
-
// List of packets that belongs to a specific instruction.
typedef std::list<PacketPtr> PerInstPackets;
// instructions at the offset.
PerInstPackets* getInstPackets(int offset);
void updateResources();
+ bool areRequestsDone(const uint64_t instSeqNum);
// Check if a packet hasn't been removed from instMap in too long.
// Panics if a deadlock is detected and returns nothing otherwise.
std::vector<PacketPtr> pkts;
};
+// PendingWriteInst tracks the number of outstanding Ruby requests
+// per write instruction. Once all requests associated with one instruction
+// are completely done in Ruby, we call back the requester to mark
+// that this instruction is complete.
+class PendingWriteInst
+{
+ public:
+ PendingWriteInst()
+ : numPendingStores(0),
+ originalPort(nullptr),
+ gpuDynInstPtr(nullptr)
+ {}
+
+ ~PendingWriteInst()
+ {}
+
+ void
+ addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+ bool usingRubyTester)
+ {
+ assert(port);
+ originalPort = port;
+
+ if (!usingRubyTester) {
+ gpuDynInstPtr = inst;
+ }
+
+ numPendingStores++;
+ }
+
+ // return true if no more ack is expected
+ bool
+ receiveWriteCompleteAck()
+ {
+ assert(numPendingStores > 0);
+ numPendingStores--;
+ return (numPendingStores == 0) ? true : false;
+ }
+
+ // ack the original requester that this write instruction is complete
+ void
+ ackWriteCompletion(bool usingRubyTester)
+ {
+ assert(numPendingStores == 0);
+
+ // make a response packet
+ PacketPtr pkt = new Packet(std::make_shared<Request>(),
+ MemCmd::WriteCompleteResp);
+
+ if (!usingRubyTester) {
+ assert(gpuDynInstPtr);
+ ComputeUnit::DataPort::SenderState* ss =
+ new ComputeUnit::DataPort::SenderState
+ (gpuDynInstPtr, 0, nullptr);
+ pkt->senderState = ss;
+ }
+
+ // send the ack response to the requester
+ originalPort->sendTimingResp(pkt);
+ }
+
+ int
+ getNumPendingStores() {
+ return numPendingStores;
+ }
+
+ private:
+ // the number of stores waiting for writeCompleteCallback
+ int numPendingStores;
+ // The original port that sent one of packets associated with this
+ // write instruction. We may have more than one packet per instruction,
+ // which implies multiple ports per instruction. However, we need
+ // only 1 of the ports to call back the CU. Therefore, here we keep
+ // track the port that sent the first packet of this instruction.
+ RubyPort::MemSlavePort* originalPort;
+ // similar to the originalPort, this gpuDynInstPtr is set only for
+ // the first packet of this instruction.
+ GPUDynInstPtr gpuDynInstPtr;
+};
+
class GPUCoalescer : public RubyPort
{
public:
void collateStats();
void regStats() override;
+ // each store request needs two callbacks:
+ // (1) writeCallback is called when the store is received and processed
+ // by TCP. This writeCallback does not guarantee the store is actually
+ // completed at its destination cache or memory. writeCallback helps
+ // release hardware resources (e.g., its entry in coalescedTable)
+ // allocated for the store so that subsequent requests will not be
+ // blocked unnecessarily due to hardware resource constraints.
+ // (2) writeCompleteCallback is called when the store is fully completed
+ // at its destination cache or memory. writeCompleteCallback
+ // guarantees that the store is fully completed. This callback
+ // will decrement hardware counters in CU
void writeCallback(Addr address, DataBlock& data);
void writeCallback(Addr address,
Cycles forwardRequestTime,
Cycles firstResponseTime);
+ void writeCompleteCallback(Addr address,
+ uint64_t instSeqNum,
+ MachineType mach);
+
void readCallback(Addr address, DataBlock& data);
void readCallback(Addr address,
Cycles forwardRequestTime,
Cycles firstResponseTime,
bool isRegion);
- /* atomics need their own callback because the data
- might be const coming from SLICC */
+
void atomicCallback(Addr address,
MachineType mach,
const DataBlock& data);
- void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
- void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
-
- // Alternate implementations in VIPER Coalescer
- virtual RequestStatus makeRequest(PacketPtr pkt) override;
-
+ RequestStatus makeRequest(PacketPtr pkt) override;
int outstandingCount() const override { return m_outstanding_count; }
bool
GMTokenPort& getGMTokenPort() { return gmTokenPort; }
- void recordRequestType(SequencerRequestType requestType);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
Stats::Histogram& getLatencyHist() { return m_latencyHist; }
getFirstResponseToCompletionDelayHist(const MachineType t) const
{ return *m_FirstResponseToCompletionDelayHist[t]; }
- // Changed to protected to enable inheritance by VIPER Coalescer
protected:
bool tryCacheAccess(Addr addr, RubyRequestType type,
Addr pc, RubyAccessMode access_mode,
int size, DataBlock*& data_ptr);
- // Alternate implementations in VIPER Coalescer
- virtual void issueRequest(CoalescedRequest* crequest);
- void kernelCallback(int wavfront_id);
+ // since the two following issue functions are protocol-specific,
+ // they must be implemented in a derived coalescer
+ virtual void issueRequest(CoalescedRequest* crequest) = 0;
+ virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+
+ void kernelCallback(int wavefront_id);
void hitCallback(CoalescedRequest* crequest,
MachineType mach,
bool success, bool isRegion);
void completeHitCallback(std::vector<PacketPtr> & mylist);
-
virtual RubyRequestType getRequestType(PacketPtr pkt);
// Attempt to remove a packet from the uncoalescedTable and coalesce
EventFunctionWrapper issueEvent;
-
- // Changed to protected to enable inheritance by VIPER Coalescer
protected:
int m_max_outstanding_requests;
Cycles m_deadlock_threshold;
// an address, the are serviced in age order.
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+ // a map btw an instruction sequence number and PendingWriteInst
+ // this is used to do a final call back for each write when it is
+ // completely done in the memory system
+ std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
+
// Global outstanding request count, across all request tables
int m_outstanding_count;
bool m_deadlock_check_scheduled;
EventFunctionWrapper deadlockCheckEvent;
bool assumingRfOCoherence;
- // m5 style stats for TCP hit/miss counts
- Stats::Scalar GPU_TCPLdHits;
- Stats::Scalar GPU_TCPLdTransfers;
- Stats::Scalar GPU_TCCLdHits;
- Stats::Scalar GPU_LdMiss;
-
- Stats::Scalar GPU_TCPStHits;
- Stats::Scalar GPU_TCPStTransfers;
- Stats::Scalar GPU_TCCStHits;
- Stats::Scalar GPU_StMiss;
-
- Stats::Scalar CP_TCPLdHits;
- Stats::Scalar CP_TCPLdTransfers;
- Stats::Scalar CP_TCCLdHits;
- Stats::Scalar CP_LdMiss;
-
- Stats::Scalar CP_TCPStHits;
- Stats::Scalar CP_TCPStTransfers;
- Stats::Scalar CP_TCCStHits;
- Stats::Scalar CP_StMiss;
+// TODO - Need to update the following stats once the VIPER protocol
+// is re-integrated.
+// // m5 style stats for TCP hit/miss counts
+// Stats::Scalar GPU_TCPLdHits;
+// Stats::Scalar GPU_TCPLdTransfers;
+// Stats::Scalar GPU_TCCLdHits;
+// Stats::Scalar GPU_LdMiss;
+//
+// Stats::Scalar GPU_TCPStHits;
+// Stats::Scalar GPU_TCPStTransfers;
+// Stats::Scalar GPU_TCCStHits;
+// Stats::Scalar GPU_StMiss;
+//
+// Stats::Scalar CP_TCPLdHits;
+// Stats::Scalar CP_TCPLdTransfers;
+// Stats::Scalar CP_TCCLdHits;
+// Stats::Scalar CP_LdMiss;
+//
+// Stats::Scalar CP_TCPStHits;
+// Stats::Scalar CP_TCPStTransfers;
+// Stats::Scalar CP_TCCStHits;
+// Stats::Scalar CP_StMiss;
//! Histogram for number of outstanding requests per cycle.
Stats::Histogram m_outstandReqHist;
std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+// TODO - Need to update the following stats once the VIPER protocol
+// is re-integrated.
+// Stats::Distribution numHopDelays;
+// Stats::Distribution tcpToTccDelay;
+// Stats::Distribution tccToSdDelay;
+// Stats::Distribution sdToSdDelay;
+// Stats::Distribution sdToTccDelay;
+// Stats::Distribution tccToTcpDelay;
+//
+// Stats::Average avgTcpToTcc;
+// Stats::Average avgTccToSd;
+// Stats::Average avgSdToSd;
+// Stats::Average avgSdToTcc;
+// Stats::Average avgTccToTcp;
+
private:
// Token port is used to send/receive tokens to/from GPU's global memory
// pipeline across the port boundary. There is one per <wave size> data
class RubyGPUCoalescer(RubyPort):
type = 'RubyGPUCoalescer'
+ abstract = True
cxx_class = 'GPUCoalescer'
cxx_header = "mem/ruby/system/GPUCoalescer.hh"
"max requests (incl. prefetches) outstanding")
max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
"coalesced in a single cycle")
- assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
- "Ownership coherence");
icache = Param.RubyCache("")
dcache = Param.RubyCache("")
VIPERCoalescer(const Params *);
~VIPERCoalescer();
- void issueMemSyncRequest(PacketPtr pkt);
+ void issueMemSyncRequest(PacketPtr pkt) override;
void issueRequest(CoalescedRequest* crequest) override;
void wbCallback(Addr address);
void invCallback(Addr address);
cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
- assume_rfo = False