From: Tony Gutierrez Date: Tue, 1 May 2018 20:59:35 +0000 (-0400) Subject: gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model X-Git-Tag: v20.1.0.0~589 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b8da9abba7b7ec710a749a893ed698fc41f2edcf;p=gem5.git gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez Reviewed-by: Jason Lowe-Power Maintainer: Anthony Gutierrez Tested-by: kokoro --- diff --git a/build_opts/GCN3_X86 b/build_opts/GCN3_X86 new file mode 100644 index 000000000..21e3cf0e4 --- /dev/null +++ b/build_opts/GCN3_X86 @@ -0,0 +1,5 @@ +PROTOCOL = 'GPU_VIPER' +TARGET_ISA = 'x86' +TARGET_GPU_ISA = 'gcn3' +BUILD_GPU = True +CPU_MODELS = 'AtomicSimpleCPU,O3CPU,TimingSimpleCPU' diff --git a/configs/common/GPUTLBConfig.py b/configs/common/GPUTLBConfig.py index ced8aa198..8e2b1e46e 100644 --- a/configs/common/GPUTLBConfig.py +++ b/configs/common/GPUTLBConfig.py @@ -48,7 +48,7 @@ def TLB_constructor(level): maxOutstandingReqs = options.L%(level)dMaxOutstandingReqs,\ accessDistance = options.L%(level)dAccessDistanceStat,\ clk_domain = SrcClockDomain(\ - clock = options.GPUClock,\ + clock = options.gpu_clock,\ voltage_domain = VoltageDomain(\ voltage = options.gpu_voltage)))" % locals() return constructor_call @@ -60,23 +60,22 @@ def Coalescer_constructor(level): coalescingWindow = options.L%(level)dCoalescingWindow,\ disableCoalescing = options.L%(level)dDisableCoalescing,\ clk_domain = SrcClockDomain(\ - clock = options.GPUClock,\ + clock = options.gpu_clock,\ voltage_domain = VoltageDomain(\ voltage = options.gpu_voltage)))" % locals() return constructor_call -def create_TLB_Coalescer(options, my_level, my_index, TLB_name, Coalescer_name): - # arguments: options, TLB level, number of private structures for this Level, - # TLB name and Coalescer name +def create_TLB_Coalescer(options, my_level, my_index, tlb_name, + coalescer_name): + # arguments: options, TLB level, number of private structures for this + # Level, TLB name and Coalescer name for i in range(my_index): - TLB_name.append(eval(TLB_constructor(my_level))) - Coalescer_name.append(eval(Coalescer_constructor(my_level))) + tlb_name.append(eval(TLB_constructor(my_level))) + coalescer_name.append(eval(Coalescer_constructor(my_level))) def config_tlb_hierarchy(options, system, shader_idx): - n_cu = options.num_compute_units - # Make this configurable now, instead of the hard coded val. The dispatcher - # is always the last item in the system.cpu list. - dispatcher_idx = len(system.cpu) - 1 + n_cu = options.cu_per_sa * options.sa_per_complex * \ + options.num_gpu_complexes if options.TLB_config == "perLane": num_TLBs = 64 * n_cu @@ -90,21 +89,26 @@ def config_tlb_hierarchy(options, system, shader_idx): print("Bad option for TLB Configuration.") sys.exit(1) - #---------------------------------------------------------------------------------------- + #------------------------------------------------------------------------- # A visual representation of the TLB hierarchy # for ease of configuration - # < Modify here the width and the number of levels if you want a different configuration > - # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) for this level - L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], 'CoalescerArray': []}, - {'name': 'dispatcher', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}, - {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], 'CoalescerArray': []}] + # < Modify here the width and the number of levels if you want a different + # configuration > + # width is the number of TLBs of the given type (i.e., D-TLB, I-TLB etc) + # for this level + L1 = [{'name': 'sqc', 'width': options.num_sqc, 'TLBarray': [], + 'CoalescerArray': []}, + {'name': 'scalar', 'width' : options.num_scalar_cache, + 'TLBarray': [], 'CoalescerArray': []}, + {'name': 'l1', 'width': num_TLBs, 'TLBarray': [], + 'CoalescerArray': []}] L2 = [{'name': 'l2', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}] L3 = [{'name': 'l3', 'width': 1, 'TLBarray': [], 'CoalescerArray': []}] TLB_hierarchy = [L1, L2, L3] - #---------------------------------------------------------------------------------------- + #------------------------------------------------------------------------- # Create the hiearchy # Call the appropriate constructors and add objects to the system @@ -164,17 +168,14 @@ def config_tlb_hierarchy(options, system, shader_idx): for tlb in range(tlb_per_cu): exec('system.cpu[%d].CUs[%d].translation_port[%d] = \ system.l1_coalescer[%d].slave[%d]' % \ - (shader_idx, cu_idx, tlb, cu_idx*tlb_per_cu+tlb, 0)) + (shader_idx, cu_idx, tlb, + cu_idx*tlb_per_cu+tlb, 0)) else: exec('system.cpu[%d].CUs[%d].translation_port[%d] = \ system.l1_coalescer[%d].slave[%d]' % \ - (shader_idx, cu_idx, tlb_per_cu, cu_idx / (n_cu / num_TLBs), cu_idx % (n_cu / num_TLBs))) - - elif name == 'dispatcher': # Dispatcher TLB - for index in range(TLB_type['width']): - exec('system.cpu[%d].translation_port = \ - system.dispatcher_coalescer[%d].slave[0]' % \ - (dispatcher_idx, index)) + (shader_idx, cu_idx, tlb_per_cu, + cu_idx / (n_cu / num_TLBs), + cu_idx % (n_cu / num_TLBs))) elif name == 'sqc': # I-TLB for index in range(n_cu): sqc_tlb_index = index / options.cu_per_sqc @@ -182,7 +183,14 @@ def config_tlb_hierarchy(options, system, shader_idx): exec('system.cpu[%d].CUs[%d].sqc_tlb_port = \ system.sqc_coalescer[%d].slave[%d]' % \ (shader_idx, index, sqc_tlb_index, sqc_tlb_port_id)) - + elif name == 'scalar': # Scalar D-TLB + for index in range(n_cu): + scalar_tlb_index = index / options.cu_per_scalar_cache + scalar_tlb_port_id = index % options.cu_per_scalar_cache + exec('system.cpu[%d].CUs[%d].scalar_tlb_port = \ + system.scalar_coalescer[%d].slave[%d]' % \ + (shader_idx, index, scalar_tlb_index, + scalar_tlb_port_id)) # Connect the memSidePorts (masters) of all the TLBs with the # cpuSidePorts (slaves) of the Coalescers of the next level diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 88095e7ad..7578694b6 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -3728,7 +3728,7 @@ namespace Gcn3ISA DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n", wf->computeUnit->cu_id, wf->wgId, refCount); - wf->computeUnit->registerManager.freeRegisters(wf); + wf->computeUnit->registerManager->freeRegisters(wf); wf->computeUnit->completedWfs++; wf->computeUnit->activeWaves--; diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 8bb49c0b7..3197dc078 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -192,7 +192,7 @@ namespace Gcn3ISA */ bool misaligned_acc = split_addr > vaddr; - RequestPtr req = new Request(0, vaddr, req_size, 0, + RequestPtr req = std::make_shared(vaddr, req_size, 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -208,7 +208,6 @@ namespace Gcn3ISA pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize()); gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1); gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2); - delete req; } else { gpuDynInst->numScalarReqs = 1; gpuDynInst->setRequestFlags(req); @@ -243,7 +242,7 @@ namespace Gcn3ISA */ bool misaligned_acc = split_addr > vaddr; - RequestPtr req = new Request(0, vaddr, req_size, 0, + RequestPtr req = std::make_shared(vaddr, req_size, 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -259,7 +258,6 @@ namespace Gcn3ISA pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize()); gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1); gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2); - delete req; } else { gpuDynInst->numScalarReqs = 1; gpuDynInst->setRequestFlags(req); @@ -574,7 +572,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, sizeof(T), 0, + RequestPtr req = std::make_shared(vaddr, + sizeof(T), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -600,7 +599,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, sizeof(T), 0, + RequestPtr req = std::make_shared(vaddr, + sizeof(T), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -619,7 +619,7 @@ namespace Gcn3ISA { // create request and set flags gpuDynInst->statusBitVector = VectorMask(1); - Request *req = new Request(0, 0, 0, 0, + RequestPtr req = std::make_shared(0, 0, 0, gpuDynInst->computeUnit()-> masterId(), 0, gpuDynInst->wfDynId); @@ -777,7 +777,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, sizeof(T), 0, + RequestPtr req = std::make_shared(vaddr, + sizeof(T), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -802,7 +803,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, req_size, 0, + RequestPtr req = std::make_shared(vaddr, req_size, + 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -826,7 +828,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, sizeof(T), 0, + RequestPtr req = std::make_shared(vaddr, + sizeof(T), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -851,7 +854,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, req_size, 0, + RequestPtr req = std::make_shared(vaddr, req_size, + 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId); @@ -875,7 +879,8 @@ namespace Gcn3ISA if (gpuDynInst->exec_mask[lane]) { Addr vaddr = gpuDynInst->addr[lane]; - RequestPtr req = new Request(0, vaddr, sizeof(T), 0, + RequestPtr req = std::make_shared(vaddr, + sizeof(T), 0, gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId, gpuDynInst->makeAtomicOpFunctor( diff --git a/src/arch/gcn3/operand.hh b/src/arch/gcn3/operand.hh index ac340f19b..218faf8cc 100644 --- a/src/arch/gcn3/operand.hh +++ b/src/arch/gcn3/operand.hh @@ -153,7 +153,7 @@ namespace Gcn3ISA ComputeUnit *cu = _gpuDynInst->computeUnit(); for (auto i = 0; i < NumDwords; ++i) { - int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx + i); + int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx + i); vrfData[i] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx); DPRINTF(GPUVRF, "Read v[%d]\n", vgprIdx); @@ -207,7 +207,7 @@ namespace Gcn3ISA ? _gpuDynInst->exec_mask : wf->execMask(); if (NumDwords == 1) { - int vgprIdx = cu->registerManager.mapVgpr(wf, _opIdx); + int vgprIdx = cu->registerManager->mapVgpr(wf, _opIdx); vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx); assert(vrfData[0]); auto reg_file_vgpr = vrfData[0]->template as(); @@ -223,8 +223,8 @@ namespace Gcn3ISA DPRINTF(GPUVRF, "Write v[%d]\n", vgprIdx); cu->vrf[wf->simdId]->printReg(wf, vgprIdx); } else if (NumDwords == 2) { - int vgprIdx0 = cu->registerManager.mapVgpr(wf, _opIdx); - int vgprIdx1 = cu->registerManager.mapVgpr(wf, _opIdx + 1); + int vgprIdx0 = cu->registerManager->mapVgpr(wf, _opIdx); + int vgprIdx1 = cu->registerManager->mapVgpr(wf, _opIdx + 1); vrfData[0] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx0); vrfData[1] = &cu->vrf[wf->simdId]->readWriteable(vgprIdx1); assert(vrfData[0]); @@ -605,16 +605,16 @@ namespace Gcn3ISA if (_opIdx == REG_VCC_LO) { sgprIdx = cu->registerManager - .mapSgpr(wf, wf->reservedScalarRegs - 2 + dword); + ->mapSgpr(wf, wf->reservedScalarRegs - 2 + dword); } else if (_opIdx == REG_FLAT_SCRATCH_HI) { sgprIdx = cu->registerManager - .mapSgpr(wf, wf->reservedScalarRegs - 3 + dword); + ->mapSgpr(wf, wf->reservedScalarRegs - 3 + dword); } else if (_opIdx == REG_FLAT_SCRATCH_LO) { assert(NumDwords == 1); sgprIdx = cu->registerManager - .mapSgpr(wf, wf->reservedScalarRegs - 4 + dword); + ->mapSgpr(wf, wf->reservedScalarRegs - 4 + dword); } else { - sgprIdx = cu->registerManager.mapSgpr(wf, _opIdx + dword); + sgprIdx = cu->registerManager->mapSgpr(wf, _opIdx + dword); } assert(sgprIdx > -1); diff --git a/src/dev/hsa/hsa_device.cc b/src/dev/hsa/hsa_device.cc index 78ec8e8b4..094623dd8 100644 --- a/src/dev/hsa/hsa_device.cc +++ b/src/dev/hsa/hsa_device.cc @@ -101,7 +101,7 @@ HSADevice::translateOrDie(Addr vaddr, Addr &paddr) * with new extensions, it will likely be wrong to just arbitrarily * grab context zero. */ - auto process = sys->getThreadContext(0)->getProcessPtr(); + auto process = sys->threads[0]->getProcessPtr(); if (!process->pTable->translate(vaddr, paddr)) { fatal("failed translation: vaddr 0x%x\n", vaddr); diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc index 3f5c8eb0a..459043d93 100644 --- a/src/dev/hsa/hsa_driver.cc +++ b/src/dev/hsa/hsa_driver.cc @@ -92,3 +92,28 @@ HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, DPRINTF(HSADriver, "amdkfd doorbell mapped to %xp\n", start); return start; } + +/** + * Forward relevant parameters to packet processor; queueID + * is used to link doorbell. The queueIDs are not re-used + * in current implementation, and we allocate only one page + * (4096 bytes) for doorbells, so check if this queue ID can + * be mapped into that page. + */ +void +HSADriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf) +{ + TypedBufferArg args(ioc_buf); + args.copyIn(mem_proxy); + + if (queueId >= 0x1000) { + fatal("%s: Exceeded maximum number of HSA queues allowed\n", name()); + } + + args->queue_id = queueId++; + auto &hsa_pp = device->hsaPacketProc(); + hsa_pp.setDeviceQueueDesc(args->read_pointer_address, + args->ring_base_address, args->queue_id, + args->ring_size); + args.copyOut(mem_proxy); +} diff --git a/src/dev/hsa/hsa_driver.hh b/src/dev/hsa/hsa_driver.hh index b3c7ee2af..abf79abfc 100644 --- a/src/dev/hsa/hsa_driver.hh +++ b/src/dev/hsa/hsa_driver.hh @@ -56,7 +56,7 @@ struct HSADriverParams; class HSADevice; -class SETranslatingPortProxy; +class PortProxy; class ThreadContext; class HSADriver : public EmulatedDriver @@ -74,8 +74,7 @@ class HSADriver : public EmulatedDriver HSADevice *device; uint32_t queueId; - void allocateQueue(const SETranslatingPortProxy &mem_proxy, - Addr ioc_buf_addr); + void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf); }; #endif // __DEV_HSA_HSA_DRIVER_HH__ diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc index bd050163b..f9880e40e 100644 --- a/src/dev/hsa/hsa_packet_processor.cc +++ b/src/dev/hsa/hsa_packet_processor.cc @@ -151,7 +151,7 @@ HSAPacketProcessor::translateOrDie(Addr vaddr, Addr &paddr) // Grab the process and try to translate the virtual address with it; with // new extensions, it will likely be wrong to just arbitrarily grab context // zero. - auto process = sys->getThreadContext(0)->getProcessPtr(); + auto process = sys->threads[0]->getProcessPtr(); if (!process->pTable->translate(vaddr, paddr)) fatal("failed translation: vaddr 0x%x\n", vaddr); @@ -393,7 +393,7 @@ HSAPacketProcessor::processPkt(void* pkt, uint32_t rl_idx, Addr host_pkt_addr) * The reason for this is that the DMASequencer does * not support atomic operations. */ - auto tc = sys->getThreadContext(0); + auto tc = sys->threads[0]; auto &virt_proxy = tc->getVirtProxy(); TypedBufferArg prev_signal(signal_addr); prev_signal.copyIn(virt_proxy); diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc index 57cf6d1b1..8523be9cc 100644 --- a/src/dev/hsa/hw_scheduler.cc +++ b/src/dev/hsa/hw_scheduler.cc @@ -92,7 +92,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer, // We use the same mapping function used by hsa runtime to do this mapping // // Originally - // #define VOID_PTR_ADD32(ptr,n) \ + // #define VOID_PTR_ADD32(ptr,n) // (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/ // (Addr)VOID_PTR_ADD32(0, queue_id) Addr db_offset = queue_id; @@ -343,7 +343,7 @@ HWScheduler::unregisterQueue(uint64_t queue_id) // `(Addr)(VOID_PRT_ADD32(0, queue_id))` // // Originally - // #define VOID_PTR_ADD32(ptr,n) \ + // #define VOID_PTR_ADD32(ptr,n) // (void*)((uint32_t*)(ptr) + n)/*ptr + offset*/ // (Addr)VOID_PTR_ADD32(0, queue_id) Addr db_offset = queue_id; diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 7eaf65fec..6b033f403 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -1,48 +1,48 @@ +# Copyright (c) 2015-2018 Advanced Micro Devices, Inc. +# All rights reserved. # -# Copyright (c) 2015 Advanced Micro Devices, Inc. -# All rights reserved. +# For use for simulation and test purposes only # -# For use for simulation and test purposes only +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: # -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. # -# 1. Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. # -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. # -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# Author: Steve Reinhardt +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. # +# Authors: Steve Reinhardt from m5.defines import buildEnv from m5.params import * from m5.proxy import * from m5.SimObject import SimObject +from m5.objects.Bridge import Bridge from m5.objects.ClockedObject import ClockedObject from m5.objects.Device import DmaDevice -from m5.objects.Process import EmulatedDriver -from m5.objects.Bridge import Bridge +from m5.objects.HSADevice import HSADevice +from m5.objects.HSADriver import HSADriver from m5.objects.LdsState import LdsState +from m5.objects.Process import EmulatedDriver class PrefetchType(Enum): vals = [ 'PF_CU', @@ -52,15 +52,48 @@ class PrefetchType(Enum): vals = [ 'PF_END', ] -class VectorRegisterFile(SimObject): +class PoolManager(SimObject): + type = 'PoolManager' + abstract = True + cxx_header = "gpu-compute/pool_manager.hh" + + min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') + pool_size = Param.Int(2048, 'number of vector registers per SIMD') + +# The simple pool manage only allows one workgroup to +# be executing on a CU at any given time. +class SimplePoolManager(PoolManager): + type = 'SimplePoolManager' + cxx_class = 'SimplePoolManager' + cxx_header = "gpu-compute/simple_pool_manager.hh" + +class RegisterFile(SimObject): + type = 'RegisterFile' + cxx_class = 'RegisterFile' + cxx_header = 'gpu-compute/register_file.hh' + + simd_id = Param.Int(-1, 'SIMD ID associated with this Register File') + num_regs = Param.Int(2048, 'number of registers in this RF') + wf_size = Param.Int(64, 'Wavefront size (in work items)') + +class ScalarRegisterFile(RegisterFile): + type = 'ScalarRegisterFile' + cxx_class = 'ScalarRegisterFile' + cxx_header = 'gpu-compute/scalar_register_file.hh' + +class VectorRegisterFile(RegisterFile): type = 'VectorRegisterFile' cxx_class = 'VectorRegisterFile' cxx_header = 'gpu-compute/vector_register_file.hh' - simd_id = Param.Int(0, 'SIMD ID associated with this VRF') - num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD') - wfSize = Param.Int(64, 'Wavefront size (in work items)') - min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF') +class RegisterManager(SimObject): + type = 'RegisterManager' + cxx_class = 'RegisterManager' + cxx_header = 'gpu-compute/register_manager.hh' + + policy = Param.String("static", "Register Manager Policy") + vrf_pool_managers = VectorParam.PoolManager('VRF Pool Managers') + srf_pool_managers = VectorParam.PoolManager('SRF Pool Managers') class Wavefront(SimObject): type = 'Wavefront' @@ -69,45 +102,68 @@ class Wavefront(SimObject): simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)') wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)') - wfSize = Param.Int(64, 'Wavefront size (in work items)') + wf_size = Param.Int(64, 'Wavefront size (in work items)') + max_ib_size = Param.Int(13, 'Maximum size (in number of insts) of the ' + 'instruction buffer (IB).') +# Most of the default values here are obtained from the +# AMD Graphics Core Next (GCN) Architecture whitepaper. class ComputeUnit(ClockedObject): type = 'ComputeUnit' cxx_class = 'ComputeUnit' cxx_header = 'gpu-compute/compute_unit.hh' wavefronts = VectorParam.Wavefront('Number of wavefronts') - wfSize = Param.Int(64, 'Wavefront size (in work items)') + # Wavefront size is 64. This is configurable, however changing + # this value to anything other than 64 will likely cause errors. + wf_size = Param.Int(64, 'Wavefront size (in work items)') num_SIMDs = Param.Int(4, 'number of SIMD units per CU') + num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU') + num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\ + 'per CU') + simd_width = Param.Int(16, 'width (number of lanes) per SIMD unit') + + operand_network_length = Param.Int(1, 'number of pipe stages of operand '\ + 'network') spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\ 'latency') - dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\ + dpbypass_pipe_length = Param.Int(4, 'vector ALU Double Precision bypass '\ 'latency') - + scalar_pipe_length = Param.Int(1, 'number of pipe stages per scalar ALU') issue_period = Param.Int(4, 'number of cycles per issue period') + + vrf_gm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\ + 'GM bus') + srf_scm_bus_latency = Param.Int(1, 'number of cycles per use of SRF '\ + 'to Scalar Mem bus') + vrf_lm_bus_latency = Param.Int(1, 'number of cycles per use of VRF to '\ + 'LM bus') + num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU') num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU') - n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') - mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\ - "Represents the pipeline to reach the TCP and "\ - "specified in GPU clock cycles") - mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\ - "cu. Represents the pipeline between the TCP "\ - "and cu as well as TCP data array access. "\ - "Specified in GPU clock cycles") + n_wf = Param.Int(10, 'Number of wavefront slots per SIMD') + mem_req_latency = Param.Int(50, "Latency for request from the cu to ruby. "\ + "Represents the pipeline to reach the TCP "\ + "and specified in GPU clock cycles") + mem_resp_latency = Param.Int(50, "Latency for responses from ruby to the "\ + "cu. Represents the pipeline between the "\ + "TCP and cu as well as TCP data array "\ + "access. Specified in GPU clock cycles") system = Param.System(Parent.any, "system object") cu_id = Param.Int('CU id') - vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\ - "in bytes") - coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\ - "in bytes") + vrf_to_coalescer_bus_width = Param.Int(64, "VRF->Coalescer data bus "\ + "width in bytes") + coalescer_to_vrf_bus_width = Param.Int(64, "Coalescer->VRF data bus "\ + "width in bytes") memory_port = VectorMasterPort("Port to the memory system") translation_port = VectorMasterPort('Port to the TLB hierarchy') sqc_port = MasterPort("Port to the SQC (I-cache") sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)") + scalar_port = MasterPort("Port to the scalar data cache") + scalar_tlb_port = MasterPort("Port to the TLB for the scalar data cache") perLaneTLB = Param.Bool(False, "enable per-lane TLB") prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\ "(0 turns off prefetching)") @@ -116,19 +172,22 @@ class ComputeUnit(ClockedObject): "from last mem req in lane of "\ "CU|Phase|Wavefront") execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy"); - xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr."); debugSegFault = Param.Bool(False, "enable debugging GPU seg faults") functionalTLB = Param.Bool(False, "Assume TLB causes no delay") localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\ "kernel end") - countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\ - "and how many times") + countPages = Param.Bool(False, "Generate per-CU file of all pages "\ + "touched and how many times") + scalar_mem_queue_size = Param.Int(32, "Number of entries in scalar "\ + "memory pipeline's queues") global_mem_queue_size = Param.Int(256, "Number of entries in the global " "memory pipeline's queues") local_mem_queue_size = Param.Int(256, "Number of entries in the local " "memory pipeline's queues") + max_wave_requests = Param.Int(64, "number of pending vector memory "\ + "requests per wavefront") max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\ " of instructions that can be sent to coalescer") ldsBus = Bridge() # the bridge between the CU and its LDS @@ -137,72 +196,54 @@ class ComputeUnit(ClockedObject): vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ "file") + + scalar_register_file = VectorParam.ScalarRegisterFile("Scalar register "\ + "file") out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery" " in the GM pipeline") + register_manager = Param.RegisterManager("Register Manager") + fetch_depth = Param.Int(2, 'number of i-cache lines that may be ' + 'buffered in the fetch unit.') class Shader(ClockedObject): type = 'Shader' cxx_class = 'Shader' cxx_header = 'gpu-compute/shader.hh' - CUs = VectorParam.ComputeUnit('Number of compute units') - n_wf = Param.Int(1, 'Number of wavefront slots per SIMD') + gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU') + dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher') + n_wf = Param.Int(10, 'Number of wavefront slots per SIMD') impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into - ruby at kernel boundaries""") - separate_acquire_release = Param.Bool(False, - """Do ld_acquire/st_release generate separate requests for the - acquire and release?""") + ruby at kernel boundaries""") globalmem = Param.MemorySize('64kB', 'Memory size') timing = Param.Bool(False, 'timing memory accesses') cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU") translation = Param.Bool(False, "address translation"); + timer_period = Param.Clock('10us', "system timer period") + idlecu_timeout = Param.Tick(0, "Idle CU watchdog timeout threshold") + max_valu_insts = Param.Int(0, "Maximum vALU insts before exiting") -class ClDriver(EmulatedDriver): - type = 'ClDriver' - cxx_header = 'gpu-compute/cl_driver.hh' - codefile = VectorParam.String('code file name(s)') +class GPUComputeDriver(HSADriver): + type = 'GPUComputeDriver' + cxx_header = 'gpu-compute/gpu_compute_driver.hh' -class GpuDispatcher(DmaDevice): - type = 'GpuDispatcher' +class GPUDispatcher(SimObject): + type = 'GPUDispatcher' cxx_header = 'gpu-compute/dispatcher.hh' - # put at 8GB line for now - pio_addr = Param.Addr(0x200000000, "Device Address") - pio_latency = Param.Latency('1ns', "Programmed IO latency") - shader_pointer = Param.Shader('pointer to shader') - translation_port = MasterPort('Port to the dispatcher TLB') - cpu = Param.BaseCPU("CPU to wake up on kernel completion") - - cl_driver = Param.ClDriver('pointer to driver') - -class MemType(Enum): vals = [ - 'M_U8', - 'M_U16', - 'M_U32', - 'M_U64', - 'M_S8', - 'M_S16', - 'M_S32', - 'M_S64', - 'M_F16', - 'M_F32', - 'M_F64', - ] + +class GPUCommandProcessor(HSADevice): + type = 'GPUCommandProcessor' + cxx_header = 'gpu-compute/gpu_command_processor.hh' + dispatcher = Param.GPUDispatcher('workgroup dispatcher for the GPU') class StorageClassType(Enum): vals = [ 'SC_SPILL', 'SC_GLOBAL', - 'SC_SHARED', + 'SC_GROUP', 'SC_PRIVATE', 'SC_READONLY', 'SC_KERNARG', + 'SC_ARG', 'SC_NONE', ] - -class RegisterType(Enum): vals = [ - 'RT_VECTOR', - 'RT_SCALAR', - 'RT_CONDITION', - 'RT_HARDWARE', - 'RT_NONE', - ] diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py index e12db7107..ad4c6c3f7 100644 --- a/src/gpu-compute/GPUStaticInstFlags.py +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -13,9 +13,9 @@ # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# 3. Neither the name of the copyright holder nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -40,15 +40,18 @@ class GPUStaticInstFlags(Enum): # Op types 'ALU', # ALU op 'Branch', # Branch instruction + 'CondBranch', # Conditinal Branch instruction 'Nop', # No-op (no effect at all) - 'Return', # Return instruction + 'Return', # Subroutine return instruction + 'EndOfKernel', # Kernel termination instruction + 'KernelLaunch', # Kernel launch inst 'UnconditionalJump', # 'SpecialOp', # Special op 'Waitcnt', # Is a waitcnt instruction # Memory ops 'MemBarrier', # Barrier instruction - 'MemFence', # Memory fence instruction + 'MemSync', # Synchronizing instruction 'MemoryRef', # References memory (load, store, or atomic) 'Flat', # Flat memory op 'Load', # Reads from memory @@ -64,6 +67,13 @@ class GPUStaticInstFlags(Enum): 'WritesSCC', # The instruction writes SCC 'ReadsVCC', # The instruction reads VCC 'WritesVCC', # The instruction writes VCC + 'ReadsEXEC', # The instruction reads Exec Mask + 'WritesEXEC', # The instruction writes Exec Mask + 'ReadsMode', # The instruction reads Mode register + 'WritesMode', # The instruction writes Mode register + 'IgnoreExec', # The instruction ignores the Exec Mask + 'IsSDWA', # The instruction is a SDWA instruction + 'IsDPP', # The instruction is a DPP instruction # Atomic OP types 'AtomicAnd', @@ -78,13 +88,6 @@ class GPUStaticInstFlags(Enum): 'AtomicMax', 'AtomicMin', - # Memory order flags - 'RelaxedOrder', - 'Acquire', # Has acquire semantics - 'Release', # Has release semantics - 'AcquireRelease', # Has acquire and release semantics - 'NoOrder', # Has no ordering restrictions - # Segment access flags 'ArgSegment', # Accesses the arg segment 'GlobalSegment', # Accesses global memory @@ -95,15 +98,17 @@ class GPUStaticInstFlags(Enum): 'SpillSegment', # Accesses the spill segment 'NoSegment', # Does not have an associated segment - # Scope flags - 'WorkitemScope', - 'WavefrontScope', - 'WorkgroupScope', - 'DeviceScope', - 'SystemScope', - 'NoScope', # Does not have an associated scope - # Coherence flags - 'GloballyCoherent', # Coherent with other workitems on same device - 'SystemCoherent' # Coherent with a different device, or the host + 'GloballyCoherent', # Coherent with other work-items on same device + 'SystemCoherent', # Coherent with a different device, or the host + + # Floating-point flags + 'F16', # F16 operation + 'F32', # F32 operation + 'F64', # F64 operation + + # MAC, MAD, FMA + 'FMA', # FMA + 'MAC', # MAC + 'MAD' # MAD ] diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index b0ffebf0b..244791b9b 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,56 +41,62 @@ SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') SimObject('X86GPUTLB.py') -if env['TARGET_GPU_ISA'] == 'hsail': - Source('brig_object.cc') - Source('hsail_code.cc') - -Source('cl_driver.cc') Source('compute_unit.cc') -Source('condition_register_state.cc') Source('dispatcher.cc') Source('exec_stage.cc') Source('fetch_stage.cc') Source('fetch_unit.cc') Source('global_memory_pipeline.cc') +Source('gpu_command_processor.cc') +Source('gpu_compute_driver.cc') Source('gpu_dyn_inst.cc') Source('gpu_exec_context.cc') Source('gpu_static_inst.cc') Source('gpu_tlb.cc') -Source('hsa_object.cc') -Source('kernel_cfg.cc') Source('lds_state.cc') Source('local_memory_pipeline.cc') Source('pool_manager.cc') +Source('register_file.cc') +Source('register_manager.cc') +Source('scalar_memory_pipeline.cc') +Source('scalar_register_file.cc') Source('schedule_stage.cc') Source('scheduler.cc') Source('scoreboard_check_stage.cc') Source('shader.cc') Source('simple_pool_manager.cc') +Source('static_register_manager_policy.cc') Source('tlb_coalescer.cc') Source('vector_register_file.cc') -Source('vector_register_state.cc') Source('wavefront.cc') -DebugFlag('BRIG') DebugFlag('GPUCoalescer') +DebugFlag('GPUCommandProc') +DebugFlag('GPUDriver') +DebugFlag('GPUInitAbi') DebugFlag('GPUDisp') DebugFlag('GPUExec') DebugFlag('GPUFetch') -DebugFlag('GPUHsailCFInfo') +DebugFlag('GPUKernelInfo') DebugFlag('GPUMem') DebugFlag('GPUPort') DebugFlag('GPUPrefetch') DebugFlag('GPUReg') +DebugFlag('GPURename') +DebugFlag('GPURF') +DebugFlag('GPURfState') +DebugFlag('GPUSched') +DebugFlag('GPUShader') +DebugFlag('GPUSRF') DebugFlag('GPUSync') DebugFlag('GPUTLB') DebugFlag('GPUVRF') -DebugFlag('HSALoader') -DebugFlag('HSAIL') -DebugFlag('HSAILObject') +DebugFlag('GPUVRFSched') +DebugFlag('GPUWgLatency') DebugFlag('Predictor') DebugFlag('WavefrontStack') CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch', - 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL', - 'GPUVRF']) + 'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync', + 'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo', + 'GPUInitAbi']) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index cd880d6cc..feeb803e1 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -42,53 +42,69 @@ #include "debug/GPUMem.hh" #include "debug/GPUPort.hh" #include "debug/GPUPrefetch.hh" +#include "debug/GPUReg.hh" +#include "debug/GPURename.hh" #include "debug/GPUSync.hh" #include "debug/GPUTLB.hh" #include "gpu-compute/dispatcher.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_static_inst.hh" -#include "gpu-compute/ndrange.hh" +#include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/simple_pool_manager.hh" #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" #include "mem/page_table.hh" #include "sim/process.hh" - -ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p), - scoreboardCheckStage(p), scheduleStage(p), execStage(p), - globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0), - cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs), +#include "sim/sim_exit.hh" + +ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), + numVectorGlobalMemUnits(p->num_global_mem_pipes), + numVectorSharedMemUnits(p->num_shared_mem_pipes), + numScalarMemUnits(p->num_scalar_mem_pipes), + numVectorALUs(p->num_SIMDs), + numScalarALUs(p->num_scalar_cores), + vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), + coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), + registerManager(p->register_manager), fetchStage(p), + scoreboardCheckStage(p), scheduleStage(p, this), execStage(p), + globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p), + tickEvent([this]{ exec(); }, "Compute unit tick event", + false, Event::CPU_Tick_Pri), + cu_id(p->cu_id), + vrf(p->vector_register_file), srf(p->scalar_register_file), + simdWidth(p->simd_width), spBypassPipeLength(p->spbypass_pipe_length), dpBypassPipeLength(p->dpbypass_pipe_length), + scalarPipeStages(p->scalar_pipe_length), + operandNetworkLength(p->operand_network_length), issuePeriod(p->issue_period), - numGlbMemUnits(p->num_global_mem_pipes), - numLocMemUnits(p->num_shared_mem_pipes), + vrf_gm_bus_latency(p->vrf_gm_bus_latency), + srf_scm_bus_latency(p->srf_scm_bus_latency), + vrf_lm_bus_latency(p->vrf_lm_bus_latency), perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth), prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type), - xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault), + debugSegFault(p->debugSegFault), functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier), countPages(p->countPages), barrier_id(0), - vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), - coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(this, "ComputeUnit")), lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this), _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0), - wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst()) + wavefrontSize(p->wf_size) { /** * This check is necessary because std::bitset only provides conversion * to unsigned long or unsigned long long via to_ulong() or to_ullong(). - * there are * a few places in the code where to_ullong() is used, however - * if VSZ is larger than a value the host can support then bitset will - * throw a runtime exception. we should remove all use of to_long() or - * to_ullong() so we can have VSZ greater than 64b, however until that is - * done this assert is required. + * there are a few places in the code where to_ullong() is used, however + * if wavefrontSize is larger than a value the host can support then + * bitset will throw a runtime exception. We should remove all use of + * to_long() or to_ullong() so we can have wavefrontSize greater than 64b, + * however until that is done this assert is required. */ - fatal_if(p->wfSize > std::numeric_limits::digits || - p->wfSize <= 0, + fatal_if(p->wf_size > std::numeric_limits::digits || + p->wf_size <= 0, "WF size is larger than the host can support"); fatal_if(!isPowerOf2(wavefrontSize), "Wavefront size should be a power of 2"); @@ -101,10 +117,12 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p), numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t)) / coalescerToVrfBusWidth; - lastVaddrWF.resize(numSIMDs); - wfList.resize(numSIMDs); + // Initialization: all WF slots are assumed STOPPED + idleWfs = p->n_wf * numVectorALUs; + lastVaddrWF.resize(numVectorALUs); + wfList.resize(numVectorALUs); - for (int j = 0; j < numSIMDs; ++j) { + for (int j = 0; j < numVectorALUs; ++j) { lastVaddrWF[j].resize(p->n_wf); for (int i = 0; i < p->n_wf; ++i) { @@ -119,9 +137,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p), } } - lastVaddrSimd.resize(numSIMDs); + lastVaddrSimd.resize(numVectorALUs); - for (int i = 0; i < numSIMDs; ++i) { + for (int i = 0; i < numVectorALUs; ++i) { lastVaddrSimd[i].resize(wfSize(), 0); } @@ -150,20 +168,33 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p), cuExitCallback = new CUExitCallback(this); registerExitCallback(cuExitCallback); - xactCasLoadMap.clear(); - lastExecCycle.resize(numSIMDs, 0); + lastExecCycle.resize(numVectorALUs, 0); for (int i = 0; i < vrf.size(); ++i) { vrf[i]->setParent(this); } - + for (int i = 0; i < srf.size(); ++i) { + srf[i]->setParent(this); + } numVecRegsPerSimd = vrf[0]->numRegs(); + numScalarRegsPerSimd = srf[0]->numRegs(); + + registerManager->setParent(this); + + activeWaves = 0; + + instExecPerSimd.resize(numVectorALUs, 0); + + // Calculate the number of bits to address a cache line + panic_if(!isPowerOf2(_cacheLineSize), + "Cache line size should be a power of two."); + cacheLineBits = floorLog2(_cacheLineSize); } ComputeUnit::~ComputeUnit() { // Delete wavefront slots - for (int j = 0; j < numSIMDs; ++j) { + for (int j = 0; j < numVectorALUs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { delete wfList[j][i]; } @@ -171,63 +202,110 @@ ComputeUnit::~ComputeUnit() } lastVaddrCU.clear(); readyList.clear(); - waveStatusList.clear(); dispatchList.clear(); - vectorAluInstAvail.clear(); delete cuExitCallback; delete ldsPort; } -void -ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr) +int +ComputeUnit::numExeUnits() const +{ + return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits + + numVectorSharedMemUnits + numScalarMemUnits; +} + +// index into readyList of the first memory unit +int +ComputeUnit::firstMemUnit() const +{ + return numVectorALUs + numScalarALUs; +} + +// index into readyList of the last memory unit +int +ComputeUnit::lastMemUnit() const { - w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount); + return numExeUnits() - 1; +} - w->workGroupSz[0] = ndr->q.wgSize[0]; - w->workGroupSz[1] = ndr->q.wgSize[1]; - w->workGroupSz[2] = ndr->q.wgSize[2]; +// index into scalarALUs vector of SALU used by the wavefront +int +ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const +{ + if (numScalarALUs == 1) { + return 0; + } else { + return w->simdId % numScalarALUs; + } +} + +// index into readyList of Scalar ALU unit used by wavefront +int +ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const +{ + return numVectorALUs + mapWaveToScalarAlu(w); +} + +// index into readyList of Global Memory unit used by wavefront +int +ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const +{ + // TODO: FIXME if more than 1 GM pipe supported + return numVectorALUs + numScalarALUs; +} + +// index into readyList of Local Memory unit used by wavefront +int +ComputeUnit::mapWaveToLocalMem(Wavefront *w) const +{ + // TODO: FIXME if more than 1 LM pipe supported + return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits; +} + +// index into readyList of Scalar Memory unit used by wavefront +int +ComputeUnit::mapWaveToScalarMem(Wavefront *w) const +{ + // TODO: FIXME if more than 1 ScM pipe supported + return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits + + numVectorSharedMemUnits; +} + +void +ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task) +{ + w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs()); + w->workGroupSz[0] = task->wgSize(0); + w->workGroupSz[1] = task->wgSize(1); + w->workGroupSz[2] = task->wgSize(2); w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2]; - w->gridSz[0] = ndr->q.gdSize[0]; - w->gridSz[1] = ndr->q.gdSize[1]; - w->gridSz[2] = ndr->q.gdSize[2]; - w->kernelArgs = ndr->q.args; - w->privSizePerItem = ndr->q.privMemPerItem; - w->spillSizePerItem = ndr->q.spillMemPerItem; - w->roBase = ndr->q.roMemStart; - w->roSize = ndr->q.roMemTotal; - w->computeActualWgSz(ndr); + w->gridSz[0] = task->gridSize(0); + w->gridSz[1] = task->gridSize(1); + w->gridSz[2] = task->gridSize(2); + w->computeActualWgSz(task); } +// delete all wavefronts that have been marked as ready at SCB stage +// but are found to have empty instruction buffers at SCH stage void -ComputeUnit::updateEvents() { - - if (!timestampVec.empty()) { - uint32_t vecSize = timestampVec.size(); - uint32_t i = 0; - while (i < vecSize) { - if (timestampVec[i] <= shader->tick_cnt) { - std::pair regInfo = regIdxVec[i]; - vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t), - statusVec[i]); - timestampVec.erase(timestampVec.begin() + i); - regIdxVec.erase(regIdxVec.begin() + i); - statusVec.erase(statusVec.begin() + i); - --vecSize; - --i; +ComputeUnit::updateReadyList(int unitId) +{ + if (!readyList[unitId].empty()) { + for (std::vector::iterator it = readyList[unitId].begin(); + it != readyList[unitId].end();) { + if ((*it)->instructionBuffer.empty()) { + it = readyList[unitId].erase(it); + } + else { + ++it; } - ++i; } } - - for (int i = 0; i< numSIMDs; ++i) { - vrf[i]->updateEvents(); - } } - void ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, - NDRange *ndr) + HSAQueueEntry *task, bool fetchContext) { static int _n_wave = 0; @@ -239,7 +317,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, init_mask[k] = 1; } - w->kernId = ndr->dispatchId; + w->execMask() = init_mask; + + w->kernId = task->dispatchId(); w->wfId = waveId; w->initMask = init_mask.to_ullong(); @@ -263,29 +343,21 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, w->oldBarrierCnt = 0; w->barrierCnt = 0; - w->privBase = ndr->q.privMemStart; - ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); - - w->spillBase = ndr->q.spillMemStart; - ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); - - w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong()); - // WG state - w->wgId = ndr->globalWgId; - w->dispatchId = ndr->dispatchId; - w->workGroupId[0] = w->wgId % ndr->numWg[0]; - w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1]; - w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]); + w->wgId = task->globalWgId(); + w->dispatchId = task->dispatchId(); + w->workGroupId[0] = w->wgId % task->numWg(0); + w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1); + w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1)); w->barrierId = barrier_id; - w->stalledAtBarrier = false; + w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true; // set the wavefront context to have a pointer to this section of the LDS w->ldsChunk = ldsChunk; int32_t refCount M5_VAR_USED = - lds.increaseRefCounter(w->dispatchId, w->wgId); + lds.increaseRefCounter(w->dispatchId, w->wgId); DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", cu_id, w->wgId, refCount); @@ -294,85 +366,134 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, if (w->pendingFetch) w->dropFetch = true; - // is this the last wavefront in the workgroup - // if set the spillWidth to be the remaining work-items - // so that the vector access is correct - if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) { - w->spillWidth = w->actualWgSzTotal - (waveId * wfSize()); - } else { - w->spillWidth = wfSize(); - } - DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " - "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); + "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId, + w->wfSlotId); + + w->initRegState(task, w->actualWgSzTotal); + w->start(_n_wave++, task->codeAddr()); - w->start(++_n_wave, ndr->q.code_ptr); + waveLevelParallelism.sample(activeWaves); + activeWaves++; +} + +/** + * trigger invalidate operation in the cu + * + * req: request initialized in shader, carrying the invlidate flags + */ +void +ComputeUnit::doInvalidate(RequestPtr req, int kernId){ + GPUDynInstPtr gpuDynInst + = std::make_shared(this, nullptr, + new KernelLaunchStaticInst(), getAndIncSeqNum()); + + // kern_id will be used in inv responses + gpuDynInst->kern_id = kernId; + // update contextId field + req->setContext(gpuDynInst->wfDynId); + + injectGlobalMemFence(gpuDynInst, true, req); +} + +/** + * trigger flush operation in the cu + * + * gpuDynInst: inst passed to the request + */ +void +ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) { + injectGlobalMemFence(gpuDynInst, true); } void -ComputeUnit::StartWorkgroup(NDRange *ndr) +ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler) { - // reserve the LDS capacity allocated to the work group - // disambiguated by the dispatch ID and workgroup ID, which should be - // globally unique - LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId, - ndr->q.ldsSize); - - // Send L1 cache acquire - // isKernel + isAcquire = Kernel Begin - if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = - std::make_shared(this, nullptr, kernelLaunchInst, - getAndIncSeqNum()); - - gpuDynInst->useContinuation = false; - injectGlobalMemFence(gpuDynInst, true); + // If we aren't ticking, start it up! + if (!tickEvent.scheduled()) { + DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id); + schedule(tickEvent, nextCycle()); } - // calculate the number of 32-bit vector registers required by wavefront - int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); - int wave_id = 0; - - // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time - for (int m = 0; m < shader->n_wf * numSIMDs; ++m) { - Wavefront *w = wfList[m % numSIMDs][m / numSIMDs]; - // Check if this wavefront slot is available: - // It must be stopped and not waiting - // for a release to complete S_RETURNING - if (w->status == Wavefront::S_STOPPED) { - fillKernelState(w, ndr); - // if we have scheduled all work items then stop - // scheduling wavefronts - if (wave_id * wfSize() >= w->actualWgSzTotal) - break; + // the kernel's invalidate must have finished before any wg dispatch + assert(task->isInvDone()); - // reserve vector registers for the scheduled wavefront - assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd); - uint32_t normSize = 0; + // reserve the LDS capacity allocated to the work group + // disambiguated by the dispatch ID and workgroup ID, which should be + // globally unique + LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(), + task->globalWgId(), + task->ldsSize()); - w->startVgprIndex = vrf[m % numSIMDs]->manager-> - allocateRegion(vregDemand, &normSize); + panic_if(!ldsChunk, "was not able to reserve space for this WG"); - w->reservedVectorRegs = normSize; - vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; + // calculate the number of 32-bit vector registers required + // by each work item + int vregDemand = task->numVectorRegs(); + int sregDemand = task->numScalarRegs(); + int wave_id = 0; - startWavefront(w, wave_id, ldsChunk, ndr); - ++wave_id; + // Assign WFs according to numWfsToSched vector, which is computed by + // hasDispResources() + for (int j = 0; j < shader->n_wf; ++j) { + for (int i = 0; i < numVectorALUs; ++i) { + Wavefront *w = wfList[i][j]; + // Check if this wavefront slot is available and there are WFs + // remaining to be dispatched to current SIMD: + // WF slot must be stopped and not waiting + // for a release to complete S_RETURNING + if (w->getStatus() == Wavefront::S_STOPPED && + numWfsToSched[i] > 0) { + // decrement number of WFs awaiting dispatch to current SIMD + numWfsToSched[i] -= 1; + + fillKernelState(w, task); + + DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] " + "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId, + vregDemand, sregDemand); + + registerManager->allocateRegisters(w, vregDemand, sregDemand); + + startWavefront(w, wave_id, ldsChunk, task); + ++wave_id; + } } } ++barrier_id; } -int -ComputeUnit::ReadyWorkgroup(NDRange *ndr) +void +ComputeUnit::insertInPipeMap(Wavefront *w) +{ + panic_if(w->instructionBuffer.empty(), + "Instruction Buffer of WF%d can't be empty", w->wgId); + GPUDynInstPtr ii = w->instructionBuffer.front(); + pipeMap.emplace(ii->seqNum()); +} + +void +ComputeUnit::deleteFromPipeMap(Wavefront *w) +{ + panic_if(w->instructionBuffer.empty(), + "Instruction Buffer of WF%d can't be empty", w->wgId); + GPUDynInstPtr ii = w->instructionBuffer.front(); + // delete the dynamic instruction from the pipeline map + auto it = pipeMap.find(ii->seqNum()); + panic_if(it == pipeMap.end(), "Pipeline Map is empty\n"); + pipeMap.erase(it); +} + +bool +ComputeUnit::hasDispResources(HSAQueueEntry *task) { - // Get true size of workgroup (after clamping to grid size) - int trueWgSize[3]; + // compute true size of workgroup (after clamping to grid size) + int trueWgSize[HSAQueueEntry::MAX_DIM]; int trueWgSizeTotal = 1; - for (int d = 0; d < 3; ++d) { - trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] - - ndr->wgId[d] * ndr->q.wgSize[d]); + for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) { + trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) - + task->wgId(d) * task->wgSize(d)); trueWgSizeTotal *= trueWgSize[d]; DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]); @@ -380,69 +501,104 @@ ComputeUnit::ReadyWorkgroup(NDRange *ndr) DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal); + // calculate the number of WFs in this WG + int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); + // calculate the number of 32-bit vector registers required by each // work item of the work group - int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount); - bool vregAvail = true; - int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); + int vregDemandPerWI = task->numVectorRegs(); + // calculate the number of 32-bit scalar registers required by each + // work item of the work group + int sregDemandPerWI = task->numScalarRegs(); + + // check if the total number of VGPRs snd SGPRs required by all WFs + // of the WG fit in the VRFs of all SIMD units and the CU's SRF + panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd), + "WG with %d WFs and %d VGPRs per WI can not be allocated to CU " + "that has %d VGPRs\n", + numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd); + panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd, + "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU " + "with %d SGPRs\n", + numWfs, sregDemandPerWI, numScalarRegsPerSimd); + + // number of WF slots that are not occupied int freeWfSlots = 0; - // check if the total number of VGPRs required by all WFs of the WG - // fit in the VRFs of all SIMD units - assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd)); + // number of Wfs from WG that were successfully mapped to a SIMD int numMappedWfs = 0; - std::vector numWfsPerSimd; - numWfsPerSimd.resize(numSIMDs, 0); - // find how many free WF slots we have across all SIMDs + numWfsToSched.clear(); + numWfsToSched.resize(numVectorALUs, 0); + + // attempt to map WFs to the SIMDs, based on WF slot availability + // and register file availability for (int j = 0; j < shader->n_wf; ++j) { - for (int i = 0; i < numSIMDs; ++i) { - if (wfList[i][j]->status == Wavefront::S_STOPPED) { - // count the number of free WF slots + for (int i = 0; i < numVectorALUs; ++i) { + if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) { ++freeWfSlots; - if (numMappedWfs < numWfs) { - // count the WFs to be assigned per SIMD - numWfsPerSimd[i]++; + // check if current WF will fit onto current SIMD/VRF + // if all WFs have not yet been mapped to the SIMDs + if (numMappedWfs < numWfs && + registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1, + sregDemandPerWI) && + registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1, + vregDemandPerWI)) { + numWfsToSched[i]++; + numMappedWfs++; } - numMappedWfs++; } } } - // if there are enough free WF slots then find if there are enough - // free VGPRs per SIMD based on the WF->SIMD mapping - if (freeWfSlots >= numWfs) { - for (int j = 0; j < numSIMDs; ++j) { - // find if there are enough free VGPR regions in the SIMD's VRF - // to accommodate the WFs of the new WG that would be mapped to - // this SIMD unit - vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j], - vregDemandPerWI); - - // stop searching if there is at least one SIMD - // whose VRF does not have enough free VGPR pools. - // This is because a WG is scheduled only if ALL - // of its WFs can be scheduled - if (!vregAvail) - break; + // check that the number of mapped WFs is not greater + // than the actual number of WFs + assert(numMappedWfs <= numWfs); + + bool vregAvail = true; + bool sregAvail = true; + // if a WF to SIMD mapping was not found, find the limiting resource + if (numMappedWfs < numWfs) { + + for (int j = 0; j < numVectorALUs; ++j) { + // find if there are enough free VGPRs in the SIMD's VRF + // to accomodate the WFs of the new WG that would be mapped + // to this SIMD unit + vregAvail &= registerManager-> + canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI); + // find if there are enough free SGPRs in the SIMD's SRF + // to accomodate the WFs of the new WG that would be mapped + // to this SIMD unit + sregAvail &= registerManager-> + canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI); } } - DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n", - freeWfSlots, vregAvail); + DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \ + VGPR Availability = %d, SGPR Availability = %d\n", + freeWfSlots, numMappedWfs, vregAvail, sregAvail); if (!vregAvail) { ++numTimesWgBlockedDueVgprAlloc; } + if (!sregAvail) { + ++numTimesWgBlockedDueSgprAlloc; + } + // Return true if enough WF slots to submit workgroup and if there are // enough VGPRs to schedule all WFs to their SIMD units - if (!lds.canReserve(ndr->q.ldsSize)) { + bool ldsAvail = lds.canReserve(task->ldsSize()); + if (!ldsAvail) { wgBlockedDueLdsAllocation++; } - // Return true if (a) there are enough free WF slots to submit - // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their - // SIMD units and (c) if there is enough space in LDS - return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize); + // Return true if the following are all true: + // (a) all WFs of the WG were mapped to free WF slots + // (b) there are enough VGPRs to schedule all WFs to their SIMD units + // (c) there are enough SGPRs on the CU to schedule all WFs + // (d) there is enough space in LDS to allocate for all WFs + bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail + && ldsAvail; + return can_dispatch; } int @@ -451,21 +607,24 @@ ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots) DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id); int ccnt = 0; - for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) { + for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) { for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) { Wavefront *w = wfList[i_simd][i_wf]; - if (w->status == Wavefront::S_RUNNING) { + if (w->getStatus() == Wavefront::S_RUNNING) { DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf); DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n", w->barrierId, _barrier_id); - DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n", + DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n", w->barrierCnt, bcnt); + + DPRINTF(GPUSync, "outstanding Reqs = %d\n", + w->outstandingReqs); } - if (w->status == Wavefront::S_RUNNING && + if (w->getStatus() == Wavefront::S_RUNNING && w->barrierId == _barrier_id && w->barrierCnt == bcnt && !w->outstandingReqs) { ++ccnt; @@ -482,61 +641,22 @@ ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots) return ccnt == bslots; } -// Check if the current wavefront is blocked on additional resources. -bool -ComputeUnit::cedeSIMD(int simdId, int wfSlotId) -{ - bool cede = false; - - // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld - // magic instructions will impact the scheduling of wavefronts - if (xact_cas_mode) { - /* - * When a wavefront calls xact_cas_ld, it adds itself to a per address - * queue. All per address queues are managed by the xactCasLoadMap. - * - * A wavefront is not blocked if: it is not in ANY per address queue or - * if it is at the head of a per address queue. - */ - for (auto itMap : xactCasLoadMap) { - std::list curWaveIDQueue = itMap.second.waveIDQueue; - - if (!curWaveIDQueue.empty()) { - for (auto it : curWaveIDQueue) { - waveIdentifier cur_wave = it; - - if (cur_wave.simdId == simdId && - cur_wave.wfSlotId == wfSlotId) { - // 2 possibilities - // 1: this WF has a green light - // 2: another WF has a green light - waveIdentifier owner_wave = curWaveIDQueue.front(); - - if (owner_wave.simdId != cur_wave.simdId || - owner_wave.wfSlotId != cur_wave.wfSlotId) { - // possibility 2 - cede = true; - break; - } else { - // possibility 1 - break; - } - } - } - } - } - } - - return cede; -} - // Execute one clock worth of work on the ComputeUnit. void ComputeUnit::exec() { - updateEvents(); + // process reads and writes in the RFs + for (auto &vecRegFile : vrf) { + vecRegFile->exec(); + } + + for (auto &scRegFile : srf) { + scRegFile->exec(); + } + // Execute pipeline stages in reverse order to simulate // the pipeline latency + scalarMemoryPipe.exec(); globalMemoryPipe.exec(); localMemoryPipe.exec(); execStage.exec(); @@ -545,65 +665,62 @@ ComputeUnit::exec() fetchStage.exec(); totalCycles++; + + // Put this CU to sleep if there is no more work to be done. + if (!isDone()) { + schedule(tickEvent, nextCycle()); + } else { + shader->notifyCuSleep(); + DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id); + } } void ComputeUnit::init() { - // Initialize CU Bus models - glbMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1)); - locMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1)); - nextGlbMemBus = 0; - nextLocMemBus = 0; - fatal_if(numGlbMemUnits > 1, - "No support for multiple Global Memory Pipelines exists!!!"); - vrfToGlobalMemPipeBus.resize(numGlbMemUnits); - for (int j = 0; j < numGlbMemUnits; ++j) { - vrfToGlobalMemPipeBus[j] = WaitClass(); - vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1)); - } + // Initialize CU Bus models and execution resources - fatal_if(numLocMemUnits > 1, - "No support for multiple Local Memory Pipelines exists!!!"); - vrfToLocalMemPipeBus.resize(numLocMemUnits); - for (int j = 0; j < numLocMemUnits; ++j) { - vrfToLocalMemPipeBus[j] = WaitClass(); - vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1)); + // Vector ALUs + vectorALUs.clear(); + for (int i = 0; i < numVectorALUs; i++) { + vectorALUs.emplace_back(this, clockPeriod()); } - vectorRegsReserved.resize(numSIMDs, 0); - aluPipe.resize(numSIMDs); - wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits); - for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) { - wfWait[i] = WaitClass(); - wfWait[i].init(&shader->tick_cnt, shader->ticks(1)); + // Scalar ALUs + scalarALUs.clear(); + for (int i = 0; i < numScalarALUs; i++) { + scalarALUs.emplace_back(this, clockPeriod()); } - for (int i = 0; i < numSIMDs; ++i) { - aluPipe[i] = WaitClass(); - aluPipe[i].init(&shader->tick_cnt, shader->ticks(1)); - } + // Vector Global Memory + fatal_if(numVectorGlobalMemUnits > 1, + "No support for multiple Global Memory Pipelines exists!!!"); + vectorGlobalMemUnit.init(this, clockPeriod()); + vrfToGlobalMemPipeBus.init(this, clockPeriod()); + glbMemToVrfBus.init(this, clockPeriod()); - // Setup space for call args - for (int j = 0; j < numSIMDs; ++j) { - for (int i = 0; i < shader->n_wf; ++i) { - wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize); - } - } + // Vector Local/Shared Memory + fatal_if(numVectorSharedMemUnits > 1, + "No support for multiple Local Memory Pipelines exists!!!"); + vectorSharedMemUnit.init(this, clockPeriod()); + vrfToLocalMemPipeBus.init(this, clockPeriod()); + locMemToVrfBus.init(this, clockPeriod()); - // Initializing pipeline resources - readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits); - waveStatusList.resize(numSIMDs); + // Scalar Memory + fatal_if(numScalarMemUnits > 1, + "No support for multiple Scalar Memory Pipelines exists!!!"); + scalarMemUnit.init(this, clockPeriod()); + srfToScalarMemPipeBus.init(this, clockPeriod()); + scalarMemToSrfBus.init(this, clockPeriod()); - for (int j = 0; j < numSIMDs; ++j) { - for (int i = 0; i < shader->n_wf; ++i) { - waveStatusList[j].push_back( - std::make_pair(wfList[j][i], BLOCKED)); - } - } + vectorRegsReserved.resize(numVectorALUs, 0); + scalarRegsReserved.resize(numVectorALUs, 0); + + // Initializing pipeline resources + readyList.resize(numExeUnits()); - for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) { - dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY)); + for (int j = 0; j < numExeUnits(); ++j) { + dispatchList.push_back(std::make_pair(nullptr, EMPTY)); } fetchStage.init(this); @@ -612,10 +729,7 @@ ComputeUnit::init() execStage.init(this); globalMemoryPipe.init(this); localMemoryPipe.init(this); - // initialize state for statistics calculation - vectorAluInstAvail.resize(numSIMDs, false); - shrMemInstAvail = 0; - glbMemInstAvail = 0; + scalarMemoryPipe.init(this); gmTokenPort.setTokenManager(memPortTokens); } @@ -629,61 +743,176 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) SenderState *sender_state = safe_cast(pkt->senderState); int index = sender_state->port_index; GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + GPUDispatcher &dispatcher = computeUnit->shader->dispatcher(); + + // MemSyncResp + WriteAckResp are handled completely here and we don't + // schedule a MemRespEvent to process the responses further + if (pkt->cmd == MemCmd::MemSyncResp) { + // This response is for 1 of the following request types: + // - kernel launch + // - kernel end + // - non-kernel mem sync + + // Kernel Launch + // wavefront was nullptr when launching kernel, so it is meaningless + // here (simdId=-1, wfSlotId=-1) + if (gpuDynInst->isKernelLaunch()) { + // for kernel launch, the original request must be both kernel-type + // and acquire + assert(pkt->req->isKernel()); + assert(pkt->req->isAcquire()); + + // one D-Cache inv is done, decrement counter + dispatcher.updateInvCounter(gpuDynInst->kern_id); + + delete pkt->senderState; + delete pkt; + return true; + } - // Is the packet returned a Kernel End or Barrier - if (pkt->req->isKernel() && pkt->req->isRelease()) { - Wavefront *w = - computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; + // retrieve wavefront from inst + Wavefront *w = gpuDynInst->wavefront(); // Check if we are waiting on Kernel End Release - if (w->status == Wavefront::S_RETURNING) { - DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n", + if (w->getStatus() == Wavefront::S_RETURNING + && gpuDynInst->isEndOfKernel()) { + // for kernel end, the original request must be both kernel-type + // and release + assert(pkt->req->isKernel()); + assert(pkt->req->isRelease()); + + // one wb done, decrement counter, and return whether all wbs are + // done for the kernel + bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id); + + // not all wbs are done for the kernel, just release pkt + // resources + if (!isWbDone) { + delete pkt->senderState; + delete pkt; + return true; + } + + // all wbs are completed for the kernel, do retirement work + // for the workgroup + DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n", computeUnit->cu_id, w->simdId, w->wfSlotId, - w->wfDynId, w->kernId); + w->wfDynId, w->wgId); - computeUnit->shader->dispatcher->notifyWgCompl(w); - w->status = Wavefront::S_STOPPED; - } else { - w->outstandingReqs--; + dispatcher.notifyWgCompl(w); + w->setStatus(Wavefront::S_STOPPED); } - DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n", + if (!pkt->req->isKernel()) { + w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; + DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing " + "outstanding reqs %d => %d\n", gpuDynInst->simdId, + gpuDynInst->wfSlotId, gpuDynInst->wfDynId, + gpuDynInst->disassemble(), w->outstandingReqs, + w->outstandingReqs - 1); + computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); + } + + DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, w->barrierCnt); - if (gpuDynInst->useContinuation) { - assert(!gpuDynInst->isNoScope()); - gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), - gpuDynInst); - } - delete pkt->senderState; delete pkt; return true; - } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { - if (gpuDynInst->useContinuation) { - assert(!gpuDynInst->isNoScope()); - gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), - gpuDynInst); + } else if (pkt->cmd == MemCmd::WriteCompleteResp) { + // this is for writeComplete callback + // we simply get decrement write-related wait counters + assert(gpuDynInst); + Wavefront *w M5_VAR_USED = + computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; + assert(w); + DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing " + "outstanding reqs %d => %d\n", gpuDynInst->simdId, + gpuDynInst->wfSlotId, gpuDynInst->wfDynId, + gpuDynInst->disassemble(), w->outstandingReqs, + w->outstandingReqs - 1); + if (gpuDynInst->statusBitVector.none()) { + // ask gm pipe to decrement request counters, instead of directly + // performing here, to avoid asynchronous counter update and + // instruction retirement (which may hurt waincnt effects) + computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n", + computeUnit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); } delete pkt->senderState; delete pkt; + return true; } EventFunctionWrapper *mem_resp_event = computeUnit->memPort[index]->createMemRespEvent(pkt); - DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n", + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, - index, pkt->req->getPaddr()); + gpuDynInst->seqNum(), index, pkt->req->getPaddr()); computeUnit->schedule(mem_resp_event, curTick() + computeUnit->resp_tick_latency); + + return true; +} + +bool +ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt) +{ + assert(!pkt->req->isKernel()); + + // retrieve sender state + SenderState *sender_state = safe_cast(pkt->senderState); + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + + assert(pkt->isRead() || pkt->isWrite()); + assert(gpuDynInst->numScalarReqs > 0); + + gpuDynInst->numScalarReqs--; + + /** + * for each returned scalar request we decrement the + * numScalarReqs counter that is associated with this + * gpuDynInst, which should have been set to correspond + * to the number of packets sent for the memory op. + * once all packets return, the memory op is finished + * and we can push it into the response queue. + */ + if (!gpuDynInst->numScalarReqs) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { + computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push( + gpuDynInst); + } else { + computeUnit->scalarMemoryPipe.getGMStRespFIFO().push( + gpuDynInst); + } + } + + delete pkt->senderState; + delete pkt; + return true; } +void +ComputeUnit::ScalarDataPort::recvReqRetry() +{ + for (const auto &pkt : retries) { + if (!sendTimingReq(pkt)) { + break; + } else { + retries.pop_front(); + } + } +} + void ComputeUnit::DataPort::recvReqRetry() { @@ -715,7 +944,6 @@ bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { computeUnit->fetchStage.processFetchReturn(pkt); - return true; } @@ -759,9 +987,12 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) BaseTLB::Mode TLB_mode; assert(pkt->isRead() || pkt->isWrite()); + // only do some things if actually accessing data + bool isDataAccess = pkt->isWrite() || pkt->isRead(); + // Check write before read for atomic operations // since atomic operations should use BaseTLB::Write - if (pkt->isWrite()){ + if (pkt->isWrite()) { TLB_mode = BaseTLB::Write; } else if (pkt->isRead()) { TLB_mode = BaseTLB::Read; @@ -825,8 +1056,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) assert(pkt->req->hasPaddr()); assert(pkt->req->hasSize()); - uint8_t *tmpData = pkt->getPtr(); - // this is necessary because the GPU TLB receives packets instead // of requests. when the translation is complete, all relevent // fields in the request will be populated, but not in the packet. @@ -834,13 +1063,17 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) // and proper flags. PacketPtr oldPkt = pkt; pkt = new Packet(oldPkt->req, oldPkt->cmd); + if (isDataAccess) { + uint8_t *tmpData = oldPkt->getPtr(); + pkt->dataStatic(tmpData); + } delete oldPkt; - pkt->dataStatic(tmpData); // New SenderState for the memory access - pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, - index, nullptr); + pkt->senderState = + new ComputeUnit::DataPort::SenderState(gpuDynInst, index, + nullptr); gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); gpuDynInst->tlbHitLevel[index] = hit_level; @@ -860,8 +1093,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) assert(tlbPort[tlbPort_index]->retries.size() > 0); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " - "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, - tmp_vaddr); + "failed!\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, tmp_vaddr); tlbPort[tlbPort_index]->retries.push_back(pkt); } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) { @@ -872,8 +1105,8 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) tlbPort[tlbPort_index]->stallPort(); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " - "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, - tmp_vaddr); + "failed!\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, tmp_vaddr); tlbPort[tlbPort_index]->retries.push_back(pkt); } else { @@ -882,7 +1115,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); } } else { - if (pkt->cmd == MemCmd::MemFenceReq) { + if (pkt->cmd == MemCmd::MemSyncReq) { gpuDynInst->statusBitVector = VectorMask(0); } else { gpuDynInst->statusBitVector &= (~(1ll << index)); @@ -907,6 +1140,7 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) // Translation is done. It is safe to send the packet to memory. memPort[0]->sendFunctional(new_pkt); + DPRINTF(GPUMem, "Functional sendRequest\n"); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, new_pkt->req->getPaddr()); @@ -923,56 +1157,105 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) } void -ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) +ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt) { - EventFunctionWrapper *mem_req_event = - memPort[index]->createMemReqEvent(pkt); - + assert(pkt->isWrite() || pkt->isRead()); - // New SenderState for the memory access - pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index, - nullptr); + BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write; - DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", - cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, - pkt->req->getPaddr()); + pkt->senderState = + new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst); - schedule(mem_req_event, curTick() + req_tick_latency); + pkt->senderState = + new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false, + pkt->senderState); + + if (scalarDTLBPort->isStalled()) { + assert(scalarDTLBPort->retries.size()); + scalarDTLBPort->retries.push_back(pkt); + } else if (!scalarDTLBPort->sendTimingReq(pkt)) { + scalarDTLBPort->stallPort(); + scalarDTLBPort->retries.push_back(pkt); + } else { + DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n", + tlb_mode == BaseTLB::Read ? "read" : "write", + pkt->req->getVaddr()); + } } void -ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, +ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, + bool kernelMemSync, RequestPtr req) { - assert(gpuDynInst->isGlobalSeg()); + assert(gpuDynInst->isGlobalSeg() || + gpuDynInst->executedAs() == Enums::SC_GLOBAL); if (!req) { req = std::make_shared( 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId); } + + // all mem sync requests have Paddr == 0 req->setPaddr(0); - if (kernelLaunch) { - req->setFlags(Request::KERNEL); - } - // for non-kernel MemFence operations, memorder flags are set depending - // on which type of request is currently being sent, so this - // should be set by the caller (e.g. if an inst has acq-rel - // semantics, it will send one acquire req an one release req) - gpuDynInst->setRequestFlags(req, kernelLaunch); + PacketPtr pkt = nullptr; - // a mem fence must correspond to an acquire/release request - assert(req->isAcquire() || req->isRelease()); + if (kernelMemSync) { + if (gpuDynInst->isKernelLaunch()) { + req->setCacheCoherenceFlags(Request::ACQUIRE); + req->setReqInstSeqNum(gpuDynInst->seqNum()); + req->setFlags(Request::KERNEL); + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); - // create packet - PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq); + EventFunctionWrapper *mem_req_event = + memPort[0]->createMemReqEvent(pkt); - // set packet's sender state - pkt->senderState = - new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr); + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " + "an acquire\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); + } else { + assert(gpuDynInst->isEndOfKernel()); + + req->setCacheCoherenceFlags(Request::RELEASE); + req->setReqInstSeqNum(gpuDynInst->seqNum()); + req->setFlags(Request::KERNEL); + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); + + EventFunctionWrapper *mem_req_event = + memPort[0]->createMemReqEvent(pkt); + + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " + "a release\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); + } + } else { + gpuDynInst->setRequestFlags(req); + + req->setReqInstSeqNum(gpuDynInst->seqNum()); + + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); - // send the packet - sendSyncRequest(gpuDynInst, 0, pkt); + EventFunctionWrapper *mem_req_event = + memPort[0]->createMemReqEvent(pkt); + + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", + cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, + pkt->req->getPaddr()); + + schedule(mem_req_event, curTick() + req_tick_latency); + } } void @@ -992,69 +1275,60 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) Addr paddr = pkt->req->getPaddr(); - if (pkt->cmd != MemCmd::MemFenceResp) { - int index = gpuDynInst->memStatusVector[paddr].back(); - - DPRINTF(GPUMem, "Response for addr %#x, index %d\n", - pkt->req->getPaddr(), index); + // mem sync resp and write-complete callback must be handled already in + // DataPort::recvTimingResp + assert(pkt->cmd != MemCmd::MemSyncResp); + assert(pkt->cmd != MemCmd::WriteCompleteResp); - gpuDynInst->memStatusVector[paddr].pop_back(); - gpuDynInst->pAddr = pkt->req->getPaddr(); + // this is for read, write and atomic + int index = gpuDynInst->memStatusVector[paddr].back(); - if (pkt->isRead() || pkt->isWrite()) { - - if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) { - gpuDynInst->statusBitVector &= (~(1ULL << index)); - } else { - assert(gpuDynInst->statusVector[index] > 0); - gpuDynInst->statusVector[index]--; + DPRINTF(GPUMem, "Response for addr %#x, index %d\n", + pkt->req->getPaddr(), index); - if (!gpuDynInst->statusVector[index]) - gpuDynInst->statusBitVector &= (~(1ULL << index)); - } + gpuDynInst->memStatusVector[paddr].pop_back(); + gpuDynInst->pAddr = pkt->req->getPaddr(); - DPRINTF(GPUMem, "bitvector is now %#x\n", - gpuDynInst->statusBitVector); + gpuDynInst->statusBitVector &= (~(1ULL << index)); - if (gpuDynInst->statusBitVector == VectorMask(0)) { - auto iter = gpuDynInst->memStatusVector.begin(); - auto end = gpuDynInst->memStatusVector.end(); + DPRINTF(GPUMem, "bitvector is now %#x\n", + gpuDynInst->statusBitVector); - while (iter != end) { - assert(iter->second.empty()); - ++iter; - } + if (gpuDynInst->statusBitVector == VectorMask(0)) { + auto iter = gpuDynInst->memStatusVector.begin(); + auto end = gpuDynInst->memStatusVector.end(); - gpuDynInst->memStatusVector.clear(); + while (iter != end) { + assert(iter->second.empty()); + ++iter; + } - if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) - gpuDynInst->statusVector.clear(); + // Calculate the difference between the arrival of the first cache + // block and the last cache block to arrive if we have the time + // for the first cache block. + if (compute_unit->headTailMap.count(gpuDynInst)) { + Tick headTick = compute_unit->headTailMap.at(gpuDynInst); + compute_unit->headTailLatency.sample(curTick() - headTick); + compute_unit->headTailMap.erase(gpuDynInst); + } - compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); + gpuDynInst->memStatusVector.clear(); - DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", - compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId); + // note: only handle read response here; for write, the response + // is separately handled when writeComplete callback is received + if (pkt->isRead()) { + gpuDynInst-> + profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); + compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); - // after clearing the status vectors, - // see if there is a continuation to perform - // the continuation may generate more work for - // this memory request - if (gpuDynInst->useContinuation) { - assert(!gpuDynInst->isNoScope()); - gpuDynInst->execContinuation( - gpuDynInst->staticInstruction(), - gpuDynInst); - } - } + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); } } else { - gpuDynInst->statusBitVector = VectorMask(0); - - if (gpuDynInst->useContinuation) { - assert(!gpuDynInst->isNoScope()); - gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), - gpuDynInst); + if (!compute_unit->headTailMap.count(gpuDynInst)) { + compute_unit->headTailMap.insert( + std::make_pair(gpuDynInst, curTick())); } } @@ -1192,8 +1466,7 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) // Because it's atomic operation, only need TLB translation state prefetch_pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, - computeUnit->shader->gpuTc, - true); + computeUnit->shader->gpuTc, true); // Currently prefetches are zero-latency, hence the sendFunctional sendFunctional(prefetch_pkt); @@ -1270,10 +1543,40 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) pkt->req->getPaddr()); } else { DPRINTF(GPUPort, - "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n", + "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " + "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index, + pkt->req->getPaddr()); + } +} + +const char* +ComputeUnit::ScalarDataPort::MemReqEvent::description() const +{ + return "ComputeUnit scalar memory request event"; +} + +void +ComputeUnit::ScalarDataPort::MemReqEvent::process() +{ + SenderState *sender_state = safe_cast(pkt->senderState); + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit; + + if (!(scalarDataPort->sendTimingReq(pkt))) { + scalarDataPort->retries.push_back(pkt); + + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, index, + gpuDynInst->wfSlotId, scalarDataPort->index, pkt->req->getPaddr()); + } else { + DPRINTF(GPUPort, + "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " + "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, gpuDynInst->seqNum(), + scalarDataPort->index, pkt->req->getPaddr()); } } @@ -1314,6 +1617,66 @@ ComputeUnit::DTLBPort::recvReqRetry() } } +bool +ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt) +{ + assert(pkt->senderState); + + TheISA::GpuTLB::TranslationState *translation_state = + safe_cast(pkt->senderState); + + // Page faults are not allowed + fatal_if(!translation_state->tlbEntry, + "Translation of vaddr %#x failed\n", pkt->req->getVaddr()); + + delete translation_state->tlbEntry; + assert(!translation_state->ports.size()); + + pkt->senderState = translation_state->saved; + delete translation_state; + + ScalarDTLBPort::SenderState *sender_state = + safe_cast(pkt->senderState); + + GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; + delete pkt->senderState; + + Wavefront *w M5_VAR_USED = gpuDynInst->wavefront(); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received " + "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId, + w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr()); + + MemCmd mem_cmd; + + if (pkt->cmd == MemCmd::ReadResp) { + mem_cmd = MemCmd::ReadReq; + } else if (pkt->cmd == MemCmd::WriteResp) { + mem_cmd = MemCmd::WriteReq; + } else { + fatal("Scalar DTLB receieved unexpected MemCmd response %s\n", + pkt->cmd.toString()); + } + + PacketPtr req_pkt = new Packet(pkt->req, mem_cmd); + req_pkt->dataStatic(pkt->getPtr()); + delete pkt; + + req_pkt->senderState = + new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst); + + if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) { + computeUnit->scalarDataPort->retries.push_back(req_pkt); + DPRINTF(GPUMem, "send scalar req failed for: %s\n", + gpuDynInst->disassemble()); + } else { + DPRINTF(GPUMem, "send scalar req for: %s\n", + gpuDynInst->disassemble()); + } + + return true; +} + bool ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) { @@ -1324,8 +1687,8 @@ ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) assert(pkt->senderState); // pop off the TLB translation state - TheISA::GpuTLB::TranslationState *translation_state = - safe_cast(pkt->senderState); + TheISA::GpuTLB::TranslationState *translation_state + = safe_cast(pkt->senderState); bool success = translation_state->tlbEntry != nullptr; delete translation_state->tlbEntry; @@ -1510,6 +1873,152 @@ ComputeUnit::regStats() scalarMemWritesPerWF = scalarMemWrites / completedWfs; scalarMemReadsPerWF = scalarMemReads / completedWfs; + vectorMemReadsPerKiloInst + .name(name() + ".vector_mem_reads_per_kilo_inst") + .desc("Number of vector mem reads per kilo-instruction") + ; + vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000; + vectorMemWritesPerKiloInst + .name(name() + ".vector_mem_writes_per_kilo_inst") + .desc("Number of vector mem writes per kilo-instruction") + ; + vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000; + vectorMemInstsPerKiloInst + .name(name() + ".vector_mem_insts_per_kilo_inst") + .desc("Number of vector mem insts per kilo-instruction") + ; + vectorMemInstsPerKiloInst = + ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000; + scalarMemReadsPerKiloInst + .name(name() + ".scalar_mem_reads_per_kilo_inst") + .desc("Number of scalar mem reads per kilo-instruction") + ; + scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000; + scalarMemWritesPerKiloInst + .name(name() + ".scalar_mem_writes_per_kilo_inst") + .desc("Number of scalar mem writes per kilo-instruction") + ; + scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000; + scalarMemInstsPerKiloInst + .name(name() + ".scalar_mem_insts_per_kilo_inst") + .desc("Number of scalar mem insts per kilo-instruction") + ; + scalarMemInstsPerKiloInst = + ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000; + + instCyclesVMemPerSimd + .init(numVectorALUs) + .name(name() + ".inst_cycles_vector_memory") + .desc("Number of cycles to send address, command, data from VRF to " + "vector memory unit, per SIMD") + ; + + instCyclesScMemPerSimd + .init(numVectorALUs) + .name(name() + ".inst_cycles_scalar_memory") + .desc("Number of cycles to send address, command, data from SRF to " + "scalar memory unit, per SIMD") + ; + + instCyclesLdsPerSimd + .init(numVectorALUs) + .name(name() + ".inst_cycles_lds") + .desc("Number of cycles to send address, command, data from VRF to " + "LDS unit, per SIMD") + ; + + globalReads + .name(name() + ".global_mem_reads") + .desc("Number of reads to the global segment") + ; + globalWrites + .name(name() + ".global_mem_writes") + .desc("Number of writes to the global segment") + ; + globalMemInsts + .name(name() + ".global_mem_insts") + .desc("Number of memory instructions sent to the global segment") + ; + globalMemInsts = globalReads + globalWrites; + argReads + .name(name() + ".arg_reads") + .desc("Number of reads to the arg segment") + ; + argWrites + .name(name() + ".arg_writes") + .desc("NUmber of writes to the arg segment") + ; + argMemInsts + .name(name() + ".arg_mem_insts") + .desc("Number of memory instructions sent to the arg segment") + ; + argMemInsts = argReads + argWrites; + spillReads + .name(name() + ".spill_reads") + .desc("Number of reads to the spill segment") + ; + spillWrites + .name(name() + ".spill_writes") + .desc("Number of writes to the spill segment") + ; + spillMemInsts + .name(name() + ".spill_mem_insts") + .desc("Number of memory instructions sent to the spill segment") + ; + spillMemInsts = spillReads + spillWrites; + groupReads + .name(name() + ".group_reads") + .desc("Number of reads to the group segment") + ; + groupWrites + .name(name() + ".group_writes") + .desc("Number of writes to the group segment") + ; + groupMemInsts + .name(name() + ".group_mem_insts") + .desc("Number of memory instructions sent to the group segment") + ; + groupMemInsts = groupReads + groupWrites; + privReads + .name(name() + ".private_reads") + .desc("Number of reads to the private segment") + ; + privWrites + .name(name() + ".private_writes") + .desc("Number of writes to the private segment") + ; + privMemInsts + .name(name() + ".private_mem_insts") + .desc("Number of memory instructions sent to the private segment") + ; + privMemInsts = privReads + privWrites; + readonlyReads + .name(name() + ".readonly_reads") + .desc("Number of reads to the readonly segment") + ; + readonlyWrites + .name(name() + ".readonly_writes") + .desc("Number of memory instructions sent to the readonly segment") + ; + readonlyMemInsts + .name(name() + ".readonly_mem_insts") + .desc("Number of memory instructions sent to the readonly segment") + ; + readonlyMemInsts = readonlyReads + readonlyWrites; + kernargReads + .name(name() + ".kernarg_reads") + .desc("Number of reads sent to the kernarg segment") + ; + kernargWrites + .name(name() + ".kernarg_writes") + .desc("Number of memory instructions sent to the kernarg segment") + ; + kernargMemInsts + .name(name() + ".kernarg_mem_insts") + .desc("Number of memory instructions sent to the kernarg segment") + ; + kernargMemInsts = kernargReads + kernargWrites; + tlbCycles .name(name() + ".tlb_cycles") .desc("total number of cycles for all uncoalesced requests") @@ -1596,6 +2105,71 @@ ComputeUnit::regStats() .desc("number of vec ops executed (e.g. WF size/inst)") ; + numVecOpsExecutedF16 + .name(name() + ".num_vec_ops_f16_executed") + .desc("number of f16 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedF32 + .name(name() + ".num_vec_ops_f32_executed") + .desc("number of f32 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedF64 + .name(name() + ".num_vec_ops_f64_executed") + .desc("number of f64 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedFMA16 + .name(name() + ".num_vec_ops_fma16_executed") + .desc("number of fma16 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedFMA32 + .name(name() + ".num_vec_ops_fma32_executed") + .desc("number of fma32 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedFMA64 + .name(name() + ".num_vec_ops_fma64_executed") + .desc("number of fma64 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedMAD16 + .name(name() + ".num_vec_ops_mad16_executed") + .desc("number of mad16 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedMAD32 + .name(name() + ".num_vec_ops_mad32_executed") + .desc("number of mad32 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedMAD64 + .name(name() + ".num_vec_ops_mad64_executed") + .desc("number of mad64 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedMAC16 + .name(name() + ".num_vec_ops_mac16_executed") + .desc("number of mac16 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedMAC32 + .name(name() + ".num_vec_ops_mac32_executed") + .desc("number of mac32 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedMAC64 + .name(name() + ".num_vec_ops_mac64_executed") + .desc("number of mac64 vec ops executed (e.g. WF size/inst)") + ; + + numVecOpsExecutedTwoOpFP + .name(name() + ".num_vec_ops_two_op_fp_executed") + .desc("number of two op FP vec ops executed (e.g. WF size/inst)") + ; + totalCycles .name(name() + ".num_total_cycles") .desc("number of cycles the CU ran for") @@ -1611,6 +2185,21 @@ ComputeUnit::regStats() .desc("Vector Operations per cycle (this CU only)") ; + vpc_f16 + .name(name() + ".vpc_f16") + .desc("F16 Vector Operations per cycle (this CU only)") + ; + + vpc_f32 + .name(name() + ".vpc_f32") + .desc("F32 Vector Operations per cycle (this CU only)") + ; + + vpc_f64 + .name(name() + ".vpc_f64") + .desc("F64 Vector Operations per cycle (this CU only)") + ; + numALUInstsExecuted .name(name() + ".num_alu_insts_executed") .desc("Number of dynamic non-GM memory insts executed") @@ -1623,15 +2212,30 @@ ComputeUnit::regStats() ipc = numInstrExecuted / totalCycles; vpc = numVecOpsExecuted / totalCycles; + vpc_f16 = numVecOpsExecutedF16 / totalCycles; + vpc_f32 = numVecOpsExecutedF32 / totalCycles; + vpc_f64 = numVecOpsExecutedF64 / totalCycles; numTimesWgBlockedDueVgprAlloc .name(name() + ".times_wg_blocked_due_vgpr_alloc") - .desc("Number of times WGs are blocked due to VGPR allocation per SIMD") + .desc("Number of times WGs are blocked due to VGPR allocation per " + "SIMD") + ; + + numTimesWgBlockedDueSgprAlloc + .name(name() + ".times_wg_blocked_due_sgpr_alloc") + .desc("Number of times WGs are blocked due to SGPR allocation per " + "SIMD") ; dynamicGMemInstrCnt .name(name() + ".global_mem_instr_cnt") - .desc("dynamic global memory instructions count") + .desc("dynamic non-flat global memory instruction count") + ; + + dynamicFlatMemInstrCnt + .name(name() + ".flat_global_mem_instr_cnt") + .desc("dynamic flat global memory instruction count") ; dynamicLMemInstrCnt @@ -1647,6 +2251,11 @@ ComputeUnit::regStats() .desc("number of completed wavefronts") ; + completedWGs + .name(name() + ".num_completed_wgs") + .desc("number of completed workgroups") + ; + numCASOps .name(name() + ".num_CAS_ops") .desc("number of compare and swap operations") @@ -1657,15 +2266,37 @@ ComputeUnit::regStats() .desc("number of compare and swap operations that failed") ; + headTailLatency + .init(0, 1000000, 10000) + .name(name() + ".head_tail_latency") + .desc("ticks between first and last cache block arrival at coalescer") + .flags(Stats::pdf | Stats::oneline) + ; + + waveLevelParallelism + .init(0, shader->n_wf * numVectorALUs, 1) + .name(name() + ".wlp") + .desc("wave level parallelism: count of active waves at wave launch") + ; + + instInterleave + .init(numVectorALUs, 0, 20, 1) + .name(name() + ".interleaving") + .desc("Measure of instruction interleaving per SIMD") + ; + // register stats of pipeline stages fetchStage.regStats(); scoreboardCheckStage.regStats(); scheduleStage.regStats(); execStage.regStats(); - // register stats of memory pipeline + // register stats of memory pipelines globalMemoryPipe.regStats(); localMemoryPipe.regStats(); + scalarMemoryPipe.regStats(); + + registerManager->regStats(); } void @@ -1682,6 +2313,10 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) } } else { if (gpuDynInst->isALU()) { + shader->total_valu_insts++; + if (shader->total_valu_insts == shader->max_valu_insts) { + exitSimLoop("max vALU insts"); + } vALUInsts++; instCyclesVALU++; threadCyclesVALU += gpuDynInst->wavefront()->execMask().count(); @@ -1698,6 +2333,74 @@ ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) } else if (gpuDynInst->isStore()) { vectorMemWrites++; } + + if (gpuDynInst->isLoad()) { + switch (gpuDynInst->executedAs()) { + case Enums::SC_SPILL: + spillReads++; + break; + case Enums::SC_GLOBAL: + globalReads++; + break; + case Enums::SC_GROUP: + groupReads++; + break; + case Enums::SC_PRIVATE: + privReads++; + break; + case Enums::SC_READONLY: + readonlyReads++; + break; + case Enums::SC_KERNARG: + kernargReads++; + break; + case Enums::SC_ARG: + argReads++; + break; + case Enums::SC_NONE: + /** + * this case can occur for flat mem insts + * who execute with EXEC = 0 + */ + break; + default: + fatal("%s has no valid segment\n", gpuDynInst->disassemble()); + break; + } + } else if (gpuDynInst->isStore()) { + switch (gpuDynInst->executedAs()) { + case Enums::SC_SPILL: + spillWrites++; + break; + case Enums::SC_GLOBAL: + globalWrites++; + break; + case Enums::SC_GROUP: + groupWrites++; + break; + case Enums::SC_PRIVATE: + privWrites++; + break; + case Enums::SC_READONLY: + readonlyWrites++; + break; + case Enums::SC_KERNARG: + kernargWrites++; + break; + case Enums::SC_ARG: + argWrites++; + break; + case Enums::SC_NONE: + /** + * this case can occur for flat mem insts + * who execute with EXEC = 0 + */ + break; + default: + fatal("%s has no valid segment\n", gpuDynInst->disassemble()); + break; + } + } } } @@ -1728,31 +2431,32 @@ ComputeUnit::CUExitCallback::process() *page_stat_file << std::dec << iter.second.second << std::endl; } } - } +} bool ComputeUnit::isDone() const { - for (int i = 0; i < numSIMDs; ++i) { - if (!isSimdDone(i)) { + for (int i = 0; i < numVectorALUs; ++i) { + if (!isVectorAluIdle(i)) { return false; } } - bool glbMemBusRdy = true; - for (int j = 0; j < numGlbMemUnits; ++j) { - glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy(); + // TODO: FIXME if more than 1 of any memory pipe supported + if (!srfToScalarMemPipeBus.rdy()) { + return false; + } + if (!vrfToGlobalMemPipeBus.rdy()) { + return false; } - bool locMemBusRdy = true; - for (int j = 0; j < numLocMemUnits; ++j) { - locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy(); + if (!vrfToLocalMemPipeBus.rdy()) { + return false; } - if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() || - !globalMemoryPipe.isGMStRespFIFOWrRdy() || - !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy() + if (!globalMemoryPipe.isGMReqFIFOWrRdy() + || !localMemoryPipe.isLMReqFIFOWrRdy() || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() || - !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) { + !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) { return false; } @@ -1760,30 +2464,19 @@ ComputeUnit::isDone() const } int32_t -ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const +ComputeUnit::getRefCounter(const uint32_t dispatchId, + const uint32_t wgId) const { return lds.getRefCounter(dispatchId, wgId); } bool -ComputeUnit::isSimdDone(uint32_t simdId) const +ComputeUnit::isVectorAluIdle(uint32_t simdId) const { - assert(simdId < numSIMDs); - - for (int i=0; i < numGlbMemUnits; ++i) { - if (!vrfToGlobalMemPipeBus[i].rdy()) - return false; - } - for (int i=0; i < numLocMemUnits; ++i) { - if (!vrfToLocalMemPipeBus[i].rdy()) - return false; - } - if (!aluPipe[simdId].rdy()) { - return false; - } + assert(simdId < numVectorALUs); for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){ - if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) { + if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) { return false; } } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 49713e936..187cbc9d5 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -36,28 +36,30 @@ #include #include -#include +#include #include #include "base/callback.hh" #include "base/statistics.hh" #include "base/types.hh" +#include "config/the_gpu_isa.hh" #include "enums/PrefetchType.hh" #include "gpu-compute/exec_stage.hh" #include "gpu-compute/fetch_stage.hh" #include "gpu-compute/global_memory_pipeline.hh" +#include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/local_memory_pipeline.hh" -#include "gpu-compute/qstruct.hh" +#include "gpu-compute/register_manager.hh" +#include "gpu-compute/scalar_memory_pipeline.hh" #include "gpu-compute/schedule_stage.hh" #include "gpu-compute/scoreboard_check_stage.hh" #include "mem/port.hh" #include "mem/token_port.hh" #include "sim/clocked_object.hh" -static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; -static const int MAX_WIDTH_FOR_MEM_INST = 32; - -class NDRange; +class HSAQueueEntry; +class LdsChunk; +class ScalarRegisterFile; class Shader; class VectorRegisterFile; @@ -69,18 +71,6 @@ enum EXEC_POLICY RR }; -// List of execution units -enum EXEC_UNIT -{ - SIMD0 = 0, - SIMD1, - SIMD2, - SIMD3, - GLBMEM_PIPE, - LDSMEM_PIPE, - NUM_UNITS -}; - enum TLB_CACHE { TLB_MISS_CACHE_MISS = 0, @@ -92,32 +82,100 @@ enum TLB_CACHE class ComputeUnit : public ClockedObject { public: - FetchStage fetchStage; - ScoreboardCheckStage scoreboardCheckStage; - ScheduleStage scheduleStage; - ExecStage execStage; - GlobalMemPipeline globalMemoryPipe; - LocalMemPipeline localMemoryPipe; + + + // Execution resources + // + // The ordering of units is: + // Vector ALUs + // Scalar ALUs + // GM Pipe + // LM Pipe + // Scalar Mem Pipe + // + // Note: the ordering of units is important and the code assumes the + // above ordering. However, there may be more than one resource of + // each type (e.g., 4 VALUs or 2 SALUs) + + int numVectorGlobalMemUnits; + // Resource control for global memory to VRF data/address bus + WaitClass glbMemToVrfBus; + // Resource control for Vector Register File->Global Memory pipe buses + WaitClass vrfToGlobalMemPipeBus; + // Resource control for Vector Global Memory execution unit + WaitClass vectorGlobalMemUnit; + + int numVectorSharedMemUnits; + // Resource control for local memory to VRF data/address bus + WaitClass locMemToVrfBus; + // Resource control for Vector Register File->Local Memory pipe buses + WaitClass vrfToLocalMemPipeBus; + // Resource control for Vector Shared/Local Memory execution unit + WaitClass vectorSharedMemUnit; + + int numScalarMemUnits; + // Resource control for scalar memory to SRF data/address bus + WaitClass scalarMemToSrfBus; + // Resource control for Scalar Register File->Scalar Memory pipe buses + WaitClass srfToScalarMemPipeBus; + // Resource control for Scalar Memory execution unit + WaitClass scalarMemUnit; + + // vector ALU execution resources + int numVectorALUs; + std::vector vectorALUs; + + // scalar ALU execution resources + int numScalarALUs; + std::vector scalarALUs; + + // Return total number of execution units on this CU + int numExeUnits() const; + // index into readyList of the first memory unit + int firstMemUnit() const; + // index into readyList of the last memory unit + int lastMemUnit() const; + // index into scalarALUs vector of SALU used by the wavefront + int mapWaveToScalarAlu(Wavefront *w) const; + // index into readyList of SALU used by wavefront + int mapWaveToScalarAluGlobalIdx(Wavefront *w) const; + // index into readyList of Global Memory unit used by wavefront + int mapWaveToGlobalMem(Wavefront *w) const; + // index into readyList of Local Memory unit used by wavefront + int mapWaveToLocalMem(Wavefront *w) const; + // index into readyList of Scalar Memory unit used by wavefront + int mapWaveToScalarMem(Wavefront *w) const; + + int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes + int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes + int numCyclesPerStoreTransfer; // number of cycles per vector store + int numCyclesPerLoadTransfer; // number of cycles per vector load // Buffers used to communicate between various pipeline stages + // At a high level, the following intra-/inter-stage communication occurs: + // SCB to SCH: readyList provides per exec resource list of waves that + // passed dependency and readiness checks. If selected by + // scheduler, attempt to add wave to schList conditional on + // RF support. + // SCH: schList holds waves that are gathering operands or waiting + // for execution resource availability. Once ready, waves are + // placed on the dispatchList as candidates for execution. A wave + // may spend multiple cycles in SCH stage, on the schList due to + // RF access conflicts or execution resource contention. + // SCH to EX: dispatchList holds waves that are ready to be executed. + // LM/FLAT arbitration may remove an LM wave and place it + // back on the schList. RF model may also force a wave back + // to the schList if using the detailed model. + // List of waves which are ready to be scheduled. // Each execution resource has a ready list. readyList is // used to communicate between scoreboardCheck stage and // schedule stage - // TODO: make enum to index readyList std::vector> readyList; - // Stores the status of waves. A READY implies the - // wave is ready to be scheduled this cycle and - // is already present in the readyList. waveStatusList is - // used to communicate between scoreboardCheck stage and - // schedule stage - // TODO: convert std::pair to a class to increase readability - std::vector>> waveStatusList; - // List of waves which will be dispatched to - // each execution resource. A FILLED implies + // each execution resource. An EXREADY implies // dispatch list is non-empty and // execution unit has something to execute // this cycle. Currently, the dispatch list of @@ -127,32 +185,67 @@ class ComputeUnit : public ClockedObject // and exec stage // TODO: convert std::pair to a class to increase readability std::vector> dispatchList; + // track presence of dynamic instructions in the Schedule pipeline + // stage. This is used to check the readiness of the oldest, + // non-dispatched instruction of every WF in the Scoreboard stage. + std::unordered_set pipeMap; + + RegisterManager* registerManager; + + FetchStage fetchStage; + ScoreboardCheckStage scoreboardCheckStage; + ScheduleStage scheduleStage; + ExecStage execStage; + GlobalMemPipeline globalMemoryPipe; + LocalMemPipeline localMemoryPipe; + ScalarMemPipeline scalarMemoryPipe; + + EventFunctionWrapper tickEvent; - int rrNextMemID; // used by RR WF exec policy to cycle through WF's - int rrNextALUWp; typedef ComputeUnitParams Params; std::vector> wfList; int cu_id; // array of vector register files, one per SIMD std::vector vrf; - // Number of vector ALU units (SIMDs) in CU - int numSIMDs; + // array of scalar register files, one per SIMD + std::vector srf; + + // Width per VALU/SIMD unit: number of work items that can be executed + // on the vector ALU simultaneously in a SIMD unit + int simdWidth; // number of pipe stages for bypassing data to next dependent single // precision vector instruction inside the vector ALU pipeline int spBypassPipeLength; // number of pipe stages for bypassing data to next dependent double // precision vector instruction inside the vector ALU pipeline int dpBypassPipeLength; - // number of cycles per issue period - int issuePeriod; + // number of pipe stages for scalar ALU + int scalarPipeStages; + // number of pipe stages for operand collection & distribution network + int operandNetworkLength; + // number of cycles per instruction issue period + Cycles issuePeriod; + + // VRF to GM Bus latency + Cycles vrf_gm_bus_latency; + // SRF to Scalar Mem Bus latency + Cycles srf_scm_bus_latency; + // VRF to LM Bus latency + Cycles vrf_lm_bus_latency; - // Number of global and local memory execution resources in CU - int numGlbMemUnits; - int numLocMemUnits; // tracks the last cycle a vector instruction was executed on a SIMD std::vector lastExecCycle; + // Track the amount of interleaving between wavefronts on each SIMD. + // This stat is sampled using instExecPerSimd to compute the number of + // instructions that have been executed on a SIMD between a WF executing + // two successive instructions. + Stats::VectorDistribution instInterleave; + + // tracks the number of dyn inst executed per SIMD + std::vector instExecPerSimd; + // true if we allow a separate TLB per lane bool perLaneTLB; // if 0, TLB prefetching is off. @@ -166,8 +259,10 @@ class ComputeUnit : public ClockedObject Enums::PrefetchType prefetchType; EXEC_POLICY exec_policy; - bool xact_cas_mode; bool debugSegFault; + // Idle CU timeout in ticks + Tick idleCUTimeout; + int idleWfs; bool functionalTLB; bool localMemBarrier; @@ -183,91 +278,67 @@ class ComputeUnit : public ClockedObject Shader *shader; uint32_t barrier_id; - // vector of Vector ALU (MACC) pipelines - std::vector aluPipe; - // minimum issue period per SIMD unit (in cycles) - std::vector wfWait; - - // Resource control for Vector Register File->Global Memory pipe buses - std::vector vrfToGlobalMemPipeBus; - // Resource control for Vector Register File->Local Memory pipe buses - std::vector vrfToLocalMemPipeBus; - int nextGlbMemBus; - int nextLocMemBus; - // Resource control for global memory to VRF data/address bus - WaitClass glbMemToVrfBus; - // Resource control for local memory to VRF data/address bus - WaitClass locMemToVrfBus; - - uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes - uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes - uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store - uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load Tick req_tick_latency; Tick resp_tick_latency; - // number of vector registers being reserved for each SIMD unit + /** + * Number of WFs to schedule to each SIMD. This vector is populated + * by hasDispResources(), and consumed by the subsequent call to + * dispWorkgroup(), to schedule the specified number of WFs to the + * SIMD units. Entry I provides the number of WFs to schedule to SIMD I. + */ + std::vector numWfsToSched; + + // number of currently reserved vector registers per SIMD unit std::vector vectorRegsReserved; + // number of currently reserved scalar registers per SIMD unit + std::vector scalarRegsReserved; // number of vector registers per SIMD unit - uint32_t numVecRegsPerSimd; - // Support for scheduling VGPR status update events - std::vector > regIdxVec; - std::vector timestampVec; - std::vector statusVec; + int numVecRegsPerSimd; + // number of available scalar registers per SIMD unit + int numScalarRegsPerSimd; - void - registerEvent(uint32_t simdId, - uint32_t regIdx, - uint32_t operandSize, - uint64_t when, - uint8_t newStatus) { - regIdxVec.push_back(std::make_pair(simdId, regIdx)); - timestampVec.push_back(when); - statusVec.push_back(newStatus); - if (operandSize > 4) { - regIdxVec.push_back(std::make_pair(simdId, - ((regIdx + 1) % - numVecRegsPerSimd))); - timestampVec.push_back(when); - statusVec.push_back(newStatus); - } - } - - void updateEvents(); + void updateReadyList(int unitId); // this hash map will keep track of page divergence // per memory instruction per wavefront. The hash map // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. std::map pagesTouched; + void insertInPipeMap(Wavefront *w); + void deleteFromPipeMap(Wavefront *w); + ComputeUnit(const Params *p); ~ComputeUnit(); - int spBypassLength() { return spBypassPipeLength; }; - int dpBypassLength() { return dpBypassPipeLength; }; - int storeBusLength() { return numCyclesPerStoreTransfer; }; - int loadBusLength() { return numCyclesPerLoadTransfer; }; - int wfSize() const { return wavefrontSize; }; - void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + // Timing Functions + int oprNetPipeLength() const { return operandNetworkLength; } + int simdUnitWidth() const { return simdWidth; } + int spBypassLength() const { return spBypassPipeLength; } + int dpBypassLength() const { return dpBypassPipeLength; } + int scalarPipeLength() const { return scalarPipeStages; } + int storeBusLength() const { return numCyclesPerStoreTransfer; } + int loadBusLength() const { return numCyclesPerLoadTransfer; } + int wfSize() const { return wavefrontSize; } + void exec(); void initiateFetch(Wavefront *wavefront); void fetch(PacketPtr pkt, Wavefront *wavefront); - void fillKernelState(Wavefront *w, NDRange *ndr); + void fillKernelState(Wavefront *w, HSAQueueEntry *task); void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, - NDRange *ndr); - - void StartWorkgroup(NDRange *ndr); - int ReadyWorkgroup(NDRange *ndr); - - bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } - bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } - bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } - int GlbMemUnitId() { return GLBMEM_PIPE; } - int ShrMemUnitId() { return LDSMEM_PIPE; } - int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } - int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } + HSAQueueEntry *task, bool fetchContext=false); + + void doInvalidate(RequestPtr req, int kernId); + void doFlush(GPUDynInstPtr gpuDynInst); + + void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false); + bool hasDispResources(HSAQueueEntry *task); + + int cacheLineSize() const { return _cacheLineSize; } + int getCacheLineBits() const { return cacheLineBits; } + /* This function cycles through all the wavefronts in all the phases to see * if all of the wavefronts which should be associated with one barrier * (denoted with _barrier_id), are all at the same barrier in the program @@ -275,14 +346,15 @@ class ComputeUnit : public ClockedObject * return true. */ int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); - bool cedeSIMD(int simdId, int wfSlotId); - template void doSmReturn(GPUDynInstPtr gpuDynInst); + template + void doSmReturn(GPUDynInstPtr gpuDynInst); + virtual void init() override; void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); - void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); + void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt); void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, - bool kernelLaunch=true, + bool kernelMemSync, RequestPtr req=nullptr); void handleMemPacket(PacketPtr pkt, int memport_index); bool processTimingPacket(PacketPtr pkt); @@ -292,7 +364,7 @@ class ComputeUnit : public ClockedObject MasterID masterId() { return _masterId; } bool isDone() const; - bool isSimdDone(uint32_t) const; + bool isVectorAluIdle(uint32_t simdId) const; protected: MasterID _masterId; @@ -323,6 +395,44 @@ class ComputeUnit : public ClockedObject Stats::Scalar scalarMemReads; Stats::Formula scalarMemReadsPerWF; + Stats::Formula vectorMemReadsPerKiloInst; + Stats::Formula vectorMemWritesPerKiloInst; + Stats::Formula vectorMemInstsPerKiloInst; + Stats::Formula scalarMemReadsPerKiloInst; + Stats::Formula scalarMemWritesPerKiloInst; + Stats::Formula scalarMemInstsPerKiloInst; + + // Cycles required to send register source (addr and data) from + // register files to memory pipeline, per SIMD. + Stats::Vector instCyclesVMemPerSimd; + Stats::Vector instCyclesScMemPerSimd; + Stats::Vector instCyclesLdsPerSimd; + + Stats::Scalar globalReads; + Stats::Scalar globalWrites; + Stats::Formula globalMemInsts; + Stats::Scalar argReads; + Stats::Scalar argWrites; + Stats::Formula argMemInsts; + Stats::Scalar spillReads; + Stats::Scalar spillWrites; + Stats::Formula spillMemInsts; + Stats::Scalar groupReads; + Stats::Scalar groupWrites; + Stats::Formula groupMemInsts; + Stats::Scalar privReads; + Stats::Scalar privWrites; + Stats::Formula privMemInsts; + Stats::Scalar readonlyReads; + Stats::Scalar readonlyWrites; + Stats::Formula readonlyMemInsts; + Stats::Scalar kernargReads; + Stats::Scalar kernargWrites; + Stats::Formula kernargMemInsts; + + int activeWaves; + Stats::Distribution waveLevelParallelism; + void updateInstStats(GPUDynInstPtr gpuDynInst); // the following stats compute the avg. TLB accesslatency per @@ -339,21 +449,48 @@ class ComputeUnit : public ClockedObject // over all memory instructions executed over all wavefronts // how many touched 0-4 pages, 4-8, ..., 60-64 pages Stats::Distribution pageDivergenceDist; + // count of non-flat global memory vector instructions executed Stats::Scalar dynamicGMemInstrCnt; + // count of flat global memory vector instructions executed + Stats::Scalar dynamicFlatMemInstrCnt; Stats::Scalar dynamicLMemInstrCnt; Stats::Scalar wgBlockedDueLdsAllocation; - // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active - // when the instruction is committed, this number is still incremented by 1 + // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are + // active when the instruction is committed, this number is still + // incremented by 1 Stats::Scalar numInstrExecuted; // Number of cycles among successive instruction executions across all // wavefronts of the same CU Stats::Distribution execRateDist; // number of individual vector operations executed Stats::Scalar numVecOpsExecuted; + // number of individual f16 vector operations executed + Stats::Scalar numVecOpsExecutedF16; + // number of individual f32 vector operations executed + Stats::Scalar numVecOpsExecutedF32; + // number of individual f64 vector operations executed + Stats::Scalar numVecOpsExecutedF64; + // number of individual FMA 16,32,64 vector operations executed + Stats::Scalar numVecOpsExecutedFMA16; + Stats::Scalar numVecOpsExecutedFMA32; + Stats::Scalar numVecOpsExecutedFMA64; + // number of individual MAC 16,32,64 vector operations executed + Stats::Scalar numVecOpsExecutedMAC16; + Stats::Scalar numVecOpsExecutedMAC32; + Stats::Scalar numVecOpsExecutedMAC64; + // number of individual MAD 16,32,64 vector operations executed + Stats::Scalar numVecOpsExecutedMAD16; + Stats::Scalar numVecOpsExecutedMAD32; + Stats::Scalar numVecOpsExecutedMAD64; + // total number of two op FP vector operations executed + Stats::Scalar numVecOpsExecutedTwoOpFP; // Total cycles that something is running on the GPU Stats::Scalar totalCycles; Stats::Formula vpc; // vector ops per cycle + Stats::Formula vpc_f16; // vector ops per cycle + Stats::Formula vpc_f32; // vector ops per cycle + Stats::Formula vpc_f64; // vector ops per cycle Stats::Formula ipc; // vector instructions per cycle Stats::Distribution controlFlowDivergenceDist; Stats::Distribution activeLanesPerGMemInstrDist; @@ -362,20 +499,16 @@ class ComputeUnit : public ClockedObject Stats::Formula numALUInstsExecuted; // number of times a WG can not start due to lack of free VGPRs in SIMDs Stats::Scalar numTimesWgBlockedDueVgprAlloc; + // number of times a WG can not start due to lack of free SGPRs in SIMDs + Stats::Scalar numTimesWgBlockedDueSgprAlloc; Stats::Scalar numCASOps; Stats::Scalar numFailedCASOps; Stats::Scalar completedWfs; - // flag per vector SIMD unit that is set when there is at least one - // WV that has a vector ALU instruction as the oldest in its - // Instruction Buffer: Defined in the Scoreboard stage, consumed - // by the Execute stage. - std::vector vectorAluInstAvail; - // number of available (oldest) LDS instructions that could have - // been issued to the LDS at a specific issue slot - int shrMemInstAvail; - // number of available Global memory instructions that could have - // been issued to TCP at a specific issue slot - int glbMemInstAvail; + Stats::Scalar completedWGs; + + // distrubtion in latency difference between first and last cache block + // arrival ticks + Stats::Distribution headTailLatency; void regStats() override; @@ -389,8 +522,6 @@ class ComputeUnit : public ClockedObject int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; - int cacheLineSize() const { return _cacheLineSize; } - bool sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); @@ -486,6 +617,56 @@ class ComputeUnit : public ClockedObject }; + // Scalar data cache access port + class ScalarDataPort : public MasterPort + { + public: + ScalarDataPort(const std::string &_name, ComputeUnit *_cu, + PortID _index) + : MasterPort(_name, _cu, _index), computeUnit(_cu), index(_index) + { + (void)index; + } + + bool recvTimingResp(PacketPtr pkt) override; + void recvReqRetry() override; + + struct SenderState : public Packet::SenderState + { + SenderState(GPUDynInstPtr gpuDynInst, + Packet::SenderState *sender_state=nullptr) + : _gpuDynInst(gpuDynInst), saved(sender_state) + { + } + + GPUDynInstPtr _gpuDynInst; + Packet::SenderState *saved; + }; + + class MemReqEvent : public Event + { + private: + ScalarDataPort *scalarDataPort; + PacketPtr pkt; + + public: + MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt) + : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + std::deque retries; + + private: + ComputeUnit *computeUnit; + PortID index; + }; + // Instruction cache access port class SQCPort : public MasterPort { @@ -500,10 +681,13 @@ class ComputeUnit : public ClockedObject { Wavefront *wavefront; Packet::SenderState *saved; + // kernel id to be used in handling I-Cache invalidate response + int kernId; SenderState(Wavefront *_wavefront, Packet::SenderState - *sender_state=nullptr) - : wavefront(_wavefront), saved(sender_state) { } + *sender_state=nullptr, int _kernId=-1) + : wavefront(_wavefront), saved(sender_state), + kernId(_kernId){ } }; std::deque> retries; @@ -575,6 +759,34 @@ class ComputeUnit : public ClockedObject virtual void recvReqRetry(); }; + class ScalarDTLBPort : public MasterPort + { + public: + ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu) + : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) + { + } + + struct SenderState : public Packet::SenderState + { + SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { } + GPUDynInstPtr _gpuDynInst; + }; + + bool recvTimingResp(PacketPtr pkt) override; + void recvReqRetry() override { assert(false); } + + bool isStalled() const { return stalled; } + void stallPort() { stalled = true; } + void unstallPort() { stalled = false; } + + std::deque retries; + + private: + ComputeUnit *computeUnit; + bool stalled; + }; + class ITLBPort : public MasterPort { public: @@ -710,6 +922,10 @@ class ComputeUnit : public ClockedObject std::vector memPort; // port to the TLB hierarchy (i.e., the L1 TLB) std::vector tlbPort; + // port to the scalar data cache + ScalarDataPort *scalarDataPort; + // port to the scalar data TLB + ScalarDTLBPort *scalarDTLBPort; // port to the SQC (i.e. the I-cache) SQCPort *sqcPort; // port to the SQC TLB (there's a separate TLB for each I-cache) @@ -726,6 +942,14 @@ class ComputeUnit : public ClockedObject tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), this, idx); return *tlbPort[idx]; + } else if (if_name == "scalar_port") { + scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(), + idx), this, idx); + return *scalarDataPort; + } else if (if_name == "scalar_tlb_port") { + scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()), + this); + return *scalarDTLBPort; } else if (if_name == "sqc_port") { sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), this, idx); @@ -746,32 +970,18 @@ class ComputeUnit : public ClockedObject } } - // xact_cas_load() - class waveIdentifier - { - public: - waveIdentifier() { } - waveIdentifier(int _simdId, int _wfSlotId) - : simdId(_simdId), wfSlotId(_wfSlotId) { } - - int simdId; - int wfSlotId; - }; - - class waveQueue - { - public: - std::list waveIDQueue; - }; - std::map xactCasLoadMap; - - uint64_t getAndIncSeqNum() { return globalSeqNum++; } + InstSeqNum getAndIncSeqNum() { return globalSeqNum++; } private: const int _cacheLineSize; - uint64_t globalSeqNum; + int cacheLineBits; + InstSeqNum globalSeqNum; int wavefrontSize; - GPUStaticInst *kernelLaunchInst; + + // hold the time of the arrival of the first cache block related to + // a particular GPUDynInst. This is used to calculate the difference + // between the first and last chace block arrival times. + std::map headTailMap; }; #endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 99bffbd40..51f5e97fe 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -34,66 +34,76 @@ #include "gpu-compute/dispatcher.hh" -#include "cpu/base.hh" #include "debug/GPUDisp.hh" -#include "gpu-compute/cl_driver.hh" -#include "gpu-compute/cl_event.hh" +#include "debug/GPUKernelInfo.hh" +#include "debug/GPUWgLatency.hh" +#include "gpu-compute/gpu_command_processor.hh" +#include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/wavefront.hh" -#include "mem/packet_access.hh" - -GpuDispatcher *GpuDispatcher::instance = nullptr; - -GpuDispatcher::GpuDispatcher(const Params *p) - : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")), - pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), - dispatchCount(0), dispatchActive(false), cpu(p->cpu), - shader(p->shader_pointer), driver(p->cl_driver), - tickEvent([this]{ exec(); }, "GPU Dispatcher tick", - false, Event::CPU_Tick_Pri) +#include "sim/syscall_emul_buf.hh" +#include "sim/system.hh" + +GPUDispatcher::GPUDispatcher(const Params *p) + : SimObject(p), shader(nullptr), gpuCmdProc(nullptr), + tickEvent([this]{ exec(); }, + "GPU Dispatcher tick", false, Event::CPU_Tick_Pri), + dispatchActive(false) { - shader->handshake(this); - driver->handshake(this); - - ndRange.wg_disp_rem = false; - ndRange.globalWgId = 0; - schedule(&tickEvent, 0); +} - // translation port for the dispatcher - tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); +GPUDispatcher::~GPUDispatcher() +{ +} - num_kernelLaunched +void +GPUDispatcher::regStats() +{ + numKernelLaunched .name(name() + ".num_kernel_launched") .desc("number of kernel launched") ; + + cyclesWaitingForDispatch + .name(name() + ".cycles_wait_dispatch") + .desc("number of cycles with outstanding wavefronts " + "that are waiting to be dispatched") + ; +} + +HSAQueueEntry* +GPUDispatcher::hsaTask(int disp_id) +{ + assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end()); + return hsaQueueEntries[disp_id]; } -GpuDispatcher *GpuDispatcherParams::create() +void +GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc) { - GpuDispatcher *dispatcher = new GpuDispatcher(this); - GpuDispatcher::setInstance(dispatcher); + gpuCmdProc = gpu_cmd_proc; +} - return GpuDispatcher::getInstance(); +void +GPUDispatcher::setShader(Shader *new_shader) +{ + shader = new_shader; } void -GpuDispatcher::serialize(CheckpointOut &cp) const +GPUDispatcher::serialize(CheckpointOut &cp) const { Tick event_tick = 0; - if (ndRange.wg_disp_rem) - fatal("Checkpointing not supported during active workgroup execution"); - if (tickEvent.scheduled()) event_tick = tickEvent.when(); SERIALIZE_SCALAR(event_tick); - } void -GpuDispatcher::unserialize(CheckpointIn &cp) +GPUDispatcher::unserialize(CheckpointIn &cp) { Tick event_tick; @@ -102,288 +112,256 @@ GpuDispatcher::unserialize(CheckpointIn &cp) UNSERIALIZE_SCALAR(event_tick); - if (event_tick) + if (event_tick) { schedule(&tickEvent, event_tick); + } } -AddrRangeList -GpuDispatcher::getAddrRanges() const -{ - AddrRangeList ranges; - - DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", - pioAddr, pioSize); - - ranges.push_back(RangeSize(pioAddr, pioSize)); - - return ranges; -} - -Tick -GpuDispatcher::read(PacketPtr pkt) +/** + * After all relevant HSA data structures have been traversed/extracted + * from memory by the CP, dispatch() is called on the dispatcher. This will + * schedule a dispatch event that, when triggered, will attempt to dispatch + * the WGs associated with the given task to the CUs. + */ +void +GPUDispatcher::dispatch(HSAQueueEntry *task) { - assert(pkt->getAddr() >= pioAddr); - assert(pkt->getAddr() < pioAddr + pioSize); - - int offset = pkt->getAddr() - pioAddr; - pkt->allocate(); + ++numKernelLaunched; - DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); + DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n", + task->kernelName(), task->dispatchId()); - if (offset < 8) { - assert(!offset); - assert(pkt->getSize() == 8); + execIds.push(task->dispatchId()); + dispatchActive = true; + hsaQueueEntries.emplace(task->dispatchId(), task); - uint64_t retval = dispatchActive; - pkt->setLE(retval); - } else { - offset -= 8; - assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); - char *curTaskPtr = (char*)&curTask; - - memcpy(pkt->getPtr(), curTaskPtr + offset, pkt->getSize()); + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->clockPeriod()); } - - pkt->makeAtomicResponse(); - - return pioDelay; } -Tick -GpuDispatcher::write(PacketPtr pkt) +void +GPUDispatcher::exec() { - assert(pkt->getAddr() >= pioAddr); - assert(pkt->getAddr() < pioAddr + pioSize); - - int offset = pkt->getAddr() - pioAddr; - -#if TRACING_ON - uint64_t data_val = 0; - - switch (pkt->getSize()) { - case 1: - data_val = pkt->getLE(); - break; - case 2: - data_val = pkt->getLE(); - break; - case 4: - data_val = pkt->getLE(); - break; - case 8: - data_val = pkt->getLE(); - break; - default: - DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); - } + int fail_count(0); - DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, - pkt->getSize()); -#endif - if (!offset) { - static int nextId = 0; - - // The depends field of the qstruct, which was previously unused, is - // used to communicate with simulated application. - if (curTask.depends) { - HostState hs; - shader->ReadMem((uint64_t)(curTask.depends), &hs, - sizeof(HostState), 0); + /** + * There are potentially multiple outstanding kernel launches. + * It is possible that the workgroups in a different kernel + * can fit on the GPU even if another kernel's workgroups cannot + */ + DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); - // update event start time (in nano-seconds) - uint64_t start = curTick() / 1000; + if (execIds.size() > 0) { + ++cyclesWaitingForDispatch; + } - shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), - &start, sizeof(uint64_t), 0); + /** + * dispatch work cannot start until the kernel's invalidate is + * completely finished; hence, kernel will always initiates + * invalidate first and keeps waiting until inv done + */ + while (execIds.size() > fail_count) { + int exec_id = execIds.front(); + auto task = hsaQueueEntries[exec_id]; + bool launched(false); + + // invalidate is needed before starting dispatch + if (shader->impl_kern_boundary_sync) { + // try to invalidate cache + shader->prepareInvalidate(task); + } else { + // kern boundary sync is not set, skip invalidate + task->markInvDone(); } - // launch kernel - ++num_kernelLaunched; - - NDRange *ndr = &(ndRangeMap[nextId]); - // copy dispatch info - ndr->q = curTask; - - // update the numDispTask polled by the runtime - accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); + /** + * invalidate is still ongoing, put the kernel on the queue to + * retry later + */ + if (!task->isInvDone()){ + execIds.push(exec_id); + ++fail_count; - ndr->numWgTotal = 1; + DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending" + " invalidate requests\n", exec_id, task->outstandingInvs()); - for (int i = 0; i < 3; ++i) { - ndr->wgId[i] = 0; - ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); - ndr->numWgTotal *= ndr->numWg[i]; + // try the next kernel_id + execIds.pop(); + continue; } - ndr->numWgCompleted = 0; - ndr->globalWgId = 0; - ndr->wg_disp_rem = true; - ndr->execDone = false; - ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; - ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; - ndr->dispatchId = nextId; - ndr->curCid = pkt->req->contextId(); - DPRINTF(GPUDisp, "launching kernel %d\n",nextId); - execIds.push(nextId); - ++nextId; - - dispatchActive = true; - - if (!tickEvent.scheduled()) { - schedule(&tickEvent, curTick() + shader->ticks(1)); - } - } else { - // populate current task struct - // first 64 bits are launch reg - offset -= 8; - assert(offset < sizeof(HsaQueueEntry)); - char *curTaskPtr = (char*)&curTask; - memcpy(curTaskPtr + offset, pkt->getPtr(), pkt->getSize()); - } - - pkt->makeAtomicResponse(); - - return pioDelay; -} - - -Port & -GpuDispatcher::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "translation_port") { - return *tlbPort; - } - - return DmaDevice::getPort(if_name, idx); -} - -void -GpuDispatcher::exec() -{ - int fail_count = 0; - - // There are potentially multiple outstanding kernel launches. - // It is possible that the workgroups in a different kernel - // can fit on the GPU even if another kernel's workgroups cannot - DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); - - while (execIds.size() > fail_count) { - int execId = execIds.front(); - - while (ndRangeMap[execId].wg_disp_rem) { - //update the thread context - shader->updateContext(ndRangeMap[execId].curCid); - - // attempt to dispatch_workgroup - if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { - // if we failed try the next kernel, - // it may have smaller workgroups. - // put it on the queue to rety latter - DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); - execIds.push(execId); + // kernel invalidate is done, start workgroup dispatch + while (!task->dispComplete()) { + // update the thread context + shader->updateContext(task->contextId()); + + // attempt to dispatch workgroup + DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n", + curTick(), exec_id); + + if (!shader->dispatchWorkgroups(task)) { + /** + * if we failed try the next kernel, + * it may have smaller workgroups. + * put it on the queue to rety latter + */ + DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id); + execIds.push(exec_id); ++fail_count; break; + } else if (!launched) { + launched = true; + DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id); } } - // let's try the next kernel_id + + // try the next kernel_id execIds.pop(); } DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); - if (doneIds.size() && cpu) { - shader->hostWakeUp(cpu); - } - while (doneIds.size()) { - // wakeup the CPU if any Kernels completed this cycle - DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); + DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front()); doneIds.pop(); } } -void -GpuDispatcher::notifyWgCompl(Wavefront *w) +bool +GPUDispatcher::isReachingKernelEnd(Wavefront *wf) { - int kern_id = w->kernId; - DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); - assert(ndRangeMap[kern_id].dispatchId == kern_id); - ndRangeMap[kern_id].numWgCompleted++; - - if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { - ndRangeMap[kern_id].execDone = true; - doneIds.push(kern_id); - - if (ndRangeMap[kern_id].addrToNotify) { - accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, - 0); - } - - accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); + int kern_id = wf->kernId; + assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end()); + auto task = hsaQueueEntries[kern_id]; + assert(task->dispatchId() == kern_id); + + /** + * whether the next workgroup is the final one in the kernel, + * +1 as we check first before taking action + */ + return (task->numWgCompleted() + 1 == task->numWgTotal()); +} - // update event end time (in nano-seconds) - if (ndRangeMap[kern_id].q.depends) { - HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; - uint64_t event; - shader->ReadMem((uint64_t)(&host_state->event), &event, - sizeof(uint64_t), 0); +/** + * update the counter of oustanding inv requests for the kernel + * kern_id: kernel id + * val: +1/-1, increment or decrement the counter (default: -1) + */ +void +GPUDispatcher::updateInvCounter(int kern_id, int val) { + assert(val == -1 || val == 1); - uint64_t end = curTick() / 1000; + auto task = hsaQueueEntries[kern_id]; + task->updateOutstandingInvs(val); - shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, - sizeof(uint64_t), 0); - } + // kernel invalidate is done, schedule dispatch work + if (task->isInvDone() && !tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->clockPeriod()); } +} - if (!tickEvent.scheduled()) { - schedule(&tickEvent, curTick() + shader->ticks(1)); - } +/** + * update the counter of oustanding wb requests for the kernel + * kern_id: kernel id + * val: +1/-1, increment or decrement the counter (default: -1) + * + * return true if all wbs are done for the kernel + */ +bool +GPUDispatcher::updateWbCounter(int kern_id, int val) { + assert(val == -1 || val == 1); + + auto task = hsaQueueEntries[kern_id]; + task->updateOutstandingWbs(val); + + // true: WB is done, false: WB is still ongoing + return (task->outstandingWbs() == 0); } -void -GpuDispatcher::scheduleDispatch() -{ - if (!tickEvent.scheduled()) - schedule(&tickEvent, curTick() + shader->ticks(1)); +/** + * get kernel's outstanding cache writeback requests + */ +int +GPUDispatcher::getOutstandingWbs(int kernId) { + auto task = hsaQueueEntries[kernId]; + + return task->outstandingWbs(); } +/** + * When an end program instruction detects that the last WF in + * a WG has completed it will call this method on the dispatcher. + * If we detect that this is the last WG for the given task, then + * we ring the completion signal, which is used by the CPU to + * synchronize with the GPU. The HSAPP is also notified that the + * task has completed so it can be removed from its task queues. + */ void -GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) +GPUDispatcher::notifyWgCompl(Wavefront *wf) { - if (cpu) { - if (off) { - shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, - true); - val += off; + int kern_id = wf->kernId; + DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId); + auto task = hsaQueueEntries[kern_id]; + assert(task->dispatchId() == kern_id); + task->notifyWgCompleted(); + + DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n", + curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id); + + if (task->numWgCompleted() == task->numWgTotal()) { + // Notify the HSA PP that this kernel is complete + gpuCmdProc->hsaPacketProc() + .finishPkt(task->dispPktPtr(), task->queueId()); + if (task->completionSignal()) { + // The signal value is aligned 8 bytes from + // the actual handle in the runtime + Addr signal_addr = task->completionSignal() + sizeof(Addr); + DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering " + "completion signal: %x!\n", signal_addr); + + /** + * HACK: The semantics of the HSA signal is to decrement + * the current signal value. We cheat here and read out + * he value from main memory using functional access and + * then just DMA the decremented value. This is because + * the DMA controller does not currently support GPU + * atomics. + */ + auto *tc = gpuCmdProc->system()->threads[0]; + auto &virt_proxy = tc->getVirtProxy(); + TypedBufferArg prev_signal(signal_addr); + prev_signal.copyIn(virt_proxy); + + Addr *new_signal = new Addr; + *new_signal = (Addr)*prev_signal - 1; + + gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr, + new_signal, 0); + } else { + DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion " + "signal\n"); } - shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); - } else { - panic("Cannot find host"); + DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n", + curTick(), kern_id); + DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id); } -} - -// helper functions for driver to retrieve GPU attributes -int -GpuDispatcher::getNumCUs() -{ - return shader->cuList.size(); -} -int -GpuDispatcher::wfSize() const -{ - return shader->cuList[0]->wfSize(); + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->clockPeriod()); + } } void -GpuDispatcher::setFuncargsSize(int funcargs_size) +GPUDispatcher::scheduleDispatch() { - shader->funcargs_size = funcargs_size; + if (!tickEvent.scheduled()) { + schedule(&tickEvent, curTick() + shader->clockPeriod()); + } } -uint32_t -GpuDispatcher::getStaticContextSize() const +GPUDispatcher *GPUDispatcherParams::create() { - return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); + return new GPUDispatcher(this); } diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh index 1ffe81c10..cd282b9cb 100644 --- a/src/gpu-compute/dispatcher.hh +++ b/src/gpu-compute/dispatcher.hh @@ -31,125 +31,69 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __GPU_DISPATCHER_HH__ -#define __GPU_DISPATCHER_HH__ +/** + * @file + * The GPUDispatcher is the component of the shader that is responsible + * for creating and dispatching WGs to the compute units. If all WGs in + * a kernel cannot be dispatched simultaneously, then the dispatcher will + * keep track of all pending WGs and dispatch them as resources become + * available. + */ + +#ifndef __GPU_COMPUTE_DISPATCHER_HH__ +#define __GPU_COMPUTE_DISPATCHER_HH__ #include +#include #include #include "base/statistics.hh" -#include "dev/dma_device.hh" -#include "gpu-compute/compute_unit.hh" -#include "gpu-compute/ndrange.hh" -#include "gpu-compute/qstruct.hh" -#include "mem/port.hh" -#include "params/GpuDispatcher.hh" +#include "dev/hsa/hsa_packet.hh" +#include "params/GPUDispatcher.hh" +#include "sim/sim_object.hh" -class BaseCPU; +class GPUCommandProcessor; +class HSAQueueEntry; class Shader; +class Wavefront; -class GpuDispatcher : public DmaDevice +class GPUDispatcher : public SimObject { - public: - typedef GpuDispatcherParams Params; - - MasterID masterId() { return _masterId; } - - protected: - MasterID _masterId; - - // Base and length of PIO register space - Addr pioAddr; - Addr pioSize; - Tick pioDelay; - - HsaQueueEntry curTask; - - std::unordered_map ndRangeMap; - NDRange ndRange; - - // list of kernel_ids to launch - std::queue execIds; - // list of kernel_ids that have finished - std::queue doneIds; - - uint64_t dispatchCount; - // is there a kernel in execution? - bool dispatchActive; - - BaseCPU *cpu; - Shader *shader; - ClDriver *driver; - EventFunctionWrapper tickEvent; - - - static GpuDispatcher *instance; - - // sycall emulation mode can have only 1 application running(?) - // else we have to do some pid based tagging - // unused - typedef std::unordered_map TranslationBuffer; - TranslationBuffer tlb; - - public: - /*statistics*/ - Stats::Scalar num_kernelLaunched; - GpuDispatcher(const Params *p); - - ~GpuDispatcher() { } - - void exec(); - virtual void serialize(CheckpointOut &cp) const override; - virtual void unserialize(CheckpointIn &cp) override; - void notifyWgCompl(Wavefront *w); - void scheduleDispatch(); - void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off); - - // using singleton so that glue code can pass pointer locations - // to the dispatcher. when there are multiple dispatchers, we can - // call something like getInstance(index) - static void - setInstance(GpuDispatcher *_instance) - { - instance = _instance; - } - - static GpuDispatcher* getInstance() { return instance; } - - class TLBPort : public MasterPort - { - public: - - TLBPort(const std::string &_name, GpuDispatcher *_dispatcher) - : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { } - - protected: - GpuDispatcher *dispatcher; - - virtual bool recvTimingResp(PacketPtr pkt) { return true; } - virtual Tick recvAtomic(PacketPtr pkt) { return 0; } - virtual void recvFunctional(PacketPtr pkt) { } - virtual void recvRangeChange() { } - virtual void recvReqRetry() { } - - }; - - TLBPort *tlbPort; - - Port &getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - - AddrRangeList getAddrRanges() const override; - Tick read(PacketPtr pkt) override; - Tick write(PacketPtr pkt) override; - - // helper functions to retrieve/set GPU attributes - int getNumCUs(); - int wfSize() const; - void setFuncargsSize(int funcargs_size); - - /** Returns the size of the static hardware context of a wavefront */ - uint32_t getStaticContextSize() const; + public: + typedef GPUDispatcherParams Params; + + GPUDispatcher(const Params *p); + ~GPUDispatcher(); + + void serialize(CheckpointOut &cp) const override; + void unserialize(CheckpointIn &cp) override; + void regStats() override; + void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc); + void setShader(Shader *new_shader); + void exec(); + bool isReachingKernelEnd(Wavefront *wf); + void updateInvCounter(int kern_id, int val=-1); + bool updateWbCounter(int kern_id, int val=-1); + int getOutstandingWbs(int kern_id); + void notifyWgCompl(Wavefront *wf); + void scheduleDispatch(); + void dispatch(HSAQueueEntry *task); + HSAQueueEntry* hsaTask(int disp_id); + + private: + Shader *shader; + GPUCommandProcessor *gpuCmdProc; + EventFunctionWrapper tickEvent; + std::unordered_map hsaQueueEntries; + // list of kernel_ids to launch + std::queue execIds; + // list of kernel_ids that have finished + std::queue doneIds; + // is there a kernel in execution? + bool dispatchActive; + /*statistics*/ + Stats::Scalar numKernelLaunched; + Stats::Scalar cyclesWaitingForDispatch; }; -#endif // __GPU_DISPATCHER_HH__ +#endif // __GPU_COMPUTE_DISPATCHER_HH__ diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index 0640083f8..2dece180b 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -33,13 +33,15 @@ #include "gpu-compute/exec_stage.hh" +#include + +#include "base/trace.hh" +#include "debug/GPUSched.hh" #include "gpu-compute/compute_unit.hh" +#include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs), - numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), - vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr), - shrMemInstAvail(nullptr), lastTimeInstExecuted(false), +ExecStage::ExecStage(const ComputeUnitParams *p) : lastTimeInstExecuted(false), thisTimeInstExecuted(false), instrExecuted (false), executionResourcesUsed(0) { @@ -53,37 +55,18 @@ ExecStage::init(ComputeUnit *cu) computeUnit = cu; _name = computeUnit->name() + ".ExecStage"; dispatchList = &computeUnit->dispatchList; - vectorAluInstAvail = &(computeUnit->vectorAluInstAvail); - glbMemInstAvail= &(computeUnit->glbMemInstAvail); - shrMemInstAvail= &(computeUnit->shrMemInstAvail); idle_dur = 0; } void ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { if (stage == IdleExec) { - // count cycles of no vector ALU instruction executed - // even if one was the oldest in a WV of that vector SIMD unit - if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) { - numCyclesWithNoInstrTypeIssued[unitId]++; - } - - // count cycles of no global memory (vector) instruction executed - // even if one was the oldest in a WV of that vector SIMD unit - if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) { - numCyclesWithNoInstrTypeIssued[unitId]++; - (*glbMemInstAvail)--; - } - - // count cycles of no shared memory (vector) instruction executed - // even if one was the oldest in a WV of that vector SIMD unit - if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) { - numCyclesWithNoInstrTypeIssued[unitId]++; - (*shrMemInstAvail)--; - } + // count cycles when no instruction to a specific execution resource + // is executed + numCyclesWithNoInstrTypeIssued[unitId]++; } else if (stage == BusyExec) { - // count the number of cycles an instruction to a specific unit - // was issued + // count the number of cycles an instruction to a specific execution + // resource type was issued numCyclesWithInstrTypeIssued[unitId]++; thisTimeInstExecuted = true; instrExecuted = true; @@ -102,14 +85,13 @@ ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) { } lastTimeInstExecuted = thisTimeInstExecuted; - // track the number of cycles we either issued one vector instruction - // or issued no instructions at all + // track the number of cycles we either issued at least + // instruction or issued no instructions at all if (instrExecuted) { numCyclesWithInstrIssued++; } else { numCyclesWithNoIssue++; } - spc.sample(executionResourcesUsed); } } @@ -122,25 +104,86 @@ ExecStage::initStatistics() thisTimeInstExecuted = false; } +std::string +ExecStage::dispStatusToStr(int i) +{ + std::string s("INVALID"); + switch (i) { + case EMPTY: + s = "EMPTY"; + break; + case SKIP: + s = "SKIP"; + break; + case EXREADY: + s = "EXREADY"; + break; + } + return s; +} + +void +ExecStage::dumpDispList() +{ + std::stringstream ss; + bool empty = true; + for (int i = 0; i < computeUnit->numExeUnits(); i++) { + DISPATCH_STATUS s = dispatchList->at(i).second; + ss << i << ": " << dispStatusToStr(s); + if (s != EMPTY) { + empty = false; + Wavefront *w = dispatchList->at(i).first; + ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: "; + ss << (w->instructionBuffer.front())->seqNum() << ": "; + ss << (w->instructionBuffer.front())->disassemble(); + } + ss << "\n"; + } + if (!empty) { + DPRINTF(GPUSched, "Dispatch List:\n%s", ss.str()); + } +} + void ExecStage::exec() { initStatistics(); - - for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) { - // if dispatch list for this execution resource is empty, - // skip this execution resource this cycle - if (dispatchList->at(unitId).second == EMPTY) { - collectStatistics(IdleExec, unitId); - continue; - } - - collectStatistics(BusyExec, unitId); - // execute an instruction for the WF - dispatchList->at(unitId).first->exec(); - // clear the dispatch list entry - dispatchList->at(unitId).second = EMPTY; - dispatchList->at(unitId).first = (Wavefront*)nullptr; + if (Debug::GPUSched) { + dumpDispList(); + } + for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { + DISPATCH_STATUS s = dispatchList->at(unitId).second; + switch (s) { + case EMPTY: + // Do not execute if empty, waiting for VRF reads, + // or LM tied to GM waiting for VRF reads + collectStatistics(IdleExec, unitId); + break; + case EXREADY: + { + collectStatistics(BusyExec, unitId); + Wavefront *w = dispatchList->at(unitId).first; + DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n", + unitId, w->simdId, w->wfDynId, + (w->instructionBuffer.front())->disassemble()); + DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId); + dispatchList->at(unitId).first->exec(); + (computeUnit->scheduleStage).deleteFromSch(w); + dispatchList->at(unitId).second = EMPTY; + dispatchList->at(unitId).first->freeResources(); + dispatchList->at(unitId).first = nullptr; + break; + } + case SKIP: + collectStatistics(BusyExec, unitId); + DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId); + dispatchList->at(unitId).second = EMPTY; + dispatchList->at(unitId).first->freeResources(); + dispatchList->at(unitId).first = nullptr; + break; + default: + panic("Unknown dispatch status in exec()\n"); + } } collectStatistics(PostExec, 0); @@ -165,7 +208,7 @@ ExecStage::regStats() ; spc - .init(0, numSIMDs + numMemUnits, 1) + .init(0, computeUnit->numExeUnits(), 1) .name(name() + ".spc") .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") ; @@ -177,25 +220,36 @@ ExecStage::regStats() ; numCyclesWithInstrTypeIssued - .init(numSIMDs + numMemUnits) - .name(name() + ".num_cycles_with_instrtype_issue") - .desc("Number of cycles at least one instruction of specific type " - "issued") + .init(computeUnit->numExeUnits()) + .name(name() + ".num_cycles_issue_exec_rsrc") + .desc("Number of cycles at least one instruction issued to " + "execution resource type") ; numCyclesWithNoInstrTypeIssued - .init(numSIMDs + numMemUnits) - .name(name() + ".num_cycles_with_instr_type_no_issue") - .desc("Number of cycles no instruction of specific type issued") + .init(computeUnit->numExeUnits()) + .name(name() + ".num_cycles_no_issue_exec_rsrc") + .desc("Number of clks no instructions issued to execution " + "resource type") ; - for (int i = 0; i < numSIMDs; ++i) { - numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i)); - numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i)); + int c = 0; + for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) { + std::string s = "VectorALU" + std::to_string(i); + numCyclesWithNoInstrTypeIssued.subname(c, s); + numCyclesWithInstrTypeIssued.subname(c, s); + } + for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) { + std::string s = "ScalarALU" + std::to_string(i); + numCyclesWithNoInstrTypeIssued.subname(c, s); + numCyclesWithInstrTypeIssued.subname(c, s); } + numCyclesWithNoInstrTypeIssued.subname(c, "VectorMemPipe"); + numCyclesWithInstrTypeIssued.subname(c++, "VectorMemPipe"); + + numCyclesWithNoInstrTypeIssued.subname(c, "SharedMemPipe"); + numCyclesWithInstrTypeIssued.subname(c++, "SharedMemPipe"); - numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM")); - numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM")); - numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); - numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM")); + numCyclesWithNoInstrTypeIssued.subname(c, "ScalarMemPipe"); + numCyclesWithInstrTypeIssued.subname(c++, "ScalarMemPipe"); } diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh index 2e14a542e..670252cde 100644 --- a/src/gpu-compute/exec_stage.hh +++ b/src/gpu-compute/exec_stage.hh @@ -35,6 +35,7 @@ #define __EXEC_STAGE_HH__ #include +#include #include #include @@ -53,8 +54,9 @@ enum STAT_STATUS enum DISPATCH_STATUS { - EMPTY = 0, - FILLED + EMPTY = 0, // no wave present in dispatchList slot + EXREADY, // wave ready for execution + SKIP, // extra memory resource needed, Shared Mem. only }; // Execution stage. @@ -72,18 +74,21 @@ class ExecStage void init(ComputeUnit *cu); void exec(); + std::string dispStatusToStr(int j); + void dumpDispList(); + std::string name() { return _name; } void regStats(); // number of idle cycles Stats::Scalar numCyclesWithNoIssue; // number of busy cycles Stats::Scalar numCyclesWithInstrIssued; - // number of cycles (per execution unit) during which at least one - // instruction was issued to that unit + // number of cycles during which at least one + // instruction was issued to an execution resource type Stats::Vector numCyclesWithInstrTypeIssued; - // number of idle cycles (per execution unit) during which the unit issued - // no instruction targeting that unit, even though there is at least one - // Wavefront with such an instruction as the oldest + // number of idle cycles during which the scheduler + // issued no instructions targeting a specific + // execution resource type Stats::Vector numCyclesWithNoInstrTypeIssued; // SIMDs active per cycle Stats::Distribution spc; @@ -92,11 +97,6 @@ class ExecStage void collectStatistics(enum STAT_STATUS stage, int unitId); void initStatistics(); ComputeUnit *computeUnit; - uint32_t numSIMDs; - - // Number of memory execution resources; - // both global and local memory execution resources in CU - uint32_t numMemUnits; // List of waves which will be dispatched to // each execution resource. A FILLED implies @@ -108,18 +108,12 @@ class ExecStage // dispatchList is used to communicate between schedule // and exec stage std::vector> *dispatchList; - // flag per vector SIMD unit that is set when there is at least one - // WV that has a vector ALU instruction as the oldest in its - // Instruction Buffer - std::vector *vectorAluInstAvail; - int *glbMemInstAvail; - int *shrMemInstAvail; bool lastTimeInstExecuted; bool thisTimeInstExecuted; bool instrExecuted; Stats::Scalar numTransActiveIdle; Stats::Distribution idleDur; - uint32_t executionResourcesUsed; + int executionResourcesUsed; uint64_t idle_dur; std::string _name; }; diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc index 4a2d4233f..cf0b39e70 100644 --- a/src/gpu-compute/fetch_stage.cc +++ b/src/gpu-compute/fetch_stage.cc @@ -36,18 +36,18 @@ #include "gpu-compute/compute_unit.hh" #include "gpu-compute/wavefront.hh" -FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs), - computeUnit(nullptr) +FetchStage::FetchStage(const ComputeUnitParams* p) : + numVectorALUs(p->num_SIMDs), computeUnit(nullptr) { - for (int j = 0; j < numSIMDs; ++j) { + for (int j = 0; j < numVectorALUs; ++j) { FetchUnit newFetchUnit(p); - fetchUnit.push_back(newFetchUnit); + _fetchUnit.push_back(newFetchUnit); } } FetchStage::~FetchStage() { - fetchUnit.clear(); + _fetchUnit.clear(); } void @@ -56,17 +56,17 @@ FetchStage::init(ComputeUnit *cu) computeUnit = cu; _name = computeUnit->name() + ".FetchStage"; - for (int j = 0; j < numSIMDs; ++j) { - fetchUnit[j].bindWaveList(&computeUnit->wfList[j]); - fetchUnit[j].init(computeUnit); + for (int j = 0; j < numVectorALUs; ++j) { + _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]); + _fetchUnit[j].init(computeUnit); } } void FetchStage::exec() { - for (int j = 0; j < numSIMDs; ++j) { - fetchUnit[j].exec(); + for (int j = 0; j < numVectorALUs; ++j) { + _fetchUnit[j].exec(); } } @@ -83,13 +83,13 @@ FetchStage::processFetchReturn(PacketPtr pkt) instFetchInstReturned.sample(num_instructions); uint32_t simdId = wavefront->simdId; - fetchUnit[simdId].processFetchReturn(pkt); + _fetchUnit[simdId].processFetchReturn(pkt); } void FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront) { - fetchUnit[wavefront->simdId].fetch(pkt, wavefront); + _fetchUnit[wavefront->simdId].fetch(pkt, wavefront); } void diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh index 310ce6f60..afaf81b5a 100644 --- a/src/gpu-compute/fetch_stage.hh +++ b/src/gpu-compute/fetch_stage.hh @@ -62,14 +62,15 @@ class FetchStage std::string name() { return _name; } void regStats(); Stats::Distribution instFetchInstReturned; + FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); } private: - uint32_t numSIMDs; + int numVectorALUs; ComputeUnit *computeUnit; // List of fetch units. A fetch unit is - // instantiated per SIMD - std::vector fetchUnit; + // instantiated per VALU/SIMD + std::vector _fetchUnit; std::string _name; }; diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index c567b71db..fb04cd27e 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -45,11 +45,9 @@ uint32_t FetchUnit::globalFetchUnitID; -FetchUnit::FetchUnit(const ComputeUnitParams* params) : - timingSim(true), - computeUnit(nullptr), - fetchScheduler(params), - waveList(nullptr) +FetchUnit::FetchUnit(const ComputeUnitParams* params) + : timingSim(true), computeUnit(nullptr), fetchScheduler(params), + waveList(nullptr), fetchDepth(params->fetch_depth) { } @@ -66,9 +64,14 @@ FetchUnit::init(ComputeUnit *cu) timingSim = computeUnit->shader->timingSim; fetchQueue.clear(); fetchStatusQueue.resize(computeUnit->shader->n_wf); - - for (int j = 0; j < computeUnit->shader->n_wf; ++j) { - fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); + fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc()); + + for (int i = 0; i < computeUnit->shader->n_wf; ++i) { + Wavefront *wf = waveList->at(i); + assert(wf->wfSlotId == i); + fetchStatusQueue[i] = std::make_pair(wf, false); + fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf); + fetchBuf[i].decoder(&decoder); } fetchScheduler.bindList(&fetchQueue); @@ -77,6 +80,23 @@ FetchUnit::init(ComputeUnit *cu) void FetchUnit::exec() { + /** + * now we check if any of the fetch buffers have + * buffered instruction data that can be decoded + * and sent to its wavefront's instruction buffer. + * then we check if any of the fetch buffer entries + * can be released. we only check if we can + * release a buffer + */ + for (auto &fetch_buf : fetchBuf) { + if (!fetch_buf.hasFreeSpace()) { + fetch_buf.checkWaveReleaseBuf(); + } + if (fetch_buf.hasFetchDataToProcess()) { + fetch_buf.decodeInsts(); + } + } + // re-evaluate waves which are marked as not ready for fetch for (int j = 0; j < computeUnit->shader->n_wf; ++j) { // Following code assumes 64-bit opertaion and all insts are @@ -88,9 +108,10 @@ FetchUnit::exec() // 4 or less instructions and it can not have any branches to // prevent speculative instruction fetches if (!fetchStatusQueue[j].second) { - if (curWave->status == Wavefront::S_RUNNING && - curWave->instructionBuffer.size() <= 4 && - !curWave->instructionBufferHasBranch() && + if ((curWave->getStatus() == Wavefront::S_RUNNING || + curWave->getStatus() == Wavefront::S_WAITCNT) && + fetchBuf[j].hasFreeSpace() && + !curWave->stopFetch() && !curWave->pendingFetch) { fetchQueue.push_back(curWave); fetchStatusQueue[j].second = true; @@ -111,45 +132,38 @@ FetchUnit::exec() void FetchUnit::initiateFetch(Wavefront *wavefront) { - // calculate the virtual address to fetch from the SQC - Addr vaddr = wavefront->pc(); + assert(fetchBuf.at(wavefront->wfSlotId).hasFreeSpace()); /** - * the instruction buffer holds one instruction per entry, regardless - * of the underlying instruction's size. the PC, however, addresses - * instrutions on a 32b granularity so we must account for that here. - */ - for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) { - vaddr += - wavefront->instructionBuffer.at(i)->staticInstruction()->instSize(); - } - vaddr = wavefront->basePtr + vaddr; + * calculate the virtual address to fetch from the SQC. the fetch + * buffer holds a configurable number of cache lines. we start + * fetching at the address of the cache line immediately following + * the buffered line(s). + */ + Addr vaddr = fetchBuf.at(wavefront->wfSlotId).nextFetchAddr(); - DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", - computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); + // this should already be aligned to a cache line + assert(vaddr == makeLineAddress(vaddr, + computeUnit->getCacheLineBits())); - // Since this is an instruction prefetch, if you're split then just finish - // out the current line. - int block_size = computeUnit->cacheLineSize(); - // check for split accesses - Addr split_addr = roundDown(vaddr + block_size - 1, block_size); - int size = block_size; + // shouldn't be fetching a line that is already buffered + assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr)); - if (split_addr > vaddr) { - // misaligned access, just grab the rest of the line - size = split_addr - vaddr; - } + fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr); + + DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch " + "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId, + wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr); + + DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", + computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); // set up virtual request RequestPtr req = std::make_shared( - vaddr, size, Request::INST_FETCH, + vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH, computeUnit->masterId(), 0, 0, nullptr); PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - // This fetchBlock is kind of faux right now - because the translations so - // far don't actually return Data - uint64_t fetchBlock; - pkt->dataStatic(&fetchBlock); if (timingSim) { // SenderState needed on Return @@ -210,19 +224,23 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); - // this is necessary because the GPU TLB receives packets instead of - // requests. when the translation is complete, all relevent fields in the - // request will be populated, but not in the packet. here we create the - // new packet so we can set the size, addr, and proper flags. + /** + * this is necessary because the GPU TLB receives packets instead of + * requests. when the translation is complete, all relevent fields in + * the request will be populated, but not in the packet. here we create + * the new packet so we can set the size, addr, and proper flags. + */ PacketPtr oldPkt = pkt; pkt = new Packet(oldPkt->req, oldPkt->cmd); delete oldPkt; - TheGpuISA::RawMachInst *data = - new TheGpuISA::RawMachInst[pkt->req->getSize() / - sizeof(TheGpuISA::RawMachInst)]; - - pkt->dataDynamic(data); + /** + * we should have reserved an entry in the fetch buffer + * for this cache line. here we get the pointer to the + * entry used to buffer this request's line data. + */ + pkt->dataStatic(fetchBuf.at(wavefront->wfSlotId) + .reservedBuf(pkt->req->getVaddr())); // New SenderState for the memory access pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); @@ -257,47 +275,15 @@ FetchUnit::processFetchReturn(PacketPtr pkt) Wavefront *wavefront = sender_state->wavefront; DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " - "%d bytes, %d instructions!\n", computeUnit->cu_id, - wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), - pkt->req->getSize(), pkt->req->getSize() / - sizeof(TheGpuISA::RawMachInst)); + "%d bytes!\n", computeUnit->cu_id, wavefront->simdId, + wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize()); if (wavefront->dropFetch) { assert(wavefront->instructionBuffer.empty()); + assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess()); wavefront->dropFetch = false; } else { - TheGpuISA::RawMachInst *inst_index_ptr = - (TheGpuISA::RawMachInst*)pkt->getPtr(); - - assert(wavefront->instructionBuffer.size() <= 4); - - for (int i = 0; i < pkt->req->getSize() / - sizeof(TheGpuISA::RawMachInst); ++i) { - GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); - - assert(inst_ptr); - - if (inst_ptr->instSize() == 8) { - /** - * this instruction occupies 2 consecutive - * entries in the instruction array, the - * second of which contains a nullptr. so if - * this inst is 8 bytes we advance two entries - * instead of 1 - */ - ++i; - } - - DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", - computeUnit->cu_id, wavefront->simdId, - wavefront->wfSlotId, inst_ptr->disassemble()); - - GPUDynInstPtr gpuDynInst = - std::make_shared(computeUnit, wavefront, inst_ptr, - computeUnit->getAndIncSeqNum()); - - wavefront->instructionBuffer.push_back(gpuDynInst); - } + fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr()); } wavefront->pendingFetch = false; @@ -306,8 +292,337 @@ FetchUnit::processFetchReturn(PacketPtr pkt) delete pkt; } +void +FetchUnit::flushBuf(int wfSlotId) +{ + fetchBuf.at(wfSlotId).flushBuf(); +} + void FetchUnit::bindWaveList(std::vector *wave_list) { waveList = wave_list; } + +/** FetchBufDesc */ +void +FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth, int cache_line_size, + Wavefront *wf) +{ + wavefront = wf; + fetchDepth = fetch_depth; + maxIbSize = wavefront->maxIbSize; + cacheLineSize = cache_line_size; + maxFbSize = cacheLineSize * fetchDepth; + + // Calculate the number of bits to address a cache line + panic_if(!isPowerOf2(cacheLineSize), + "Cache line size should be a power of two."); + cacheLineBits = floorLog2(cacheLineSize); + + bufStart = new uint8_t[maxFbSize]; + readPtr = bufStart; + bufEnd = bufStart + maxFbSize; + + for (int i = 0; i < fetchDepth; ++i) { + freeList.emplace_back(readPtr + i * cacheLineSize); + } +} + +void +FetchUnit::FetchBufDesc::flushBuf() +{ + restartFromBranch = true; + /** + * free list may have some entries + * so we clear it here to avoid duplicates + */ + freeList.clear(); + bufferedPCs.clear(); + reservedPCs.clear(); + readPtr = bufStart; + + for (int i = 0; i < fetchDepth; ++i) { + freeList.push_back(bufStart + i * cacheLineSize); + } + + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch " + "buffer\n", wavefront->simdId, wavefront->wfSlotId, + wavefront->wfDynId); +} + +Addr +FetchUnit::FetchBufDesc::nextFetchAddr() +{ + Addr next_line = 0; + + if (bufferedAndReservedLines()) { + Addr last_line_fetched = 0; + if (!reservedLines()) { + /** + * get the PC of the most recently fetched cache line, + * then return the address of the next line. + */ + last_line_fetched = bufferedPCs.rbegin()->first; + } else { + last_line_fetched = reservedPCs.rbegin()->first; + } + + next_line = last_line_fetched + cacheLineSize; + + /** + * should not be trying to fetch a line that has already + * been fetched. + */ + assert(bufferedPCs.find(next_line) == bufferedPCs.end()); + assert(reservedPCs.find(next_line) == reservedPCs.end()); + } else { + /** + * we do not have any buffered cache lines yet, so we + * assume this is the initial fetch, or the first fetch + * after a branch, and get the PC directly from the WF. + * in the case of a branch, we may not start at the + * beginning of a cache line, so we adjust the readPtr by + * the current PC's offset from the start of the line. + */ + next_line = makeLineAddress(wavefront->pc(), cacheLineBits); + readPtr = bufStart; + + /** + * if we are here we have no buffered lines. in the case we flushed + * the buffer due to a branch, we may need to start fetching from + * some offset from the start of the fetch buffer, so we adjust for + * that here. + */ + if (restartFromBranch) { + restartFromBranch = false; + int byte_offset + = wavefront->pc() - makeLineAddress(wavefront->pc(), + cacheLineBits); + readPtr += byte_offset; + } + } + + return next_line; +} + +void +FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr) +{ + // we should have free buffer space, and the line + // at vaddr should not already be cached. + assert(hasFreeSpace()); + assert(bufferedPCs.find(vaddr) == bufferedPCs.end()); + assert(reservedPCs.find(vaddr) == reservedPCs.end()); + assert(bufferedAndReservedLines() < fetchDepth); + + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d reserved fetch buffer entry " + "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId, + wavefront->wfDynId, vaddr); + + /** + * we reserve buffer space, by moving it out of the + * free list, however we do not mark the buffered + * line as valid until the fetch unit for this buffer + * has receieved the response from the memory system. + */ + uint8_t *inst_buf = freeList.front(); + reservedPCs.emplace(vaddr, inst_buf); + freeList.pop_front(); +} + +void +FetchUnit::FetchBufDesc::fetchDone(Addr vaddr) +{ + assert(bufferedPCs.find(vaddr) == bufferedPCs.end()); + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n", + wavefront->simdId, wavefront->wfSlotId, + wavefront->wfDynId, vaddr); + + /** + * this address should have an entry reserved in the + * fetch buffer already, however it should be invalid + * until the fetch completes. + */ + auto reserved_pc = reservedPCs.find(vaddr); + assert(reserved_pc != reservedPCs.end()); + bufferedPCs.emplace(vaddr, reserved_pc->second); + + if (readPtr == bufEnd) { + readPtr = bufStart; + } + + reserved_pc->second = nullptr; + reservedPCs.erase(reserved_pc); +} + +bool +FetchUnit::FetchBufDesc::hasFetchDataToProcess() const +{ + return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst); +} + +void +FetchUnit::FetchBufDesc::checkWaveReleaseBuf() +{ + Addr cur_wave_pc = roundDown(wavefront->pc(), + wavefront->computeUnit->cacheLineSize()); + if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) { + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d current wave PC(%#x) still " + "being fetched.\n", wavefront->simdId, wavefront->wfSlotId, + wavefront->wfDynId, cur_wave_pc); + + // should be reserved, but not buffered yet + assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end()); + + return; + } + + auto current_buffered_pc = bufferedPCs.find(cur_wave_pc); + auto oldest_buffered_pc = bufferedPCs.begin(); + + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d checking if PC block addr = %#x" + "(PC = %#x) can be released.\n", wavefront->simdId, + wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc, + wavefront->pc()); + +#ifdef DEBUG + int idx = 0; + for (const auto &buf_pc : bufferedPCs) { + DPRINTF(GPUFetch, "PC[%d] = %#x\n", idx, buf_pc.first); + ++idx; + } +#endif + + // if we haven't buffered data for this PC, we shouldn't + // be fetching from it. + assert(current_buffered_pc != bufferedPCs.end()); + + /** + * we're using a std::map so the addresses are sorted. if this + * PC is not the oldest one in the map, we must be fetching from + * a newer block, and we can release the oldest PC's fetch buffer + * entry back to the free list. + */ + if (current_buffered_pc != oldest_buffered_pc) { + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for PC = %#x, " + "removing it from the fetch buffer.\n", wavefront->simdId, + wavefront->wfSlotId, wavefront->wfDynId, + oldest_buffered_pc->first); + + freeList.emplace_back(oldest_buffered_pc->second); + oldest_buffered_pc->second = nullptr; + bufferedPCs.erase(oldest_buffered_pc); + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d has %d lines buffered.\n", + wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, + bufferedLines()); + } +} + +void +FetchUnit::FetchBufDesc::decodeInsts() +{ + assert(readPtr); + + if (splitDecode()) { + decodeSplitInst(); + } + + while (wavefront->instructionBuffer.size() < maxIbSize + && hasFetchDataToProcess()) { + if (splitDecode()) { + decodeSplitInst(); + } else { + TheGpuISA::MachInst mach_inst + = reinterpret_cast(readPtr); + GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst); + readPtr += gpu_static_inst->instSize(); + + assert(readPtr <= bufEnd); + + GPUDynInstPtr gpu_dyn_inst + = std::make_shared(wavefront->computeUnit, + wavefront, gpu_static_inst, + wavefront->computeUnit-> + getAndIncSeqNum()); + wavefront->instructionBuffer.push_back(gpu_dyn_inst); + + DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). " + "%d bytes remain.\n", wavefront->simdId, + wavefront->wfSlotId, wavefront->wfDynId, + gpu_static_inst->disassemble(), + gpu_static_inst->instSize(), + fetchBytesRemaining()); + } + } +} + +void +FetchUnit::FetchBufDesc::decodeSplitInst() +{ + TheGpuISA::RawMachInst split_inst = 0; + int dword_size = sizeof(uint32_t); + int num_dwords = sizeof(TheGpuISA::RawMachInst) / dword_size; + + for (int i = 0; i < num_dwords; ++i) { + ((uint32_t*)(&split_inst))[i] = *reinterpret_cast(readPtr); + if (readPtr + dword_size >= bufEnd) { + readPtr = bufStart; + } + } + + assert(readPtr == bufStart); + + TheGpuISA::MachInst mach_inst + = reinterpret_cast(&split_inst); + GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst); + readPtr += (gpu_static_inst->instSize() - dword_size); + assert(readPtr < bufEnd); + + GPUDynInstPtr gpu_dyn_inst + = std::make_shared(wavefront->computeUnit, + wavefront, gpu_static_inst, + wavefront->computeUnit-> + getAndIncSeqNum()); + wavefront->instructionBuffer.push_back(gpu_dyn_inst); + + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d decoded split inst %s (%#x) " + "(%d bytes). %d bytes remain in %d buffered lines.\n", + wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, + gpu_static_inst->disassemble(), split_inst, + gpu_static_inst->instSize(), fetchBytesRemaining(), + bufferedLines()); +} + +bool +FetchUnit::FetchBufDesc::splitDecode() const +{ + /** + * if a read of a raw instruction would go beyond the end + * of the fetch buffer, then we must perform a split decode. + */ + bool is_split = (readPtr + sizeof(TheGpuISA::RawMachInst)) > bufEnd; + + return is_split; +} + +int +FetchUnit::FetchBufDesc::fetchBytesRemaining() const +{ + int bytes_remaining = 0; + + if (bufferedLines() && readPtr != bufEnd) { + auto last_buf_pc = bufferedPCs.rbegin(); + uint8_t *end_ptr = last_buf_pc->second + cacheLineSize; + int byte_diff = end_ptr - readPtr; + + if (end_ptr > readPtr) { + bytes_remaining = byte_diff; + } else if (end_ptr < readPtr) { + bytes_remaining = bufferedBytes() + byte_diff; + } + } + + assert(bytes_remaining <= bufferedBytes()); + return bytes_remaining; +} diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh index 48ffdc110..2cfe3f0fe 100644 --- a/src/gpu-compute/fetch_unit.hh +++ b/src/gpu-compute/fetch_unit.hh @@ -36,7 +36,6 @@ #include #include -#include #include "arch/gpu_decoder.hh" #include "base/statistics.hh" @@ -58,9 +57,170 @@ class FetchUnit void initiateFetch(Wavefront *wavefront); void fetch(PacketPtr pkt, Wavefront *wavefront); void processFetchReturn(PacketPtr pkt); + void flushBuf(int wfSlotId); static uint32_t globalFetchUnitID; private: + /** + * fetch buffer descriptor. holds buffered + * instruction data in the fetch unit. + */ + class FetchBufDesc + { + public: + FetchBufDesc() : bufStart(nullptr), bufEnd(nullptr), + readPtr(nullptr), fetchDepth(0), maxIbSize(0), maxFbSize(0), + cacheLineSize(0), restartFromBranch(false), wavefront(nullptr), + _decoder(nullptr) + { + } + + ~FetchBufDesc() + { + delete[] bufStart; + } + + /** + * allocate the fetch buffer space, and set the fetch depth + * (number of lines that may be buffered), fetch size + * (cache line size), and parent WF for this fetch buffer. + */ + void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf); + + int + bufferedAndReservedLines() const + { + return bufferedLines() + reservedLines(); + } + + int bufferedLines() const { return bufferedPCs.size(); } + int bufferedBytes() const { return bufferedLines() * cacheLineSize; } + int reservedLines() const { return reservedPCs.size(); } + bool hasFreeSpace() const { return !freeList.empty(); } + void flushBuf(); + Addr nextFetchAddr(); + + /** + * reserve an entry in the fetch buffer for PC = vaddr, + */ + void reserveBuf(Addr vaddr); + + /** + * return a pointer to the raw fetch buffer data. + * this allows the fetch pkt to use this data directly + * to avoid unnecessary memcpy and malloc/new. + */ + uint8_t* + reservedBuf(Addr vaddr) const + { + auto reserved_pc = reservedPCs.find(vaddr); + assert(reserved_pc != reservedPCs.end()); + assert(reserved_pc == reservedPCs.begin()); + + return reserved_pc->second; + } + + void fetchDone(Addr vaddr); + + /** + * checks if the buffer contains valid data. this essentially + * tells fetch when there is data remaining that needs to be + * decoded into the WF's IB. + */ + bool hasFetchDataToProcess() const; + + /** + * each time the fetch stage is ticked, we check if there + * are any data in the fetch buffer that may be decoded and + * sent to the IB. because we are modeling the fetch buffer + * as a circular buffer, it is possible that an instruction + * can straddle the end/beginning of the fetch buffer, so + * decodeSplitInsts() handles that case. + */ + void decodeInsts(); + + /** + * checks if the wavefront can release any of its fetch + * buffer entries. this will occur when the WF's PC goes + * beyond any of the currently buffered cache lines. + */ + void checkWaveReleaseBuf(); + + void + decoder(TheGpuISA::Decoder *dec) + { + _decoder = dec; + } + + bool + pcBuffered(Addr pc) const + { + bool buffered = bufferedPCs.find(pc) != bufferedPCs.end() + && reservedPCs.find(pc) != reservedPCs.end(); + + return buffered; + } + + /** + * calculates the number of fetched bytes that have yet + * to be decoded. + */ + int fetchBytesRemaining() const; + + private: + void decodeSplitInst(); + + /** + * check if the next instruction to be processed out of + * the fetch buffer is split across the end/beginning of + * the fetch buffer. + */ + bool splitDecode() const; + + /** + * the set of PCs (fetch addresses) that are currently + * buffered. bufferedPCs are valid, reservedPCs are + * waiting for their buffers to be filled with valid + * fetch data. + */ + std::map bufferedPCs; + std::map reservedPCs; + + /** + * represents the fetch buffer free list. holds buffer space + * that is currently free. each pointer in this array must + * have enough space to hold a cache line. in reality we + * have one actual fetch buffer: 'bufStart', these pointers + * point to addresses within bufStart that are aligned to the + * cache line size. + */ + std::deque freeList; + + /** + * raw instruction buffer. holds cache line data associated with + * the set of PCs (fetch addresses) that are buffered here. + */ + uint8_t *bufStart; + uint8_t *bufEnd; + /** + * pointer that points to the next chunk of inst data to be + * decoded. + */ + uint8_t *readPtr; + // how many lines the fetch unit may buffer + int fetchDepth; + // maximum size (in number of insts) of the WF's IB + int maxIbSize; + // maximum size (in bytes) of this fetch buffer + int maxFbSize; + int cacheLineSize; + int cacheLineBits; + bool restartFromBranch; + // wavefront whose IB is serviced by this fetch buffer + Wavefront *wavefront; + TheGpuISA::Decoder *_decoder; + }; + bool timingSim; ComputeUnit *computeUnit; TheGpuISA::Decoder decoder; @@ -82,6 +242,15 @@ class FetchUnit // Pointer to list of waves dispatched on to this SIMD unit std::vector *waveList; + // holds the fetch buffers. each wave has 1 entry. + std::vector fetchBuf; + /** + * number of cache lines we can fetch and buffer. + * this includes the currently fetched line (i.e., the + * line that corresponds to the WF's current PC), as + * well as any lines that may be prefetched. + */ + int fetchDepth; }; #endif // __FETCH_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 64778f011..0bbacc44c 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -31,12 +31,13 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include "gpu-compute/global_memory_pipeline.hh" - +#define __STDC_FORMAT_MACROS +#include #include "debug/GPUCoalescer.hh" #include "debug/GPUMem.hh" #include "debug/GPUReg.hh" #include "gpu-compute/compute_unit.hh" +#include "gpu-compute/global_memory_pipeline.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/vector_register_file.hh" @@ -44,7 +45,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) : computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size), - outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0), + maxWaveRequests(p->max_wave_requests), inflightStores(0), inflightLoads(0) { } @@ -76,6 +77,31 @@ GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const return true; } +void +GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp) +{ + // We require one token from the coalescer's uncoalesced table to + // proceed + int token_count = 1; + + DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count); + assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count)); + mp->computeUnit()->getTokenManager()->acquireTokens(token_count); +} + +bool +GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const +{ + // Ensure we haven't exceeded the maximum number of vmem requests + // for this wavefront + if ((mp->wavefront()->outstandingReqsRdGm + + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) { + return false; + } + + return true; +} + void GlobalMemPipeline::exec() { @@ -87,42 +113,60 @@ GlobalMemPipeline::exec() // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible - if ((m) && (m->isLoad() || m->isAtomicRet())) { + if (m && (m->isLoad() || m->isAtomicRet())) { w = m->wavefront(); - accessVrf = - w->computeUnit->vrf[w->simdId]-> - vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); + accessVrf = w->computeUnit->vrf[w->simdId]-> + canScheduleWriteOperandsFromLoad(w, m); + } if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && - accessVrf && m->statusBitVector == VectorMask(0) && - (computeUnit->shader->coissue_return || - computeUnit->wfWait.at(m->pipeId).rdy())) { + accessVrf && (computeUnit->shader->coissue_return || + computeUnit->vectorGlobalMemUnit.rdy())) { w = m->wavefront(); + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n", + m->cu_id, m->simdId, m->wfSlotId, m->disassemble()); m->completeAcc(m); + if (m->isLoad() || m->isAtomicRet()) { + w->computeUnit->vrf[w->simdId]-> + scheduleWriteOperandsFromLoad(w, m); + } + completeRequest(m); - // Decrement outstanding register count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + Tick accessTime = curTick() - m->getAccessTime(); - if (m->isStore() || m->isAtomic()) { + // Decrement outstanding requests count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + if (m->isStore() || m->isAtomic() || m->isMemSync()) { + computeUnit->shader->sampleStore(accessTime); computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, -1); } - if (m->isLoad() || m->isAtomic()) { + if (m->isLoad() || m->isAtomic() || m->isMemSync()) { + computeUnit->shader->sampleLoad(accessTime); computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, -1); } + w->validateRequestCounters(); + + // Generate stats for round-trip time for vectory memory insts + // going all the way to memory and stats for individual cache + // blocks generated by the instruction. + m->profileRoundTripTime(curTick(), InstMemoryHop::Complete); + computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime()); + computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime()); + // Mark write bus busy for appropriate amount of time computeUnit->glbMemToVrfBus.set(m->time); if (!computeUnit->shader->coissue_return) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); + w->computeUnit->vectorGlobalMemUnit.set(m->time); } // If pipeline has executed a global memory instruction @@ -148,13 +192,13 @@ GlobalMemPipeline::exec() mp->disassemble(), mp->seqNum()); // Memfences will not return tokens and must be issued so we should // not request one as this will deplete the token count until deadlock - if (!mp->isMemFence()) { + if (!mp->isMemSync()) { assert(mp->computeUnit()->getTokenManager()->haveTokens(1)); mp->computeUnit()->getTokenManager()->acquireTokens(1); } mp->initiateAcc(mp); - if (!outOfOrderDataDelivery && !mp->isMemFence()) { + if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) { /** * if we are not in out-of-order data delivery mode * then we keep the responses sorted in program order. @@ -178,19 +222,11 @@ GlobalMemPipeline::exec() GPUDynInstPtr GlobalMemPipeline::getNextReadyResp() { - if (outOfOrderDataDelivery) { - if (!gmReturnedLoads.empty()) { - return gmReturnedLoads.front(); - } else if (!gmReturnedStores.empty()) { - return gmReturnedStores.front(); - } - } else { - if (!gmOrderedRespBuffer.empty()) { - auto mem_req = gmOrderedRespBuffer.begin(); + if (!gmOrderedRespBuffer.empty()) { + auto mem_req = gmOrderedRespBuffer.begin(); - if (mem_req->second.second) { - return mem_req->second.first; - } + if (mem_req->second.second) { + return mem_req->second.first; } } @@ -208,51 +244,33 @@ GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst) --inflightStores; } - if (outOfOrderDataDelivery) { - if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { - assert(!gmReturnedLoads.empty()); - gmReturnedLoads.pop(); - } else if (gpuDynInst->isStore()) { - assert(!gmReturnedStores.empty()); - gmReturnedStores.pop(); - } - } else { - // we should only pop the oldest requst, and it - // should be marked as done if we are here - assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum()); - assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst); - assert(gmOrderedRespBuffer.begin()->second.second); - // remove this instruction from the buffer by its - // unique seq ID - gmOrderedRespBuffer.erase(gpuDynInst->seqNum()); - } + // we should only pop the oldest requst, and it + // should be marked as done if we are here + assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum()); + assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst); + assert(gmOrderedRespBuffer.begin()->second.second); + // remove this instruction from the buffer by its + // unique seq ID + gmOrderedRespBuffer.erase(gpuDynInst->seqNum()); } void GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst) { + gpuDynInst->setAccessTime(curTick()); + gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate); gmIssuedRequests.push(gpuDynInst); } void GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst) { - if (outOfOrderDataDelivery) { - if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { - assert(isGMLdRespFIFOWrRdy()); - gmReturnedLoads.push(gpuDynInst); - } else { - assert(isGMStRespFIFOWrRdy()); - gmReturnedStores.push(gpuDynInst); - } - } else { - auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum()); - // if we are getting a response for this mem request, - // then it ought to already be in the ordered response - // buffer - assert(mem_req != gmOrderedRespBuffer.end()); - mem_req->second.second = true; - } + auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum()); + // if we are getting a response for this mem request, + // then it ought to already be in the ordered response + // buffer + assert(mem_req != gmOrderedRespBuffer.end()); + mem_req->second.second = true; } void diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh index 2f83185a9..6fb1db7b4 100644 --- a/src/gpu-compute/global_memory_pipeline.hh +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -60,52 +60,34 @@ class GlobalMemPipeline void init(ComputeUnit *cu); void exec(); - std::queue &getGMStRespFIFO() { return gmReturnedStores; } - std::queue &getGMLdRespFIFO() { return gmReturnedLoads; } - /** - * find the next ready response to service. for OoO mode we - * simply pop the oldest (based on when the response was - * received) response in the response FIFOs. for in-order mode - * we pop the oldest (in program order) response, and only if - * it is marked as done. + * Find the next ready response to service. In order to ensure + * that no waitcnts are violated, we pop the oldest (in program order) + * response, and only if it is marked as done. This is because waitcnt + * values expect memory operations to complete and decrement their + * counter values in program order. */ GPUDynInstPtr getNextReadyResp(); /** * once a memory request is finished we remove it from the - * buffer. this method determines which response buffer - * we're using based on the mode (in-order vs. OoO). + * buffer. */ void completeRequest(GPUDynInstPtr gpuDynInst); /** - * issues a request to the pipeline - i.e., enqueue it - * in the request buffer. + * Issues a request to the pipeline (i.e., enqueue it + * in the request buffer). */ void issueRequest(GPUDynInstPtr gpuDynInst); /** - * this method handles responses sent to this GM pipeline by the - * CU. in the case of in-order delivery it simply marks the reqeust - * as done in the ordered buffer to indicate that the requst is - * finished. for out-of-order data delivery, the requests are enqueued - * (in the order in which they are received) in the response FIFOs. + * This method handles responses sent to this GM pipeline by the + * CU. Simply marks the reqeust as done in the ordered buffer to + * indicate that the requst is finished. */ void handleResponse(GPUDynInstPtr gpuDynInst); - bool - isGMLdRespFIFOWrRdy() const - { - return gmReturnedLoads.size() < gmQueueSize; - } - - bool - isGMStRespFIFOWrRdy() const - { - return gmReturnedStores.size() < gmQueueSize; - } - bool isGMReqFIFOWrRdy(uint32_t pendReqs=0) const { @@ -114,7 +96,6 @@ class GlobalMemPipeline const std::string &name() const { return _name; } void regStats(); - void incLoadVRFBankConflictCycles(int num_cycles) { @@ -122,12 +103,15 @@ class GlobalMemPipeline } bool coalescerReady(GPUDynInstPtr mp) const; + bool outstandingReqsCheck(GPUDynInstPtr mp) const; + + void acqCoalescerToken(GPUDynInstPtr mp); private: ComputeUnit *computeUnit; std::string _name; int gmQueueSize; - bool outOfOrderDataDelivery; + int maxWaveRequests; // number of cycles of delaying the update of a VGPR that is the // target of a load instruction (or the load component of an atomic) @@ -143,12 +127,11 @@ class GlobalMemPipeline int globalMemSize; /* - * this buffer holds the memory responses when in-order data - * deilvery is used - the responses are ordered by their unique - * sequence number, which is monotonically increasing. when a - * memory request returns its "done" flag is set to true. during - * each tick the the GM pipeline will check if the oldest request - * is finished, and if so it will be removed from the queue. + * This buffer holds the memory responses in order data - the responses + * are ordered by their unique sequence number, which is monotonically + * increasing. When a memory request returns its "done" flag is set to + * true. During each tick the the GM pipeline will check if the oldest + * request is finished, and if so it will be removed from the queue. * * key: memory instruction's sequence ID * @@ -161,14 +144,6 @@ class GlobalMemPipeline // Global Memory Request FIFO: all global memory requests // are issued to this FIFO from the memory pipelines std::queue gmIssuedRequests; - - // Globa Store Response FIFO: all responses of global memory - // stores are sent to this FIFO from TCP - std::queue gmReturnedStores; - - // Global Load Response FIFO: all responses of global memory - // loads are sent to this FIFO from TCP - std::queue gmReturnedLoads; }; #endif // __GLOBAL_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc new file mode 100644 index 000000000..b6205ac13 --- /dev/null +++ b/src/gpu-compute/gpu_command_processor.cc @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +#include "gpu-compute/gpu_command_processor.hh" + +#include "debug/GPUCommandProc.hh" +#include "debug/GPUKernelInfo.hh" +#include "gpu-compute/dispatcher.hh" +#include "params/GPUCommandProcessor.hh" + +GPUCommandProcessor::GPUCommandProcessor(const Params *p) + : HSADevice(p), dispatcher(*p->dispatcher) +{ + dispatcher.setCommandProcessor(this); +} + +/** + * submitDispatchPkt() is the entry point into the CP from the HSAPP + * and is only meant to be used with AQL kernel dispatch packets. + * After the HSAPP receives and extracts an AQL packet, it sends + * it to the CP, which is responsible for gathering all relevant + * information about a task, initializing CU state, and sending + * it to the dispatcher for WG creation and dispatch. + * + * First we need capture all information from the the AQL pkt and + * the code object, then store it in an HSAQueueEntry. Once the + * packet and code are extracted, we extract information from the + * queue descriptor that the CP needs to perform state initialization + * on the CU. Finally we call dispatch() to send the task to the + * dispatcher. When the task completely finishes, we call finishPkt() + * on the HSA packet processor in order to remove the packet from the + * queue, and notify the runtime that the task has completed. + */ +void +GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) +{ + static int dynamic_task_id = 0; + _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt; + + /** + * we need to read a pointer in the application's address + * space to pull out the kernel code descriptor. + */ + auto *tc = sys->threads[0]; + auto &virt_proxy = tc->getVirtProxy(); + + /** + * The kernel_object is a pointer to the machine code, whose entry + * point is an 'amd_kernel_code_t' type, which is included in the + * kernel binary, and describes various aspects of the kernel. The + * desired entry is the 'kernel_code_entry_byte_offset' field, + * which provides the byte offset (positive or negative) from the + * address of the amd_kernel_code_t to the start of the machine + * instructions. + */ + AMDKernelCode akc; + virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc, + sizeof(AMDKernelCode)); + + DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the " + "kernel object\n", akc.kernel_code_entry_byte_offset); + + Addr machine_code_addr = (Addr)disp_pkt->kernel_object + + akc.kernel_code_entry_byte_offset; + + DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n", + machine_code_addr); + + Addr kern_name_addr(0); + virt_proxy.readBlob(akc.runtime_loader_kernel_symbol + 0x10, + (uint8_t*)&kern_name_addr, 0x8); + + std::string kernel_name; + virt_proxy.readString(kernel_name, kern_name_addr); + + DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str()); + + HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id, + dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr); + + DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), " + "grid size (%dx%dx%d) kernarg addr: %#x, completion " + "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x, + disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z, + disp_pkt->grid_size_x, disp_pkt->grid_size_y, + disp_pkt->grid_size_z, disp_pkt->kernarg_address, + disp_pkt->completion_signal); + + DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, " + "num scalar regs: %d, code addr: %#x, kernarg size: %d, " + "LDS size: %d)\n", kernel_name, task->numVectorRegs(), + task->numScalarRegs(), task->codeAddr(), 0, 0); + + initABI(task); + ++dynamic_task_id; +} + +/** + * submitVendorPkt() is for accepting vendor-specific packets from + * the HSAPP. Vendor-specific packets may be used by the runtime to + * send commands to the HSA device that are specific to a particular + * vendor. The vendor-specific packets should be defined by the vendor + * in the runtime. + */ + +/** + * TODO: For now we simply tell the HSAPP to finish the packet, + * however a future patch will update this method to provide + * the proper handling of any required vendor-specific packets. + * In the version of ROCm that is currently supported (1.6) + * the runtime will send packets that direct the CP to + * invalidate the GPUs caches. We do this automatically on + * each kernel launch in the CU, so this is safe for now. + */ +void +GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) +{ + hsaPP->finishPkt(raw_pkt, queue_id); +} + +/** + * Once the CP has finished extracting all relevant information about + * a task and has initialized the ABI state, we send a description of + * the task to the dispatcher. The dispatcher will create and dispatch + * WGs to the CUs. + */ +void +GPUCommandProcessor::dispatchPkt(HSAQueueEntry *task) +{ + dispatcher.dispatch(task); +} + +/** + * The CP is responsible for traversing all HSA-ABI-related data + * structures from memory and initializing the ABI state. + * Information provided by the MQD, AQL packet, and code object + * metadata will be used to initialze register file state. + */ +void +GPUCommandProcessor::initABI(HSAQueueEntry *task) +{ + auto *readDispIdOffEvent = new ReadDispIdOffsetDmaEvent(*this, task); + + Addr hostReadIdxPtr + = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr; + + dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr), + sizeof(readDispIdOffEvent->readDispIdOffset), readDispIdOffEvent, + &readDispIdOffEvent->readDispIdOffset); +} + +System* +GPUCommandProcessor::system() +{ + return sys; +} + +AddrRangeList +GPUCommandProcessor::getAddrRanges() const +{ + AddrRangeList ranges; + return ranges; +} + +void +GPUCommandProcessor::setShader(Shader *shader) +{ + _shader = shader; +} + +Shader* +GPUCommandProcessor::shader() +{ + return _shader; +} + +GPUCommandProcessor* +GPUCommandProcessorParams::create() +{ + return new GPUCommandProcessor(this); +} diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh new file mode 100644 index 000000000..7253dd421 --- /dev/null +++ b/src/gpu-compute/gpu_command_processor.hh @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +/** + * @file + * The GPUCommandProcessor (CP) is responsible for accepting commands, in + * the form of HSA AQL packets, from the HSA packet processor (HSAPP). The CP + * works with several components, including the HSAPP and the dispatcher. + * When the HSAPP sends a ready task to the CP, it will perform the necessary + * operations to extract relevant data structures from memory, such as the + * AQL queue descriptor and AQL packet, and initializes register state for the + * task's wavefronts. + */ + +#ifndef __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__ +#define __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__ + +#include "dev/hsa/hsa_device.hh" +#include "gpu-compute/hsa_queue_entry.hh" + +struct GPUCommandProcessorParams; +class GPUDispatcher; +class Shader; + +class GPUCommandProcessor : public HSADevice +{ + public: + typedef GPUCommandProcessorParams Params; + + GPUCommandProcessor() = delete; + GPUCommandProcessor(const Params *p); + + void setShader(Shader *shader); + Shader* shader(); + + void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) override; + void submitVendorPkt(void *raw_pkt, uint32_t queue_id, + Addr host_pkt_addr) override; + void dispatchPkt(HSAQueueEntry *task); + + Tick write(PacketPtr pkt) override { return 0; } + Tick read(PacketPtr pkt) override { return 0; } + AddrRangeList getAddrRanges() const override; + System *system(); + + private: + Shader *_shader; + GPUDispatcher &dispatcher; + + void initABI(HSAQueueEntry *task); + + /** + * Perform a DMA read of the read_dispatch_id_field_base_byte_offset + * field, which follows directly after the read_dispatch_id (the read + * pointer) in the amd_hsa_queue_t struct (aka memory queue descriptor + * (MQD)), to find the base address of the MQD. The MQD is the runtime's + * soft representation of a HW queue descriptor (HQD). + * + * Any fields below the read dispatch ID in the amd_hsa_queue_t should + * not change according to the HSA standard, therefore we should be able + * to get them based on their known relative position to the read dispatch + * ID. + */ + class ReadDispIdOffsetDmaEvent : public DmaCallback + { + public: + ReadDispIdOffsetDmaEvent(GPUCommandProcessor &gpu_cmd_proc, + HSAQueueEntry *task) + : DmaCallback(), readDispIdOffset(0), gpuCmdProc(gpu_cmd_proc), + _task(task) + { + } + + void + process() override + { + /** + * Now that the read pointer's offset from the base of + * the MQD is known, we can use that to calculate the + * the address of the MQD itself, the dispatcher will + * DMA that into the HSAQueueEntry when a kernel is + * launched. + */ + _task->hostAMDQueueAddr + = gpuCmdProc.hsaPP->getQueueDesc(_task->queueId()) + ->hostReadIndexPtr - readDispIdOffset; + + /** + * DMA a copy of the MQD into the task. Some fields of + * the MQD will be used to initialize register state. + */ + auto *mqdDmaEvent = new MQDDmaEvent(gpuCmdProc, _task); + gpuCmdProc.dmaReadVirt(_task->hostAMDQueueAddr, + sizeof(_amd_queue_t), mqdDmaEvent, + &_task->amdQueue); + } + + uint32_t readDispIdOffset; + + private: + GPUCommandProcessor &gpuCmdProc; + HSAQueueEntry *_task; + }; + + /** + * Perform a DMA read of the MQD that corresponds to a hardware + * queue descriptor (HQD). We store a copy of the MQD in the + * HSAQueueEntry object so we can send a copy of it along with + * a dispatch packet, which is needed to initialize register + * state. + */ + class MQDDmaEvent : public DmaCallback + { + public: + MQDDmaEvent(GPUCommandProcessor &gpu_cmd_proc, HSAQueueEntry *task) + : DmaCallback(), gpuCmdProc(gpu_cmd_proc), _task(task) + { + } + + void + process() override + { + gpuCmdProc.dispatchPkt(_task); + } + + private: + GPUCommandProcessor &gpuCmdProc; + HSAQueueEntry *_task; + }; +}; + +#endif // __DEV_HSA_GPU_COMMAND_PROCESSOR_HH__ diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc new file mode 100644 index 000000000..287c2a19a --- /dev/null +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2015-2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Sooraj Puthoor + * Anthony Gutierrez + */ + +#include "gpu-compute/gpu_compute_driver.hh" + +#include "cpu/thread_context.hh" +#include "debug/GPUDriver.hh" +#include "dev/hsa/hsa_device.hh" +#include "dev/hsa/hsa_packet_processor.hh" +#include "dev/hsa/kfd_ioctl.h" +#include "params/GPUComputeDriver.hh" +#include "sim/syscall_emul_buf.hh" + +GPUComputeDriver::GPUComputeDriver(Params *p) + : HSADriver(p) +{ + DPRINTF(GPUDriver, "Constructing KFD: device\n"); +} + +int +GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) +{ + auto &virt_proxy = tc->getVirtProxy(); + + switch (req) { + case AMDKFD_IOC_GET_VERSION: + { + DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n"); + + TypedBufferArg args(ioc_buf); + args->major_version = 1; + args->minor_version = 0; + + args.copyOut(virt_proxy); + } + break; + case AMDKFD_IOC_CREATE_QUEUE: + { + DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n"); + + allocateQueue(virt_proxy, ioc_buf); + + DPRINTF(GPUDriver, "Creating queue %d\n", queueId); + } + break; + case AMDKFD_IOC_DESTROY_QUEUE: + { + TypedBufferArg args(ioc_buf); + args.copyIn(virt_proxy); + DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \ + "queue offset %d\n", args->queue_id); + device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id); + } + break; + case AMDKFD_IOC_SET_MEMORY_POLICY: + { + warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n"); + } + break; + case AMDKFD_IOC_GET_CLOCK_COUNTERS: + { + DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n"); + + TypedBufferArg args(ioc_buf); + args.copyIn(virt_proxy); + + // Set nanosecond resolution + args->system_clock_freq = 1000000000; + + /** + * Derive all clock counters based on the tick. All + * device clocks are identical and perfectly in sync. + */ + uint64_t elapsed_nsec = curTick() / SimClock::Int::ns; + args->gpu_clock_counter = elapsed_nsec; + args->cpu_clock_counter = elapsed_nsec; + args->system_clock_counter = elapsed_nsec; + + args.copyOut(virt_proxy); + } + break; + case AMDKFD_IOC_GET_PROCESS_APERTURES: + { + DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n"); + + TypedBufferArg args(ioc_buf); + args->num_of_nodes = 1; + + /** + * Set the GPUVM/LDS/Scratch APEs exactly as they + * are in the real driver, see the KFD driver + * in the ROCm Linux kernel source: + * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c + */ + for (int i = 0; i < args->num_of_nodes; ++i) { + /** + * While the GPU node numbers start at 0, we add 1 + * to force the count to start at 1. This is to + * ensure that the base/limit addresses are + * calculated correctly. + */ + args->process_apertures[i].scratch_base + = scratchApeBase(i + 1); + args->process_apertures[i].scratch_limit = + scratchApeLimit(args->process_apertures[i].scratch_base); + + args->process_apertures[i].lds_base = ldsApeBase(i + 1); + args->process_apertures[i].lds_limit = + ldsApeLimit(args->process_apertures[i].lds_base); + + args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1); + args->process_apertures[i].gpuvm_limit = + gpuVmApeLimit(args->process_apertures[i].gpuvm_base); + + // NOTE: Must match ID populated by hsaTopology.py + args->process_apertures[i].gpu_id = 2765; + + DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i, + args->process_apertures[i].gpuvm_base); + DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i, + args->process_apertures[i].gpuvm_limit); + + DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i, + args->process_apertures[i].lds_base); + DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i, + args->process_apertures[i].lds_limit); + + DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i, + args->process_apertures[i].scratch_base); + DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i, + args->process_apertures[i].scratch_limit); + + /** + * The CPU's 64b address space can only use the + * areas with VA[63:47] == 0x1ffff or VA[63:47] == 0, + * therefore we must ensure that the apertures do not + * fall in the CPU's address space. + */ + assert(bits(args->process_apertures[i].scratch_base, 63, + 47) != 0x1ffff); + assert(bits(args->process_apertures[i].scratch_base, 63, + 47) != 0); + assert(bits(args->process_apertures[i].scratch_limit, 63, + 47) != 0x1ffff); + assert(bits(args->process_apertures[i].scratch_limit, 63, + 47) != 0); + assert(bits(args->process_apertures[i].lds_base, 63, + 47) != 0x1ffff); + assert(bits(args->process_apertures[i].lds_base, 63, + 47) != 0); + assert(bits(args->process_apertures[i].lds_limit, 63, + 47) != 0x1ffff); + assert(bits(args->process_apertures[i].lds_limit, 63, + 47) != 0); + assert(bits(args->process_apertures[i].gpuvm_base, 63, + 47) != 0x1ffff); + assert(bits(args->process_apertures[i].gpuvm_base, 63, + 47) != 0); + assert(bits(args->process_apertures[i].gpuvm_limit, 63, + 47) != 0x1ffff); + assert(bits(args->process_apertures[i].gpuvm_limit, 63, + 47) != 0); + } + + args.copyOut(virt_proxy); + } + break; + case AMDKFD_IOC_UPDATE_QUEUE: + { + warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n"); + } + break; + case AMDKFD_IOC_CREATE_EVENT: + { + warn("unimplemented ioctl: AMDKFD_IOC_CREATE_EVENT\n"); + } + break; + case AMDKFD_IOC_DESTROY_EVENT: + { + warn("unimplemented ioctl: AMDKFD_IOC_DESTROY_EVENT\n"); + } + break; + case AMDKFD_IOC_SET_EVENT: + { + warn("unimplemented ioctl: AMDKFD_IOC_SET_EVENT\n"); + } + break; + case AMDKFD_IOC_RESET_EVENT: + { + warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n"); + } + break; + case AMDKFD_IOC_WAIT_EVENTS: + { + warn("unimplemented ioctl: AMDKFD_IOC_WAIT_EVENTS\n"); + } + break; + case AMDKFD_IOC_DBG_REGISTER: + { + warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n"); + } + break; + case AMDKFD_IOC_DBG_UNREGISTER: + { + warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n"); + } + break; + case AMDKFD_IOC_DBG_ADDRESS_WATCH: + { + warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n"); + } + break; + case AMDKFD_IOC_DBG_WAVE_CONTROL: + { + warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n"); + } + break; + case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU: + { + warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n"); + } + break; + case AMDKFD_IOC_FREE_MEMORY_OF_GPU: + { + warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n"); + } + break; + case AMDKFD_IOC_MAP_MEMORY_TO_GPU: + { + warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n"); + } + break; + case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU: + { + warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n"); + } + case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH: + { + warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n"); + } + break; + case AMDKFD_IOC_SET_CU_MASK: + { + warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n"); + } + break; + case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE: + { + warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE" + "\n"); + } + break; + case AMDKFD_IOC_SET_TRAP_HANDLER: + { + warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n"); + } + break; + case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW: + { + DPRINTF(GPUDriver, + "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n"); + + TypedBufferArg + ioc_args(ioc_buf); + + ioc_args.copyIn(virt_proxy); + ioc_args->num_of_nodes = 1; + + for (int i = 0; i < ioc_args->num_of_nodes; ++i) { + TypedBufferArg ape_args + (ioc_args->kfd_process_device_apertures_ptr); + + ape_args->scratch_base = scratchApeBase(i + 1); + ape_args->scratch_limit = + scratchApeLimit(ape_args->scratch_base); + ape_args->lds_base = ldsApeBase(i + 1); + ape_args->lds_limit = ldsApeLimit(ape_args->lds_base); + ape_args->gpuvm_base = gpuVmApeBase(i + 1); + ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base); + + ape_args->gpu_id = 2765; + + assert(bits(ape_args->scratch_base, 63, 47) != 0x1ffff); + assert(bits(ape_args->scratch_base, 63, 47) != 0); + assert(bits(ape_args->scratch_limit, 63, 47) != 0x1ffff); + assert(bits(ape_args->scratch_limit, 63, 47) != 0); + assert(bits(ape_args->lds_base, 63, 47) != 0x1ffff); + assert(bits(ape_args->lds_base, 63, 47) != 0); + assert(bits(ape_args->lds_limit, 63, 47) != 0x1ffff); + assert(bits(ape_args->lds_limit, 63, 47) != 0); + assert(bits(ape_args->gpuvm_base, 63, 47) != 0x1ffff); + assert(bits(ape_args->gpuvm_base, 63, 47) != 0); + assert(bits(ape_args->gpuvm_limit, 63, 47) != 0x1ffff); + assert(bits(ape_args->gpuvm_limit, 63, 47) != 0); + + ape_args.copyOut(virt_proxy); + } + + ioc_args.copyOut(virt_proxy); + } + break; + case AMDKFD_IOC_GET_DMABUF_INFO: + { + warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n"); + } + break; + case AMDKFD_IOC_IMPORT_DMABUF: + { + warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n"); + } + break; + case AMDKFD_IOC_GET_TILE_CONFIG: + { + warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n"); + } + break; + case AMDKFD_IOC_IPC_IMPORT_HANDLE: + { + warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n"); + } + break; + case AMDKFD_IOC_IPC_EXPORT_HANDLE: + { + warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n"); + } + break; + case AMDKFD_IOC_CROSS_MEMORY_COPY: + { + warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n"); + } + break; + case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE: + { + warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n"); + } + break; + default: + fatal("%s: bad ioctl %d\n", req); + break; + } + return 0; +} + +Addr +GPUComputeDriver::gpuVmApeBase(int gpuNum) const +{ + return ((Addr)gpuNum << 61) + 0x1000000000000L; +} + +Addr +GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const +{ + return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL; +} + +Addr +GPUComputeDriver::scratchApeBase(int gpuNum) const +{ + return ((Addr)gpuNum << 61) + 0x100000000L; +} + +Addr +GPUComputeDriver::scratchApeLimit(Addr apeBase) const +{ + return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; +} + +Addr +GPUComputeDriver::ldsApeBase(int gpuNum) const +{ + return ((Addr)gpuNum << 61) + 0x0; +} + +Addr +GPUComputeDriver::ldsApeLimit(Addr apeBase) const +{ + return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; +} + +GPUComputeDriver* +GPUComputeDriverParams::create() +{ + return new GPUComputeDriver(this); +} diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh new file mode 100644 index 000000000..b13531de4 --- /dev/null +++ b/src/gpu-compute/gpu_compute_driver.hh @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2015-2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Sooraj Puthoor + * Anthony Gutierrez + */ + +/** + * @file + * The GPUComputeDriver implements an HSADriver for an HSA AMD GPU + * agent. Other GPU devices, or other HSA agents, should not derive + * from this class. Instead device-specific implementations of an + * HSADriver should be provided for each unique device. + */ + +#ifndef __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__ +#define __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__ + +#include "dev/hsa/hsa_driver.hh" + +struct GPUComputeDriverParams; + +class GPUComputeDriver final : public HSADriver +{ + public: + typedef GPUComputeDriverParams Params; + GPUComputeDriver(Params *p); + int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override; + + private: + /** + * The aperture (APE) base/limit pairs are set + * statically at startup by the real KFD. AMD + * x86_64 CPUs only use the areas in the 64b + * address space where VA[63:47] == 0x1ffff or + * VA[63:47] = 0. These methods generate the APE + * base/limit pairs in exactly the same way as + * the real KFD does, which ensures these APEs do + * not fall into the CPU's address space + * + * see the macros in the KFD driver in the ROCm + * Linux kernel source: + * + * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c + */ + Addr gpuVmApeBase(int gpuNum) const; + Addr gpuVmApeLimit(Addr apeBase) const; + Addr scratchApeBase(int gpuNum) const; + Addr scratchApeLimit(Addr apeBase) const; + Addr ldsApeBase(int gpuNum) const; + Addr ldsApeLimit(Addr apeBase) const; +}; + +#endif // __GPU_COMPUTE_GPU_COMPUTE_DRIVER_HH__ diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 0c729b784..74b963b73 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -35,26 +35,50 @@ #include "debug/GPUMem.hh" #include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/wavefront.hh" GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, - GPUStaticInst *static_inst, uint64_t instSeqNum) - : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), - n_reg(0), useContinuation(false), - statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum) + GPUStaticInst *static_inst, InstSeqNum instSeqNum) + : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(), + (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false), + _staticInst(static_inst), _seqNum(instSeqNum) { tlbHitLevel.assign(computeUnit()->wfSize(), -1); - d_data = new uint8_t[computeUnit()->wfSize() * 16]; + // vector instructions can have up to 4 source/destination operands + d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)]; a_data = new uint8_t[computeUnit()->wfSize() * 8]; x_data = new uint8_t[computeUnit()->wfSize() * 8]; + // scalar loads can read up to 16 Dwords of data (see publicly + // available GCN3 ISA manual) + scalar_data = new uint8_t[16 * sizeof(uint32_t)]; + for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) { + scalar_data[i] = 0; + } for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) { a_data[i] = 0; x_data[i] = 0; } - for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) { + for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) { d_data[i] = 0; } + time = 0; + + cu_id = _cu->cu_id; + if (_wf) { + simdId = _wf->simdId; + wfDynId = _wf->wfDynId; + kern_id = _wf->kernId; + wg_id = _wf->wgId; + wfSlotId = _wf->wfSlotId; + } else { + simdId = -1; + wfDynId = -1; + kern_id = -1; + wg_id = -1; + wfSlotId = -1; + } } GPUDynInst::~GPUDynInst() @@ -62,6 +86,8 @@ GPUDynInst::~GPUDynInst() delete[] d_data; delete[] a_data; delete[] x_data; + delete[] scalar_data; + delete _staticInst; } void @@ -82,6 +108,36 @@ GPUDynInst::numDstRegOperands() return _staticInst->numDstRegOperands(); } +int +GPUDynInst::numSrcVecOperands() +{ + return _staticInst->numSrcVecOperands(); +} + +int +GPUDynInst::numDstVecOperands() +{ + return _staticInst->numDstVecOperands(); +} + +int +GPUDynInst::numSrcVecDWORDs() +{ + return _staticInst->numSrcVecDWORDs(); +} + +int +GPUDynInst::numDstVecDWORDs() +{ + return _staticInst->numDstVecDWORDs(); +} + +int +GPUDynInst::numOpdDWORDs(int operandIdx) +{ + return _staticInst->numOpdDWORDs(operandIdx); +} + int GPUDynInst::getNumOperands() { @@ -100,12 +156,6 @@ GPUDynInst::isScalarRegister(int operandIdx) return _staticInst->isScalarRegister(operandIdx); } -bool -GPUDynInst::isCondRegister(int operandIdx) -{ - return _staticInst->isCondRegister(operandIdx); -} - int GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst) { @@ -130,13 +180,82 @@ GPUDynInst::isSrcOperand(int operandIdx) return _staticInst->isSrcOperand(operandIdx); } +bool +GPUDynInst::hasSourceSgpr() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) { + return true; + } + } + return false; +} + +bool +GPUDynInst::hasSourceVgpr() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) { + return true; + } + } + return false; +} + +bool +GPUDynInst::hasDestinationSgpr() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) { + return true; + } + } + return false; +} + +bool +GPUDynInst::srcIsVgpr(int index) const +{ + assert(index >= 0 && index < _staticInst->getNumOperands()); + if (_staticInst->isVectorRegister(index) && + _staticInst->isSrcOperand(index)) { + return true; + } + return false; +} + +bool +GPUDynInst::hasDestinationVgpr() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) { + return true; + } + } + return false; +} + +bool +GPUDynInst::isOpcode(const std::string& opcodeStr, + const std::string& extStr) const +{ + return _staticInst->opcode().find(opcodeStr) != std::string::npos && + _staticInst->opcode().find(extStr) != std::string::npos; +} + +bool +GPUDynInst::isOpcode(const std::string& opcodeStr) const +{ + return _staticInst->opcode().find(opcodeStr) != std::string::npos; +} + const std::string& GPUDynInst::disassemble() const { return _staticInst->disassemble(); } -uint64_t +InstSeqNum GPUDynInst::seqNum() const { return _seqNum; @@ -148,6 +267,40 @@ GPUDynInst::executedAs() return _staticInst->executed_as; } +bool +GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s) +{ + assert(s); + for (int i = 0; i < getNumOperands(); ++i) { + if (isVectorRegister(i) && isSrcOperand(i)) { + for (int j = 0; j < s->getNumOperands(); ++j) { + if (s->isVectorRegister(j) && s->isDstOperand(j)) { + if (i == j) + return true; + } + } + } + } + return false; +} + +bool +GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s) +{ + assert(s); + for (int i = 0; i < getNumOperands(); ++i) { + if (isScalarRegister(i) && isSrcOperand(i)) { + for (int j = 0; j < s->getNumOperands(); ++j) { + if (s->isScalarRegister(j) && s->isDstOperand(j)) { + if (i == j) + return true; + } + } + } + } + return false; +} + // Process a memory instruction and (if necessary) submit timing request void GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) @@ -156,12 +309,15 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) cu->cu_id, simdId, wfSlotId, exec_mask); _staticInst->initiateAcc(gpuDynInst); - time = 0; } void GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst) { + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=" + "%#x\n complete", + cu->cu_id, simdId, wfSlotId, exec_mask); + _staticInst->completeAcc(gpuDynInst); } @@ -181,12 +337,42 @@ GPUDynInst::isBranch() const return _staticInst->isBranch(); } +bool +GPUDynInst::isCondBranch() const +{ + return _staticInst->isCondBranch(); +} + bool GPUDynInst::isNop() const { return _staticInst->isNop(); } +bool +GPUDynInst::isEndOfKernel() const +{ + return _staticInst->isEndOfKernel(); +} + +bool +GPUDynInst::isKernelLaunch() const +{ + return _staticInst->isKernelLaunch(); +} + +bool +GPUDynInst::isSDWAInst() const +{ + return _staticInst->isSDWAInst(); +} + +bool +GPUDynInst::isDPPInst() const +{ + return _staticInst->isDPPInst(); +} + bool GPUDynInst::isReturn() const { @@ -218,9 +404,9 @@ GPUDynInst::isBarrier() const } bool -GPUDynInst::isMemFence() const +GPUDynInst::isMemSync() const { - return _staticInst->isMemFence(); + return _staticInst->isMemSync(); } bool @@ -265,6 +451,12 @@ GPUDynInst::isAtomicRet() const return _staticInst->isAtomicRet(); } +bool +GPUDynInst::isVector() const +{ + return !_staticInst->isScalar(); +} + bool GPUDynInst::isScalar() const { @@ -295,6 +487,78 @@ GPUDynInst::writesVCC() const return _staticInst->writesVCC(); } +bool +GPUDynInst::readsMode() const +{ + return _staticInst->readsMode(); +} + +bool +GPUDynInst::writesMode() const +{ + return _staticInst->writesMode(); +} + +bool +GPUDynInst::readsEXEC() const +{ + return _staticInst->readsEXEC(); +} + +bool +GPUDynInst::writesEXEC() const +{ + return _staticInst->writesEXEC(); +} + +bool +GPUDynInst::ignoreExec() const +{ + return _staticInst->ignoreExec(); +} + +bool +GPUDynInst::writesExecMask() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + return _staticInst->isDstOperand(i) && + _staticInst->isExecMaskRegister(i); + } + return false; +} + +bool +GPUDynInst::readsExecMask() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + return _staticInst->isSrcOperand(i) && + _staticInst->isExecMaskRegister(i); + } + return false; +} + +bool +GPUDynInst::writesFlatScratch() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) { + return _staticInst->isFlatScratchRegister(i); + } + } + return false; +} + +bool +GPUDynInst::readsFlatScratch() const +{ + for (int i = 0; i < _staticInst->getNumOperands(); ++i) { + if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) { + return _staticInst->isFlatScratchRegister(i); + } + } + return false; +} + bool GPUDynInst::isAtomicAnd() const { @@ -421,81 +685,241 @@ GPUDynInst::isSpillSeg() const } bool -GPUDynInst::isWorkitemScope() const -{ - return _staticInst->isWorkitemScope(); -} - -bool -GPUDynInst::isWavefrontScope() const +GPUDynInst::isGloballyCoherent() const { - return _staticInst->isWavefrontScope(); + return _staticInst->isGloballyCoherent(); } bool -GPUDynInst::isWorkgroupScope() const +GPUDynInst::isSystemCoherent() const { - return _staticInst->isWorkgroupScope(); + return _staticInst->isSystemCoherent(); } bool -GPUDynInst::isDeviceScope() const +GPUDynInst::isF16() const { - return _staticInst->isDeviceScope(); + return _staticInst->isF16(); } bool -GPUDynInst::isSystemScope() const +GPUDynInst::isF32() const { - return _staticInst->isSystemScope(); + return _staticInst->isF32(); } bool -GPUDynInst::isNoScope() const +GPUDynInst::isF64() const { - return _staticInst->isNoScope(); + return _staticInst->isF64(); } bool -GPUDynInst::isRelaxedOrder() const +GPUDynInst::isFMA() const { - return _staticInst->isRelaxedOrder(); + return _staticInst->isFMA(); } bool -GPUDynInst::isAcquire() const +GPUDynInst::isMAC() const { - return _staticInst->isAcquire(); + return _staticInst->isMAC(); } bool -GPUDynInst::isRelease() const +GPUDynInst::isMAD() const { - return _staticInst->isRelease(); + return _staticInst->isMAD(); } -bool -GPUDynInst::isAcquireRelease() const -{ - return _staticInst->isAcquireRelease(); -} +void +GPUDynInst::doApertureCheck(const VectorMask &mask) +{ + assert(mask.any()); + // find the segment of the first active address, after + // that we check that all other active addresses also + // fall within the same APE + for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { + if (mask[lane]) { + if (computeUnit()->shader->isLdsApe(addr[lane])) { + // group segment + staticInstruction()->executed_as = Enums::SC_GROUP; + break; + } else if (computeUnit()->shader->isScratchApe(addr[lane])) { + // private segment + staticInstruction()->executed_as = Enums::SC_PRIVATE; + break; + } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) { + // we won't support GPUVM + fatal("flat access is in GPUVM APE\n"); + } else if (bits(addr[lane], 63, 47) != 0x1FFFF && + bits(addr[lane], 63, 47)) { + // we are in the "hole", this is a memory violation + fatal("flat access at addr %#x has a memory violation\n", + addr[lane]); + } else { + // global memory segment + staticInstruction()->executed_as = Enums::SC_GLOBAL; + break; + } + } + } -bool -GPUDynInst::isNoOrder() const -{ - return _staticInst->isNoOrder(); + // we should have found the segment + assert(executedAs() != Enums::SC_NONE); + + // flat accesses should not straddle multiple APEs so we + // must check that all addresses fall within the same APE + if (executedAs() == Enums::SC_GROUP) { + for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { + if (mask[lane]) { + // if the first valid addr we found above was LDS, + // all the rest should be + assert(computeUnit()->shader->isLdsApe(addr[lane])); + } + } + } else if (executedAs() == Enums::SC_PRIVATE) { + for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { + if (mask[lane]) { + // if the first valid addr we found above was private, + // all the rest should be + assert(computeUnit()->shader->isScratchApe(addr[lane])); + } + } + } else { + for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) { + if (mask[lane]) { + // if the first valid addr we found above was global, + // all the rest should be. because we don't have an + // explicit range of the global segment, we just make + // sure that the address fall in no other APE and that + // it is not a memory violation + assert(!computeUnit()->shader->isLdsApe(addr[lane])); + assert(!computeUnit()->shader->isScratchApe(addr[lane])); + assert(!computeUnit()->shader->isGpuVmApe(addr[lane])); + assert(!(bits(addr[lane], 63, 47) != 0x1FFFF + && bits(addr[lane], 63, 47))); + } + } + } } -bool -GPUDynInst::isGloballyCoherent() const -{ - return _staticInst->isGloballyCoherent(); +void +GPUDynInst::resolveFlatSegment(const VectorMask &mask) +{ + doApertureCheck(mask); + + + // Now that we know the aperature, do the following: + // 1. Transform the flat address to its segmented equivalent. + // 2. Set the execUnitId based an the aperture check. + // 3. Decrement any extra resources that were reserved. Other + // resources are released as normal, below. + if (executedAs() == Enums::SC_GLOBAL) { + // no transormation for global segment + wavefront()->execUnitId = wavefront()->flatGmUnitId; + if (isLoad()) { + wavefront()->rdLmReqsInPipe--; + } else if (isStore()) { + wavefront()->wrLmReqsInPipe--; + } else if (isAtomic() || isMemSync()) { + wavefront()->wrLmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; + } else { + panic("Invalid memory operation!\n"); + } + } else if (executedAs() == Enums::SC_GROUP) { + for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { + if (mask[lane]) { + // flat address calculation goes here. + // addr[lane] = segmented address + panic("Flat group memory operation is unimplemented!\n"); + } + } + wavefront()->execUnitId = wavefront()->flatLmUnitId; + if (isLoad()) { + wavefront()->rdGmReqsInPipe--; + } else if (isStore()) { + wavefront()->wrGmReqsInPipe--; + } else if (isAtomic() || isMemSync()) { + wavefront()->rdGmReqsInPipe--; + wavefront()->wrGmReqsInPipe--; + } else { + panic("Invalid memory operation!\n"); + } + } else if (executedAs() == Enums::SC_PRIVATE) { + /** + * Flat instructions may resolve to the private segment (scratch), + * which is backed by main memory and provides per-lane scratch + * memory. Flat addressing uses apertures - registers that specify + * the address range in the VA space where LDS/private memory is + * mapped. The value of which is set by the kernel mode driver. + * These apertures use addresses that are not used by x86 CPUs. + * When the address of a Flat operation falls into one of the + * apertures, the Flat operation is redirected to either LDS or + * to the private memory segment. + * + * For private memory the SW runtime will allocate some space in + * the VA space for each AQL queue. The base address of which is + * stored in scalar registers per the AMD GPU ABI. The amd_queue_t + * scratch_backing_memory_location provides the base address in + * memory for the queue's private segment. Various other fields + * loaded into register state during kernel launch specify per-WF + * and per-work-item offsets so that individual lanes may access + * their private segment allocation. + * + * For more details about flat addressing see: + * http://rocm-documentation.readthedocs.io/en/latest/ + * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch + * + * https://github.com/ROCm-Developer-Tools/ + * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md + * #flat-addressing + */ + + uint32_t numSgprs = wavefront()->maxSgprs; + uint32_t physSgprIdx = + wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), + numSgprs - 3); + uint32_t offset = + wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); + physSgprIdx = + wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), + numSgprs - 4); + uint32_t size = + wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); + for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { + if (mask[lane]) { + addr[lane] = addr[lane] + lane * size + offset + + wavefront()->computeUnit->shader->getHiddenPrivateBase() - + wavefront()->computeUnit->shader->getScratchBase(); + } + } + wavefront()->execUnitId = wavefront()->flatLmUnitId; + if (isLoad()) { + wavefront()->rdGmReqsInPipe--; + } else if (isStore()) { + wavefront()->wrGmReqsInPipe--; + } else if (isAtomic() || isMemSync()) { + wavefront()->rdGmReqsInPipe--; + wavefront()->wrGmReqsInPipe--; + } else { + panic("Invalid memory operation!\n"); + } + } else { + for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { + if (mask[lane]) { + panic("flat addr %#llx maps to bad segment %d\n", + addr[lane], executedAs()); + } + } + } } -bool -GPUDynInst::isSystemCoherent() const +TheGpuISA::ScalarRegU32 +GPUDynInst::srcLiteral() const { - return _staticInst->isSystemCoherent(); + return _staticInst->srcLiteral(); } void @@ -504,6 +928,8 @@ GPUDynInst::updateStats() if (_staticInst->isLocalMem()) { // access to LDS (shared) memory cu->dynamicLMemInstrCnt++; + } else if (_staticInst->isFlat()) { + cu->dynamicFlatMemInstrCnt++; } else { // access to global memory @@ -536,3 +962,28 @@ GPUDynInst::updateStats() cu->dynamicGMemInstrCnt++; } } + +void +GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId) +{ + // Only take the first measurement in the case of coalescing + if (roundTripTime.size() > hopId) + return; + + roundTripTime.push_back(currentTime); +} + +void +GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId) +{ + if (lineAddressTime.count(addr)) { + if (lineAddressTime[addr].size() > hopId) { + return; + } + + lineAddressTime[addr].push_back(currentTime); + } else if (hopId == 0) { + auto addressTimeVec = std::vector { currentTime }; + lineAddressTime.insert(std::make_pair(addr, addressTimeVec)); + } +} diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index bee08e3df..392b57d12 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -39,7 +39,6 @@ #include "base/amo.hh" #include "base/logging.hh" -#include "enums/MemType.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" @@ -68,20 +67,10 @@ class AtomicOpCAS : public TypedAtomicOpFunctor } else { computeUnit->numFailedCASOps++; } - - if (computeUnit->xact_cas_mode) { - computeUnit->xactCasLoadMap.clear(); - } } AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } }; -typedef enum -{ - VT_32, - VT_64, -} vgpr_type; - class GPUDynInst : public GPUExecContext { public: @@ -91,27 +80,51 @@ class GPUDynInst : public GPUExecContext void execute(GPUDynInstPtr gpuDynInst); int numSrcRegOperands(); int numDstRegOperands(); + int numDstVecOperands(); + int numSrcVecOperands(); + int numSrcVecDWORDs(); + int numDstVecDWORDs(); + int numOpdDWORDs(int operandIdx); int getNumOperands(); bool isVectorRegister(int operandIdx); bool isScalarRegister(int operandIdx); - bool isCondRegister(int operandIdx); int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst); int getOperandSize(int operandIdx); bool isDstOperand(int operandIdx); bool isSrcOperand(int operandIdx); + bool hasDestinationSgpr() const; + bool hasSourceSgpr() const; + bool hasDestinationVgpr() const; + bool hasSourceVgpr() const; + + bool hasSgprRawDependence(GPUDynInstPtr s); + bool hasVgprRawDependence(GPUDynInstPtr s); + + // returns true if the string "opcodeStr" is found in the + // opcode of the instruction + bool isOpcode(const std::string& opcodeStr) const; + bool isOpcode(const std::string& opcodeStr, + const std::string& extStr) const; + // returns true if source operand at "index" is a vector register + bool srcIsVgpr(int index) const; + const std::string &disassemble() const; - uint64_t seqNum() const; + InstSeqNum seqNum() const; Enums::StorageClassType executedAs(); - // The address of the memory operation + // virtual address for scalar memory operations + Addr scalarAddr; + // virtual addressies for vector memory operations std::vector addr; Addr pAddr; - // The data to get written + // vector data to get written uint8_t *d_data; + // scalar data to be transferred + uint8_t *scalar_data; // Additional data (for atomics) uint8_t *a_data; // Additional data (for atomics) @@ -119,19 +132,6 @@ class GPUDynInst : public GPUExecContext // The execution mask VectorMask exec_mask; - // The memory type (M_U32, M_S32, ...) - Enums::MemType m_type; - - // The equivalency class - int equiv; - // The return VGPR type (VT_32 or VT_64) - vgpr_type v_type; - // Number of VGPR's accessed (1, 2, or 4) - int n_reg; - // The return VGPR index - int dst_reg; - // There can be max 4 dest regs> - int dst_reg_vec[4]; // SIMD where the WF of the memory instruction has been mapped to int simdId; // unique id of the WF where the memory instruction belongs to @@ -140,21 +140,16 @@ class GPUDynInst : public GPUExecContext int kern_id; // The CU id of the requesting wf int cu_id; + // The workgroup id of the requesting wf + int wg_id; // HW slot id where the WF is mapped to inside a SIMD unit int wfSlotId; // execution pipeline id where the memory instruction has been scheduled - int pipeId; + int execUnitId; // The execution time of this operation Tick time; // The latency of this operation WaitClass latency; - // A list of bank conflicts for the 4 cycles. - uint32_t bc[4]; - - // A pointer to ROM - uint8_t *rom; - // The size of the READONLY segment - int sz_rom; // Initiate the specified memory operation, by creating a // memory request and sending it off to the memory system. @@ -168,16 +163,23 @@ class GPUDynInst : public GPUExecContext GPUStaticInst* staticInstruction() { return _staticInst; } + TheGpuISA::ScalarRegU32 srcLiteral() const; + bool isALU() const; bool isBranch() const; + bool isCondBranch() const; bool isNop() const; bool isReturn() const; + bool isEndOfKernel() const; + bool isKernelLaunch() const; + bool isSDWAInst() const; + bool isDPPInst() const; bool isUnconditionalJump() const; bool isSpecialOp() const; bool isWaitcnt() const; bool isBarrier() const; - bool isMemFence() const; + bool isMemSync() const; bool isMemRef() const; bool isFlat() const; bool isLoad() const; @@ -188,10 +190,20 @@ class GPUDynInst : public GPUExecContext bool isAtomicRet() const; bool isScalar() const; + bool isVector() const; bool readsSCC() const; bool writesSCC() const; bool readsVCC() const; bool writesVCC() const; + bool readsEXEC() const; + bool writesEXEC() const; + bool readsMode() const; + bool writesMode() const; + bool ignoreExec() const; + bool readsFlatScratch() const; + bool writesFlatScratch() const; + bool readsExecMask() const; + bool writesExecMask() const; bool isAtomicAnd() const; bool isAtomicOr() const; @@ -217,39 +229,25 @@ class GPUDynInst : public GPUExecContext bool isReadOnlySeg() const; bool isSpillSeg() const; - bool isWorkitemScope() const; - bool isWavefrontScope() const; - bool isWorkgroupScope() const; - bool isDeviceScope() const; - bool isSystemScope() const; - bool isNoScope() const; - - bool isRelaxedOrder() const; - bool isAcquire() const; - bool isRelease() const; - bool isAcquireRelease() const; - bool isNoOrder() const; - bool isGloballyCoherent() const; bool isSystemCoherent() const; - /* - * Loads/stores/atomics may have acquire/release semantics associated - * withthem. Some protocols want to see the acquire/release as separate - * requests from the load/store/atomic. We implement that separation - * using continuations (i.e., a function pointer with an object associated - * with it). When, for example, the front-end generates a store with - * release semantics, we will first issue a normal store and set the - * continuation in the GPUDynInst to a function that generate a - * release request. That continuation will be called when the normal - * store completes (in ComputeUnit::DataPort::recvTimingResponse). The - * continuation will be called in the context of the same GPUDynInst - * that generated the initial store. - */ - std::function execContinuation; - - // when true, call execContinuation when response arrives - bool useContinuation; + bool isF16() const; + bool isF32() const; + bool isF64() const; + + bool isFMA() const; + bool isMAC() const; + bool isMAD() const; + + // for FLAT memory ops. check the segment address + // against the APE registers to see if it falls + // within one of the APE ranges for LDS/SCRATCH/GPUVM. + // if it does not fall into one of the three APEs, it + // will be a regular global access. + void doApertureCheck(const VectorMask &mask); + // Function to resolve a flat accesses during execution stage. + void resolveFlatSegment(const VectorMask &mask); template AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1) @@ -282,62 +280,31 @@ class GPUDynInst : public GPUExecContext } void - setRequestFlags(RequestPtr req, bool setMemOrder=true) + setRequestFlags(RequestPtr req) const { - // currently these are the easy scopes to deduce - if (isPrivateSeg()) { - req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); - } else if (isSpillSeg()) { - req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); - } else if (isGlobalSeg()) { - req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); - } else if (isReadOnlySeg()) { - req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); - } else if (isGroupSeg()) { - req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); - } else if (isFlat()) { - panic("TODO: translate to correct scope"); - } else { - fatal("%s has bad segment type\n", disassemble()); + if (isGloballyCoherent()) { + req->setCacheCoherenceFlags(Request::GLC_BIT); } - if (isWavefrontScope()) { - req->setMemSpaceConfigFlags(Request::SCOPE_VALID | - Request::WAVEFRONT_SCOPE); - } else if (isWorkgroupScope()) { - req->setMemSpaceConfigFlags(Request::SCOPE_VALID | - Request::WORKGROUP_SCOPE); - } else if (isDeviceScope()) { - req->setMemSpaceConfigFlags(Request::SCOPE_VALID | - Request::DEVICE_SCOPE); - } else if (isSystemScope()) { - req->setMemSpaceConfigFlags(Request::SCOPE_VALID | - Request::SYSTEM_SCOPE); - } else if (!isNoScope() && !isWorkitemScope()) { - fatal("%s has bad scope type\n", disassemble()); + if (isSystemCoherent()) { + req->setCacheCoherenceFlags(Request::SLC_BIT); } - if (setMemOrder) { - // set acquire and release flags - if (isAcquire()) { - req->setFlags(Request::ACQUIRE); - } else if (isRelease()) { - req->setFlags(Request::RELEASE); - } else if (isAcquireRelease()) { - req->setFlags(Request::ACQUIRE | Request::RELEASE); - } else if (!isNoOrder()) { - fatal("%s has bad memory order\n", disassemble()); - } - } - - // set atomic type - // currently, the instruction genenerator only produces atomic return - // but a magic instruction can produce atomic no return if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } + + if (isMemSync()) { + // the path for kernel launch and kernel end is different + // from non-kernel mem sync. + assert(!isKernelLaunch()); + assert(!isEndOfKernel()); + + // must be wbinv inst if not kernel launch/end + req->setCacheCoherenceFlags(Request::ACQUIRE); + } } // Map returned packets and the addresses they satisfy with which lane they @@ -348,12 +315,39 @@ class GPUDynInst : public GPUExecContext // Track the status of memory requests per lane, a bit per lane VectorMask statusBitVector; // for ld_v# or st_v# - std::vector statusVector; std::vector tlbHitLevel; + // for misaligned scalar ops we track the number + // of outstanding reqs here + int numScalarReqs; + + Tick getAccessTime() const { return accessTime; } + + void setAccessTime(Tick currentTime) { accessTime = currentTime; } + + void profileRoundTripTime(Tick currentTime, int hopId); + std::vector getRoundTripTime() const { return roundTripTime; } + + void profileLineAddressTime(Addr addr, Tick currentTime, int hopId); + const std::map>& getLineAddressTime() const + { return lineAddressTime; } + + // inst used to save/restore a wavefront context + bool isSaveRestore; private: GPUStaticInst *_staticInst; - uint64_t _seqNum; + const InstSeqNum _seqNum; + + // the time the request was started + Tick accessTime = -1; + + // hold the tick when the instruction arrives at certain hop points + // on it's way to main memory + std::vector roundTripTime; + + // hold each cache block address for the instruction and a vector + // to hold the tick when the block arrives at certain hop points + std::map> lineAddressTime; }; #endif // __GPU_DYN_INST_HH__ diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc index 154d2b8ed..2411e9e84 100644 --- a/src/gpu-compute/gpu_exec_context.cc +++ b/src/gpu-compute/gpu_exec_context.cc @@ -59,8 +59,8 @@ GPUExecContext::readMiscReg(int opIdx) const } void -GPUExecContext::writeMiscReg(int opIdx, RegVal operandVal) +GPUExecContext::writeMiscReg(int opIdx, RegVal val) { assert(gpuISA); - gpuISA->writeMiscReg(opIdx, operandVal); + gpuISA->writeMiscReg(opIdx, val); } diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 49c0315ba..9ab1580df 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -34,10 +34,10 @@ #include "gpu-compute/gpu_static_inst.hh" GPUStaticInst::GPUStaticInst(const std::string &opcode) - : executed_as(Enums::SC_NONE), opcode(opcode), - _instNum(0), _instAddr(0) + : executed_as(Enums::SC_NONE), _opcode(opcode), + _instNum(0), _instAddr(0), srcVecOperands(-1), dstVecOperands(-1), + srcVecDWORDs(-1), dstVecDWORDs(-1) { - setFlag(NoOrder); } const std::string& @@ -50,3 +50,80 @@ GPUStaticInst::disassemble() return disassembly; } + +int +GPUStaticInst::numSrcVecOperands() +{ + if (srcVecOperands > -1) + return srcVecOperands; + + srcVecOperands = 0; + if (!isScalar()) { + for (int k = 0; k < getNumOperands(); ++k) { + if (isVectorRegister(k) && isSrcOperand(k)) + srcVecOperands++; + } + } + return srcVecOperands; +} + +int +GPUStaticInst::numDstVecOperands() +{ + if (dstVecOperands > -1) + return dstVecOperands; + + dstVecOperands = 0; + if (!isScalar()) { + for (int k = 0; k < getNumOperands(); ++k) { + if (isVectorRegister(k) && isDstOperand(k)) + dstVecOperands++; + } + } + return dstVecOperands; +} + +int +GPUStaticInst::numSrcVecDWORDs() +{ + if (srcVecDWORDs > -1) { + return srcVecDWORDs; + } + + srcVecDWORDs = 0; + if (!isScalar()) { + for (int i = 0; i < getNumOperands(); i++) { + if (isVectorRegister(i) && isSrcOperand(i)) { + int dwords = numOpdDWORDs(i); + srcVecDWORDs += dwords; + } + } + } + return srcVecDWORDs; +} + +int +GPUStaticInst::numDstVecDWORDs() +{ + if (dstVecDWORDs > -1) { + return dstVecDWORDs; + } + + dstVecDWORDs = 0; + if (!isScalar()) { + for (int i = 0; i < getNumOperands(); i++) { + if (isVectorRegister(i) && isDstOperand(i)) { + int dwords = numOpdDWORDs(i); + dstVecDWORDs += dwords; + } + } + } + return dstVecDWORDs; +} + +int +GPUStaticInst::numOpdDWORDs(int operandIdx) +{ + return getOperandSize(operandIdx) <= 4 ? 1 + : getOperandSize(operandIdx) / 4; +} diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index ee5a98e77..88fd9f991 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -59,6 +59,7 @@ class GPUStaticInst : public GPUStaticInstFlags { public: GPUStaticInst(const std::string &opcode); + virtual ~GPUStaticInst() { } void instAddr(int inst_addr) { _instAddr = inst_addr; } int instAddr() const { return _instAddr; } int nextInstAddr() const { return _instAddr + instSize(); } @@ -71,15 +72,18 @@ class GPUStaticInst : public GPUStaticInstFlags int ipdInstNum() const { return _ipdInstNum; } + virtual TheGpuISA::ScalarRegU32 srcLiteral() const { return 0; } + virtual void execute(GPUDynInstPtr gpuDynInst) = 0; virtual void generateDisassembly() = 0; const std::string& disassemble(); virtual int getNumOperands() = 0; - virtual bool isCondRegister(int operandIndex) = 0; virtual bool isScalarRegister(int operandIndex) = 0; virtual bool isVectorRegister(int operandIndex) = 0; virtual bool isSrcOperand(int operandIndex) = 0; virtual bool isDstOperand(int operandIndex) = 0; + virtual bool isFlatScratchRegister(int opIdx) = 0; + virtual bool isExecMaskRegister(int opIdx) = 0; virtual int getOperandSize(int operandIndex) = 0; virtual int getRegisterIndex(int operandIndex, @@ -88,12 +92,24 @@ class GPUStaticInst : public GPUStaticInstFlags virtual int numDstRegOperands() = 0; virtual int numSrcRegOperands() = 0; - virtual bool isValid() const = 0; + virtual int coalescerTokenCount() const { return 0; } + + int numDstVecOperands(); + int numSrcVecOperands(); + int numDstVecDWORDs(); + int numSrcVecDWORDs(); + + int numOpdDWORDs(int operandIdx); bool isALU() const { return _flags[ALU]; } bool isBranch() const { return _flags[Branch]; } + bool isCondBranch() const { return _flags[CondBranch]; } bool isNop() const { return _flags[Nop]; } bool isReturn() const { return _flags[Return]; } + bool isEndOfKernel() const { return _flags[EndOfKernel]; } + bool isKernelLaunch() const { return _flags[KernelLaunch]; } + bool isSDWAInst() const { return _flags[IsSDWA]; } + bool isDPPInst() const { return _flags[IsDPP]; } bool isUnconditionalJump() const @@ -105,7 +121,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isWaitcnt() const { return _flags[Waitcnt]; } bool isBarrier() const { return _flags[MemBarrier]; } - bool isMemFence() const { return _flags[MemFence]; } + bool isMemSync() const { return _flags[MemSync]; } bool isMemRef() const { return _flags[MemoryRef]; } bool isFlat() const { return _flags[Flat]; } bool isLoad() const { return _flags[Load]; } @@ -125,6 +141,13 @@ class GPUStaticInst : public GPUStaticInstFlags bool writesSCC() const { return _flags[WritesSCC]; } bool readsVCC() const { return _flags[ReadsVCC]; } bool writesVCC() const { return _flags[WritesVCC]; } + // Identify instructions that implicitly read the Execute mask + // as a source operand but not to dictate which threads execute. + bool readsEXEC() const { return _flags[ReadsEXEC]; } + bool writesEXEC() const { return _flags[WritesEXEC]; } + bool readsMode() const { return _flags[ReadsMode]; } + bool writesMode() const { return _flags[WritesMode]; } + bool ignoreExec() const { return _flags[IgnoreExec]; } bool isAtomicAnd() const { return _flags[AtomicAnd]; } bool isAtomicOr() const { return _flags[AtomicOr]; } @@ -166,34 +189,29 @@ class GPUStaticInst : public GPUStaticInstFlags bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; } bool isSpillSeg() const { return _flags[SpillSegment]; } - bool isWorkitemScope() const { return _flags[WorkitemScope]; } - bool isWavefrontScope() const { return _flags[WavefrontScope]; } - bool isWorkgroupScope() const { return _flags[WorkgroupScope]; } - bool isDeviceScope() const { return _flags[DeviceScope]; } - bool isSystemScope() const { return _flags[SystemScope]; } - bool isNoScope() const { return _flags[NoScope]; } - - bool isRelaxedOrder() const { return _flags[RelaxedOrder]; } - bool isAcquire() const { return _flags[Acquire]; } - bool isRelease() const { return _flags[Release]; } - bool isAcquireRelease() const { return _flags[AcquireRelease]; } - bool isNoOrder() const { return _flags[NoOrder]; } - /** - * Coherence domain of a memory instruction. Only valid for - * machine ISA. The coherence domain specifies where it is - * possible to perform memory synchronization, e.g., acquire - * or release, from the shader kernel. + * Coherence domain of a memory instruction. The coherence domain + * specifies where it is possible to perform memory synchronization + * (e.g., acquire or release) from the shader kernel. * - * isGloballyCoherent(): returns true if kernel is sharing memory - * with other work-items on the same device (GPU) + * isGloballyCoherent(): returns true if WIs share same device + * isSystemCoherent(): returns true if WIs or threads in different + * devices share memory * - * isSystemCoherent(): returns true if kernel is sharing memory - * with other work-items on a different device (GPU) or the host (CPU) */ bool isGloballyCoherent() const { return _flags[GloballyCoherent]; } bool isSystemCoherent() const { return _flags[SystemCoherent]; } + // Floating-point instructions + bool isF16() const { return _flags[F16]; } + bool isF32() const { return _flags[F32]; } + bool isF64() const { return _flags[F64]; } + + // FMA, MAC, MAD instructions + bool isFMA() const { return _flags[FMA]; } + bool isMAC() const { return _flags[MAC]; } + bool isMAD() const { return _flags[MAD]; } + virtual int instSize() const = 0; // only used for memory instructions @@ -217,37 +235,36 @@ class GPUStaticInst : public GPUStaticInstFlags // For flat memory accesses Enums::StorageClassType executed_as; - void setFlag(Flags flag) { _flags[flag] = true; } - - virtual void - execLdAcq(GPUDynInstPtr gpuDynInst) - { - fatal("calling execLdAcq() on a non-load instruction.\n"); - } - - virtual void - execSt(GPUDynInstPtr gpuDynInst) - { - fatal("calling execLdAcq() on a non-load instruction.\n"); - } - - virtual void - execAtomic(GPUDynInstPtr gpuDynInst) - { - fatal("calling execAtomic() on a non-atomic instruction.\n"); - } - - virtual void - execAtomicAcq(GPUDynInstPtr gpuDynInst) - { - fatal("calling execAtomicAcq() on a non-atomic instruction.\n"); + void setFlag(Flags flag) { + _flags[flag] = true; + + if (isGroupSeg()) { + executed_as = Enums::SC_GROUP; + } else if (isGlobalSeg()) { + executed_as = Enums::SC_GLOBAL; + } else if (isPrivateSeg()) { + executed_as = Enums::SC_PRIVATE; + } else if (isSpillSeg()) { + executed_as = Enums::SC_SPILL; + } else if (isReadOnlySeg()) { + executed_as = Enums::SC_READONLY; + } else if (isKernArgSeg()) { + executed_as = Enums::SC_KERNARG; + } else if (isArgSeg()) { + executed_as = Enums::SC_ARG; + } } + const std::string& opcode() const { return _opcode; } protected: - const std::string opcode; + const std::string _opcode; std::string disassembly; int _instNum; int _instAddr; + int srcVecOperands; + int dstVecOperands; + int srcVecDWORDs; + int dstVecDWORDs; /** * Identifier of the immediate post-dominator instruction. */ @@ -262,9 +279,9 @@ class KernelLaunchStaticInst : public GPUStaticInst KernelLaunchStaticInst() : GPUStaticInst("kernel_launch") { setFlag(Nop); + setFlag(KernelLaunch); + setFlag(MemSync); setFlag(Scalar); - setFlag(Acquire); - setFlag(SystemScope); setFlag(GlobalSegment); } @@ -277,11 +294,14 @@ class KernelLaunchStaticInst : public GPUStaticInst void generateDisassembly() override { - disassembly = opcode; + disassembly = _opcode; } int getNumOperands() override { return 0; } - bool isCondRegister(int operandIndex) override { return false; } + bool isFlatScratchRegister(int opIdx) override { return false; } + // return true if the Execute mask is explicitly used as a source + // register operand + bool isExecMaskRegister(int opIdx) override { return false; } bool isScalarRegister(int operandIndex) override { return false; } bool isVectorRegister(int operandIndex) override { return false; } bool isSrcOperand(int operandIndex) override { return false; } @@ -296,7 +316,6 @@ class KernelLaunchStaticInst : public GPUStaticInst int numDstRegOperands() override { return 0; } int numSrcRegOperands() override { return 0; } - bool isValid() const override { return true; } int instSize() const override { return 0; } }; diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc index 37a8b03a2..a37618d32 100644 --- a/src/gpu-compute/gpu_tlb.cc +++ b/src/gpu-compute/gpu_tlb.cc @@ -74,7 +74,6 @@ namespace X86ISA allocationPolicy = p->allocationPolicy; hasMemSidePort = false; accessDistance = p->accessDistance; - clock = p->clk_domain->clockPeriod(); tlb.assign(size, TlbEntry()); @@ -624,8 +623,8 @@ namespace X86ISA { bool delayedResponse; - return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false, - latency); + return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, + false, latency); } void @@ -803,13 +802,13 @@ namespace X86ISA } /* - * We now know the TLB lookup outcome (if it's a hit or a miss), as well - * as the TLB access latency. + * We now know the TLB lookup outcome (if it's a hit or a miss), as + * well as the TLB access latency. * * We create and schedule a new TLBEvent which will help us take the - * appropriate actions (e.g., update TLB on a hit, send request to lower - * level TLB on a miss, or start a page walk if this was the last-level - * TLB) + * appropriate actions (e.g., update TLB on a hit, send request to + * lower level TLB on a miss, or start a page walk if this was the + * last-level TLB) */ TLBEvent *tlb_event = new TLBEvent(this, virt_page_addr, lookup_outcome, pkt); @@ -823,15 +822,15 @@ namespace X86ISA assert(tlb_event); DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n", - curTick() + this->ticks(hitLatency)); + curTick() + cyclesToTicks(Cycles(hitLatency))); - schedule(tlb_event, curTick() + this->ticks(hitLatency)); + schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency))); } - GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome, - PacketPtr _pkt) - : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr), - outcome(tlb_outcome), pkt(_pkt) + GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, + tlbOutcome tlb_outcome, PacketPtr _pkt) + : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr), + outcome(tlb_outcome), pkt(_pkt) { } @@ -848,7 +847,8 @@ namespace X86ISA bool storeCheck = flags & (StoreCheck << FlagShift); // Do paging protection checks. - bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift))); + bool inUser + = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift))); CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0); bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp)); @@ -874,10 +874,9 @@ namespace X86ISA * The latter calls handelHit with TLB miss as tlbOutcome. */ void - GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome, - PacketPtr pkt) + GpuTLB::handleTranslationReturn(Addr virt_page_addr, + tlbOutcome tlb_outcome, PacketPtr pkt) { - assert(pkt); Addr vaddr = pkt->req->getVaddr(); @@ -890,15 +889,18 @@ namespace X86ISA TlbEntry *local_entry, *new_entry; if (tlb_outcome == TLB_HIT) { - DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr); + DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", + vaddr); local_entry = sender_state->tlbEntry; } else { DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n", vaddr); - // We are returning either from a page walk or from a hit at a lower - // TLB level. The senderState should be "carrying" a pointer to the - // correct TLBEntry. + /** + * We are returning either from a page walk or from a hit at a + * lower TLB level. The senderState should be "carrying" a pointer + * to the correct TLBEntry. + */ new_entry = sender_state->tlbEntry; assert(new_entry); local_entry = new_entry; @@ -1024,7 +1026,8 @@ namespace X86ISA TLBEvent *tlb_event = translationReturnEvent[virtPageAddr]; assert(tlb_event); tlb_event->updateOutcome(PAGE_WALK); - schedule(tlb_event, curTick() + ticks(missLatency2)); + schedule(tlb_event, + curTick() + cyclesToTicks(Cycles(missLatency2))); } } else if (outcome == PAGE_WALK) { if (update_stats) @@ -1095,7 +1098,7 @@ namespace X86ISA return virtPageAddr; } - /* + /** * recvTiming receives a coalesced timing request from a TLBCoalescer * and it calls issueTLBLookup() * It only rejects the packet if we have exceeded the max @@ -1145,9 +1148,11 @@ namespace X86ISA DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr " "%#x\n", vaddr); - // We are returning either from a page walk or from a hit at a lower - // TLB level. The senderState should be "carrying" a pointer to the - // correct TLBEntry. + /** + * We are returning either from a page walk or from a hit at a + * lower TLB level. The senderState should be "carrying" a pointer + * to the correct TLBEntry. + */ new_entry = sender_state->tlbEntry; assert(new_entry); local_entry = new_entry; @@ -1267,8 +1272,8 @@ namespace X86ISA } else { // If this was a prefetch, then do the normal thing if it // was a successful translation. Otherwise, send an empty - // TLB entry back so that it can be figured out as empty and - // handled accordingly. + // TLB entry back so that it can be figured out as empty + // and handled accordingly. if (pte) { DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr, pte->paddr); @@ -1343,7 +1348,7 @@ namespace X86ISA assert(virt_page_addr == tlb_event->getTLBEventVaddr()); tlb_event->updateOutcome(MISS_RETURN); - tlb->schedule(tlb_event, curTick()+tlb->ticks(1)); + tlb->schedule(tlb_event, curTick()+tlb->clockPeriod()); return true; } @@ -1393,8 +1398,8 @@ namespace X86ISA tmp_access_info.sumDistance = 0; tmp_access_info.meanDistance = 0; - ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr, - tmp_access_info)); + ret = TLBFootprint.insert( + AccessPatternTable::value_type(virt_page_addr, tmp_access_info)); bool first_page_access = ret.second; @@ -1428,74 +1433,74 @@ namespace X86ISA page_stat_file = simout.create(name().c_str())->stream(); // print header - *page_stat_file << "page,max_access_distance,mean_access_distance, " - << "stddev_distance" << std::endl; + *page_stat_file + << "page,max_access_distance,mean_access_distance, " + << "stddev_distance" << std::endl; } // update avg. reuse distance footprint - AccessPatternTable::iterator iter, iter_begin, iter_end; unsigned int sum_avg_reuse_distance_per_page = 0; // iterate through all pages seen by this TLB - for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) { - sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance / - iter->second.accessesPerPage; + for (auto &iter : TLBFootprint) { + sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance / + iter.second.accessesPerPage; if (accessDistance) { - unsigned int tmp = iter->second.localTLBAccesses[0]; + unsigned int tmp = iter.second.localTLBAccesses[0]; unsigned int prev = tmp; - for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) { if (i) { tmp = prev + 1; } - prev = iter->second.localTLBAccesses[i]; + prev = iter.second.localTLBAccesses[i]; // update the localTLBAccesses value // with the actual differece - iter->second.localTLBAccesses[i] -= tmp; + iter.second.localTLBAccesses[i] -= tmp; // compute the sum of AccessDistance per page // used later for mean - iter->second.sumDistance += - iter->second.localTLBAccesses[i]; + iter.second.sumDistance += + iter.second.localTLBAccesses[i]; } - iter->second.meanDistance = - iter->second.sumDistance / iter->second.accessesPerPage; + iter.second.meanDistance = + iter.second.sumDistance / iter.second.accessesPerPage; // compute std_dev and max (we need a second round because we // need to know the mean value unsigned int max_distance = 0; unsigned int stddev_distance = 0; - for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) { + for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) { unsigned int tmp_access_distance = - iter->second.localTLBAccesses[i]; + iter.second.localTLBAccesses[i]; if (tmp_access_distance > max_distance) { max_distance = tmp_access_distance; } unsigned int diff = - tmp_access_distance - iter->second.meanDistance; + tmp_access_distance - iter.second.meanDistance; stddev_distance += pow(diff, 2); } stddev_distance = - sqrt(stddev_distance/iter->second.accessesPerPage); + sqrt(stddev_distance/iter.second.accessesPerPage); if (page_stat_file) { - *page_stat_file << std::hex << iter->first << ","; + *page_stat_file << std::hex << iter.first << ","; *page_stat_file << std::dec << max_distance << ","; - *page_stat_file << std::dec << iter->second.meanDistance + *page_stat_file << std::dec << iter.second.meanDistance << ","; *page_stat_file << std::dec << stddev_distance; *page_stat_file << std::endl; } // erase the localTLBAccesses array - iter->second.localTLBAccesses.clear(); + iter.second.localTLBAccesses.clear(); } } diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh index dbd3a16f3..9186b33fe 100644 --- a/src/gpu-compute/gpu_tlb.hh +++ b/src/gpu-compute/gpu_tlb.hh @@ -69,26 +69,7 @@ namespace X86ISA uint32_t configAddress; - // TLB clock: will inherit clock from shader's clock period in terms - // of nuber of ticks of curTime (aka global simulation clock) - // The assignment of TLB clock from shader clock is done in the python - // config files. - int clock; - public: - // clock related functions ; maps to-and-from Simulation ticks and - // object clocks. - Tick frequency() const { return SimClock::Frequency / clock; } - - Tick - ticks(int numCycles) const - { - return (Tick)clock * numCycles; - } - - Tick curCycle() const { return curTick() / clock; } - Tick tickToCycles(Tick val) const { return val / clock;} - typedef X86GPUTLBParams Params; GpuTLB(const Params *p); ~GpuTLB(); diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh new file mode 100644 index 000000000..a6917db3e --- /dev/null +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2017-2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +/** + * @file + * HSAQueuEntry is the simulator's internal representation of an + * AQL queue entry (task). It encasulates all of the relevant info + * about a task, which is gathered from various runtime data + * structures including: the AQL MQD, the AQL packet, and the code + * object. + */ + +#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__ +#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__ + +#include +#include +#include +#include +#include + +#include "base/intmath.hh" +#include "base/types.hh" +#include "dev/hsa/hsa_packet.hh" +#include "dev/hsa/hsa_queue.hh" +#include "gpu-compute/kernel_code.hh" + +class HSAQueueEntry +{ + public: + HSAQueueEntry(std::string kernel_name, uint32_t queue_id, + int dispatch_id, void *disp_pkt, AMDKernelCode *akc, + Addr host_pkt_addr, Addr code_addr) + : kernName(kernel_name), + _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x, + (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y, + (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}}, + _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x, + (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y, + (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}}, + numVgprs(akc->workitem_vgpr_count), + numSgprs(akc->wavefront_sgpr_count), + _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt), + _hostDispPktAddr(host_pkt_addr), + _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt) + ->completion_signal), + codeAddress(code_addr), + kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address), + _outstandingInvs(-1), _outstandingWbs(0), + _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)-> + group_segment_size), + _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)-> + private_segment_size), + _contextId(0), _wgId{{ 0, 0, 0 }}, + _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0), + _globalWgId(0), dispatchComplete(false) + + { + initialVgprState.reset(); + initialSgprState.reset(); + + for (int i = 0; i < MAX_DIM; ++i) { + _numWg[i] = divCeil(_gridSize[i], _wgSize[i]); + _numWgTotal *= _numWg[i]; + } + + parseKernelCode(akc); + } + + const std::string& + kernelName() const + { + return kernName; + } + + int + wgSize(int dim) const + { + assert(dim < MAX_DIM); + return _wgSize[dim]; + } + + int + gridSize(int dim) const + { + assert(dim < MAX_DIM); + return _gridSize[dim]; + } + + int + numVectorRegs() const + { + return numVgprs; + } + + int + numScalarRegs() const + { + return numSgprs; + } + + uint32_t + queueId() const + { + return _queueId; + } + + int + dispatchId() const + { + return _dispatchId; + } + + void* + dispPktPtr() + { + return dispPkt; + } + + Addr + hostDispPktAddr() const + { + return _hostDispPktAddr; + } + + Addr + completionSignal() const + { + return _completionSignal; + } + + Addr + codeAddr() const + { + return codeAddress; + } + + Addr + kernargAddr() const + { + return kernargAddress; + } + + int + ldsSize() const + { + return _ldsSize; + } + + int privMemPerItem() const { return _privMemPerItem; } + + int + contextId() const + { + return _contextId; + } + + bool + dispComplete() const + { + return dispatchComplete; + } + + int + wgId(int dim) const + { + assert(dim < MAX_DIM); + return _wgId[dim]; + } + + void + wgId(int dim, int val) + { + assert(dim < MAX_DIM); + _wgId[dim] = val; + } + + int + globalWgId() const + { + return _globalWgId; + } + + void + globalWgId(int val) + { + _globalWgId = val; + } + + int + numWg(int dim) const + { + assert(dim < MAX_DIM); + return _numWg[dim]; + } + + void + notifyWgCompleted() + { + ++_numWgCompleted; + } + + int + numWgCompleted() const + { + return _numWgCompleted; + } + + int + numWgTotal() const + { + return _numWgTotal; + } + + void + markWgDispatch() + { + ++_wgId[0]; + ++_globalWgId; + + if (wgId(0) * wgSize(0) >= gridSize(0)) { + _wgId[0] = 0; + ++_wgId[1]; + + if (wgId(1) * wgSize(1) >= gridSize(1)) { + _wgId[1] = 0; + ++_wgId[2]; + + if (wgId(2) * wgSize(2) >= gridSize(2)) { + dispatchComplete = true; + } + } + } + } + + int + numWgAtBarrier() const + { + return numWgArrivedAtBarrier; + } + + bool vgprBitEnabled(int bit) const + { + return initialVgprState.test(bit); + } + + bool sgprBitEnabled(int bit) const + { + return initialSgprState.test(bit); + } + + /** + * Host-side addr of the amd_queue_t on which + * this task was queued. + */ + Addr hostAMDQueueAddr; + + /** + * Keep a copy of the AMD HSA queue because we + * need info from some of its fields to initialize + * register state. + */ + _amd_queue_t amdQueue; + + // the maximum number of dimensions for a grid or workgroup + const static int MAX_DIM = 3; + + /* getter */ + int + outstandingInvs() { + return _outstandingInvs; + } + + /** + * Whether invalidate has started or finished -1 is the + * initial value indicating inv has not started for the + * kernel. + */ + bool + isInvStarted() + { + return (_outstandingInvs != -1); + } + + /** + * update the number of pending invalidate requests + * + * val: negative to decrement, positive to increment + */ + void + updateOutstandingInvs(int val) + { + _outstandingInvs += val; + assert(_outstandingInvs >= 0); + } + + /** + * Forcefully change the state to be inv done. + */ + void + markInvDone() + { + _outstandingInvs = 0; + } + + /** + * Is invalidate done? + */ + bool + isInvDone() const + { + assert(_outstandingInvs >= 0); + return (_outstandingInvs == 0); + } + + int + outstandingWbs() const + { + return _outstandingWbs; + } + + /** + * Update the number of pending writeback requests. + * + * val: negative to decrement, positive to increment + */ + void + updateOutstandingWbs(int val) + { + _outstandingWbs += val; + assert(_outstandingWbs >= 0); + } + + private: + void + parseKernelCode(AMDKernelCode *akc) + { + /** set the enable bits for the initial SGPR state */ + initialSgprState.set(PrivateSegBuf, + akc->enable_sgpr_private_segment_buffer); + initialSgprState.set(DispatchPtr, + akc->enable_sgpr_dispatch_ptr); + initialSgprState.set(QueuePtr, + akc->enable_sgpr_queue_ptr); + initialSgprState.set(KernargSegPtr, + akc->enable_sgpr_kernarg_segment_ptr); + initialSgprState.set(DispatchId, + akc->enable_sgpr_dispatch_id); + initialSgprState.set(FlatScratchInit, + akc->enable_sgpr_flat_scratch_init); + initialSgprState.set(PrivateSegSize, + akc->enable_sgpr_private_segment_size); + initialSgprState.set(GridWorkgroupCountX, + akc->enable_sgpr_grid_workgroup_count_x); + initialSgprState.set(GridWorkgroupCountY, + akc->enable_sgpr_grid_workgroup_count_y); + initialSgprState.set(GridWorkgroupCountZ, + akc->enable_sgpr_grid_workgroup_count_z); + initialSgprState.set(WorkgroupIdX, + akc->enable_sgpr_workgroup_id_x); + initialSgprState.set(WorkgroupIdY, + akc->enable_sgpr_workgroup_id_y); + initialSgprState.set(WorkgroupIdZ, + akc->enable_sgpr_workgroup_id_z); + initialSgprState.set(WorkgroupInfo, + akc->enable_sgpr_workgroup_info); + initialSgprState.set(PrivSegWaveByteOffset, + akc->enable_sgpr_private_segment_wave_byte_offset); + + /** + * set the enable bits for the initial VGPR state. the + * workitem Id in the X dimension is always initialized. + */ + initialVgprState.set(WorkitemIdX, true); + initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id_y); + initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id_z); + } + + // name of the kernel associated with the AQL entry + std::string kernName; + // workgroup Size (3 dimensions) + std::array _wgSize; + // grid Size (3 dimensions) + std::array _gridSize; + // total number of VGPRs per work-item + int numVgprs; + // total number of SGPRs per wavefront + int numSgprs; + // id of AQL queue in which this entry is placed + uint32_t _queueId; + int _dispatchId; + // raw AQL packet pointer + void *dispPkt; + // host-side addr of the dispatch packet + Addr _hostDispPktAddr; + // pointer to bool + Addr _completionSignal; + // base address of the raw machine code + Addr codeAddress; + // base address of the kernel args + Addr kernargAddress; + /** + * Number of outstanding invs for the kernel. + * values: + * -1: initial value, invalidate has not started for the kernel + * 0: 1)-1->0, about to start (a transient state, added in the same cycle) + * 2)+1->0, all inv requests are finished, i.e., invalidate done + * ?: positive value, indicating the number of pending inv requests + */ + int _outstandingInvs; + /** + * Number of outstanding wbs for the kernel + * values: + * 0: 1)initial value, flush has not started for the kernel + * 2)+1->0: all wb requests are finished, i.e., flush done + * ?: positive value, indicating the number of pending wb requests + */ + int _outstandingWbs; + int _ldsSize; + int _privMemPerItem; + int _contextId; + std::array _wgId; + std::array _numWg; + int _numWgTotal; + int numWgArrivedAtBarrier; + // The number of completed work groups + int _numWgCompleted; + int _globalWgId; + bool dispatchComplete; + + std::bitset initialVgprState; + std::bitset initialSgprState; +}; + +#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__ diff --git a/src/gpu-compute/kernel_code.hh b/src/gpu-compute/kernel_code.hh new file mode 100644 index 000000000..b3560c7e5 --- /dev/null +++ b/src/gpu-compute/kernel_code.hh @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +#ifndef __GPU_COMPUTE_KERNEL_CODE_HH__ +#define __GPU_COMPUTE_KERNEL_CODE_HH__ + +#include +#include + +/** + * these enums represent the indices into the + * initialRegState bitfields in HsaKernelInfo. + * each bit specifies whether or not the + * particular piece of state that the bit + * corresponds to should be initialized into + * the VGPRs/SGPRs. the order in which the + * fields are placed matters, as all enabled + * pieces of state will be initialized into + * contiguous registers in the same order + * as their position in the bitfield - which + * is specified in the HSA ABI. + */ +enum ScalarRegInitFields : int +{ + PrivateSegBuf = 0, + DispatchPtr = 1, + QueuePtr = 2, + KernargSegPtr = 3, + DispatchId = 4, + FlatScratchInit = 5, + PrivateSegSize = 6, + GridWorkgroupCountX = 7, + GridWorkgroupCountY = 8, + GridWorkgroupCountZ = 9, + WorkgroupIdX = 10, + WorkgroupIdY = 11, + WorkgroupIdZ = 12, + WorkgroupInfo = 13, + PrivSegWaveByteOffset = 14, + NumScalarInitFields = 15 +}; + +enum VectorRegInitFields : int +{ + WorkitemIdX = 0, + WorkitemIdY = 1, + WorkitemIdZ = 2, + NumVectorInitFields = 3 +}; + +struct AMDKernelCode +{ + uint32_t amd_kernel_code_version_major; + uint32_t amd_kernel_code_version_minor; + uint16_t amd_machine_kind; + uint16_t amd_machine_version_major; + uint16_t amd_machine_version_minor; + uint16_t amd_machine_version_stepping; + int64_t kernel_code_entry_byte_offset; + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + uint64_t max_scratch_backing_memory_byte_size; + + /** + * The fields below are used to set program settings for + * compute shaders. Here they are primarily used to setup + * initial register state. See the following for full details + * about kernel launch, state initialization, and the AMD kernel + * code object: https://github.com/RadeonOpenCompute/ROCm_Documentation/ + * blob/master/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst + * #initial-kernel-register-state + */ + + // the 32b below here represent the fields of + // the COMPUTE_PGM_RSRC1 register + uint32_t granulated_workitem_vgpr_count : 6; + uint32_t granulated_wavefront_sgpr_count : 4; + uint32_t priority : 2; + uint32_t float_mode_round_32 : 2; + uint32_t float_mode_round_16_64 : 2; + uint32_t float_mode_denorm_32 : 2; + uint32_t float_mode_denorm_16_64 : 2; + uint32_t priv : 1; + uint32_t enable_dx10_clamp : 1; + uint32_t debug_mode : 1; + uint32_t enable_ieee_mode : 1; + uint32_t bulky : 1; + uint32_t cdbg_user : 1; + uint32_t compute_pgm_rsrc1_reserved : 6; + // end COMPUTE_PGM_RSRC1 register + + // the 32b below here represent the fields of + // the COMPUTE_PGM_RSRC2 register + uint32_t enable_sgpr_private_segment_wave_byte_offset : 1; + uint32_t user_sgpr_count : 5; + uint32_t enable_trap_handler : 1; + uint32_t enable_sgpr_workgroup_id_x : 1; + uint32_t enable_sgpr_workgroup_id_y : 1; + uint32_t enable_sgpr_workgroup_id_z : 1; + uint32_t enable_sgpr_workgroup_info : 1; + uint32_t enable_vgpr_workitem_id_y : 1; + uint32_t enable_vgpr_workitem_id_z : 1; + uint32_t enable_exception_address_watch : 1; + uint32_t enable_exception_memory_violation : 1; + uint32_t granulated_lds_size : 9; + uint32_t enable_exception_ieee_754_fp_invalid_operation : 1; + uint32_t enable_exception_fp_denormal_source : 1; + uint32_t enable_exception_ieee_754_fp_division_by_zero : 1; + uint32_t enable_exception_ieee_754_fp_overflow : 1; + uint32_t enable_exception_ieee_754_fp_underflow : 1; + uint32_t enable_exception_ieee_754_fp_inexact : 1; + uint32_t enable_exception_int_divide_by_zero : 1; + uint32_t compute_pgm_rsrc2_reserved : 1; + // end COMPUTE_PGM_RSRC2 + + // the 32b below here represent the fields of + // KERNEL_CODE_PROPERTIES + uint32_t enable_sgpr_private_segment_buffer : 1; + uint32_t enable_sgpr_dispatch_ptr : 1; + uint32_t enable_sgpr_queue_ptr : 1; + uint32_t enable_sgpr_kernarg_segment_ptr : 1; + uint32_t enable_sgpr_dispatch_id : 1; + uint32_t enable_sgpr_flat_scratch_init : 1; + uint32_t enable_sgpr_private_segment_size : 1; + uint32_t enable_sgpr_grid_workgroup_count_x : 1; + uint32_t enable_sgpr_grid_workgroup_count_y : 1; + uint32_t enable_sgpr_grid_workgroup_count_z : 1; + uint32_t kernel_code_properties_reserved1 : 6; + uint32_t enable_ordered_append_gds : 1; + uint32_t private_element_size : 2; + uint32_t is_ptr64 : 1; + uint32_t is_dynamic_callstack : 1; + uint32_t is_debug_enabled : 1; + uint32_t is_xnack_enabled : 1; + uint32_t kernel_code_properties_reserved2 : 9; + // end KERNEL_CODE_PROPERTIES + + uint32_t workitem_private_segment_byte_size; + uint32_t workgroup_group_segment_byte_size; + uint32_t gds_segment_byte_size; + uint64_t kernarg_segment_byte_size; + uint32_t workgroup_fbarrier_count; + uint16_t wavefront_sgpr_count; + uint16_t workitem_vgpr_count; + uint16_t reserved_vgpr_first; + uint16_t reserved_vgpr_count; + uint16_t reserved_sgpr_first; + uint16_t reserved_sgpr_count; + uint16_t debug_wavefront_private_segment_offset_sgpr; + uint16_t debug_private_segment_buffer_sgpr; + uint8_t kernarg_segment_alignment; + uint8_t group_segment_alignment; + uint8_t private_segment_alignment; + uint8_t wavefront_size; + int32_t call_convention; + uint8_t reserved[12]; + uint64_t runtime_loader_kernel_symbol; + uint64_t control_directives[16]; +}; + +#endif // __GPU_COMPUTE_KERNEL_CODE_HH__ diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc index d56562b79..58c5d986e 100644 --- a/src/gpu-compute/lds_state.cc +++ b/src/gpu-compute/lds_state.cc @@ -210,8 +210,8 @@ LdsState::processPacket(PacketPtr packet) parent->loadBusLength(); // delay for accessing the LDS Tick processingTime = - parent->shader->ticks(bankConflicts * bankConflictPenalty) + - parent->shader->ticks(busLength); + parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) + + parent->cyclesToTicks(Cycles(busLength)); // choose (delay + last packet in queue) or (now + delay) as the time to // return this Tick doneAt = earliestReturnTime() + processingTime; diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index c4934a657..58171e30c 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -41,7 +41,6 @@ #include #include -#include "enums/MemType.hh" #include "gpu-compute/misc.hh" #include "mem/port.hh" #include "params/LdsState.hh" @@ -50,8 +49,8 @@ class ComputeUnit; /** - * this represents a slice of the overall LDS, intended to be associated with an - * individual workgroup + * this represents a slice of the overall LDS, intended to be associated with + * an individual workgroup */ class LdsChunk { @@ -71,7 +70,8 @@ class LdsChunk read(const uint32_t index) { fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0"); - fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS " + "chunk"); T *p0 = (T *) (&(chunk.at(index))); return *p0; } @@ -84,7 +84,8 @@ class LdsChunk write(const uint32_t index, const T value) { fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0"); - fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk"); + fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS " + "chunk"); T *p0 = (T *) (&(chunk.at(index))); *p0 = value; } @@ -203,14 +204,16 @@ class LdsState: public ClockedObject protected: - // the lds reference counter - // The key is the workgroup ID and dispatch ID - // The value is the number of wavefronts that reference this LDS, as - // wavefronts are launched, the counter goes up for that workgroup and when - // they return it decreases, once it reaches 0 then this chunk of the LDS is - // returned to the available pool. However,it is deallocated on the 1->0 - // transition, not whenever the counter is 0 as it always starts with 0 when - // the workgroup asks for space + /** + * the lds reference counter + * The key is the workgroup ID and dispatch ID + * The value is the number of wavefronts that reference this LDS, as + * wavefronts are launched, the counter goes up for that workgroup and when + * they return it decreases, once it reaches 0 then this chunk of the LDS + * is returned to the available pool. However,it is deallocated on the 1->0 + * transition, not whenever the counter is 0 as it always starts with 0 + * when the workgroup asks for space + */ std::unordered_map> refCounter; @@ -356,22 +359,41 @@ class LdsState: public ClockedObject const uint32_t size) { if (chunkMap.find(dispatchId) != chunkMap.end()) { - fatal_if( + panic_if( chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(), "duplicate workgroup ID asking for space in the LDS " "did[%d] wgid[%d]", dispatchId, wgId); } - fatal_if(bytesAllocated + size > maximumSize, - "request would ask for more space than is available"); + if (bytesAllocated + size > maximumSize) { + return nullptr; + } else { + bytesAllocated += size; + + auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size)); + panic_if(!value.second, "was unable to allocate a new chunkMap"); + + // make an entry for this workgroup + refCounter[dispatchId][wgId] = 0; - bytesAllocated += size; + return &chunkMap[dispatchId][wgId]; + } + } + + /* + * return pointer to lds chunk for wgid + */ + LdsChunk * + getLdsChunk(const uint32_t dispatchId, const uint32_t wgId) + { + fatal_if(chunkMap.find(dispatchId) == chunkMap.end(), + "fetch for unknown dispatch ID did[%d]", dispatchId); - chunkMap[dispatchId].emplace(wgId, LdsChunk(size)); - // make an entry for this workgroup - refCounter[dispatchId][wgId] = 0; + fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(), + "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]", + wgId, dispatchId); - return &chunkMap[dispatchId][wgId]; + return &chunkMap[dispatchId][wgId]; } bool diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 68c5afa4a..b31ed6f4a 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -33,6 +33,7 @@ #include "gpu-compute/local_memory_pipeline.hh" +#include "debug/GPUMem.hh" #include "debug/GPUPort.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" @@ -62,24 +63,31 @@ LocalMemPipeline::exec() bool accessVrf = true; Wavefront *w = nullptr; - if ((m) && (m->isLoad() || m->isAtomicRet())) { + if ((m) && m->latency.rdy() && (m->isLoad() || m->isAtomicRet())) { w = m->wavefront(); - accessVrf = - w->computeUnit->vrf[w->simdId]-> - vrfOperandAccessReady(m->seqNum(), w, m, - VrfAccessType::WRITE); + accessVrf = w->computeUnit->vrf[w->simdId]-> + canScheduleWriteOperandsFromLoad(w, m); + } if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && - computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return - || computeUnit->wfWait.at(m->pipeId).rdy())) { + computeUnit->locMemToVrfBus.rdy() + && (computeUnit->shader->coissue_return + || computeUnit->vectorSharedMemUnit.rdy())) { lmReturnedRequests.pop(); w = m->wavefront(); + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n", + m->cu_id, m->simdId, m->wfSlotId, m->disassemble()); m->completeAcc(m); + if (m->isLoad() || m->isAtomicRet()) { + w->computeUnit->vrf[w->simdId]-> + scheduleWriteOperandsFromLoad(w, m); + } + // Decrement outstanding request count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); @@ -96,7 +104,7 @@ LocalMemPipeline::exec() // Mark write bus busy for appropriate amount of time computeUnit->locMemToVrfBus.set(m->time); if (computeUnit->shader->coissue_return == 0) - w->computeUnit->wfWait.at(m->pipeId).set(m->time); + w->computeUnit->vectorSharedMemUnit.set(m->time); } // If pipeline has executed a local memory instruction @@ -114,6 +122,13 @@ LocalMemPipeline::exec() } } +void +LocalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst) +{ + gpuDynInst->setAccessTime(curTick()); + lmIssuedRequests.push(gpuDynInst); +} + void LocalMemPipeline::regStats() { diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh index dba938d6a..d9ab485b2 100644 --- a/src/gpu-compute/local_memory_pipeline.hh +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -58,10 +58,11 @@ class LocalMemPipeline LocalMemPipeline(const ComputeUnitParams *params); void init(ComputeUnit *cu); void exec(); - - std::queue &getLMReqFIFO() { return lmIssuedRequests; } std::queue &getLMRespFIFO() { return lmReturnedRequests; } + void issueRequest(GPUDynInstPtr gpuDynInst); + + bool isLMRespFIFOWrRdy() const { diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh index 731a9977a..0b573e8fe 100644 --- a/src/gpu-compute/misc.hh +++ b/src/gpu-compute/misc.hh @@ -39,34 +39,62 @@ #include #include "base/logging.hh" +#include "sim/clocked_object.hh" class GPUDynInst; -typedef std::bitset::digits> VectorMask; +typedef std::bitset::digits> + VectorMask; typedef std::shared_ptr GPUDynInstPtr; +enum InstMemoryHop : int { + Initiate = 0, + CoalsrSend = 1, + CoalsrRecv = 2, + GMEnqueue = 3, + Complete = 4, + InstMemoryHopMax = 5 +}; + +enum BlockMemoryHop : int { + BlockSend = 0, + BlockRecv = 1 +}; + class WaitClass { public: - WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { } - void init(uint64_t *_tcnt, uint32_t _numStages=0) + WaitClass() : nxtAvail(0), lookAheadAvail(0), clockedObject(nullptr) { } + + WaitClass(ClockedObject *_clockedObject, uint64_t _numStages=0) + : nxtAvail(0), lookAheadAvail(0), clockedObject(_clockedObject), + numStages(_numStages) { } + + void init(ClockedObject *_clockedObject, uint64_t _numStages=0) { - tcnt = _tcnt; + clockedObject = _clockedObject; numStages = _numStages; } - void set(uint32_t i) + void set(uint64_t i) { - fatal_if(nxtAvail > *tcnt, + fatal_if(nxtAvail > clockedObject->clockEdge(), "Can't allocate resource because it is busy!!!"); - nxtAvail = *tcnt + i; + nxtAvail = clockedObject->clockEdge() + i; + } + void preset(uint64_t delay) + { + lookAheadAvail = std::max(lookAheadAvail, delay + + (clockedObject->clockEdge()) - numStages); + } + bool rdy(Cycles cycles = Cycles(0)) const + { + return clockedObject->clockEdge(cycles) >= nxtAvail; } - void preset(uint32_t delay) + bool prerdy() const { - lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages); + return clockedObject->clockEdge() >= lookAheadAvail; } - bool rdy() const { return *tcnt >= nxtAvail; } - bool prerdy() const { return *tcnt >= lookAheadAvail; } private: // timestamp indicating when resource will be available @@ -75,11 +103,11 @@ class WaitClass // pending uses of the resource (when there is a cycle gap between // rdy() and set() uint64_t lookAheadAvail; - // current timestamp - uint64_t *tcnt; + // clockedObject for current timestamp + ClockedObject *clockedObject; // number of stages between checking if a resource is ready and // setting the resource's utilization - uint32_t numStages; + uint64_t numStages; }; class Float16 @@ -93,7 +121,7 @@ class Float16 Float16(float x) { - uint32_t ai = *(uint32_t *)&x; + uint32_t ai = *(reinterpret_cast(&x)); uint32_t s = (ai >> 31) & 0x1; uint32_t exp = (ai >> 23) & 0xff; @@ -139,7 +167,7 @@ class Float16 val1 |= (exp << 23); val1 |= (mant << 13); - return *(float*)&val1; + return *(reinterpret_cast(&val1)); } }; diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc index 890e0d112..6c95ca25a 100644 --- a/src/gpu-compute/pool_manager.cc +++ b/src/gpu-compute/pool_manager.cc @@ -33,8 +33,8 @@ #include "gpu-compute/pool_manager.hh" -PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize) - : _minAllocation(minAlloc), _poolSize(poolSize) +PoolManager::PoolManager(const PoolManagerParams *p) + : SimObject(p), _minAllocation(p->min_alloc), _poolSize(p->pool_size) { - assert(poolSize > 0); + assert(_poolSize > 0); } diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh index bab8b6ddf..9bbaa6459 100644 --- a/src/gpu-compute/pool_manager.hh +++ b/src/gpu-compute/pool_manager.hh @@ -38,11 +38,15 @@ #include #include +#include "params/PoolManager.hh" +#include "sim/sim_object.hh" + // Pool Manager Logic -class PoolManager +class PoolManager : public SimObject { public: - PoolManager(uint32_t minAlloc, uint32_t poolSize); + PoolManager(const PoolManagerParams *p); + virtual ~PoolManager() { _poolSize = 0; } uint32_t minAllocation() { return _minAllocation; } virtual std::string printRegion() = 0; virtual uint32_t regionSize(std::pair ®ion) = 0; diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc new file mode 100644 index 000000000..eb6474cd2 --- /dev/null +++ b/src/gpu-compute/register_file.cc @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: John Kalamatianos, + * Mark Wyse + */ + +#include "gpu-compute/register_file.hh" + +#include +#include + +#include "base/intmath.hh" +#include "base/logging.hh" +#include "debug/GPURF.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "params/RegisterFile.hh" + +RegisterFile::RegisterFile(const RegisterFileParams *p) + : SimObject(p), simdId(p->simd_id), _numRegs(p->num_regs) +{ + fatal_if((_numRegs % 2) != 0, "VRF size is illegal\n"); + fatal_if(simdId < 0, "Illegal SIMD id for VRF"); + + busy.clear(); + busy.resize(_numRegs, 0); +} + +RegisterFile::~RegisterFile() +{ +} + +void +RegisterFile::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; +} + +std::string +RegisterFile::dump() const +{ + std::stringstream ss; + ss << "Busy: "; + for (int i = 0; i < busy.size(); i++) { + ss << (int)busy[i]; + } + ss << "\n"; + return ss.str(); +} + +// Scoreboard functions + +bool +RegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const +{ + return true; +} + +bool +RegisterFile::regBusy(int idx) const +{ + return busy.at(idx); +} + +void +RegisterFile::markReg(int regIdx, bool value) +{ + DPRINTF(GPURF, "SIMD[%d] markReg(): physReg[%d] = %d\n", + simdId, regIdx, (int)value); + busy.at(regIdx) = value; +} + +void +RegisterFile::enqRegFreeEvent(uint32_t regIdx, uint64_t delay) +{ + DPRINTF(GPURF, "SIMD[%d] enqRegFreeEvent physReg[%d] at %llu\n", + simdId, regIdx, curTick() + delay); + schedule(new MarkRegFreeScbEvent(this, regIdx), + curTick() + delay); +} + +void +RegisterFile::enqRegBusyEvent(uint32_t regIdx, uint64_t delay) +{ + DPRINTF(GPURF, "SIMD[%d] enqRegBusyEvent physReg[%d] at %llu\n", + simdId, regIdx, curTick() + delay); + schedule(new MarkRegBusyScbEvent(this, regIdx), + curTick() + delay); +} + +// Schedule functions +bool +RegisterFile::canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii) +{ + return true; +} + +void +RegisterFile::scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii) +{ +} + +bool +RegisterFile::canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii) +{ + return true; +} + +void +RegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii) +{ +} + +bool +RegisterFile::canScheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii) +{ + return true; +} + +void +RegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, GPUDynInstPtr ii) +{ +} + +bool +RegisterFile::operandReadComplete(Wavefront *w, GPUDynInstPtr ii) +{ + return true; +} + +// Exec functions +void +RegisterFile::exec() +{ +} + +void +RegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) +{ +} + +RegisterFile* +RegisterFileParams::create() +{ + return new RegisterFile(this); +} + +// Events + +// Mark a register as free in the scoreboard/busy vector +void +RegisterFile::MarkRegFreeScbEvent::process() +{ + rf->markReg(regIdx, false); +} + +// Mark a register as busy in the scoreboard/busy vector +void +RegisterFile::MarkRegBusyScbEvent::process() +{ + rf->markReg(regIdx, true); +} + +void +RegisterFile::dispatchInstruction(GPUDynInstPtr ii) +{ +} + +void +RegisterFile::regStats() +{ + registerReads + .name(name() + ".register_reads") + .desc("Total number of DWORDs read from register file") + ; + + registerWrites + .name(name() + ".register_writes") + .desc("Total number of DWORDS written to register file") + ; + + sramReads + .name(name() + ".sram_reads") + .desc("Total number of register file bank SRAM activations for reads") + ; + + sramWrites + .name(name() + ".sram_writes") + .desc("Total number of register file bank SRAM activations for writes") + ; +} diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh new file mode 100644 index 000000000..4bd705a5e --- /dev/null +++ b/src/gpu-compute/register_file.hh @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: John Kalamatianos, + * Mark Wyse + */ + +#ifndef __REGISTER_FILE_HH__ +#define __REGISTER_FILE_HH__ + +#include +#include + +#include "base/statistics.hh" +#include "base/types.hh" +#include "gpu-compute/misc.hh" +#include "sim/sim_object.hh" + +class ComputeUnit; +class Shader; +class PoolManager; +class Wavefront; + +struct RegisterFileParams; + +// Abstract Register File +// This register file class can be inherited from to create both +// scalar and vector register files. +class RegisterFile : public SimObject +{ + public: + RegisterFile(const RegisterFileParams *p); + virtual ~RegisterFile(); + virtual void setParent(ComputeUnit *_computeUnit); + int numRegs() const { return _numRegs; } + virtual void regStats() override; + + // State functions + + // Scoreboard functions + virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const; + virtual bool regBusy(int idx) const; + virtual void markReg(int regIdx, bool value); + + // Abstract Register Event + class RegisterEvent : public Event + { + protected: + RegisterFile *rf; + int regIdx; + + public: + RegisterEvent(RegisterFile *_rf, int _regIdx) + : rf(_rf), regIdx(_regIdx) { setFlags(AutoDelete); } + }; + + // Register Event to mark a register as free in the scoreboard/busy vector + class MarkRegFreeScbEvent : public RegisterEvent + { + public: + MarkRegFreeScbEvent(RegisterFile *_rf, int _regIdx) + : RegisterEvent(_rf, _regIdx) { } + void process(); + }; + + // Register Event to mark a register as busy in the scoreboard/busy vector + class MarkRegBusyScbEvent : public RegisterEvent + { + public: + MarkRegBusyScbEvent(RegisterFile *_rf, int _regIdx) + : RegisterEvent(_rf, _regIdx) { } + void process(); + }; + + // Schedule an event to mark a register as free/busy in + // the scoreboard/busy vector. Delay is already in Ticks + virtual void enqRegFreeEvent(uint32_t regIdx, uint64_t delay); + virtual void enqRegBusyEvent(uint32_t regIdx, uint64_t delay); + + // Schedule functions + + // The following functions are called by the SCH stage when attempting + // to move a wave from the readyList to the schList. + // canSchedule* checks if the RF is ready to provide operands for + // the instruction, while schedule* requests the RF to begin reading + // and writing of operands. Calling schedule* may only occur + // immediately after canSchedule* was called and returned True + virtual bool canScheduleReadOperands(Wavefront *w, GPUDynInstPtr ii); + virtual bool canScheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii); + virtual void scheduleReadOperands(Wavefront *w, GPUDynInstPtr ii); + virtual void scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii); + + // The following function is called to check if all operands + // have been read for the given instruction + virtual bool operandReadComplete(Wavefront *w, GPUDynInstPtr ii); + + // The following two functions are only called by returning loads to + // check if the register file can support the incoming writes + virtual bool canScheduleWriteOperandsFromLoad(Wavefront *w, + GPUDynInstPtr ii); + // Queue the register writes. Assumes canScheduleWriteOperandsFromLoad + // was called immediately prior and returned True + virtual void scheduleWriteOperandsFromLoad(Wavefront *w, + GPUDynInstPtr ii); + + // ExecRF is invoked every cycle by the compute unit and may be + // used to model detailed timing of the register file. + virtual void exec(); + + // Called to inform RF that an instruction is executing + // to schedule events for writeback, etc., as needed + virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii); + + // Debug functions + virtual std::string dump() const; + + virtual void dispatchInstruction(GPUDynInstPtr ii); + + protected: + ComputeUnit* computeUnit; + int simdId; + + // flag indicating if a register is busy + std::vector busy; + + // numer of registers in this register file + int _numRegs; + // Stats + // Total number of register reads, incremented once per DWORD per thread + Stats::Scalar registerReads; + // Total number of register writes, incremented once per DWORD per thread + Stats::Scalar registerWrites; + + // Number of register file SRAM activations for reads. + // The register file may be implemented with multiple SRAMs. This stat + // tracks how many times the SRAMs are accessed for reads. + Stats::Scalar sramReads; + // Number of register file SRAM activations for writes + Stats::Scalar sramWrites; +}; + +#endif // __REGISTER_FILE_HH__ diff --git a/src/gpu-compute/register_manager.cc b/src/gpu-compute/register_manager.cc new file mode 100644 index 000000000..65c126066 --- /dev/null +++ b/src/gpu-compute/register_manager.cc @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Mark Wyse + */ + +#include "gpu-compute/register_manager.hh" + +#include "config/the_gpu_isa.hh" +#include "debug/GPURename.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/scalar_register_file.hh" +#include "gpu-compute/static_register_manager_policy.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" +#include "params/RegisterManager.hh" + +RegisterManager::RegisterManager(const RegisterManagerParams *p) + : SimObject(p), srfPoolMgrs(p->srf_pool_managers), + vrfPoolMgrs(p->vrf_pool_managers) +{ + if (p->policy == "static") { + policy = new StaticRegisterManagerPolicy(); + } else { + fatal("Unimplemented Register Manager Policy"); + } + +} + +RegisterManager::~RegisterManager() +{ + for (auto mgr : srfPoolMgrs) { + delete mgr; + } + for (auto mgr : vrfPoolMgrs) { + delete mgr; + } +} + +void +RegisterManager::exec() +{ + policy->exec(); +} + +void +RegisterManager::setParent(ComputeUnit *cu) +{ + computeUnit = cu; + policy->setParent(computeUnit); + for (int i = 0; i < srfPoolMgrs.size(); i++) { + fatal_if(computeUnit->srf[i]->numRegs() % + srfPoolMgrs[i]->minAllocation(), + "Min SGPR allocation is not multiple of VRF size\n"); + } + for (int i = 0; i < vrfPoolMgrs.size(); i++) { + fatal_if(computeUnit->vrf[i]->numRegs() % + vrfPoolMgrs[i]->minAllocation(), + "Min VGPG allocation is not multiple of VRF size\n"); + } +} + +// compute mapping for vector register +int +RegisterManager::mapVgpr(Wavefront* w, int vgprIndex) +{ + return policy->mapVgpr(w, vgprIndex); +} + +// compute mapping for scalar register +int +RegisterManager::mapSgpr(Wavefront* w, int sgprIndex) +{ + return policy->mapSgpr(w, sgprIndex); +} + +// check if we can allocate registers +bool +RegisterManager::canAllocateVgprs(int simdId, int nWfs, int demandPerWf) +{ + return policy->canAllocateVgprs(simdId, nWfs, demandPerWf); +} + +bool +RegisterManager::canAllocateSgprs(int simdId, int nWfs, int demandPerWf) +{ + return policy->canAllocateSgprs(simdId, nWfs, demandPerWf); +} + +// allocate registers +void +RegisterManager::allocateRegisters(Wavefront *w, int vectorDemand, + int scalarDemand) +{ + policy->allocateRegisters(w, vectorDemand, scalarDemand); +} + +void +RegisterManager::freeRegisters(Wavefront* w) +{ + policy->freeRegisters(w); +} + +void +RegisterManager::regStats() +{ + policy->regStats(); +} + +RegisterManager* +RegisterManagerParams::create() +{ + return new RegisterManager(this); +} diff --git a/src/gpu-compute/register_manager.hh b/src/gpu-compute/register_manager.hh new file mode 100644 index 000000000..60acf9533 --- /dev/null +++ b/src/gpu-compute/register_manager.hh @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016, 2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Mark Wyse + */ + +#ifndef __REGISTER_MANAGER_HH__ +#define __REGISTER_MANAGER_HH__ + +#include +#include +#include +#include +#include + +#include "gpu-compute/pool_manager.hh" +#include "gpu-compute/register_manager_policy.hh" +#include "sim/sim_object.hh" +#include "sim/stats.hh" + +class ComputeUnit; +class Wavefront; + +struct RegisterManagerParams; + +/* + * Rename stage. + */ +class RegisterManager : public SimObject +{ + public: + RegisterManager(const RegisterManagerParams* params); + ~RegisterManager(); + void setParent(ComputeUnit *cu); + void exec(); + + // Stats related variables and methods + void regStats(); + + // lookup virtual to physical register translation + int mapVgpr(Wavefront* w, int vgprIndex); + int mapSgpr(Wavefront* w, int sgprIndex); + + // check if we can allocate registers + bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf); + bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf); + + // allocate registers + void allocateRegisters(Wavefront *w, int vectorDemand, int scalarDemand); + + // free all registers used by the WF + void freeRegisters(Wavefront *w); + + std::vector srfPoolMgrs; + std::vector vrfPoolMgrs; + + private: + RegisterManagerPolicy *policy; + + ComputeUnit *computeUnit; + + std::string _name; +}; + +#endif // __REGISTER_MANAGER_HH__ diff --git a/src/gpu-compute/register_manager_policy.hh b/src/gpu-compute/register_manager_policy.hh new file mode 100644 index 000000000..2a5a2eb1e --- /dev/null +++ b/src/gpu-compute/register_manager_policy.hh @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Mark Wyse + */ + +#ifndef __REGISTER_MANAGER_POLICY_HH__ +#define __REGISTER_MANAGER_POLICY_HH__ + +#include + +class ComputeUnit; +class HSAQueueEntry; +class Wavefront; + +/** + * Register Manager Policy abstract class + * + * A Register Manager Policy implements all of the functionality + * of the Register Manager, including register mapping, allocation, + * and freeing. Different policies may be implemented that support + * different architectures or different methods of mapping and + * allocation. + */ +class RegisterManagerPolicy +{ + public: + virtual void setParent(ComputeUnit *_cu) { cu = _cu; } + + // Execute: called by RenameStage::execute() + virtual void exec() = 0; + + // provide virtual to physical register mapping + virtual int mapVgpr(Wavefront* w, int vgprIndex) = 0; + virtual int mapSgpr(Wavefront* w, int sgprIndex) = 0; + + // check if requested number of vector registers can be allocated + virtual bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) = 0; + // check if requested number of scalar registers can be allocated + // machine ISA only + virtual bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) = 0; + + // allocate vector registers and reserve from register pool + virtual void allocateRegisters(Wavefront *w, int vectorDemand, + int scalarDemand) = 0; + + // free all remaining registers held by specified WF + virtual void freeRegisters(Wavefront *w) = 0; + + // stats + virtual void regStats() = 0; + + protected: + ComputeUnit *cu; +}; + +#endif // __REGISTER_MANAGER_POLICY_HH__ diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh index aaba1d340..75a098151 100644 --- a/src/gpu-compute/rr_scheduling_policy.hh +++ b/src/gpu-compute/rr_scheduling_policy.hh @@ -36,6 +36,7 @@ #include +#include "base/logging.hh" #include "gpu-compute/scheduling_policy.hh" #include "gpu-compute/wavefront.hh" diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc new file mode 100644 index 000000000..c8823b8a6 --- /dev/null +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2016-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: John Kalamatianos + */ + +#include "gpu-compute/scalar_memory_pipeline.hh" + +#include "debug/GPUMem.hh" +#include "debug/GPUReg.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/scalar_register_file.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" + +ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p) : + computeUnit(nullptr), queueSize(p->scalar_mem_queue_size), + inflightStores(0), inflightLoads(0) +{ +} + +void +ScalarMemPipeline::init(ComputeUnit *cu) +{ + computeUnit = cu; + _name = computeUnit->name() + ".ScalarMemPipeline"; +} + +void +ScalarMemPipeline::exec() +{ + // afind oldest scalar request whose data has arrived + GPUDynInstPtr m = !returnedLoads.empty() ? returnedLoads.front() : + !returnedStores.empty() ? returnedStores.front() : nullptr; + + Wavefront *w = nullptr; + + bool accessSrf = true; + // check the SRF to see if the operands of a load (or load component + // of an atomic) are accessible + if ((m) && (m->isLoad() || m->isAtomicRet())) { + w = m->wavefront(); + + accessSrf = + w->computeUnit->srf[w->simdId]-> + canScheduleWriteOperandsFromLoad(w, m); + } + + if ((!returnedStores.empty() || !returnedLoads.empty()) && + m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() && + accessSrf && + (computeUnit->shader->coissue_return || + computeUnit->scalarMemUnit.rdy())) { + + w = m->wavefront(); + + if (m->isLoad() || m->isAtomicRet()) { + w->computeUnit->srf[w->simdId]-> + scheduleWriteOperandsFromLoad(w, m); + } + + m->completeAcc(m); + + if (m->isLoad() || m->isAtomic()) { + returnedLoads.pop(); + assert(inflightLoads > 0); + --inflightLoads; + } else { + returnedStores.pop(); + assert(inflightStores > 0); + --inflightStores; + } + + // Decrement outstanding register count + computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + + if (m->isStore() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm, + m->time, -1); + } + + if (m->isLoad() || m->isAtomic()) { + computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm, + m->time, -1); + } + + // Mark write bus busy for appropriate amount of time + computeUnit->scalarMemToSrfBus.set(m->time); + if (!computeUnit->shader->coissue_return) + w->computeUnit->scalarMemUnit.set(m->time); + } + + // If pipeline has executed a global memory instruction + // execute global memory packets and issue global + // memory packets to DTLB + if (!issuedRequests.empty()) { + GPUDynInstPtr mp = issuedRequests.front(); + if (mp->isLoad() || mp->isAtomic()) { + + if (inflightLoads >= queueSize) { + return; + } else { + ++inflightLoads; + } + } else { + if (inflightStores >= queueSize) { + return; + } else { + ++inflightStores; + } + } + mp->initiateAcc(mp); + issuedRequests.pop(); + + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId); + } +} + +void +ScalarMemPipeline::regStats() +{ +} diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh new file mode 100644 index 000000000..1944477cf --- /dev/null +++ b/src/gpu-compute/scalar_memory_pipeline.hh @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: John Kalamatianos + */ + +#ifndef __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__ +#define __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__ + +#include +#include + +#include "gpu-compute/misc.hh" +#include "params/ComputeUnit.hh" +#include "sim/stats.hh" + +/* + * @file scalar_memory_pipeline.hh + * + * The scalar memory pipeline issues global memory packets + * from the scalar ALU to the DTLB and L1 Scalar Data Cache. + * The exec() method of the memory packet issues + * the packet to the DTLB if there is space available in the return fifo. + * This exec() method also retires previously issued loads and stores that have + * returned from the memory sub-system. + */ + +class ComputeUnit; + +class ScalarMemPipeline +{ + public: + ScalarMemPipeline(const ComputeUnitParams *params); + void init(ComputeUnit *cu); + void exec(); + + std::queue &getGMReqFIFO() { return issuedRequests; } + std::queue &getGMStRespFIFO() { return returnedStores; } + std::queue &getGMLdRespFIFO() { return returnedLoads; } + + bool + isGMLdRespFIFOWrRdy() const + { + return returnedLoads.size() < queueSize; + } + + bool + isGMStRespFIFOWrRdy() const + { + return returnedStores.size() < queueSize; + } + + bool + isGMReqFIFOWrRdy(uint32_t pendReqs=0) const + { + return (issuedRequests.size() + pendReqs) < queueSize; + } + + const std::string &name() const { return _name; } + void regStats(); + + private: + ComputeUnit *computeUnit; + std::string _name; + int queueSize; + + // Counters to track and limit the inflight scalar loads and stores + // generated by this memory pipeline. + int inflightStores; + int inflightLoads; + + // Scalar Memory Request FIFO: all global memory scalar requests + // are issued to this FIFO from the scalar memory pipelines + std::queue issuedRequests; + + // Scalar Store Response FIFO: all responses of global memory + // scalar stores are sent to this FIFO from L1 Scalar Data Cache + std::queue returnedStores; + + // Scalar Load Response FIFO: all responses of global memory + // scalar loads are sent to this FIFO from L1 Scalar Data Cache + std::queue returnedLoads; +}; + +#endif // __GPU_COMPUTE_SCALAR_MEMORY_PIPELINE_HH__ diff --git a/src/gpu-compute/scalar_register_file.cc b/src/gpu-compute/scalar_register_file.cc new file mode 100644 index 000000000..150587676 --- /dev/null +++ b/src/gpu-compute/scalar_register_file.cc @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: John Kalamatianos, + * Mark Wyse + */ + +#include "gpu-compute/scalar_register_file.hh" + +#include "base/logging.hh" +#include "debug/GPUSRF.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "params/ScalarRegisterFile.hh" + +ScalarRegisterFile::ScalarRegisterFile(const ScalarRegisterFileParams *p) + : RegisterFile(p) +{ + regFile.resize(numRegs(), 0); +} + +bool +ScalarRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const +{ + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) { + + int sgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 : + ii->getOperandSize(i) / 4; + + for (int j = 0; j < nRegs; ++j) { + int pSgpr = + computeUnit->registerManager->mapSgpr(w, sgprIdx + j); + + if (regBusy(pSgpr)) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + DPRINTF(GPUSRF, "RAW stall: WV[%d]: %s: physReg[%d]\n", + w->wfDynId, ii->disassemble(), pSgpr); + w->numTimesBlockedDueRAWDependencies++; + } + return false; + } + } // nRegs + } // isScalar + } // operand + return true; +} + +void +ScalarRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii) +{ + // iterate over all register destination operands + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isScalarRegister(i) && ii->isDstOperand(i)) { + + int sgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 : + ii->getOperandSize(i) / 4; + + for (int j = 0; j < nRegs; ++j) { + int physReg = + computeUnit->registerManager->mapSgpr(w, sgprIdx + j); + + // mark the destination scalar register as busy + markReg(physReg, true); + } + } + } +} + +void +ScalarRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) +{ + for (int i = 0; i < ii->getNumOperands(); i++) { + if (ii->isScalarRegister(i) && ii->isSrcOperand(i)) { + int DWORDs = ii->getOperandSize(i) <= 4 ? 1 + : ii->getOperandSize(i) / 4; + registerReads += DWORDs; + } + } + + if (!ii->isLoad() && !(ii->isAtomic() || ii->isMemSync())) { + Cycles delay(computeUnit->scalarPipeLength()); + Tick tickDelay = computeUnit->cyclesToTicks(delay); + + for (int i = 0; i < ii->getNumOperands(); i++) { + if (ii->isScalarRegister(i) && ii->isDstOperand(i)) { + int sgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 + : ii->getOperandSize(i) / 4; + for (int j = 0; j < nRegs; j++) { + int physReg = computeUnit->registerManager-> + mapSgpr(w, sgprIdx + j); + enqRegFreeEvent(physReg, tickDelay); + } + + registerWrites += nRegs; + } + } + } +} + +void +ScalarRegisterFile::scheduleWriteOperandsFromLoad(Wavefront *w, + GPUDynInstPtr ii) +{ + assert(ii->isLoad() || ii->isAtomicRet()); + for (int i = 0; i < ii->getNumOperands(); ++i) { + if (ii->isScalarRegister(i) && ii->isDstOperand(i)) { + + int sgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 : + ii->getOperandSize(i) / 4; + + for (int j = 0; j < nRegs; ++j) { + int physReg = computeUnit->registerManager-> + mapSgpr(w, sgprIdx + j); + enqRegFreeEvent(physReg, computeUnit->clockPeriod()); + } + + registerWrites += nRegs; + } + } +} + +ScalarRegisterFile* +ScalarRegisterFileParams::create() +{ + return new ScalarRegisterFile(this); +} diff --git a/src/gpu-compute/scalar_register_file.hh b/src/gpu-compute/scalar_register_file.hh new file mode 100644 index 000000000..8002334b3 --- /dev/null +++ b/src/gpu-compute/scalar_register_file.hh @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: John Kalamatianos, + * Mark Wyse + */ + +#ifndef __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__ +#define __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__ + +#include "arch/gpu_isa.hh" +#include "base/statistics.hh" +#include "base/trace.hh" +#include "base/types.hh" +#include "debug/GPUSRF.hh" +#include "gpu-compute/register_file.hh" +#include "gpu-compute/wavefront.hh" + +struct ScalarRegisterFileParams; + +// Scalar Register File +class ScalarRegisterFile : public RegisterFile +{ + public: + using ScalarRegU32 = TheGpuISA::ScalarRegU32; + + ScalarRegisterFile(const ScalarRegisterFileParams *p); + ~ScalarRegisterFile() { } + + virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override; + virtual void scheduleWriteOperands(Wavefront *w, + GPUDynInstPtr ii) override; + virtual void scheduleWriteOperandsFromLoad(Wavefront *w, + GPUDynInstPtr ii) override; + virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override; + + void + setParent(ComputeUnit *_computeUnit) override + { + RegisterFile::setParent(_computeUnit); + } + + // Read a register that is writeable (e.g., a DST operand) + ScalarRegU32& + readWriteable(int regIdx) + { + return regFile[regIdx]; + } + + // Read a register that is not writeable (e.g., src operand) + ScalarRegU32 + read(int regIdx) const + { + return regFile[regIdx]; + } + + // Write a register + void + write(int regIdx, ScalarRegU32 value) + { + regFile[regIdx] = value; + } + + void + printReg(Wavefront *wf, int regIdx) const + { + DPRINTF(GPUSRF, "WF[%d][%d]: Id%d s[%d] = %#x\n", wf->simdId, + wf->wfSlotId, wf->wfDynId, regIdx, regFile[regIdx]); + } + + private: + std::vector regFile; +}; + +#endif // __GPU_COMPUTE_SCALAR_REGISTER_FILE_HH__ diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 63ab8db7b..949eed155 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -33,24 +33,36 @@ #include "gpu-compute/schedule_stage.hh" +#include + +#include "debug/GPUSched.hh" +#include "debug/GPUVRF.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ScheduleStage::ScheduleStage(const ComputeUnitParams *p) - : numSIMDs(p->num_SIMDs), - numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes) +ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu) + : vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false), + scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false), + locMemBusRdy(false), locMemIssueRdy(false) { - for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + for (int j = 0; j < cu->numExeUnits(); ++j) { scheduler.emplace_back(p); } + wavesInSch.clear(); + schList.resize(cu->numExeUnits()); + for (auto &dq : schList) { + dq.clear(); + } } ScheduleStage::~ScheduleStage() { scheduler.clear(); - waveStatusList.clear(); + wavesInSch.clear(); + schList.clear(); } void @@ -59,90 +71,775 @@ ScheduleStage::init(ComputeUnit *cu) computeUnit = cu; _name = computeUnit->name() + ".ScheduleStage"; - for (int j = 0; j < numSIMDs + numMemUnits; ++j) { + fatal_if(scheduler.size() != computeUnit->readyList.size(), + "Scheduler should have same number of entries as CU's readyList"); + for (int j = 0; j < computeUnit->numExeUnits(); ++j) { scheduler[j].bindList(&computeUnit->readyList[j]); } - for (int j = 0; j < numSIMDs; ++j) { - waveStatusList.push_back(&computeUnit->waveStatusList[j]); + dispatchList = &computeUnit->dispatchList; + + assert(computeUnit->numVectorGlobalMemUnits == 1); + assert(computeUnit->numVectorSharedMemUnits == 1); +} + +void +ScheduleStage::exec() +{ + // Update readyList + for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + // delete all ready wavefronts whose instruction buffers are now + // empty because the last instruction was executed + computeUnit->updateReadyList(j); + /** + * Remove any wave that already has an instruction present in SCH + * waiting for RF reads to complete. This prevents out of order + * execution within a wave. + */ + for (auto wIt = computeUnit->readyList.at(j).begin(); + wIt != computeUnit->readyList.at(j).end();) { + if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) { + *wIt = nullptr; + wIt = computeUnit->readyList.at(j).erase(wIt); + } else { + wIt++; + } + } + } + + // Attempt to add another wave for each EXE type to schList queues + // VMEM resources are iterated first, effectively giving priority + // to VMEM over VALU for scheduling read of operands to the RFs. + // Scalar Memory are iterated after VMEM + + // Iterate VMEM and SMEM + int firstMemUnit = computeUnit->firstMemUnit(); + int lastMemUnit = computeUnit->lastMemUnit(); + for (int j = firstMemUnit; j <= lastMemUnit; j++) { + int readyListSize = computeUnit->readyList[j].size(); + // If no wave is ready to be scheduled on the execution resource + // then skip scheduling for this execution resource + if (!readyListSize) { + rdyListEmpty[j]++; + continue; + } + rdyListNotEmpty[j]++; + + // Pick a wave and attempt to add it to schList + Wavefront *w = scheduler[j].chooseWave(); + if (!addToSchList(j, w)) { + // For waves not added to schList, increment count of cycles + // this wave spends in SCH stage. + w->schCycles++; + addToSchListStalls[j]++; + } } - dispatchList = &computeUnit->dispatchList; + // Iterate everything else + for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + // skip the VMEM resources + if (j >= firstMemUnit && j <= lastMemUnit) { + continue; + } + int readyListSize = computeUnit->readyList[j].size(); + // If no wave is ready to be scheduled on the execution resource + // then skip scheduling for this execution resource + if (!readyListSize) { + rdyListEmpty[j]++; + continue; + } + rdyListNotEmpty[j]++; + + // Pick a wave and attempt to add it to schList + Wavefront *w = scheduler[j].chooseWave(); + if (!addToSchList(j, w)) { + // For waves not added to schList, increment count of cycles + // this wave spends in SCH stage. + w->schCycles++; + addToSchListStalls[j]++; + } + } + + // At this point, the schList queue per EXE type may contain + // multiple waves, in order of age (oldest to youngest). + // Wave may be in RFBUSY, indicating they are waiting for registers + // to be read, or in RFREADY, indicating they are candidates for + // the dispatchList and execution + + // Iterate schList queues and check if any of the waves have finished + // reading their operands, moving those waves to RFREADY status + checkRfOperandReadComplete(); + + // Fill the dispatch list with the oldest wave of each EXE type that + // is ready to execute + // Wave is picked if status in schList is RFREADY and it passes resource + // ready checks similar to those currently in SCB + fillDispatchList(); + + // Resource arbitration on waves in dispatchList + // Losing waves are re-inserted to the schList at a location determined + // by wave age + + // Arbitrate access to the VRF->LDS bus + arbitrateVrfToLdsBus(); + + // Schedule write operations to the register files + scheduleRfDestOperands(); + + // Lastly, reserve resources for waves that are ready to execute. + reserveResources(); +} + +void +ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s, + Wavefront *w) +{ + dispatchList->at(unitId).first = w; + dispatchList->at(unitId).second = s; +} + +bool +ScheduleStage::schedRfWrites(int exeType, Wavefront *w) +{ + GPUDynInstPtr ii = w->instructionBuffer.front(); + assert(ii); + bool accessVrfWr = true; + if (!ii->isScalar()) { + accessVrfWr = + computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii); + } + bool accessSrfWr = + computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii); + bool accessRf = accessVrfWr && accessSrfWr; + if (accessRf) { + if (!ii->isScalar()) { + computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii); + } + computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii); + return true; + } else { + rfAccessStalls[SCH_RF_ACCESS_NRDY]++; + if (!accessSrfWr) { + rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++; + } + if (!accessVrfWr) { + rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++; + } + + // Increment stall counts for WF + w->schStalls++; + w->schRfAccessStalls++; + } + return false; +} + +void +ScheduleStage::scheduleRfDestOperands() +{ + for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + if (!dispatchList->at(j).first) { + continue; + } + // get the wave on dispatch list and attempt to allocate write + // resources in the RFs + Wavefront *w = dispatchList->at(j).first; + if (!schedRfWrites(j, w)) { + reinsertToSchList(j, w); + doDispatchListTransition(j, EMPTY); + // if this is a flat inst, also transition the LM pipe to empty + // Note: since FLAT/LM arbitration occurs before scheduling + // destination operands to the RFs, it is possible that a LM + // instruction lost arbitration, but would have been able to + // pass the RF destination operand check here, and execute + // instead of the FLAT. + if (w->instructionBuffer.front()->isFlat()) { + assert(dispatchList->at(w->localMem).second == SKIP); + doDispatchListTransition(w->localMem, EMPTY); + } + } + } +} + +bool +ScheduleStage::addToSchList(int exeType, Wavefront *w) +{ + // Attempt to add the wave to the schList if the VRF can support the + // wave's next instruction + GPUDynInstPtr ii = w->instructionBuffer.front(); + assert(ii); + bool accessVrf = true; + if (!ii->isScalar()) { + accessVrf = + computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii); + } + bool accessSrf = + computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii); + // If RFs can support instruction, add to schList in RFBUSY state, + // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands + // to the VRF + bool accessRf = accessVrf && accessSrf; + if (accessRf) { + DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n", + exeType, w->simdId, w->wfDynId, + ii->seqNum(), ii->disassemble()); + + computeUnit->insertInPipeMap(w); + wavesInSch.emplace(w->wfDynId); + schList.at(exeType).push_back(std::make_pair(w, RFBUSY)); + if (w->isOldestInstWaitcnt()) { + w->setStatus(Wavefront::S_WAITCNT); + } + if (!ii->isScalar()) { + computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii); + } + computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii); + + DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n", + exeType, w->simdId, w->wfDynId, + ii->seqNum(), ii->disassemble()); + return true; + } else { + // Number of stall cycles due to RF access denied + rfAccessStalls[SCH_RF_ACCESS_NRDY]++; + // Count number of denials due to each reason + // Multiple items may contribute to the denied request + if (!accessVrf) { + rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++; + } + if (!accessSrf) { + rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++; + } + + // Increment stall counts for WF + w->schStalls++; + w->schRfAccessStalls++; + DPRINTF(GPUSched, "schList[%d]: Could not add: " + "SIMD[%d] WV[%d]: %d: %s\n", + exeType, w->simdId, w->wfDynId, + ii->seqNum(), ii->disassemble()); + } + return false; +} + +void +ScheduleStage::reinsertToSchList(int exeType, Wavefront *w) +{ + // Insert wave w into schList for specified exeType. + // Wave is inserted in age order, with oldest wave being at the + // front of the schList + auto schIter = schList.at(exeType).begin(); + while (schIter != schList.at(exeType).end() + && schIter->first->wfDynId < w->wfDynId) { + schIter++; + } + schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY)); +} + +void +ScheduleStage::checkMemResources() +{ + // Check for resource availability in the next cycle + scalarMemBusRdy = false; + scalarMemIssueRdy = false; + // check if there is a SRF->Global Memory bus available and + if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) { + scalarMemBusRdy = true; + } + // check if we can issue a scalar memory instruction + if (computeUnit->scalarMemUnit.rdy(Cycles(1))) { + scalarMemIssueRdy = true; + } + + glbMemBusRdy = false; + glbMemIssueRdy = false; + // check if there is a VRF->Global Memory bus available + if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) { + glbMemBusRdy = true; + } + // check if we can issue a Global memory instruction + if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) { + glbMemIssueRdy = true; + } + + locMemBusRdy = false; + locMemIssueRdy = false; + // check if there is a VRF->LDS bus available + if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) { + locMemBusRdy = true; + } + // check if we can issue a LDS instruction + if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) { + locMemIssueRdy = true; + } +} + +bool +ScheduleStage::dispatchReady(Wavefront *w) +{ + vectorAluRdy = false; + scalarAluRdy = false; + // check for available vector/scalar ALUs in the next cycle + if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) { + vectorAluRdy = true; + } + if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) { + scalarAluRdy = true; + } + GPUDynInstPtr ii = w->instructionBuffer.front(); + + if (ii->isNop()) { + // S_NOP requires SALU. V_NOP requires VALU. + // TODO: Scalar NOP does not require SALU in hardware, + // and is executed out of IB directly. + if (ii->isScalar() && !scalarAluRdy) { + dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; + return false; + } else if (!ii->isScalar() && !vectorAluRdy) { + dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; + return false; + } + } else if (ii->isEndOfKernel()) { + // EndPgm instruction + if (ii->isScalar() && !scalarAluRdy) { + dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; + return false; + } + } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) { + // Barrier, Branch, or ALU instruction + if (ii->isScalar() && !scalarAluRdy) { + dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; + return false; + } else if (!ii->isScalar() && !vectorAluRdy) { + dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; + return false; + } + } else if (!ii->isScalar() && ii->isGlobalMem()) { + // Vector Global Memory instruction + bool rdy = true; + if (!glbMemIssueRdy) { + rdy = false; + dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++; + } + if (!glbMemBusRdy) { + rdy = false; + dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++; + } + if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) { + rdy = false; + dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++; + } + if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) { + rdy = false; + dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++; + } + if (!rdy) { + return false; + } + } else if (ii->isScalar() && ii->isGlobalMem()) { + // Scalar Global Memory instruction + bool rdy = true; + if (!scalarMemIssueRdy) { + rdy = false; + dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++; + } + if (!scalarMemBusRdy) { + rdy = false; + dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++; + } + if (!computeUnit->scalarMemoryPipe. + isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe + + w->scalarWrGmReqsInPipe)) { + rdy = false; + dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++; + } + if (!rdy) { + return false; + } + } else if (!ii->isScalar() && ii->isLocalMem()) { + // Vector Local Memory instruction + bool rdy = true; + if (!locMemIssueRdy) { + rdy = false; + dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++; + } + if (!locMemBusRdy) { + rdy = false; + dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++; + } + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) { + rdy = false; + dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++; + } + if (!rdy) { + return false; + } + } else if (!ii->isScalar() && ii->isFlat()) { + // Vector Flat memory instruction + bool rdy = true; + if (!glbMemIssueRdy || !locMemIssueRdy) { + rdy = false; + dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++; + } + if (!glbMemBusRdy || !locMemBusRdy) { + rdy = false; + dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++; + } + if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) { + rdy = false; + dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++; + } + if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) { + rdy = false; + dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++; + } + if (!computeUnit->localMemoryPipe. + isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) { + rdy = false; + dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++; + } + if (!rdy) { + return false; + } + } else { + panic("%s: unknown instr checked for readiness", ii->disassemble()); + return false; + } + dispNrdyStalls[SCH_RDY]++; + return true; } void -ScheduleStage::arbitrate() -{ - // iterate over all Memory pipelines - for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) { - if (dispatchList->at(j).first) { - Wavefront *waveToMemPipe = dispatchList->at(j).first; - // iterate over all execution pipelines - for (int i = 0; i < numSIMDs + numMemUnits; ++i) { - if ((i != j) && (dispatchList->at(i).first)) { - Wavefront *waveToExePipe = dispatchList->at(i).first; - // if the two selected wavefronts are mapped to the same - // SIMD unit then they share the VRF - if (waveToMemPipe->simdId == waveToExePipe->simdId) { - int simdId = waveToMemPipe->simdId; - // Read VRF port arbitration: - // If there are read VRF port conflicts between the - // a memory and another instruction we drop the other - // instruction. We don't need to check for write VRF - // port conflicts because the memory instruction either - // does not need to write to the VRF (store) or will - // write to the VRF when the data comes back (load) in - // which case the arbiter of the memory pipes will - // resolve any conflicts - if (computeUnit->vrf[simdId]-> - isReadConflict(waveToMemPipe->wfSlotId, - waveToExePipe->wfSlotId)) { - // FIXME: The "second" member variable is never - // used in the model. I am setting it to READY - // simply to follow the protocol of setting it - // when the WF has an instruction ready to issue - waveStatusList[simdId]->at(waveToExePipe->wfSlotId) - .second = READY; - - dispatchList->at(i).first = nullptr; - dispatchList->at(i).second = EMPTY; - break; - } +ScheduleStage::fillDispatchList() +{ + // update execution resource status + checkMemResources(); + // iterate execution resources + for (int j = 0; j < computeUnit->numExeUnits(); j++) { + assert(dispatchList->at(j).second == EMPTY); + + // iterate waves in schList to pick one for dispatch + auto schIter = schList.at(j).begin(); + bool dispatched = false; + while (schIter != schList.at(j).end()) { + // only attempt to dispatch if status is RFREADY + if (schIter->second == RFREADY) { + // Check if this wave is ready for dispatch + bool dispRdy = dispatchReady(schIter->first); + if (!dispatched && dispRdy) { + // No other wave has been dispatched for this exe + // resource, and this wave is ready. Place this wave + // on dispatchList and make it ready for execution + // next cycle. + + // Acquire a coalescer token if it is a global mem + // operation. + GPUDynInstPtr mp = schIter->first-> + instructionBuffer.front(); + if (!mp->isMemSync() && !mp->isScalar() && + (mp->isGlobalMem() || mp->isFlat())) { + computeUnit->globalMemoryPipe.acqCoalescerToken(mp); + } + + doDispatchListTransition(j, EXREADY, schIter->first); + DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: " + "EMPTY->EXREADY\n", j); + schIter->first = nullptr; + schIter = schList.at(j).erase(schIter); + dispatched = true; + } else { + // Either another wave has been dispatched, or this wave + // was not ready, so it is stalled this cycle + schIter->first->schStalls++; + if (!dispRdy) { + // not ready for dispatch, increment stall stat + schIter->first->schResourceStalls++; } + // Examine next wave for this resource + schIter++; } + } else { + // Wave not in RFREADY, try next wave + schIter++; } } + + // Increment stall count if no wave sent to dispatchList for + // current execution resource + if (!dispatched) { + schListToDispListStalls[j]++; + } else { + schListToDispList[j]++; + } } } void -ScheduleStage::exec() +ScheduleStage::arbitrateVrfToLdsBus() { - for (int j = 0; j < numSIMDs + numMemUnits; ++j) { - uint32_t readyListSize = computeUnit->readyList[j].size(); + // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops + // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus + // and a VRF->LDS bus. In GFx9, this is not the case. - // If no wave is ready to be scheduled on the execution resource - // then skip scheduling for this execution resource - if (!readyListSize) { - continue; - } + // iterate the GM pipelines + for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) { + // get the GM pipe index in the dispatchList + int gm_exe_unit = computeUnit->firstMemUnit() + i; + // get the wave in the dispatchList + Wavefront *w = dispatchList->at(gm_exe_unit).first; + // If the WF is valid, ready to execute, and the instruction + // is a flat access, arbitrate with the WF's assigned LM pipe + if (w && dispatchList->at(gm_exe_unit).second == EXREADY && + w->instructionBuffer.front()->isFlat()) { + // If the associated LM pipe also has a wave selected, block + // that wave and let the Flat instruction issue. The WF in the + // LM pipe is added back to the schList for consideration next + // cycle. + if (dispatchList->at(w->localMem).second == EXREADY) { + reinsertToSchList(w->localMem, + dispatchList->at(w->localMem).first); + // Increment stall stats for LDS-VRF arbitration + ldsBusArbStalls++; + dispatchList->at(w->localMem).first->schLdsArbStalls++; + } + // With arbitration of LM pipe complete, transition the + // LM pipe to SKIP state in the dispatchList to inform EX stage + // that a Flat instruction is executing next cycle + doDispatchListTransition(w->localMem, SKIP, w); + DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: " + "EXREADY->SKIP\n", w->localMem); + } + } +} + +void +ScheduleStage::checkRfOperandReadComplete() +{ + // Iterate the schList queues and check if operand reads + // have completed in the RFs. If so, mark the wave as ready for + // selection for dispatchList + for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + for (auto &p : schList.at(j)) { + Wavefront *w = p.first; + assert(w); - Wavefront *waveToBeDispatched = scheduler[j].chooseWave(); - dispatchList->at(j).first = waveToBeDispatched; - waveToBeDispatched->updateResources(); - dispatchList->at(j).second = FILLED; + // Increment the number of cycles the wave spends in the + // SCH stage, since this loop visits every wave in SCH. + w->schCycles++; - waveStatusList[waveToBeDispatched->simdId]->at( - waveToBeDispatched->wfSlotId).second = BLOCKED; + GPUDynInstPtr ii = w->instructionBuffer.front(); + bool vrfRdy = true; + if (!ii->isScalar()) { + vrfRdy = + computeUnit->vrf[w->simdId]->operandReadComplete(w, ii); + } + bool srfRdy = + computeUnit->srf[w->simdId]->operandReadComplete(w, ii); + bool operandsReady = vrfRdy && srfRdy; + if (operandsReady) { + DPRINTF(GPUSched, + "schList[%d]: WV[%d] operands ready for: %d: %s\n", + j, w->wfDynId, ii->seqNum(), ii->disassemble()); + DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n", + j, w->wfDynId); + p.second = RFREADY; + } else { + DPRINTF(GPUSched, + "schList[%d]: WV[%d] operands not ready for: %d: %s\n", + j, w->wfDynId, ii->seqNum(), ii->disassemble()); + + // operands not ready yet, increment SCH stage stats + // aggregate to all wavefronts on the CU + p.second = RFBUSY; + + // Increment stall stats + w->schStalls++; + w->schOpdNrdyStalls++; - assert(computeUnit->readyList[j].size() == readyListSize - 1); + opdNrdyStalls[SCH_RF_OPD_NRDY]++; + if (!vrfRdy) { + opdNrdyStalls[SCH_VRF_OPD_NRDY]++; + } + if (!srfRdy) { + opdNrdyStalls[SCH_SRF_OPD_NRDY]++; + } + } + } } - // arbitrate over all shared resources among instructions being issued - // simultaneously - arbitrate(); +} + +void +ScheduleStage::reserveResources() +{ + std::vector exeUnitReservations; + exeUnitReservations.resize(computeUnit->numExeUnits(), false); + + for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + Wavefront *dispatchedWave = dispatchList->at(j).first; + if (dispatchedWave) { + DISPATCH_STATUS s = dispatchList->at(j).second; + if (s == EMPTY) { + continue; + } else if (s == EXREADY) { + // Wave is ready for execution + std::vector execUnitIds = + dispatchedWave->reserveResources(); + GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front(); + + if (!ii->isScalar()) { + computeUnit->vrf[dispatchedWave->simdId]-> + dispatchInstruction(ii); + } + computeUnit->srf[dispatchedWave->simdId]-> + dispatchInstruction(ii); + + std::stringstream ss; + for (auto id : execUnitIds) { + ss << id << " "; + } + DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s" + " Reserving ExeRes[ %s]\n", + j, dispatchedWave->simdId, dispatchedWave->wfDynId, + ii->seqNum(), ii->disassemble(), ss.str()); + // mark the resources as reserved for this cycle + for (auto execUnitId : execUnitIds) { + panic_if(exeUnitReservations.at(execUnitId), + "Execution unit %d is reserved!!!\n" + "SIMD[%d] WV[%d]: %d: %s", + execUnitId, dispatchedWave->simdId, + dispatchedWave->wfDynId, + ii->seqNum(), ii->disassemble()); + exeUnitReservations.at(execUnitId) = true; + } + + // If wavefront::reserveResources reserved multiple resources, + // then we're executing a flat memory instruction. This means + // that we've reserved a global and local memory unit. Thus, + // we need to mark the latter execution unit as not available. + if (execUnitIds.size() > 1) { + int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem; + assert(dispatchList->at(lm_exec_unit).second == SKIP); + } + } else if (s == SKIP) { + // Shared Memory pipe reserved for FLAT instruction. + // Verify the GM pipe for this wave is ready to execute + // and the wave in the GM pipe is the same as the wave + // in the LM pipe + int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem; + assert(dispatchList->at(gm_exec_unit).first->wfDynId == + dispatchedWave->wfDynId); + assert(dispatchList->at(gm_exec_unit).second == EXREADY); + } + } + } +} + +void +ScheduleStage::deleteFromSch(Wavefront *w) +{ + wavesInSch.erase(w->wfDynId); } void ScheduleStage::regStats() { + rdyListNotEmpty + .init(computeUnit->numExeUnits()) + .name(name() + ".rdy_list_not_empty") + .desc("number of cycles one or more wave on ready list per " + "execution resource") + ; + + rdyListEmpty + .init(computeUnit->numExeUnits()) + .name(name() + ".rdy_list_empty") + .desc("number of cycles no wave on ready list per " + "execution resource") + ; + + addToSchListStalls + .init(computeUnit->numExeUnits()) + .name(name() + ".sch_list_add_stalls") + .desc("number of cycles a wave is not added to schList per " + "execution resource when ready list is not empty") + ; + + schListToDispList + .init(computeUnit->numExeUnits()) + .name(name() + ".sch_list_to_disp_list") + .desc("number of cycles a wave is added to dispatchList per " + "execution resource") + ; + + schListToDispListStalls + .init(computeUnit->numExeUnits()) + .name(name() + ".sch_list_to_disp_list_stalls") + .desc("number of cycles no wave is added to dispatchList per " + "execution resource") + ; + + // Operand Readiness Stall Cycles + opdNrdyStalls + .init(SCH_RF_OPD_NRDY_CONDITIONS) + .name(name() + ".opd_nrdy_stalls") + .desc("number of stalls in SCH due to operands not ready") + ; + opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF")); + opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF")); + opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF")); + + // dispatchReady Stall Cycles + dispNrdyStalls + .init(SCH_NRDY_CONDITIONS) + .name(name() + ".disp_nrdy_stalls") + .desc("number of stalls in SCH due to resource not ready") + ; + dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu")); + dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu")); + dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY, + csprintf("VectorMemIssue")); + dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY, + csprintf("VectorMemBusBusy")); + dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY, + csprintf("VectorMemCoalescer")); + dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd")); + dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY, + csprintf("ScalarMemIssue")); + dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY, + csprintf("ScalarMemBusBusy")); + dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY, + csprintf("ScalarMemFIFO")); + dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY, + csprintf("LocalMemIssue")); + dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY, + csprintf("LocalMemBusBusy")); + dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY, + csprintf("LocalMemFIFO")); + dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY, + csprintf("FlatMemIssue")); + dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY, + csprintf("FlatMemBusBusy")); + dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY, + csprintf("FlatMemCoalescer")); + dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY, + csprintf("FlatMemFIFO")); + dispNrdyStalls.subname(SCH_RDY, csprintf("Ready")); + + // RF Access Stall Cycles + rfAccessStalls + .init(SCH_RF_ACCESS_NRDY_CONDITIONS) + .name(name() + ".rf_access_stalls") + .desc("number of stalls due to RF access denied") + ; + rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd")); + rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr")); + rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd")); + rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr")); + rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any")); + + // Stall cycles due to wave losing LDS bus arbitration + ldsBusArbStalls + .name(name() + ".lds_bus_arb_stalls") + .desc("number of stalls due to VRF->LDS bus conflicts") + ; } diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh index ee2dd14f7..98519701a 100644 --- a/src/gpu-compute/schedule_stage.hh +++ b/src/gpu-compute/schedule_stage.hh @@ -34,6 +34,9 @@ #ifndef __SCHEDULE_STAGE_HH__ #define __SCHEDULE_STAGE_HH__ +#include +#include +#include #include #include @@ -54,40 +57,169 @@ struct ComputeUnitParams; class ScheduleStage { public: - ScheduleStage(const ComputeUnitParams *params); + ScheduleStage(const ComputeUnitParams *params, ComputeUnit *cu); ~ScheduleStage(); void init(ComputeUnit *cu); void exec(); - void arbitrate(); + // Stats related variables and methods std::string name() { return _name; } + enum SchNonRdyType { + SCH_SCALAR_ALU_NRDY, + SCH_VECTOR_ALU_NRDY, + SCH_VECTOR_MEM_ISSUE_NRDY, + SCH_VECTOR_MEM_BUS_BUSY_NRDY, + SCH_VECTOR_MEM_COALESCER_NRDY, + SCH_VECTOR_MEM_REQS_NRDY, + SCH_CEDE_SIMD_NRDY, + SCH_SCALAR_MEM_ISSUE_NRDY, + SCH_SCALAR_MEM_BUS_BUSY_NRDY, + SCH_SCALAR_MEM_FIFO_NRDY, + SCH_LOCAL_MEM_ISSUE_NRDY, + SCH_LOCAL_MEM_BUS_BUSY_NRDY, + SCH_LOCAL_MEM_FIFO_NRDY, + SCH_FLAT_MEM_ISSUE_NRDY, + SCH_FLAT_MEM_BUS_BUSY_NRDY, + SCH_FLAT_MEM_COALESCER_NRDY, + SCH_FLAT_MEM_REQS_NRDY, + SCH_FLAT_MEM_FIFO_NRDY, + SCH_RDY, + SCH_NRDY_CONDITIONS + }; + enum schopdnonrdytype_e { + SCH_VRF_OPD_NRDY, + SCH_SRF_OPD_NRDY, + SCH_RF_OPD_NRDY, + SCH_RF_OPD_NRDY_CONDITIONS + }; + enum schrfaccessnonrdytype_e { + SCH_VRF_RD_ACCESS_NRDY, + SCH_VRF_WR_ACCESS_NRDY, + SCH_SRF_RD_ACCESS_NRDY, + SCH_SRF_WR_ACCESS_NRDY, + SCH_RF_ACCESS_NRDY, + SCH_RF_ACCESS_NRDY_CONDITIONS + }; + void regStats(); + // Called by ExecStage to inform SCH of instruction execution + void deleteFromSch(Wavefront *w); + + // Schedule List status + enum SCH_STATUS + { + RFBUSY = 0, // RF busy reading operands + RFREADY, // ready for exec + }; + private: ComputeUnit *computeUnit; - uint32_t numSIMDs; - uint32_t numMemUnits; - // Each execution resource will have its own // scheduler and a dispatch list std::vector scheduler; - // Stores the status of waves. A READY implies the - // wave is ready to be scheduled this cycle and - // is already present in the readyList - std::vector>*> - waveStatusList; - // List of waves which will be dispatched to - // each execution resource. A FILLED implies - // dispatch list is non-empty and - // execution unit has something to execute - // this cycle. Currently, the dispatch list of + // each execution resource. + // Currently, the dispatch list of // an execution resource can hold only one wave because // an execution resource can execute only one wave in a cycle. std::vector> *dispatchList; + // Stats + + // Number of cycles with empty (or not empty) readyList, per execution + // resource, when the CU is active (not sleeping) + Stats::Vector rdyListEmpty; + Stats::Vector rdyListNotEmpty; + + // Number of cycles, per execution resource, when at least one wave + // was on the readyList and picked by scheduler, but was unable to be + // added to the schList, when the CU is active (not sleeping) + Stats::Vector addToSchListStalls; + + // Number of cycles, per execution resource, when a wave is selected + // as candidate for dispatchList from schList + // Note: may be arbitrated off dispatchList (e.g., LDS arbitration) + Stats::Vector schListToDispList; + + // Per execution resource stat, incremented once per cycle if no wave + // was selected as candidate for dispatch and moved to dispatchList + Stats::Vector schListToDispListStalls; + + // Number of times a wave is selected by the scheduler but cannot + // be added to the schList due to register files not being able to + // support reads or writes of operands. RF_ACCESS_NRDY condition is always + // incremented if at least one read/write not supported, other + // conditions are incremented independently from each other. + Stats::Vector rfAccessStalls; + + // Number of times a wave is executing FLAT instruction and + // forces another wave occupying its required local memory resource + // to be deselected for execution, and placed back on schList + Stats::Scalar ldsBusArbStalls; + + // Count of times VRF and/or SRF blocks waves on schList from + // performing RFBUSY->RFREADY transition + Stats::Vector opdNrdyStalls; + + // Count of times resource required for dispatch is not ready and + // blocks wave in RFREADY state on schList from potentially moving + // to dispatchList + Stats::Vector dispNrdyStalls; + std::string _name; + + // called by exec() to add a wave to schList if the RFs can support it + bool addToSchList(int exeType, Wavefront *w); + // re-insert a wave to schList if wave lost arbitration + // wave is inserted such that age order (oldest to youngest) is preserved + void reinsertToSchList(int exeType, Wavefront *w); + // check waves in schList to see if RF reads complete + void checkRfOperandReadComplete(); + // check execution resources for readiness + bool vectorAluRdy; + bool scalarAluRdy; + bool scalarMemBusRdy; + bool scalarMemIssueRdy; + bool glbMemBusRdy; + bool glbMemIssueRdy; + bool locMemBusRdy; + bool locMemIssueRdy; + // check status of memory pipes and RF to Mem buses + void checkMemResources(); + // resource ready check called by fillDispatchList + bool dispatchReady(Wavefront *w); + // pick waves from schList and populate dispatchList with one wave + // per EXE resource type + void fillDispatchList(); + // arbitrate Shared Mem Pipe VRF/LDS bus for waves in dispatchList + void arbitrateVrfToLdsBus(); + // schedule destination operand writes to register files for waves in + // dispatchList + void scheduleRfDestOperands(); + // invoked by scheduleRfDestOperands to schedule RF writes for a wave + bool schedRfWrites(int exeType, Wavefront *w); + // reserve resources for waves surviving arbitration in dispatchList + void reserveResources(); + + void doDispatchListTransition(int unitId, DISPATCH_STATUS s, + Wavefront *w = nullptr); + + // Set tracking wfDynId for each wave present in schedule stage + // Used to allow only one instruction per wave in schedule + std::unordered_set wavesInSch; + + // List of waves (one list per exe resource) that are in schedule + // stage. Waves are added to this list after selected by scheduler + // from readyList. Waves are removed from this list and placed on + // dispatchList when status reaches SCHREADY. + // Waves are kept ordered by age for each resource, always favoring + // forward progress for the oldest wave. + // The maximum number of waves per resource can be determined by either + // the VRF/SRF availability or limits imposed by paremeters (to be added) + // of the SCH stage or CU. + std::vector>> schList; }; #endif // __SCHEDULE_STAGE_HH__ diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc index 262378e2c..c4b9b9fb6 100644 --- a/src/gpu-compute/scoreboard_check_stage.cc +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -33,29 +33,23 @@ #include "gpu-compute/scoreboard_check_stage.hh" +#include "debug/GPUExec.hh" +#include "debug/GPUSched.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" #include "params/ComputeUnit.hh" ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p) - : numSIMDs(p->num_SIMDs), - numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes), - numShrMemPipes(p->num_shared_mem_pipes), - vectorAluInstAvail(nullptr), - lastGlbMemSimd(-1), - lastShrMemSimd(-1), glbMemInstAvail(nullptr), - shrMemInstAvail(nullptr) { } ScoreboardCheckStage::~ScoreboardCheckStage() { readyList.clear(); - waveStatusList.clear(); - shrMemInstAvail = nullptr; - glbMemInstAvail = nullptr; } void @@ -64,102 +58,212 @@ ScoreboardCheckStage::init(ComputeUnit *cu) computeUnit = cu; _name = computeUnit->name() + ".ScoreboardCheckStage"; - for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { readyList.push_back(&computeUnit->readyList[unitId]); } - - for (int unitId = 0; unitId < numSIMDs; ++unitId) { - waveStatusList.push_back(&computeUnit->waveStatusList[unitId]); - } - - vectorAluInstAvail = &computeUnit->vectorAluInstAvail; - glbMemInstAvail= &computeUnit->glbMemInstAvail; - shrMemInstAvail= &computeUnit->shrMemInstAvail; } void -ScoreboardCheckStage::initStatistics() +ScoreboardCheckStage::collectStatistics(nonrdytype_e rdyStatus) { - lastGlbMemSimd = -1; - lastShrMemSimd = -1; - *glbMemInstAvail = 0; - *shrMemInstAvail = 0; - - for (int unitId = 0; unitId < numSIMDs; ++unitId) - vectorAluInstAvail->at(unitId) = false; + panic_if(rdyStatus == NRDY_ILLEGAL || rdyStatus >= NRDY_CONDITIONS, + "Instruction ready status %d is illegal!!!", rdyStatus); + stallCycles[rdyStatus]++; } -void -ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId) +// Return true if this wavefront is ready +// to execute an instruction of the specified type. +// It also returns the reason (in rdyStatus) if the instruction is not +// ready. Finally it sets the execution resource type (in exesResType) +// of the instruction, only if it ready. +bool +ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, + int *exeResType, int wfSlot) { - if (curWave->instructionBuffer.empty()) - return; - - // track which vector SIMD unit has at least one WV with a vector - // ALU as the oldest instruction in its Instruction buffer - vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) || - curWave->isOldestInstALU(); - - // track how many vector SIMD units have at least one WV with a - // vector Global memory instruction as the oldest instruction - // in its Instruction buffer - if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() || - curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId && - *glbMemInstAvail <= 1) { - (*glbMemInstAvail)++; - lastGlbMemSimd = unitId; + /** + * The waitCnt checks have to be done BEFORE checking for Instruction + * buffer empty condition. Otherwise, it will result into a deadlock if + * the last instruction in the Instruction buffer is a waitCnt: after + * executing the waitCnt, the Instruction buffer would be empty and the + * ready check logic will exit BEFORE checking for wait counters being + * satisfied. + */ + + // waitCnt instruction has been dispatched or executed: next + // instruction should be blocked until waitCnts are satisfied. + if (w->getStatus() == Wavefront::S_WAITCNT) { + if (!w->waitCntsSatisfied()) { + *rdyStatus = NRDY_WAIT_CNT; + return false; + } + } + + // Is the wave waiting at a barrier. Check this condition BEFORE checking + // for instruction buffer occupancy to avoid a deadlock when the barrier is + // the last instruction in the instruction buffer. + if (w->stalledAtBarrier) { + if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt, + computeUnit->getRefCounter(w->dispatchId, w->wgId))) { + // Are all threads at barrier? + *rdyStatus = NRDY_BARRIER_WAIT; + return false; + } + w->oldBarrierCnt = w->barrierCnt; + w->stalledAtBarrier = false; + } + + // Check WF status: it has to be running + if (w->getStatus() == Wavefront::S_STOPPED || + w->getStatus() == Wavefront::S_RETURNING || + w->getStatus() == Wavefront::S_STALLED) { + *rdyStatus = NRDY_WF_STOP; + return false; + } + + // is the Instruction buffer empty + if ( w->instructionBuffer.empty()) { + *rdyStatus = NRDY_IB_EMPTY; + return false; + } + + // Check next instruction from instruction buffer + GPUDynInstPtr ii = w->nextInstr(); + // Only instruction in the instruction buffer has been dispatched. + // No need to check it again for readiness + if (!ii) { + *rdyStatus = NRDY_IB_EMPTY; + return false; + } + + // The following code is very error prone and the entire process for + // checking readiness will be fixed eventually. In the meantime, let's + // make sure that we do not silently let an instruction type slip + // through this logic and always return not ready. + if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || + ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || + ii->isEndOfKernel() || ii->isMemSync() || ii->isFlat())) { + panic("next instruction: %s is of unknown type\n", ii->disassemble()); } - // track how many vector SIMD units have at least one WV with a - // vector shared memory (LDS) instruction as the oldest instruction - // in its Instruction buffer - // TODO: parametrize the limit of the LDS units - if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) && - lastShrMemSimd != unitId) { - (*shrMemInstAvail)++; - lastShrMemSimd = unitId; + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n", + computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble()); + + // Non-scalar (i.e., vector) instructions may use VGPRs + if (!ii->isScalar()) { + if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) { + *rdyStatus = NRDY_VGPR_NRDY; + return false; + } } + // Scalar and non-scalar instructions may use SGPR + if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) { + *rdyStatus = NRDY_SGPR_NRDY; + return false; + } + + // The hardware implicitly executes S_WAITCNT 0 before executing + // the S_ENDPGM instruction. Implementing this implicit S_WAITCNT. + // isEndOfKernel() is used to identify the S_ENDPGM instruction + // On identifying it, we do the following: + // 1. Wait for all older instruction to execute + // 2. Once all the older instruction are executed, we add a wait + // count for the executed instruction(s) to complete. + if (ii->isEndOfKernel()) { + // Waiting for older instruction to execute + if (w->instructionBuffer.front()->seqNum() != ii->seqNum()) { + *rdyStatus = NRDY_WAIT_CNT; + return false; + } + // Older instructions have executed, adding implicit wait count + w->setStatus(Wavefront::S_WAITCNT); + w->setWaitCnts(0, 0, 0); + if (!w->waitCntsSatisfied()) { + *rdyStatus = NRDY_WAIT_CNT; + return false; + } + } + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, + w->simdId, w->wfSlotId, ii->disassemble()); + *exeResType = mapWaveToExeUnit(w); + *rdyStatus = INST_RDY; + return true; +} + +int +ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w) +{ + GPUDynInstPtr ii = w->nextInstr(); + assert(ii); + if (ii->isFlat()) { + /** + * NOTE: Flat memory ops requires both GM and LM resources. + * The simulator models consumption of both GM and LM + * resources in the schedule stage. At instruction execution time, + * after the aperture check is performed, only the GM or LM pipe + * is actually reserved by the timing model. The GM unit is returned + * here since Flat ops occupy the GM slot in the ready and dispatch + * lists. They also consume the LM slot in the dispatch list. + */ + return w->globalMem; + } else if (ii->isLocalMem()) { + return w->localMem; + } else if (ii->isGlobalMem()) { + if (!ii->isScalar()) { + return w->globalMem; + } else { + return w->scalarMem; + } + } else if (ii->isBranch() || + ii->isALU() || + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg() || + ii->isReturn() || + ii->isEndOfKernel() || + ii->isNop() || + ii->isBarrier()) { + if (!ii->isScalar()) { + return w->simdId; + } else { + return w->scalarAluGlobalIdx; + } + } + panic("%s: unmapped to an execution resource", ii->disassemble()); + return computeUnit->numExeUnits(); } void ScoreboardCheckStage::exec() { - initStatistics(); - // reset the ready list for all execution units; it will be // constructed every cycle since resource availability may change - for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) { + for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { + // Reset wavefront pointers to nullptr so clear() on the vector + // does not accidentally destruct the wavefront object + for (int i = 0; i < readyList[unitId]->size(); i++) { + readyList[unitId]->at(i) = nullptr; + } readyList[unitId]->clear(); } - - // iterate over the Wavefronts of all SIMD units - for (int unitId = 0; unitId < numSIMDs; ++unitId) { - for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) { + // iterate over all WF slots across all vector ALUs + for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) { + for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) { // reset the ready status of each wavefront - waveStatusList[unitId]->at(wvId).second = BLOCKED; - Wavefront *curWave = waveStatusList[unitId]->at(wvId).first; - collectStatistics(curWave, unitId); - - if (curWave->ready(Wavefront::I_ALU)) { - readyList[unitId]->push_back(curWave); - waveStatusList[unitId]->at(wvId).second = READY; - } else if (curWave->ready(Wavefront::I_GLOBAL)) { - if (computeUnit->cedeSIMD(unitId, wvId)) { - continue; - } - - readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); - waveStatusList[unitId]->at(wvId).second = READY; - } else if (curWave->ready(Wavefront::I_SHARED)) { - readyList[computeUnit->ShrMemUnitId()]->push_back(curWave); - waveStatusList[unitId]->at(wvId).second = READY; - } else if (curWave->ready(Wavefront::I_FLAT)) { - readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); - waveStatusList[unitId]->at(wvId).second = READY; - } else if (curWave->ready(Wavefront::I_PRIVATE)) { - readyList[computeUnit->GlbMemUnitId()]->push_back(curWave); - waveStatusList[unitId]->at(wvId).second = READY; + Wavefront *curWave = computeUnit->wfList[simdId][wfSlot]; + nonrdytype_e rdyStatus = NRDY_ILLEGAL; + int exeResType = -1; + // check WF readiness: If the WF's oldest + // instruction is ready to issue then add the WF to the ready list + if (ready(curWave, &rdyStatus, &exeResType, wfSlot)) { + assert(curWave->simdId == simdId); + DPRINTF(GPUSched, + "Adding to readyList[%d]: SIMD[%d] WV[%d]: %d: %s\n", + exeResType, + curWave->simdId, curWave->wfDynId, + curWave->nextInstr()->seqNum(), + curWave->nextInstr()->disassemble()); + readyList.at(exeResType)->push_back(curWave); } + collectStatistics(rdyStatus); } } } @@ -167,4 +271,16 @@ ScoreboardCheckStage::exec() void ScoreboardCheckStage::regStats() { + stallCycles + .init(NRDY_CONDITIONS) + .name(name() + ".stall_cycles") + .desc("number of cycles wave stalled in SCB") + ; + stallCycles.subname(NRDY_WF_STOP, csprintf("WFStop")); + stallCycles.subname(NRDY_IB_EMPTY, csprintf("IBEmpty")); + stallCycles.subname(NRDY_WAIT_CNT, csprintf("WaitCnt")); + stallCycles.subname(NRDY_BARRIER_WAIT, csprintf("BarrierWait")); + stallCycles.subname(NRDY_VGPR_NRDY, csprintf("VgprBusy")); + stallCycles.subname(NRDY_SGPR_NRDY, csprintf("SgprBusy")); + stallCycles.subname(INST_RDY, csprintf("InstrReady")); } diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh index 9f690d7b6..1e5695139 100644 --- a/src/gpu-compute/scoreboard_check_stage.hh +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -36,20 +36,17 @@ #include #include +#include #include #include +#include "sim/stats.hh" + class ComputeUnit; class Wavefront; struct ComputeUnitParams; -enum WAVE_STATUS -{ - BLOCKED = 0, - READY -}; - /* * Scoreboard check stage. * All wavefronts are analyzed to see if they are ready @@ -61,6 +58,18 @@ enum WAVE_STATUS class ScoreboardCheckStage { public: + enum nonrdytype_e { + NRDY_ILLEGAL, + NRDY_WF_STOP, + NRDY_IB_EMPTY, + NRDY_WAIT_CNT, + NRDY_BARRIER_WAIT, + NRDY_VGPR_NRDY, + NRDY_SGPR_NRDY, + INST_RDY, + NRDY_CONDITIONS + }; + ScoreboardCheckStage(const ComputeUnitParams* params); ~ScoreboardCheckStage(); void init(ComputeUnit *cu); @@ -71,31 +80,18 @@ class ScoreboardCheckStage void regStats(); private: - void collectStatistics(Wavefront *curWave, int unitId); - void initStatistics(); + void collectStatistics(nonrdytype_e rdyStatus); + int mapWaveToExeUnit(Wavefront *w); + bool ready(Wavefront *w, nonrdytype_e *rdyStatus, + int *exeResType, int wfSlot); ComputeUnit *computeUnit; - uint32_t numSIMDs; - uint32_t numMemUnits; - uint32_t numShrMemPipes; - - // flag per vector SIMD unit that is set when there is at least one - // WF that has a vector ALU instruction as the oldest in its - // Instruction Buffer - std::vector *vectorAluInstAvail; - int lastGlbMemSimd; - int lastShrMemSimd; - int *glbMemInstAvail; - int *shrMemInstAvail; // List of waves which are ready to be scheduled. // Each execution resource has a ready list std::vector*> readyList; - // Stores the status of waves. A READY implies the - // wave is ready to be scheduled this cycle and - // is already present in the readyList - std::vector>*> - waveStatusList; + // Stats + Stats::Vector stallCycles; std::string _name; }; diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 91f78a50a..4be2fbfbd 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -39,37 +39,63 @@ #include "base/chunk_generator.hh" #include "debug/GPUDisp.hh" #include "debug/GPUMem.hh" -#include "debug/HSAIL.hh" +#include "debug/GPUShader.hh" +#include "debug/GPUWgLatency.hh" #include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_command_processor.hh" #include "gpu-compute/gpu_static_inst.hh" -#include "gpu-compute/qstruct.hh" +#include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/wavefront.hh" #include "mem/packet.hh" #include "mem/ruby/system/RubySystem.hh" #include "sim/sim_exit.hh" -Shader::Shader(const Params *p) - : ClockedObject(p), clock(p->clk_domain->clockPeriod()), - cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer), - tickEvent([this]{ processTick(); }, "Shader tick", - false, Event::CPU_Tick_Pri), - timingSim(p->timing), hsail_mode(SIMT), - impl_kern_boundary_sync(p->impl_kern_boundary_sync), - separate_acquire_release(p->separate_acquire_release), coissue_return(1), - trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), - globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0), - box_tick_cnt(0), start_tick_cnt(0) +Shader::Shader(const Params *p) : ClockedObject(p), + _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr), + gpuTc(nullptr), cpuPointer(p->cpu_pointer), + tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event", + false, Event::CPU_Tick_Pri), + timingSim(p->timing), hsail_mode(SIMT), + impl_kern_boundary_sync(p->impl_kern_boundary_sync), + coissue_return(1), + trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), + globalMemSize(p->globalmem), + nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc), + _dispatcher(*p->dispatcher), + max_valu_insts(p->max_valu_insts), total_valu_insts(0) { + gpuCmdProc.setShader(this); + _dispatcher.setShader(this); + + _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L; + _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL; + + _ldsApe.base = ((Addr)1 << 61) + 0x0; + _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; + + _scratchApe.base = ((Addr)1 << 61) + 0x100000000L; + _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; + + shHiddenPrivateBaseVmid = 0; cuList.resize(n_cu); + panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD"); + for (int i = 0; i < n_cu; ++i) { cuList[i] = p->CUs[i]; assert(i == cuList[i]->cu_id); cuList[i]->shader = this; + cuList[i]->idleCUTimeout = p->idlecu_timeout; } } +GPUDispatcher& +Shader::dispatcher() +{ + return _dispatcher; +} + Addr Shader::mmap(int length) { @@ -83,11 +109,11 @@ Shader::mmap(int length) auto mem_state = proc->memState; if (proc->mmapGrowsDown()) { - DPRINTF(HSAIL, "GROWS DOWN"); + DPRINTF(GPUShader, "GROWS DOWN"); start = mem_state->getMmapEnd() - length; mem_state->setMmapEnd(start); } else { - DPRINTF(HSAIL, "GROWS UP"); + DPRINTF(GPUShader, "GROWS UP"); start = mem_state->getMmapEnd(); mem_state->setMmapEnd(start + length); @@ -96,7 +122,7 @@ Shader::mmap(int length) mem_state->getMmapEnd()); } - DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length); + DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length); proc->allocateMem(start, length); @@ -146,15 +172,15 @@ ShaderParams::create() } void -Shader::exec() +Shader::execScheduledAdds() { - tick_cnt = curTick(); - box_tick_cnt = curTick() - start_tick_cnt; + assert(!sa_when.empty()); // apply any scheduled adds for (int i = 0; i < sa_n; ++i) { - if (sa_when[i] <= tick_cnt) { + if (sa_when[i] <= curTick()) { *sa_val[i] += sa_x[i]; + panic_if(*sa_val[i] < 0, "Negative counter value\n"); sa_val.erase(sa_val.begin() + i); sa_x.erase(sa_x.begin() + i); sa_when.erase(sa_when.begin() + i); @@ -162,14 +188,62 @@ Shader::exec() --i; } } + if (!sa_when.empty()) { + Tick shader_wakeup = *std::max_element(sa_when.begin(), + sa_when.end()); + DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup); + schedule(tickEvent, shader_wakeup); + } else { + DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n"); + } +} + +/* + * dispatcher/shader arranges invalidate requests to the CUs + */ +void +Shader::prepareInvalidate(HSAQueueEntry *task) { + // if invalidate has already started/finished, then do nothing + if (task->isInvStarted()) return; + + // invalidate has never started; it can only perform once at kernel launch + assert(task->outstandingInvs() == -1); + int kernId = task->dispatchId(); + // counter value is 0 now, indicating the inv is about to start + _dispatcher.updateInvCounter(kernId, +1); + + // iterate all cus managed by the shader, to perform invalidate. + for (int i_cu = 0; i_cu < n_cu; ++i_cu) { + // create a request to hold INV info; the request's fields will + // be updated in cu before use + auto req = std::make_shared(0, 0, 0, + cuList[i_cu]->masterId(), + 0, -1); + + _dispatcher.updateInvCounter(kernId, +1); + // all necessary INV flags are all set now, call cu to execute + cuList[i_cu]->doInvalidate(req, task->dispatchId()); + } +} - // clock all of the cu's - for (int i = 0; i < n_cu; ++i) - cuList[i]->exec(); +/** + * dispatcher/shader arranges flush requests to the CUs + */ +void +Shader::prepareFlush(GPUDynInstPtr gpuDynInst){ + int kernId = gpuDynInst->kern_id; + // flush has never been started, performed only once at kernel end + assert(_dispatcher.getOutstandingWbs(kernId) == 0); + + // iterate all cus, managed by the shader, to perform flush. + for (int i_cu = 0; i_cu < n_cu; ++i_cu) { + _dispatcher.updateWbCounter(kernId, +1); + cuList[i_cu]->doFlush(gpuDynInst); + } } bool -Shader::dispatch_workgroups(NDRange *ndr) +Shader::dispatchWorkgroups(HSAQueueEntry *task) { bool scheduledSomething = false; int cuCount = 0; @@ -182,32 +256,24 @@ Shader::dispatch_workgroups(NDRange *ndr) // dispatch workgroup iff the following two conditions are met: // (a) wg_rem is true - there are unassigned workgroups in the grid // (b) there are enough free slots in cu cuList[i] for this wg - if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) { + if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) { scheduledSomething = true; - DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu); - - // ticks() member function translates cycles to simulation ticks. - if (!tickEvent.scheduled()) { - schedule(tickEvent, curTick() + this->ticks(1)); + DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n", + curCu, task->globalWgId()); + DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n", + curTick(), task->globalWgId(), curCu); + + if (!cuList[curCu]->tickEvent.scheduled()) { + if (!_activeCus) + _lastInactiveTick = curTick(); + _activeCus++; } - cuList[curCu]->StartWorkgroup(ndr); - ndr->wgId[0]++; - ndr->globalWgId++; - if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) { - ndr->wgId[0] = 0; - ndr->wgId[1]++; - - if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) { - ndr->wgId[1] = 0; - ndr->wgId[2]++; - - if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) { - ndr->wg_disp_rem = false; - break; - } - } - } + panic_if(_activeCus <= 0 || _activeCus > cuList.size(), + "Invalid activeCu size\n"); + cuList[curCu]->dispWorkgroup(task); + + task->markWgDispatch(); } ++cuCount; @@ -218,9 +284,83 @@ Shader::dispatch_workgroups(NDRange *ndr) } void -Shader::handshake(GpuDispatcher *_dispatcher) +Shader::regStats() { - dispatcher = _dispatcher; + ClockedObject::regStats(); + + shaderActiveTicks + .name(name() + ".shader_active_ticks") + .desc("Total ticks that any CU attached to this shader is active") + ; + allLatencyDist + .init(0, 1600000, 10000) + .name(name() + ".allLatencyDist") + .desc("delay distribution for all") + .flags(Stats::pdf | Stats::oneline); + + loadLatencyDist + .init(0, 1600000, 10000) + .name(name() + ".loadLatencyDist") + .desc("delay distribution for loads") + .flags(Stats::pdf | Stats::oneline); + + storeLatencyDist + .init(0, 1600000, 10000) + .name(name() + ".storeLatencyDist") + .desc("delay distribution for stores") + .flags(Stats::pdf | Stats::oneline); + + vectorInstSrcOperand + .init(4) + .name(name() + ".vec_inst_src_operand") + .desc("vector instruction source operand distribution"); + + vectorInstDstOperand + .init(4) + .name(name() + ".vec_inst_dst_operand") + .desc("vector instruction destination operand distribution"); + + initToCoalesceLatency + .init(0, 1600000, 10000) + .name(name() + ".initToCoalesceLatency") + .desc("Ticks from vmem inst initiateAcc to coalescer issue") + .flags(Stats::pdf | Stats::oneline); + + rubyNetworkLatency + .init(0, 1600000, 10000) + .name(name() + ".rubyNetworkLatency") + .desc("Ticks from coalescer issue to coalescer hit callback") + .flags(Stats::pdf | Stats::oneline); + + gmEnqueueLatency + .init(0, 1600000, 10000) + .name(name() + ".gmEnqueueLatency") + .desc("Ticks from coalescer hit callback to GM pipe enqueue") + .flags(Stats::pdf | Stats::oneline); + + gmToCompleteLatency + .init(0, 1600000, 10000) + .name(name() + ".gmToCompleteLatency") + .desc("Ticks queued in GM pipes ordered response buffer") + .flags(Stats::pdf | Stats::oneline); + + coalsrLineAddresses + .init(0, 20, 1) + .name(name() + ".coalsrLineAddresses") + .desc("Number of cache lines for coalesced request") + .flags(Stats::pdf | Stats::oneline); + + int wfSize = cuList[0]->wfSize(); + cacheBlockRoundTrip = new Stats::Distribution[wfSize]; + for (int idx = 0; idx < wfSize; ++idx) { + std::stringstream namestr; + ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx); + cacheBlockRoundTrip[idx] + .init(0, 1600000, 10000) + .name(namestr.str()) + .desc("Coalsr-to-coalsr time for the Nth cache block in an inst") + .flags(Stats::pdf | Stats::oneline); + } } void @@ -251,7 +391,6 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, RequestPtr req1, req2; req->splitOnVaddr(split_addr, req1, req2); - PacketPtr pkt1 = new Packet(req2, cmd); PacketPtr pkt2 = new Packet(req1, cmd); @@ -297,34 +436,22 @@ Shader::doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, } } -bool -Shader::busy() -{ - for (int i_cu = 0; i_cu < n_cu; ++i_cu) { - if (!cuList[i_cu]->isDone()) { - return true; - } - } - - return false; -} - void -Shader::ScheduleAdd(uint32_t *val,Tick when,int x) +Shader::ScheduleAdd(int *val,Tick when,int x) { sa_val.push_back(val); - sa_when.push_back(tick_cnt + when); + when += curTick(); + sa_when.push_back(when); sa_x.push_back(x); ++sa_n; -} - - -void -Shader::processTick() -{ - if (busy()) { - exec(); - schedule(tickEvent, curTick() + ticks(1)); + if (!tickEvent.scheduled() || (when < tickEvent.when())) { + DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at " + "%lu\n", when); + reschedule(tickEvent, when, true); + } else { + assert(tickEvent.scheduled()); + DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at " + "%lu\n", when); } } @@ -356,7 +483,8 @@ void Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id, bool suppress_func_errors) { - AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors); + AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, + suppress_func_errors); } void @@ -385,15 +513,11 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) pkt->senderState = new TheISA::GpuTLB::TranslationState(mode, gpuTc, false); - if (cu_id == n_cu) { - dispatcher->tlbPort->sendFunctional(pkt); - } else { - // even when the perLaneTLB flag is turned on - // it's ok tp send all accesses through lane 0 - // since the lane # is not known here, - // This isn't important since these are functional accesses. - cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); - } + // even when the perLaneTLB flag is turned on + // it's ok tp send all accesses through lane 0 + // since the lane # is not known here, + // This isn't important since these are functional accesses. + cuList[cu_id]->tlbPort[0]->sendFunctional(pkt); /* safe_cast the senderState */ TheISA::GpuTLB::TranslationState *sender_state = @@ -402,3 +526,82 @@ Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode) delete sender_state->tlbEntry; delete pkt->senderState; } + +/* + * allow the shader to sample stats from constituent devices + */ +void +Shader::sampleStore(const Tick accessTime) +{ + storeLatencyDist.sample(accessTime); + allLatencyDist.sample(accessTime); +} + +/* + * allow the shader to sample stats from constituent devices + */ +void +Shader::sampleLoad(const Tick accessTime) +{ + loadLatencyDist.sample(accessTime); + allLatencyDist.sample(accessTime); +} + +void +Shader::sampleInstRoundTrip(std::vector roundTripTime) +{ + // Only sample instructions that go all the way to main memory + if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) { + return; + } + + Tick t1 = roundTripTime[0]; + Tick t2 = roundTripTime[1]; + Tick t3 = roundTripTime[2]; + Tick t4 = roundTripTime[3]; + Tick t5 = roundTripTime[4]; + + initToCoalesceLatency.sample(t2-t1); + rubyNetworkLatency.sample(t3-t2); + gmEnqueueLatency.sample(t4-t3); + gmToCompleteLatency.sample(t5-t4); +} + +void +Shader::sampleLineRoundTrip(const std::map>& lineMap) +{ + coalsrLineAddresses.sample(lineMap.size()); + std::vector netTimes; + + // For each cache block address generated by a vmem inst, calculate + // the round-trip time for that cache block. + for (auto& it : lineMap) { + const std::vector& timeVec = it.second; + if (timeVec.size() == 2) { + netTimes.push_back(timeVec[1] - timeVec[0]); + } + } + + // Sort the cache block round trip times so that the first + // distrubtion is always measuring the fastests and the last + // distrubtion is always measuring the slowest cache block. + std::sort(netTimes.begin(), netTimes.end()); + + // Sample the round trip time for each N cache blocks into the + // Nth distribution. + int idx = 0; + for (auto& time : netTimes) { + cacheBlockRoundTrip[idx].sample(time); + ++idx; + } +} + +void +Shader::notifyCuSleep() { + // If all CUs attached to his shader are asleep, update shaderActiveTicks + panic_if(_activeCus <= 0 || _activeCus > cuList.size(), + "Invalid activeCu size\n"); + _activeCus--; + if (!_activeCus) + shaderActiveTicks += curTick() - _lastInactiveTick; +} diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 5c14d9898..72063a4a5 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -14,9 +14,9 @@ * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -30,7 +30,7 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * - * Author: Steve Reinhardt + * Authors: Steve Reinhardt */ #ifndef __SHADER_HH__ @@ -47,11 +47,11 @@ #include "cpu/simple_thread.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" -#include "enums/MemType.hh" #include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_tlb.hh" +#include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/lds_state.hh" -#include "gpu-compute/qstruct.hh" #include "mem/page_table.hh" #include "mem/port.hh" #include "mem/request.hh" @@ -61,7 +61,8 @@ #include "sim/sim_object.hh" class BaseTLB; -class GpuDispatcher; +class GPUCommandProcessor; +class GPUDispatcher; namespace TheISA { @@ -70,36 +71,144 @@ namespace TheISA static const int LDS_SIZE = 65536; +// aperture (APE) registers define the base/limit +// pair for the ATC mapped memory space. currently +// the only APEs we consider are for GPUVM/LDS/scratch. +// the APEs are registered with unique values based +// on a per-device basis +struct ApertureRegister +{ + Addr base; + Addr limit; +}; + // Class Shader: This describes a single shader instance. Most // configurations will only have a single shader. class Shader : public ClockedObject { - protected: - // Shader's clock period in terms of number of ticks of curTime, - // aka global simulation clock - Tick clock; + private: + ApertureRegister _gpuVmApe; + ApertureRegister _ldsApe; + ApertureRegister _scratchApe; + Addr shHiddenPrivateBaseVmid; + + // Number of active Cus attached to this shader + int _activeCus; + + // Last tick that all CUs attached to this shader were inactive + Tick _lastInactiveTick; + + // some stats for measuring latency + Stats::Distribution allLatencyDist; + Stats::Distribution loadLatencyDist; + Stats::Distribution storeLatencyDist; + + // average ticks from vmem inst initiateAcc to coalescer issue, + // average ticks from coalescer issue to coalescer hit callback, + // average ticks from coalescer hit callback to GM pipe enqueue, + // and average ticks spent in GM pipe's ordered resp buffer. + Stats::Distribution initToCoalesceLatency; + Stats::Distribution rubyNetworkLatency; + Stats::Distribution gmEnqueueLatency; + Stats::Distribution gmToCompleteLatency; + + // average number of cache blocks requested by vmem inst, and + // average ticks for cache blocks to main memory for the Nth + // cache block generated by a vmem inst. + Stats::Distribution coalsrLineAddresses; + Stats::Distribution *cacheBlockRoundTrip; public: typedef ShaderParams Params; enum hsail_mode_e {SIMT,VECTOR_SCALAR}; - // clock related functions ; maps to-and-from - // Simulation ticks and shader clocks. - Tick frequency() const { return SimClock::Frequency / clock; } - - Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } - - Tick getClock() const { return clock; } - Tick curCycle() const { return curTick() / clock; } - Tick tickToCycles(Tick val) const { return val / clock;} - + GPUDispatcher &dispatcher(); + void sampleLoad(const Tick accessTime); + void sampleStore(const Tick accessTime); + void sampleInstRoundTrip(std::vector roundTripTime); + void sampleLineRoundTrip(const std::map> &roundTripTime); SimpleThread *cpuThread; ThreadContext *gpuTc; BaseCPU *cpuPointer; - void processTick(); + const ApertureRegister& + gpuVmApe() const + { + return _gpuVmApe; + } + + const ApertureRegister& + ldsApe() const + { + return _ldsApe; + } + + const ApertureRegister& + scratchApe() const + { + return _scratchApe; + } + + bool + isGpuVmApe(Addr addr) const + { + bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit; + + return is_gpu_vm; + } + + bool + isLdsApe(Addr addr) const + { + bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit; + + return is_lds; + } + + bool + isScratchApe(Addr addr) const + { + bool is_scratch + = addr >= _scratchApe.base && addr <= _scratchApe.limit; + + return is_scratch; + } + + Addr + getScratchBase() + { + return _scratchApe.base; + } + + Addr + getHiddenPrivateBase() + { + return shHiddenPrivateBaseVmid; + } + + void + initShHiddenPrivateBase(Addr queueBase, uint32_t offset) + { + Addr sh_hidden_base_new = queueBase - offset; + + // We are initializing sh_hidden_private_base_vmid from the + // amd queue descriptor from the first queue. + // The sh_hidden_private_base_vmid is supposed to be same for + // all the queues from the same process + if (shHiddenPrivateBaseVmid != sh_hidden_base_new) { + // Do not panic if shHiddenPrivateBaseVmid == 0, + // that is if it is uninitialized. Panic only + // if the value is initilized and we get + // a differnt base later. + panic_if(shHiddenPrivateBaseVmid != 0, + "Currently we support only single process\n"); + } + shHiddenPrivateBaseVmid = sh_hidden_base_new; + } + EventFunctionWrapper tickEvent; // is this simulation going to be timing mode in the memory? @@ -108,30 +217,18 @@ class Shader : public ClockedObject // If set, issue acq packet @ kernel launch int impl_kern_boundary_sync; - // If set, generate a separate packet for acquire/release on - // ld_acquire/st_release/atomic operations - int separate_acquire_release; // If set, fetch returns may be coissued with instructions int coissue_return; // If set, always dump all 64 gprs to trace int trace_vgpr_all; // Number of cu units in the shader int n_cu; - // Number of wavefront slots per cu + // Number of wavefront slots per SIMD per CU int n_wf; + // The size of global memory int globalMemSize; - /* - * Bytes/work-item for call instruction - * The number of arguments for an hsail function will - * vary. We simply determine the maximum # of arguments - * required by any hsail function up front before the - * simulation (during parsing of the Brig) and record - * that number here. - */ - int funcargs_size; - // Tracks CU that rr dispatcher should attempt scheduling int nextSchedCu; @@ -139,7 +236,7 @@ class Shader : public ClockedObject uint32_t sa_n; // Pointer to value to be increments - std::vector sa_val; + std::vector sa_val; // When to do the increment std::vector sa_when; // Amount to increment by @@ -148,24 +245,29 @@ class Shader : public ClockedObject // List of Compute Units (CU's) std::vector cuList; - uint64_t tick_cnt; - uint64_t box_tick_cnt; - uint64_t start_tick_cnt; + GPUCommandProcessor &gpuCmdProc; + GPUDispatcher &_dispatcher; + + /** + * Statistics + */ + Stats::Scalar shaderActiveTicks; + Stats::Vector vectorInstSrcOperand; + Stats::Vector vectorInstDstOperand; + void regStats(); - GpuDispatcher *dispatcher; + int max_valu_insts; + int total_valu_insts; Shader(const Params *p); ~Shader(); virtual void init(); - // Run shader - void exec(); - - // Check to see if shader is busy - bool busy(); + // Run shader scheduled adds + void execScheduledAdds(); // Schedule a 32-bit value to be incremented some time in the future - void ScheduleAdd(uint32_t *val, Tick when, int x); + void ScheduleAdd(int *val, Tick when, int x); bool processTimingPacket(PacketPtr pkt); void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, @@ -190,12 +292,15 @@ class Shader : public ClockedObject cuList[cu_id] = compute_unit; } - void handshake(GpuDispatcher *dispatcher); - bool dispatch_workgroups(NDRange *ndr); + void prepareInvalidate(HSAQueueEntry *task); + void prepareFlush(GPUDynInstPtr gpuDynInst); + + bool dispatchWorkgroups(HSAQueueEntry *task); Addr mmap(int length); void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode); void updateContext(int cid); void hostWakeUp(BaseCPU *cpu); + void notifyCuSleep(); }; #endif // __SHADER_HH__ diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc index 1e4f0c6fc..1d0f1b8d7 100644 --- a/src/gpu-compute/simple_pool_manager.cc +++ b/src/gpu-compute/simple_pool_manager.cc @@ -35,6 +35,12 @@ #include "base/logging.hh" +SimplePoolManager * +SimplePoolManagerParams::create() +{ + return new SimplePoolManager(this); +} + // return the min number of elements that the manager can reserve given // a request for "size" elements uint32_t @@ -64,8 +70,6 @@ SimplePoolManager::printRegion() bool SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size) { - assert(numRegions * minAllocatedElements(size) <= poolSize()); - return _reservedGroups == 0; } diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh index 3b7ea9eb3..9fd90a505 100644 --- a/src/gpu-compute/simple_pool_manager.hh +++ b/src/gpu-compute/simple_pool_manager.hh @@ -38,14 +38,15 @@ #include #include "gpu-compute/pool_manager.hh" +#include "params/SimplePoolManager.hh" // Simple Pool Manager: allows one region per pool. No region merging is // supported. class SimplePoolManager : public PoolManager { public: - SimplePoolManager(uint32_t minAlloc, uint32_t poolSize) - : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0), + SimplePoolManager(const PoolManagerParams *p) + : PoolManager(p), _regionSize(0), _nxtFreeIdx(0), _reservedGroups(0) { } @@ -62,7 +63,7 @@ class SimplePoolManager : public PoolManager // be reserved) uint32_t _regionSize; // next index to allocate a region - uint8_t _nxtFreeIdx; + int _nxtFreeIdx; // number of groups that reserve a region uint32_t _reservedGroups; }; diff --git a/src/gpu-compute/static_register_manager_policy.cc b/src/gpu-compute/static_register_manager_policy.cc new file mode 100644 index 000000000..85f530bfc --- /dev/null +++ b/src/gpu-compute/static_register_manager_policy.cc @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2016 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Mark Wyse + */ + +#include "gpu-compute/static_register_manager_policy.hh" + +#include "config/the_gpu_isa.hh" +#include "debug/GPURename.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/pool_manager.hh" +#include "gpu-compute/scalar_register_file.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/vector_register_file.hh" +#include "gpu-compute/wavefront.hh" + +StaticRegisterManagerPolicy::StaticRegisterManagerPolicy() +{ +} + +void +StaticRegisterManagerPolicy::exec() +{ +} + +int +StaticRegisterManagerPolicy::mapVgpr(Wavefront* w, int vgprIndex) +{ + panic_if((vgprIndex >= w->reservedVectorRegs) + || (w->reservedVectorRegs < 0), + "VGPR index %d is out of range: VGPR range=[0,%d]", + vgprIndex, w->reservedVectorRegs); + + // add the offset from where the VGPRs of the wavefront have been assigned + int physicalVgprIndex = w->startVgprIndex + vgprIndex; + + panic_if(!((w->startVgprIndex <= physicalVgprIndex) && + (w->startVgprIndex + w->reservedVectorRegs - 1) + >= physicalVgprIndex), + "Invalid VGPR index %d\n", physicalVgprIndex); + + // calculate physical VGPR index + return physicalVgprIndex % w->computeUnit->vrf[w->simdId]->numRegs(); +} + +int +StaticRegisterManagerPolicy::mapSgpr(Wavefront* w, int sgprIndex) +{ + panic_if(!((sgprIndex < w->reservedScalarRegs) + && (w->reservedScalarRegs > 0)), + "SGPR index %d is out of range: SGPR range=[0,%d]\n", + sgprIndex, w->reservedScalarRegs); + + // add the offset from where the SGPRs of the wavefront have been assigned + int physicalSgprIndex = w->startSgprIndex + sgprIndex; + + panic_if(!((w->startSgprIndex <= physicalSgprIndex) && + (w->startSgprIndex + w->reservedScalarRegs - 1) + >= physicalSgprIndex), + "Invalid SGPR index %d\n", physicalSgprIndex); + + // calculate physical SGPR index + return physicalSgprIndex % w->computeUnit->srf[w->simdId]->numRegs(); +} + +bool +StaticRegisterManagerPolicy::canAllocateVgprs(int simdId, int nWfs, + int demandPerWf) +{ + return cu->registerManager->vrfPoolMgrs[simdId]-> + canAllocate(nWfs, demandPerWf); +} + +bool +StaticRegisterManagerPolicy::canAllocateSgprs(int simdId, int nWfs, + int demandPerWf) +{ + return cu->registerManager->srfPoolMgrs[simdId]-> + canAllocate(nWfs, demandPerWf); +} + +void +StaticRegisterManagerPolicy::allocateRegisters(Wavefront *w, int vectorDemand, + int scalarDemand) +{ + uint32_t allocatedSize = 0; + w->startVgprIndex = cu->registerManager->vrfPoolMgrs[w->simdId]-> + allocateRegion(vectorDemand, &allocatedSize); + w->reservedVectorRegs = allocatedSize; + cu->vectorRegsReserved[w->simdId] += w->reservedVectorRegs; + panic_if(cu->vectorRegsReserved[w->simdId] > cu->numVecRegsPerSimd, + "VRF[%d] has been overallocated %d > %d\n", + w->simdId, cu->vectorRegsReserved[w->simdId], + cu->numVecRegsPerSimd); + + if (scalarDemand) { + w->startSgprIndex = cu->registerManager->srfPoolMgrs[w->simdId]-> + allocateRegion(scalarDemand, &allocatedSize); + w->reservedScalarRegs = allocatedSize; + cu->scalarRegsReserved[w->simdId] += w->reservedScalarRegs; + panic_if(cu->scalarRegsReserved[w->simdId] > cu->numScalarRegsPerSimd, + "SRF[%d] has been overallocated %d > %d\n", + w->simdId, cu->scalarRegsReserved[w->simdId], + cu->numScalarRegsPerSimd); + } +} + +void +StaticRegisterManagerPolicy::freeRegisters(Wavefront *w) +{ + // free the vector registers of the completed wavefront + w->computeUnit->vectorRegsReserved[w->simdId] -= w->reservedVectorRegs; + // free the scalar registers of the completed wavefront + w->computeUnit->scalarRegsReserved[w->simdId] -= w->reservedScalarRegs; + + panic_if(w->computeUnit->vectorRegsReserved[w->simdId] < 0, + "Freeing VRF[%d] registers left %d registers reserved\n", + w->simdId, + w->computeUnit->vectorRegsReserved[w->simdId]); + panic_if(w->computeUnit->scalarRegsReserved[w->simdId] < 0, + "Freeing SRF[%d] registers left %d registers reserved\n", + w->simdId, + w->computeUnit->scalarRegsReserved[w->simdId]); + + int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) % + w->computeUnit->vrf[w->simdId]->numRegs(); + + w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]-> + freeRegion(w->startVgprIndex, endIndex); + + // mark/pre-mark all registers as not busy + for (int i = 0; i < w->reservedVectorRegs; i++) { + uint32_t physVgprIdx = mapVgpr(w, i); + w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false); + } + + w->reservedVectorRegs = 0; + w->startVgprIndex = 0; + + endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) % + w->computeUnit->srf[w->simdId]->numRegs(); + w->computeUnit->registerManager->srfPoolMgrs[w->simdId]-> + freeRegion(w->startSgprIndex, endIndex); + + // mark/pre-mark all registers as not busy + for (int i = 0; i < w->reservedScalarRegs; i++) { + uint32_t physSgprIdx = mapSgpr(w, i); + w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false); + } + + w->reservedScalarRegs = 0; + w->startSgprIndex = 0; +} + +void +StaticRegisterManagerPolicy::regStats() +{ +} diff --git a/src/gpu-compute/static_register_manager_policy.hh b/src/gpu-compute/static_register_manager_policy.hh new file mode 100644 index 000000000..6abeb1d1a --- /dev/null +++ b/src/gpu-compute/static_register_manager_policy.hh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Mark Wyse + */ + +#ifndef __STATIC_REGISTER_MANAGER_POLICY_HH__ +#define __STATIC_REGISTER_MANAGER_POLICY_HH__ + +#include "gpu-compute/register_manager_policy.hh" + +class HSAQueueEntry; + +class StaticRegisterManagerPolicy : public RegisterManagerPolicy +{ + public: + + StaticRegisterManagerPolicy(); + + void exec() override; + + int mapVgpr(Wavefront* w, int vgprIndex) override; + int mapSgpr(Wavefront* w, int sgprIndex) override; + + bool canAllocateVgprs(int simdId, int nWfs, int demandPerWf) override; + bool canAllocateSgprs(int simdId, int nWfs, int demandPerWf) override; + + void allocateRegisters(Wavefront *w, int vectorDemand, + int scalarDemand) override; + + void freeRegisters(Wavefront *w) override; + + void regStats() override; +}; + +#endif // __STATIC_REGISTER_MANAGER_POLICY_HH__ diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc index 90eadb026..51d2e761a 100644 --- a/src/gpu-compute/tlb_coalescer.cc +++ b/src/gpu-compute/tlb_coalescer.cc @@ -41,7 +41,6 @@ TLBCoalescer::TLBCoalescer(const Params *p) : ClockedObject(p), - clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), coalescingWindow(p->coalescingWindow), disableCoalescing(p->disableCoalescing), @@ -317,7 +316,7 @@ TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) //coalesced requests to the TLB if (!coalescer->probeTLBEvent.scheduled()) { coalescer->schedule(coalescer->probeTLBEvent, - curTick() + coalescer->ticks(1)); + curTick() + coalescer->clockPeriod()); } return true; @@ -380,7 +379,7 @@ TLBCoalescer::MemSidePort::recvReqRetry() //we've receeived a retry. Schedule a probeTLBEvent if (!coalescer->probeTLBEvent.scheduled()) coalescer->schedule(coalescer->probeTLBEvent, - curTick() + coalescer->ticks(1)); + curTick() + coalescer->clockPeriod()); } void @@ -448,7 +447,7 @@ TLBCoalescer::processProbeTLBEvent() // send the coalesced request for virt_page_addr if (!memSidePort[0]->sendTimingReq(first_packet)) { - DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", + DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n", virt_page_addr); // No need for a retries queue since we are already buffering diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh index 72d06deff..842237e5c 100644 --- a/src/gpu-compute/tlb_coalescer.hh +++ b/src/gpu-compute/tlb_coalescer.hh @@ -65,13 +65,6 @@ class ThreadContext; */ class TLBCoalescer : public ClockedObject { - protected: - // TLB clock: will inherit clock from shader's clock period in terms - // of nuber of ticks of curTime (aka global simulation clock) - // The assignment of TLB clock from shader clock is done in the - // python config files. - int clock; - public: typedef TLBCoalescerParams Params; TLBCoalescer(const Params *p); @@ -105,7 +98,8 @@ class TLBCoalescer : public ClockedObject * option is to change it to curTick(), so we coalesce based * on the receive time. */ - typedef std::unordered_map> CoalescingFIFO; + typedef std::unordered_map> + CoalescingFIFO; CoalescingFIFO coalescerFIFO; @@ -143,13 +137,6 @@ class TLBCoalescer : public ClockedObject void updatePhysAddresses(PacketPtr pkt); void regStats() override; - // Clock related functions. Maps to-and-from - // Simulation ticks and object clocks. - Tick frequency() const { return SimClock::Frequency / clock; } - Tick ticks(int numCycles) const { return (Tick)clock * numCycles; } - Tick curCycle() const { return curTick() / clock; } - Tick tickToCycles(Tick val) const { return val / clock;} - class CpuSidePort : public SlavePort { public: @@ -171,7 +158,8 @@ class TLBCoalescer : public ClockedObject virtual void recvRespRetry() { - fatal("recvRespRetry() is not implemented in the TLB coalescer.\n"); + fatal("recvRespRetry() is not implemented in the TLB " + "coalescer.\n"); } virtual AddrRangeList getAddrRanges() const; diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index a57a80972..3bddfccc1 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -36,81 +36,21 @@ #include #include "base/logging.hh" +#include "base/trace.hh" +#include "debug/GPUVRF.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" -#include "gpu-compute/shader.hh" #include "gpu-compute/simple_pool_manager.hh" #include "gpu-compute/wavefront.hh" #include "params/VectorRegisterFile.hh" VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p) - : SimObject(p), - manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)), - simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd), - vgprState(new VecRegisterState()) + : RegisterFile(p) { - fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n"); - fatal_if(simdId < 0, "Illegal SIMD id for VRF"); + regFile.resize(numRegs(), VecRegContainer()); - fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not " - "multiple of VRF size\n"); - - busy.clear(); - busy.resize(numRegsPerSimd, 0); - nxtBusy.clear(); - nxtBusy.resize(numRegsPerSimd, 0); - - vgprState->init(numRegsPerSimd, p->wfSize); -} - -void -VectorRegisterFile::setParent(ComputeUnit *_computeUnit) -{ - computeUnit = _computeUnit; - vgprState->setParent(computeUnit); -} - -uint8_t -VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const -{ - uint8_t status = nxtBusy.at(idx); - - if (operandSize > 4) { - status = status | (nxtBusy.at((idx + 1) % numRegs())); - } - - return status; -} - -uint8_t -VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const -{ - uint8_t status = busy.at(idx); - - if (operandSize > 4) { - status = status | (busy.at((idx + 1) % numRegs())); - } - - return status; -} - -void -VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value) -{ - nxtBusy.at(regIdx) = value; - - if (operandSize > 4) { - nxtBusy.at((regIdx + 1) % numRegs()) = value; - } -} - -void -VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value) -{ - busy.at(regIdx) = value; - - if (operandSize > 4) { - busy.at((regIdx + 1) % numRegs()) = value; + for (auto ® : regFile) { + reg.zero(); } } @@ -118,127 +58,154 @@ bool VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const { for (int i = 0; i < ii->getNumOperands(); ++i) { - if (ii->isVectorRegister(i)) { - uint32_t vgprIdx = ii->getRegisterIndex(i, ii); - uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1); - - if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) { - if (ii->isDstOperand(i)) { - w->numTimesBlockedDueWAXDependencies++; - } else if (ii->isSrcOperand(i)) { - w->numTimesBlockedDueRAWDependencies++; - } - - return false; - } - - if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) { - if (ii->isDstOperand(i)) { - w->numTimesBlockedDueWAXDependencies++; - } else if (ii->isSrcOperand(i)) { - w->numTimesBlockedDueRAWDependencies++; + if (ii->isVectorRegister(i) && ii->isSrcOperand(i)) { + int vgprIdx = ii->getRegisterIndex(i, ii); + + // determine number of registers + int nRegs = + ii->getOperandSize(i) <= 4 ? 1 : ii->getOperandSize(i) / 4; + for (int j = 0; j < nRegs; j++) { + int pVgpr = computeUnit->registerManager + ->mapVgpr(w, vgprIdx + j); + if (regBusy(pVgpr)) { + if (ii->isDstOperand(i)) { + w->numTimesBlockedDueWAXDependencies++; + } else if (ii->isSrcOperand(i)) { + DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n", + w->wfDynId, ii->disassemble(), pVgpr); + w->numTimesBlockedDueRAWDependencies++; + } + return false; } - - return false; } } } - return true; } void -VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) +VectorRegisterFile::scheduleWriteOperands(Wavefront *w, GPUDynInstPtr ii) { - bool loadInstr = ii->isLoad(); - bool atomicInstr = ii->isAtomic() || ii->isMemFence(); - - bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); - // iterate over all register destination operands for (int i = 0; i < ii->getNumOperands(); ++i) { if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { - uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii), - ii->getOperandSize(i), 1); - - // mark the destination vector register as busy - markReg(physReg, ii->getOperandSize(i), 1); - // clear the in-flight status of the destination vector register - preMarkReg(physReg, ii->getOperandSize(i), 0); - - // FIXME: if we ever model correct timing behavior - // for load argument instructions then we should not - // set the destination register as busy now but when - // the data returns. Loads and Atomics should free - // their destination registers when the data returns, - // not now - if (!atomicInstr && !loadNoArgInstr) { - uint32_t pipeLen = ii->getOperandSize(i) <= 4 ? - computeUnit->spBypassLength() : - computeUnit->dpBypassLength(); - - // schedule an event for marking the register as ready - computeUnit->registerEvent(w->simdId, physReg, - ii->getOperandSize(i), - computeUnit->shader->tick_cnt + - computeUnit->shader->ticks(pipeLen), - 0); + int vgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 : + ii->getOperandSize(i) / 4; + + for (int j = 0; j < nRegs; ++j) { + int physReg = computeUnit->registerManager + ->mapVgpr(w, vgprIdx + j); + + // If instruction is atomic instruction and + // the atomics do not return value, then + // do not mark this reg as busy. + if (!(ii->isAtomic() && !ii->isAtomicRet())) { + /** + * if the instruction is a load with EXEC = 0, then + * we do not mark the reg. we do this to avoid a + * deadlock that can occur because a load reserves + * its destination regs before checking its exec mask, + * and in the case it is 0, it will not send/recv any + * packets, and therefore it will never free its dest + * reg(s). + */ + if (!ii->isLoad() || (ii->isLoad() + && ii->exec_mask.any())) { + markReg(physReg, true); + } + } } } } } -int -VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w, - std::vector ®Vec, uint32_t operandSize, - uint64_t timestamp) +void +VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) { - int delay = 0; + // increment count of number of DWORDs read from VRF + int DWORDs = ii->numSrcVecDWORDs(); + registerReads += (DWORDs * w->execMask().count()); + + uint64_t mask = w->execMask().to_ullong(); + int srams = w->execMask().size() / 4; + for (int i = 0; i < srams; i++) { + if (mask & 0xF) { + sramReads += DWORDs; + } + mask = mask >> 4; + } - panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n", - regVec.size()); + if (!ii->isLoad() + && !(ii->isAtomic() || ii->isMemSync())) { + int opSize = 4; + for (int i = 0; i < ii->getNumOperands(); i++) { + if (ii->getOperandSize(i) > opSize) { + opSize = ii->getOperandSize(i); + } + } + Cycles delay(opSize <= 4 ? computeUnit->spBypassLength() + : computeUnit->dpBypassLength()); + Tick tickDelay = computeUnit->cyclesToTicks(delay); + + for (int i = 0; i < ii->getNumOperands(); i++) { + if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { + int vgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 + : ii->getOperandSize(i) / 4; + for (int j = 0; j < nRegs; j++) { + int physReg = computeUnit->registerManager + ->mapVgpr(w, vgprIdx + j); + enqRegFreeEvent(physReg, tickDelay); + } + } + } - for (int i = 0; i < regVec.size(); ++i) { - // mark the destination VGPR as free when the timestamp expires - computeUnit->registerEvent(w->simdId, regVec[i], operandSize, - computeUnit->shader->tick_cnt + timestamp + - computeUnit->shader->ticks(delay), 0); - } + // increment count of number of DWORDs written to VRF + DWORDs = ii->numDstVecDWORDs(); + registerWrites += (DWORDs * w->execMask().count()); - return delay; + mask = w->execMask().to_ullong(); + srams = w->execMask().size() / 4; + for (int i = 0; i < srams; i++) { + if (mask & 0xF) { + sramWrites += DWORDs; + } + mask = mask >> 4; + } + } } void -VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii) +VectorRegisterFile::scheduleWriteOperandsFromLoad( + Wavefront *w, GPUDynInstPtr ii) { - // iterate over all register destination operands + assert(ii->isLoad() || ii->isAtomicRet()); for (int i = 0; i < ii->getNumOperands(); ++i) { if (ii->isVectorRegister(i) && ii->isDstOperand(i)) { - uint32_t physReg = w->remap(ii->getRegisterIndex(i, ii), - ii->getOperandSize(i), 1); - // set the in-flight status of the destination vector register - preMarkReg(physReg, ii->getOperandSize(i), 1); + int vgprIdx = ii->getRegisterIndex(i, ii); + int nRegs = ii->getOperandSize(i) <= 4 ? 1 : + ii->getOperandSize(i) / 4; + + for (int j = 0; j < nRegs; ++j) { + int physReg = computeUnit->registerManager + ->mapVgpr(w, vgprIdx + j); + enqRegFreeEvent(physReg, computeUnit->clockPeriod()); + } } } -} - -bool -VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, - GPUDynInstPtr ii, - VrfAccessType accessType) -{ - bool ready = true; - - return ready; -} - -bool -VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, - VrfAccessType accessType) -{ - bool ready = true; - - return ready; + // increment count of number of DWORDs written to VRF + int DWORDs = ii->numDstVecDWORDs(); + registerWrites += (DWORDs * ii->exec_mask.count()); + + uint64_t mask = ii->exec_mask.to_ullong(); + int srams = ii->exec_mask.size() / 4; + for (int i = 0; i < srams; i++) { + if (mask & 0xF) { + sramWrites += DWORDs; + } + mask = mask >> 4; + } } VectorRegisterFile* diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh index 254197540..0ad086d68 100644 --- a/src/gpu-compute/vector_register_file.hh +++ b/src/gpu-compute/vector_register_file.hh @@ -34,111 +34,76 @@ #ifndef __VECTOR_REGISTER_FILE_HH__ #define __VECTOR_REGISTER_FILE_HH__ -#include - -#include "base/statistics.hh" -#include "base/trace.hh" -#include "base/types.hh" +#include "arch/gpu_isa.hh" +#include "config/the_gpu_isa.hh" #include "debug/GPUVRF.hh" -#include "gpu-compute/vector_register_state.hh" -#include "sim/sim_object.hh" - -class ComputeUnit; -class Shader; -class SimplePoolManager; -class Wavefront; +#include "gpu-compute/register_file.hh" +#include "gpu-compute/wavefront.hh" struct VectorRegisterFileParams; -enum class VrfAccessType : uint8_t -{ - READ = 0x01, - WRITE = 0x02, - RD_WR = READ | WRITE -}; - // Vector Register File -class VectorRegisterFile : public SimObject +class VectorRegisterFile : public RegisterFile { public: + using VecRegContainer = TheGpuISA::VecRegContainerU32; + VectorRegisterFile(const VectorRegisterFileParams *p); + ~VectorRegisterFile() { } - void setParent(ComputeUnit *_computeUnit); + virtual bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const override; + virtual void scheduleWriteOperands(Wavefront *w, + GPUDynInstPtr ii) override; + virtual void scheduleWriteOperandsFromLoad(Wavefront *w, + GPUDynInstPtr ii) override; + virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) override; - // Read a register - template - T - read(int regIdx, int threadId=0) + void + setParent(ComputeUnit *_computeUnit) override { - T p0 = vgprState->read(regIdx, threadId); - DPRINTF(GPUVRF, "reading vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)p0); - - return p0; + RegisterFile::setParent(_computeUnit); } - // Write a register - template - void - write(int regIdx, T value, int threadId=0) + // Read a register that is writeable (e.g., a DST operand) + VecRegContainer& + readWriteable(int regIdx) { - DPRINTF(GPUVRF, "writing vreg[%d][%d] = %u\n", regIdx, threadId, (uint64_t)value); - vgprState->write(regIdx, value, threadId); + return regFile[regIdx]; } - uint8_t regBusy(int idx, uint32_t operandSize) const; - uint8_t regNxtBusy(int idx, uint32_t operandSize) const; - - int numRegs() const { return numRegsPerSimd; } - - void markReg(int regIdx, uint32_t operandSize, uint8_t value); - void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value); - - virtual void exec(GPUDynInstPtr ii, Wavefront *w); - - virtual int exec(uint64_t dynamic_id, Wavefront *w, - std::vector ®Vec, uint32_t operandSize, - uint64_t timestamp); - - bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const; - virtual void updateEvents() { } - virtual void updateResources(Wavefront *w, GPUDynInstPtr ii); - - virtual bool - isReadConflict(int memWfId, int exeWfId) const + // Read a register that is not writeable (e.g., src operand) + const VecRegContainer& + read(int regIdx) const { - return false; + return regFile[regIdx]; } - virtual bool - isWriteConflict(int memWfId, int exeWfId) const + // Write a register + void + write(int regIdx, const VecRegContainer &value) { - return false; + regFile[regIdx] = value; } - virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w, - GPUDynInstPtr ii, - VrfAccessType accessType); - - virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii, - VrfAccessType accessType); - - SimplePoolManager *manager; - - protected: - ComputeUnit* computeUnit; - int simdId; - - // flag indicating if a register is busy - std::vector busy; - // flag indicating if a register will be busy (by instructions - // in the SIMD pipeline) - std::vector nxtBusy; - - // numer of registers (bank size) per simd unit (bank) - int numRegsPerSimd; + void + printReg(Wavefront *wf, int regIdx) const + { +#ifndef NDEBUG + const auto &vec_reg_cont = regFile[regIdx]; + auto vgpr = vec_reg_cont.as(); + + for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + DPRINTF(GPUVRF, "WF[%d][%d]: WV[%d] v[%d][%d] = %#x\n", + wf->simdId, wf->wfSlotId, wf->wfDynId, regIdx, lane, + vgpr[lane]); + } + } +#endif + } - // vector register state - VecRegisterState *vgprState; + private: + std::vector regFile; }; #endif // __VECTOR_REGISTER_FILE_HH__ diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 46cce9ce8..c2c98ba0c 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -34,10 +34,13 @@ #include "gpu-compute/wavefront.hh" #include "debug/GPUExec.hh" +#include "debug/GPUInitAbi.hh" #include "debug/WavefrontStack.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" +#include "gpu-compute/simple_pool_manager.hh" #include "gpu-compute/vector_register_file.hh" Wavefront* @@ -47,16 +50,18 @@ WavefrontParams::create() } Wavefront::Wavefront(const Params *p) - : SimObject(p), callArgMem(nullptr), _gpuISA() + : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId), + maxIbSize(p->max_ib_size), _gpuISA(*this), + vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1) { lastTrace = 0; - simdId = p->simdId; - wfSlotId = p->wf_slot_id; + execUnitId = -1; status = S_STOPPED; reservedVectorRegs = 0; + reservedScalarRegs = 0; startVgprIndex = 0; + startSgprIndex = 0; outstandingReqs = 0; - memReqsInPipe = 0; outstandingReqsWrGm = 0; outstandingReqsWrLm = 0; outstandingReqsRdGm = 0; @@ -65,28 +70,38 @@ Wavefront::Wavefront(const Params *p) rdGmReqsInPipe = 0; wrLmReqsInPipe = 0; wrGmReqsInPipe = 0; - + scalarRdGmReqsInPipe = 0; + scalarWrGmReqsInPipe = 0; + scalarOutstandingReqsRdGm = 0; + scalarOutstandingReqsWrGm = 0; + lastNonIdleTick = 0; barrierCnt = 0; oldBarrierCnt = 0; stalledAtBarrier = false; + ldsChunk = nullptr; memTraceBusy = 0; oldVgprTcnt = 0xffffffffffffffffll; oldDgprTcnt = 0xffffffffffffffffll; - oldVgpr.resize(p->wfSize); + oldVgpr.resize(p->wf_size); pendingFetch = false; dropFetch = false; - condRegState = new ConditionRegisterState(); - maxSpVgprs = 0; - maxDpVgprs = 0; - lastAddr.resize(p->wfSize); - workItemFlatId.resize(p->wfSize); - oldDgpr.resize(p->wfSize); - barCnt.resize(p->wfSize); + maxVgprs = 0; + maxSgprs = 0; + + lastAddr.resize(p->wf_size); + workItemFlatId.resize(p->wf_size); + oldDgpr.resize(p->wf_size); + barCnt.resize(p->wf_size); for (int i = 0; i < 3; ++i) { - workItemId[i].resize(p->wfSize); + workItemId[i].resize(p->wf_size); } + + _execMask.set(); + rawDist.clear(); + lastInstExec = 0; + vecReads.clear(); } void @@ -94,19 +109,6 @@ Wavefront::regStats() { SimObject::regStats(); - srcRegOpDist - .init(0, 4, 2) - .name(name() + ".src_reg_operand_dist") - .desc("number of executed instructions with N source register operands") - ; - - dstRegOpDist - .init(0, 3, 2) - .name(name() + ".dst_reg_operand_dist") - .desc("number of executed instructions with N destination register " - "operands") - ; - // FIXME: the name of the WF needs to be unique numTimesBlockedDueWAXDependencies .name(name() + ".timesBlockedDueWAXDependencies") @@ -121,11 +123,53 @@ Wavefront::regStats() "dependencies") ; - // FIXME: the name of the WF needs to be unique - numTimesBlockedDueVrfPortAvail - .name(name() + ".timesBlockedDueVrfPortAvail") - .desc("number of times instructions are blocked due to VRF port " - "availability") + numInstrExecuted + .name(name() + ".num_instr_executed") + .desc("number of instructions executed by this WF slot") + ; + + schCycles + .name(name() + ".sch_cycles") + .desc("number of cycles spent in schedule stage") + ; + + schStalls + .name(name() + ".sch_stalls") + .desc("number of cycles WF is stalled in SCH stage") + ; + + schRfAccessStalls + .name(name() + ".sch_rf_access_stalls") + .desc("number of cycles wave selected in SCH but RF denied adding " + "instruction") + ; + + schResourceStalls + .name(name() + ".sch_resource_stalls") + .desc("number of cycles stalled in sch by resource not available") + ; + + schOpdNrdyStalls + .name(name() + ".sch_opd_nrdy_stalls") + .desc("number of cycles stalled in sch waiting for RF reads to " + "complete") + ; + + schLdsArbStalls + .name(name() + ".sch_lds_arb_stalls") + .desc("number of cycles wave stalled due to LDS-VRF arbitration") + ; + + vecRawDistance + .init(0,20,1) + .name(name() + ".vec_raw_distance") + .desc("Count of RAW distance in dynamic instructions for this WF") + ; + + readsPerWrite + .init(0,4,1) + .name(name() + ".vec_reads_per_write") + .desc("Count of Vector reads per write for this WF") ; } @@ -133,37 +177,473 @@ void Wavefront::init() { reservedVectorRegs = 0; + reservedScalarRegs = 0; startVgprIndex = 0; + startSgprIndex = 0; + + scalarAlu = computeUnit->mapWaveToScalarAlu(this); + scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this); + globalMem = computeUnit->mapWaveToGlobalMem(this); + localMem = computeUnit->mapWaveToLocalMem(this); + scalarMem = computeUnit->mapWaveToScalarMem(this); +} + +void +Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) +{ + int regInitIdx = 0; + + // iterate over all the init fields and check which + // bits are enabled + for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) { + + if (task->sgprBitEnabled(en_bit)) { + int physSgprIdx = 0; + uint32_t wiCount = 0; + uint32_t firstWave = 0; + int orderedAppendTerm = 0; + int numWfsInWg = 0; + uint32_t finalValue = 0; + Addr host_disp_pkt_addr = task->hostDispPktAddr(); + Addr kernarg_addr = task->kernargAddr(); + Addr hidden_priv_base(0); + + switch (en_bit) { + case PrivateSegBuf: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + task->amdQueue.scratch_resource_descriptor[0]); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting PrivateSegBuffer: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + task->amdQueue.scratch_resource_descriptor[0]); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + task->amdQueue.scratch_resource_descriptor[1]); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting PrivateSegBuffer: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + task->amdQueue.scratch_resource_descriptor[1]); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + task->amdQueue.scratch_resource_descriptor[2]); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting PrivateSegBuffer: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + task->amdQueue.scratch_resource_descriptor[2]); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + task->amdQueue.scratch_resource_descriptor[3]); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting PrivateSegBuffer: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + task->amdQueue.scratch_resource_descriptor[3]); + break; + case DispatchPtr: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + ((uint32_t*)&host_disp_pkt_addr)[0]); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting DispatchPtr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + ((uint32_t*)&host_disp_pkt_addr)[0]); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + ((uint32_t*)&host_disp_pkt_addr)[1]); + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting DispatchPtr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + ((uint32_t*)&host_disp_pkt_addr)[1]); + + ++regInitIdx; + break; + case QueuePtr: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + ((uint32_t*)&task->hostAMDQueueAddr)[0]); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting QueuePtr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + ((uint32_t*)&task->hostAMDQueueAddr)[0]); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + ((uint32_t*)&task->hostAMDQueueAddr)[1]); + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting QueuePtr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + ((uint32_t*)&task->hostAMDQueueAddr)[1]); + + ++regInitIdx; + break; + case KernargSegPtr: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + ((uint32_t*)&kernarg_addr)[0]); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting KernargSegPtr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + ((uint32_t*)kernarg_addr)[0]); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + ((uint32_t*)&kernarg_addr)[1]); + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting KernargSegPtr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + ((uint32_t*)kernarg_addr)[1]); + + ++regInitIdx; + break; + case FlatScratchInit: + physSgprIdx + = computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + (TheGpuISA::ScalarRegU32)(task->amdQueue + .scratch_backing_memory_location & 0xffffffff)); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting FlatScratch Addr: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + (TheGpuISA::ScalarRegU32)(task->amdQueue + .scratch_backing_memory_location & 0xffffffff)); + + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + // This vallue should be sizeof(DWORD) aligned, that is + // 4 byte aligned + computeUnit->srf[simdId]->write(physSgprIdx, + task->amdQueue.scratch_workitem_byte_size); + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting FlatScratch size: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + task->amdQueue.scratch_workitem_byte_size); + /** + * Since flat scratch init is needed for this kernel, this + * kernel is going to have flat memory instructions and we + * need to initialize the hidden private base for this queue. + * scratch_resource_descriptor[0] has this queue's scratch + * base address. scratch_backing_memory_location has the + * offset to this queue's scratch base address from the + * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this + * queue's scratch base address for address calculation + * (stored in scratch_resource_descriptor[0]). But that + * address calculation shoule be done by first finding the + * queue's scratch base address using the calculation + * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize + * SH_HIDDEN_PRIVATE_BASE_VMID. + * + * For more details see: + * http://rocm-documentation.readthedocs.io/en/latest/ + * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch + * + * https://github.com/ROCm-Developer-Tools/ + * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md + * #flat-addressing + */ + hidden_priv_base = + (uint64_t)task->amdQueue.scratch_resource_descriptor[0] | + (((uint64_t)task->amdQueue.scratch_resource_descriptor[1] + & 0x000000000000ffff) << 32); + computeUnit->shader->initShHiddenPrivateBase( + hidden_priv_base, + task->amdQueue.scratch_backing_memory_location); + break; + case GridWorkgroupCountX: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + wiCount = ((task->gridSize(0) + + task->wgSize(0) - 1) / + task->wgSize(0)); + computeUnit->srf[simdId]->write(physSgprIdx, wiCount); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting num WG X: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, wiCount); + break; + case GridWorkgroupCountY: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + wiCount = ((task->gridSize(1) + + task->wgSize(1) - 1) / + task->wgSize(1)); + computeUnit->srf[simdId]->write(physSgprIdx, wiCount); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting num WG Y: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, wiCount); + break; + case GridWorkgroupCountZ: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + wiCount = ((task->gridSize(2) + + task->wgSize(2) - 1) / + task->wgSize(2)); + computeUnit->srf[simdId]->write(physSgprIdx, wiCount); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting num WG Z: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, wiCount); + break; + case WorkgroupIdX: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + workGroupId[0]); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting WG ID X: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, workGroupId[0]); + break; + case WorkgroupIdY: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + workGroupId[1]); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting WG ID Y: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, workGroupId[1]); + break; + case WorkgroupIdZ: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]->write(physSgprIdx, + workGroupId[2]); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting WG ID Z: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, workGroupId[2]); + break; + case PrivSegWaveByteOffset: + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + /** + * the compute_tmpring_size_wavesize specifies the number of + * kB allocated per wavefront, hence the multiplication by + * 1024. + * + * to get the per wavefront offset into the scratch + * memory, we also multiply this by the wfId. the wfId stored + * in the Wavefront class, however, is the wave ID within the + * WG, whereas here we need the global WFID because the + * scratch space will be divided amongst all waves in the + * kernel. to get the global ID we multiply the WGID by + * the WG size, then add the WFID of the wave within its WG. + */ + computeUnit->srf[simdId]->write(physSgprIdx, 1024 * + (wgId * (wgSz / 64) + wfId) * + task->amdQueue.compute_tmpring_size_wavesize); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting Private Seg Offset: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, + 1024 * (wgId * (wgSz / 64) + wfId) * + task->amdQueue.compute_tmpring_size_wavesize); + break; + case WorkgroupInfo: + firstWave = (wfId == 0) ? 1 : 0; + numWfsInWg = divCeil(wgSizeInWorkItems, + computeUnit->wfSize()); + finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1); + finalValue |= (orderedAppendTerm << 6); + finalValue |= numWfsInWg; + physSgprIdx = + computeUnit->registerManager->mapSgpr(this, regInitIdx); + computeUnit->srf[simdId]-> + write(physSgprIdx, finalValue); + + ++regInitIdx; + DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] " + "Setting WG Info: s[%d] = %x\n", + computeUnit->cu_id, simdId, + wfSlotId, wfDynId, physSgprIdx, finalValue); + break; + default: + fatal("SGPR enable bit %i not supported\n", en_bit); + break; + } + } + } + + regInitIdx = 0; + + // iterate over all the init fields and check which + // bits are enabled + for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) { + if (task->vgprBitEnabled(en_bit)) { + uint32_t physVgprIdx = 0; + TheGpuISA::VecRegContainerU32 raw_vgpr; + + switch (en_bit) { + case WorkitemIdX: + { + physVgprIdx = computeUnit->registerManager + ->mapVgpr(this, regInitIdx); + TheGpuISA::VecRegU32 vgpr_x + = raw_vgpr.as(); + + for (int lane = 0; lane < workItemId[0].size(); ++lane) { + vgpr_x[lane] = workItemId[0][lane]; + } + + computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr); + rawDist[regInitIdx] = 0; + ++regInitIdx; + } + break; + case WorkitemIdY: + { + physVgprIdx = computeUnit->registerManager + ->mapVgpr(this, regInitIdx); + TheGpuISA::VecRegU32 vgpr_y + = raw_vgpr.as(); + + for (int lane = 0; lane < workItemId[1].size(); ++lane) { + vgpr_y[lane] = workItemId[1][lane]; + } + + computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr); + rawDist[regInitIdx] = 0; + ++regInitIdx; + } + break; + case WorkitemIdZ: + { + physVgprIdx = computeUnit->registerManager-> + mapVgpr(this, regInitIdx); + TheGpuISA::VecRegU32 vgpr_z + = raw_vgpr.as(); + + for (int lane = 0; lane < workItemId[2].size(); ++lane) { + vgpr_z[lane] = workItemId[2][lane]; + } + + computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr); + rawDist[regInitIdx] = 0; + ++regInitIdx; + } + break; + } + } + } } void -Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) +Wavefront::resizeRegFiles(int num_vregs, int num_sregs) { - condRegState->init(num_cregs); - maxSpVgprs = num_sregs; - maxDpVgprs = num_dregs; + maxVgprs = num_vregs; + maxSgprs = num_sregs; } Wavefront::~Wavefront() { - if (callArgMem) - delete callArgMem; - delete condRegState; } void -Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) +Wavefront::setStatus(status_e newStatus) +{ + if (computeUnit->idleCUTimeout > 0) { + // Wavefront's status transitions to stalled or stopped + if ((newStatus == S_STOPPED || newStatus == S_STALLED || + newStatus == S_WAITCNT) && + (status != newStatus)) { + computeUnit->idleWfs++; + assert(computeUnit->idleWfs <= + (computeUnit->shader->n_wf * computeUnit->numVectorALUs)); + if (computeUnit->idleWfs == + (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) { + lastNonIdleTick = curTick(); + } + // Wavefront's status transitions to an active state (from + // a stopped or stalled state) + } else if ((status == S_STOPPED || status == S_STALLED || + status == S_WAITCNT) && + (status != newStatus)) { + // if all WFs in the CU were idle then check if the idleness + // period exceeded the timeout threshold + if (computeUnit->idleWfs == + (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) { + panic_if((curTick() - lastNonIdleTick) >= + computeUnit->idleCUTimeout, + "CU%d has been idle for %d ticks at tick %d", + computeUnit->cu_id, computeUnit->idleCUTimeout, + curTick()); + } + computeUnit->idleWfs--; + assert(computeUnit->idleWfs >= 0); + } + } + status = newStatus; +} + +void +Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc) { wfDynId = _wf_dyn_id; - basePtr = _base_ptr; + _pc = init_pc; + status = S_RUNNING; + + vecReads.resize(maxVgprs, 0); } bool Wavefront::isGmInstruction(GPUDynInstPtr ii) { - if (ii->isGlobalMem() || ii->isFlat()) + if (ii->isGlobalMem() || + (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) { return true; + } return false; } @@ -171,7 +651,40 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii) bool Wavefront::isLmInstruction(GPUDynInstPtr ii) { - if (ii->isLocalMem()) { + if (ii->isLocalMem() || + (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstWaitcnt() +{ + if (instructionBuffer.empty()) + return false; + + GPUDynInstPtr ii = instructionBuffer.front(); + + if (ii->isWaitcnt()) { + // waitcnt is a scalar + assert(ii->isScalar()); + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstScalarALU() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn() + || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() || + (ii->isKernArgSeg() && ii->isLoad()))) { return true; } @@ -179,14 +692,14 @@ Wavefront::isLmInstruction(GPUDynInstPtr ii) } bool -Wavefront::isOldestInstALU() +Wavefront::isOldestInstVectorALU() { assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (ii->isNop() || - ii->isReturn() || ii->isBranch() || - ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { + if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() || + ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel() + || (ii->isKernArgSeg() && ii->isLoad()))) { return true; } @@ -212,7 +725,20 @@ Wavefront::isOldestInstGMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && ii->isGlobalMem()) { + if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) { + return true; + } + + return false; +} + +bool +Wavefront::isOldestInstScalarMem() +{ + assert(!instructionBuffer.empty()); + GPUDynInstPtr ii = instructionBuffer.front(); + + if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) { return true; } @@ -258,15 +784,13 @@ Wavefront::isOldestInstFlatMem() return false; } -// Return true if the Wavefront's instruction -// buffer has branch instruction. bool -Wavefront::instructionBufferHasBranch() +Wavefront::stopFetch() { for (auto it : instructionBuffer) { GPUDynInstPtr ii = it; - - if (ii->isReturn() || ii->isBranch()) { + if (ii->isReturn() || ii->isBranch() || + ii->isEndOfKernel()) { return true; } } @@ -274,377 +798,125 @@ Wavefront::instructionBufferHasBranch() return false; } -// Remap HSAIL register to physical VGPR. -// HSAIL register = virtual register assigned to an operand by HLC compiler -uint32_t -Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) +void +Wavefront::freeResources() { - assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); - // add the offset from where the VGPRs of the wavefront have been assigned - uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; - // HSAIL double precision (DP) register: calculate the physical VGPR index - // assuming that DP registers are placed after SP ones in the VRF. The DP - // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust - // the DP VGPR index before mapping it to the physical VRF address space - if (mode == 1 && size > 4) { - physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); - } - - assert((startVgprIndex <= physicalVgprIndex) && - (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); - - // calculate absolute physical VGPR index - return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); + execUnitId = -1; } -// Return true if this wavefront is ready -// to execute an instruction of the specified type. -int -Wavefront::ready(itype_e type) +void Wavefront::validateRequestCounters() { - // Check to make sure wave is running - if (status == S_STOPPED || status == S_RETURNING || - instructionBuffer.empty()) { - return 0; - } - - // Is the wave waiting at a barrier - if (stalledAtBarrier) { - if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, - computeUnit->getRefCounter(dispatchId, wgId))) { - // Are all threads at barrier? - return 0; - } - oldBarrierCnt = barrierCnt; - stalledAtBarrier = false; - } - - // Read instruction - GPUDynInstPtr ii = instructionBuffer.front(); + panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 || + wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 || + outstandingReqs < 0, + "Negative requests in pipe for WF%d for slot%d" + " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d," + " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d," + " Outstanding Reqs=%d\n", + wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe, + rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs); +} - bool ready_inst M5_VAR_USED = false; - bool glbMemBusRdy = false; - bool glbMemIssueRdy = false; - if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { - for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { - if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) - glbMemBusRdy = true; - if (computeUnit->wfWait[j].prerdy()) - glbMemIssueRdy = true; +void +Wavefront::reserveGmResource(GPUDynInstPtr ii) +{ + if (!ii->isScalar()) { + if (ii->isLoad()) { + rdGmReqsInPipe++; + } else if (ii->isStore()) { + wrGmReqsInPipe++; + } else if (ii->isAtomic() || ii->isMemSync()) { + rdGmReqsInPipe++; + wrGmReqsInPipe++; + } else { + panic("Invalid memory operation!\n"); } - } - bool locMemBusRdy = false; - bool locMemIssueRdy = false; - if (type == I_SHARED || type == I_FLAT) { - for (int j=0; j < computeUnit->numLocMemUnits; ++j) { - if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) - locMemBusRdy = true; - if (computeUnit->wfWait[j].prerdy()) - locMemIssueRdy = true; + execUnitId = globalMem; + } else { + if (ii->isLoad()) { + scalarRdGmReqsInPipe++; + } else if (ii->isStore()) { + scalarWrGmReqsInPipe++; + } else if (ii->isAtomic() || ii->isMemSync()) { + scalarWrGmReqsInPipe++; + scalarRdGmReqsInPipe++; + } else { + panic("Invalid memory operation!\n"); } + execUnitId = scalarMem; } +} - // The following code is very error prone and the entire process for - // checking readiness will be fixed eventually. In the meantime, let's - // make sure that we do not silently let an instruction type slip - // through this logic and always return not ready. - if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || - ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || - ii->isMemFence() || ii->isFlat())) { - panic("next instruction: %s is of unknown type\n", ii->disassemble()); - } - - DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", - computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); - - if (type == I_ALU && ii->isBarrier()) { - // Here for ALU instruction (barrier) - if (!computeUnit->wfWait[simdId].prerdy()) { - // Is wave slot free? - return 0; - } - - // Are there in pipe or outstanding memory requests? - if ((outstandingReqs + memReqsInPipe) > 0) { - return 0; - } - - ready_inst = true; - } else if (type == I_ALU && ii->isNop()) { - // Here for ALU instruction (nop) - if (!computeUnit->wfWait[simdId].prerdy()) { - // Is wave slot free? - return 0; - } - - ready_inst = true; - } else if (type == I_ALU && ii->isReturn()) { - // Here for ALU instruction (return) - if (!computeUnit->wfWait[simdId].prerdy()) { - // Is wave slot free? - return 0; - } - - // Are there in pipe or outstanding memory requests? - if ((outstandingReqs + memReqsInPipe) > 0) { - return 0; - } - - ready_inst = true; - } else if (type == I_ALU && (ii->isBranch() || - ii->isALU() || - (ii->isKernArgSeg() && ii->isLoad()) || - ii->isArgSeg())) { - // Here for ALU instruction (all others) - if (!computeUnit->wfWait[simdId].prerdy()) { - // Is alu slot free? - return 0; - } - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_GLOBAL && ii->isGlobalMem()) { - // Here Global memory instruction - if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { - // Are there in pipe or outstanding global memory write requests? - if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { - return 0; - } - } - - if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { - // Are there in pipe or outstanding global memory read requests? - if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) - return 0; - } - - if (!glbMemIssueRdy) { - // Is WV issue slot free? - return 0; - } - - if (!glbMemBusRdy) { - // Is there an available VRF->Global memory read bus? - return 0; - } - - // Does the coalescer have space for our instruction? - if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) { - return 0; - } - - if (!computeUnit->globalMemoryPipe. - isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { - // Can we insert a new request to the Global Mem Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_SHARED && ii->isLocalMem()) { - // Here for Shared memory instruction - if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { - if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { - return 0; - } - } - - if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { - if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { - return 0; - } - } - - if (!locMemBusRdy) { - // Is there an available VRF->LDS read bus? - return 0; - } - if (!locMemIssueRdy) { - // Is wave slot free? - return 0; - } - - if (!computeUnit->localMemoryPipe. - isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { - // Can we insert a new request to the LDS Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_FLAT && ii->isFlat()) { - if (!glbMemBusRdy) { - // Is there an available VRF->Global memory read bus? - return 0; - } - - if (!locMemBusRdy) { - // Is there an available VRF->LDS read bus? - return 0; - } - - if (!glbMemIssueRdy) { - // Is wave slot free? - return 0; - } - - if (!locMemIssueRdy) { - return 0; - } - - // Does the coalescer have space for our instruction? - if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) { - return 0; - } - - if (!computeUnit->globalMemoryPipe. - isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { - // Can we insert a new request to the Global Mem Request FIFO? - return 0; - } - - if (!computeUnit->localMemoryPipe. - isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { - // Can we insert a new request to the LDS Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - // are all the operands ready? (RAW, WAW and WAR depedencies met?) - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; +void +Wavefront::reserveLmResource(GPUDynInstPtr ii) +{ + fatal_if(ii->isScalar(), + "Scalar instructions can not access Shared memory!!!"); + if (ii->isLoad()) { + rdLmReqsInPipe++; + } else if (ii->isStore()) { + wrLmReqsInPipe++; + } else if (ii->isAtomic() || ii->isMemSync()) { + wrLmReqsInPipe++; + rdLmReqsInPipe++; } else { - return 0; + panic("Invalid memory operation!\n"); } - - assert(ready_inst); - - DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, - simdId, wfSlotId, ii->disassemble()); - return 1; + execUnitId = localMem; } -void -Wavefront::updateResources() +std::vector +Wavefront::reserveResources() { + // vector of execution unit IDs to return to schedule stage + // this return is only used for debugging and an assertion... + std::vector execUnitIds; + // Get current instruction GPUDynInstPtr ii = instructionBuffer.front(); assert(ii); - computeUnit->vrf[simdId]->updateResources(this, ii); + // Single precision ALU or Branch or Return or Special instruction if (ii->isALU() || ii->isSpecialOp() || - ii->isBranch() || - // FIXME: Kernel argument loads are currently treated as ALU operations - // since we don't send memory packets at execution. If we fix that then - // we should map them to one of the memory pipelines + ii->isBranch() || ii->isNop() || (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || - ii->isReturn()) { - computeUnit->aluPipe[simdId].preset(computeUnit->shader-> - ticks(computeUnit->spBypassLength())); - // this is to enforce a fixed number of cycles per issue slot per SIMD - computeUnit->wfWait[simdId].preset(computeUnit->shader-> - ticks(computeUnit->issuePeriod)); - } else if (ii->isBarrier()) { - computeUnit->wfWait[simdId].preset(computeUnit->shader-> - ticks(computeUnit->issuePeriod)); - } else if (ii->isLoad() && ii->isFlat()) { - assert(Enums::SC_NONE != ii->executedAs()); - memReqsInPipe++; - rdGmReqsInPipe++; - if ( Enums::SC_SHARED == ii->executedAs() ) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } - } else if (ii->isStore() && ii->isFlat()) { - assert(Enums::SC_NONE != ii->executedAs()); - memReqsInPipe++; - wrGmReqsInPipe++; - if (Enums::SC_SHARED == ii->executedAs()) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + ii->isReturn() || ii->isEndOfKernel()) { + if (!ii->isScalar()) { + execUnitId = simdId; } else { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + execUnitId = scalarAluGlobalIdx; } - } else if (ii->isLoad() && ii->isGlobalMem()) { - memReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (ii->isStore() && ii->isGlobalMem()) { - memReqsInPipe++; - wrGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { - memReqsInPipe++; - wrGmReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (ii->isLoad() && ii->isLocalMem()) { - memReqsInPipe++; - rdLmReqsInPipe++; - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (ii->isStore() && ii->isLocalMem()) { - memReqsInPipe++; - wrLmReqsInPipe++; - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { - memReqsInPipe++; - wrLmReqsInPipe++; - rdLmReqsInPipe++; - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); + // this is to enforce a fixed number of cycles per issue slot per SIMD + } else if (ii->isBarrier()) { + execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId; + } else if (ii->isFlat()) { + assert(!ii->isScalar()); + reserveLmResource(ii); + // add execUnitId, reserved by reserveLmResource, list before it is + // overwriten by reserveGmResource + execUnitIds.push_back(execUnitId); + flatLmUnitId = execUnitId; + reserveGmResource(ii); + flatGmUnitId = execUnitId; + execUnitIds.push_back(flatGmUnitId); + execUnitId = -1; + } else if (ii->isGlobalMem()) { + reserveGmResource(ii); + } else if (ii->isLocalMem()) { + reserveLmResource(ii); + } else if (ii->isPrivateSeg()) { + fatal_if(ii->isScalar(), + "Scalar instructions can not access Private memory!!!"); + reserveGmResource(ii); + } else { + panic("reserveResources -> Couldn't process op!\n"); + } + + if (execUnitId != -1) { + execUnitIds.push_back(execUnitId); } + assert(execUnitIds.size()); + return execUnitIds; } void @@ -653,49 +925,171 @@ Wavefront::exec() // ---- Exit if wavefront is inactive ----------------------------- // if (status == S_STOPPED || status == S_RETURNING || - instructionBuffer.empty()) { + status==S_STALLED || instructionBuffer.empty()) { return; } + if (status == S_WAITCNT) { + /** + * if this wave is in S_WAITCNT state, then + * it should enter exec() precisely one time + * before the waitcnts are satisfied, in order + * to execute the waitcnt instruction itself + * thus we assert that the waitcnt is the + * oldest instruction. if we enter exec() with + * active waitcnts, and we're not executing + * the waitcnt instruction, something must be + * wrong + */ + assert(isOldestInstWaitcnt()); + } + // Get current instruction GPUDynInstPtr ii = instructionBuffer.front(); - const uint32_t old_pc = pc(); + const Addr old_pc = pc(); DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " - "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, - ii->disassemble(), old_pc); - - // update the instruction stats in the CU + "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId, + wfDynId, ii->disassemble(), old_pc, ii->seqNum()); ii->execute(ii); + // delete the dynamic instruction from the pipeline map + computeUnit->deleteFromPipeMap(this); + // update the instruction stats in the CU computeUnit->updateInstStats(ii); - // access the VRF - computeUnit->vrf[simdId]->exec(ii, this); - srcRegOpDist.sample(ii->numSrcRegOperands()); - dstRegOpDist.sample(ii->numDstRegOperands()); + + // inform VRF of instruction execution to schedule write-back + // and scoreboard ready for registers + if (!ii->isScalar()) { + computeUnit->vrf[simdId]->waveExecuteInst(this, ii); + } + computeUnit->srf[simdId]->waveExecuteInst(this, ii); + + computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++; + computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++; computeUnit->numInstrExecuted++; + numInstrExecuted++; + computeUnit->instExecPerSimd[simdId]++; computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - computeUnit->lastExecCycle[simdId]); computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); - if (pc() == old_pc) { - uint32_t new_pc = _gpuISA.advancePC(old_pc, ii); - // PC not modified by instruction, proceed to next or pop frame - pc(new_pc); - if (new_pc == rpc()) { - popFromReconvergenceStack(); - discardFetch(); - } else { - instructionBuffer.pop_front(); + + if (lastInstExec) { + computeUnit->instInterleave[simdId]. + sample(computeUnit->instExecPerSimd[simdId] - lastInstExec); + } + lastInstExec = computeUnit->instExecPerSimd[simdId]; + + // want to track: + // number of reads that occur per value written + + // vector RAW dependency tracking + for (int i = 0; i < ii->getNumOperands(); i++) { + if (ii->isVectorRegister(i)) { + int vgpr = ii->getRegisterIndex(i, ii); + int nReg = ii->getOperandSize(i) <= 4 ? 1 : + ii->getOperandSize(i) / 4; + for (int n = 0; n < nReg; n++) { + if (ii->isSrcOperand(i)) { + // This check should never fail, but to be safe we check + if (rawDist.find(vgpr+n) != rawDist.end()) { + vecRawDistance. + sample(numInstrExecuted.value() - rawDist[vgpr+n]); + } + // increment number of reads to this register + vecReads[vgpr+n]++; + } else if (ii->isDstOperand(i)) { + // rawDist is set on writes, but will not be set + // for the first write to each physical register + if (rawDist.find(vgpr+n) != rawDist.end()) { + // sample the number of reads that were performed + readsPerWrite.sample(vecReads[vgpr+n]); + } + // on a write, reset count of reads to 0 + vecReads[vgpr+n] = 0; + + rawDist[vgpr+n] = numInstrExecuted.value(); + } + } } + } + + if (pc() == old_pc) { + // PC not modified by instruction, proceed to next + _gpuISA.advancePC(ii); + instructionBuffer.pop_front(); } else { + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, + ii->disassemble()); discardFetch(); } + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n", + computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc()); if (computeUnit->shader->hsail_mode==Shader::SIMT) { const int num_active_lanes = execMask().count(); computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); computeUnit->numVecOpsExecuted += num_active_lanes; + + if (ii->isF16() && ii->isALU()) { + if (ii->isF32() || ii->isF64()) { + fatal("Instruction is tagged as both (1) F16, and (2)" + "either F32 or F64."); + } + computeUnit->numVecOpsExecutedF16 += num_active_lanes; + if (ii->isFMA()) { + computeUnit->numVecOpsExecutedFMA16 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + else if (ii->isMAC()) { + computeUnit->numVecOpsExecutedMAC16 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + else if (ii->isMAD()) { + computeUnit->numVecOpsExecutedMAD16 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + } + if (ii->isF32() && ii->isALU()) { + if (ii->isF16() || ii->isF64()) { + fatal("Instruction is tagged as both (1) F32, and (2)" + "either F16 or F64."); + } + computeUnit->numVecOpsExecutedF32 += num_active_lanes; + if (ii->isFMA()) { + computeUnit->numVecOpsExecutedFMA32 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + else if (ii->isMAC()) { + computeUnit->numVecOpsExecutedMAC32 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + else if (ii->isMAD()) { + computeUnit->numVecOpsExecutedMAD32 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + } + if (ii->isF64() && ii->isALU()) { + if (ii->isF16() || ii->isF32()) { + fatal("Instruction is tagged as both (1) F64, and (2)" + "either F16 or F32."); + } + computeUnit->numVecOpsExecutedF64 += num_active_lanes; + if (ii->isFMA()) { + computeUnit->numVecOpsExecutedFMA64 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + else if (ii->isMAC()) { + computeUnit->numVecOpsExecutedMAC64 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + else if (ii->isMAD()) { + computeUnit->numVecOpsExecutedMAD64 += num_active_lanes; + computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes; + } + } if (isGmInstruction(ii)) { computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); } else if (isLmInstruction(ii)) { @@ -703,82 +1097,120 @@ Wavefront::exec() } } - // ---- Update Vector ALU pipeline and other resources ------------------ // + /** + * we return here to avoid spurious errors related to flat insts + * and their address segment resolution. + */ + if (execMask().none() && ii->isFlat()) { + computeUnit->getTokenManager()->recvTokens(1); + return; + } + + // Update Vector ALU pipeline and other resources + bool flat_as_gm = false; + bool flat_as_lm = false; + if (ii->isFlat()) { + flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) || + (ii->executedAs() == Enums::SC_PRIVATE); + flat_as_lm = (ii->executedAs() == Enums::SC_GROUP); + } + // Single precision ALU or Branch or Return or Special instruction + // Note, we use the same timing regardless of SP or DP ALU operation. if (ii->isALU() || ii->isSpecialOp() || - ii->isBranch() || - // FIXME: Kernel argument loads are currently treated as ALU operations - // since we don't send memory packets at execution. If we fix that then - // we should map them to one of the memory pipelines + ii->isBranch() || ii->isNop() || (ii->isKernArgSeg() && ii->isLoad()) || - ii->isArgSeg() || - ii->isReturn()) { - computeUnit->aluPipe[simdId].set(computeUnit->shader-> - ticks(computeUnit->spBypassLength())); - + ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) { // this is to enforce a fixed number of cycles per issue slot per SIMD - computeUnit->wfWait[simdId].set(computeUnit->shader-> - ticks(computeUnit->issuePeriod)); + if (!ii->isScalar()) { + computeUnit->vectorALUs[simdId].set(computeUnit-> + cyclesToTicks(computeUnit->issuePeriod)); + } else { + computeUnit->scalarALUs[scalarAlu].set(computeUnit-> + cyclesToTicks(computeUnit->issuePeriod)); + } + // Barrier on Scalar ALU } else if (ii->isBarrier()) { - computeUnit->wfWait[simdId].set(computeUnit->shader-> - ticks(computeUnit->issuePeriod)); - } else if (ii->isLoad() && ii->isFlat()) { - assert(Enums::SC_NONE != ii->executedAs()); - - if (Enums::SC_SHARED == ii->executedAs()) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - set(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + computeUnit->scalarALUs[scalarAlu].set(computeUnit-> + cyclesToTicks(computeUnit->issuePeriod)); + // GM or Flat as GM Load + } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) { + if (!ii->isScalar()) { + computeUnit->vrfToGlobalMemPipeBus.set( + computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency)); + computeUnit->vectorGlobalMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesVMemPerSimd[simdId] += + computeUnit->vrf_gm_bus_latency; } else { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - set(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + computeUnit->srfToScalarMemPipeBus.set(computeUnit-> + cyclesToTicks(computeUnit->srf_scm_bus_latency)); + computeUnit->scalarMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesScMemPerSimd[simdId] += + computeUnit->srf_scm_bus_latency; } - } else if (ii->isStore() && ii->isFlat()) { - assert(Enums::SC_NONE != ii->executedAs()); - if (Enums::SC_SHARED == ii->executedAs()) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - set(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + // GM or Flat as GM Store + } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) { + if (!ii->isScalar()) { + computeUnit->vrfToGlobalMemPipeBus.set(computeUnit-> + cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency))); + computeUnit->vectorGlobalMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesVMemPerSimd[simdId] += + (2 * computeUnit->vrf_gm_bus_latency); } else { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - set(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + computeUnit->srfToScalarMemPipeBus.set(computeUnit-> + cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency))); + computeUnit->scalarMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesScMemPerSimd[simdId] += + (2 * computeUnit->srf_scm_bus_latency); } - } else if (ii->isLoad() && ii->isGlobalMem()) { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - set(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (ii->isStore() && ii->isGlobalMem()) { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - set(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - set(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (ii->isLoad() && ii->isLocalMem()) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - set(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (ii->isStore() && ii->isLocalMem()) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - set(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { - computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. - set(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->ShrMemUnitId()]. - set(computeUnit->shader->ticks(computeUnit->issuePeriod)); + } else if ((ii->isAtomic() || ii->isMemSync()) && + (ii->isGlobalMem() || flat_as_gm)) { + if (!ii->isScalar()) { + computeUnit->vrfToGlobalMemPipeBus.set(computeUnit-> + cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency))); + computeUnit->vectorGlobalMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesVMemPerSimd[simdId] += + (2 * computeUnit->vrf_gm_bus_latency); + } else { + computeUnit->srfToScalarMemPipeBus.set(computeUnit-> + cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency))); + computeUnit->scalarMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesScMemPerSimd[simdId] += + (2 * computeUnit->srf_scm_bus_latency); + } + // LM or Flat as LM Load + } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) { + computeUnit->vrfToLocalMemPipeBus.set(computeUnit-> + cyclesToTicks(computeUnit->vrf_lm_bus_latency)); + computeUnit->vectorSharedMemUnit. + set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesLdsPerSimd[simdId] += + computeUnit->vrf_lm_bus_latency; + // LM or Flat as LM Store + } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) { + computeUnit->vrfToLocalMemPipeBus.set(computeUnit-> + cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency))); + computeUnit->vectorSharedMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesLdsPerSimd[simdId] += + (2 * computeUnit->vrf_lm_bus_latency); + // LM or Flat as LM, Atomic or MemFence + } else if ((ii->isAtomic() || ii->isMemSync()) && + (ii->isLocalMem() || flat_as_lm)) { + computeUnit->vrfToLocalMemPipeBus.set(computeUnit-> + cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency))); + computeUnit->vectorSharedMemUnit. + set(computeUnit->cyclesToTicks(computeUnit->issuePeriod)); + computeUnit->instCyclesLdsPerSimd[simdId] += + (2 * computeUnit->vrf_lm_bus_latency); + } else { + panic("Bad instruction type!\n"); } } @@ -788,212 +1220,197 @@ Wavefront::waitingAtBarrier(int lane) return barCnt[lane] < maxBarCnt; } -void -Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, - const VectorMask& mask) +GPUDynInstPtr +Wavefront::nextInstr() { - assert(mask.count()); - reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); + // Read next instruction from instruction buffer + GPUDynInstPtr ii = instructionBuffer.front(); + // if the WF has been dispatched in the schedule stage then + // check the next oldest instruction for readiness + if (computeUnit->pipeMap.find(ii->seqNum()) != + computeUnit->pipeMap.end()) { + if (instructionBuffer.size() > 1) { + auto it = instructionBuffer.begin() + 1; + return *it; + } else { // No new instructions to check + return nullptr; + } + } + return ii; } void -Wavefront::popFromReconvergenceStack() +Wavefront::discardFetch() { - assert(!reconvergenceStack.empty()); + instructionBuffer.clear(); + dropFetch |= pendingFetch; - DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", - computeUnit->cu_id, simdId, wfSlotId, wfDynId, - execMask().to_string().c_str(), pc()); + /** + * clear the fetch buffer for this wave in order to + * remove any stale inst data + */ + computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId); +} - reconvergenceStack.pop_back(); +bool +Wavefront::waitCntsSatisfied() +{ + // Both vmWaitCnt && lgkmWaitCnt uninitialized means + // waitCnt instruction has been dispatched but not executed yet: next + // instruction should be blocked until waitCnt is executed. + if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) { + return false; + } - DPRINTF(WavefrontStack, "%3i %s\n", pc(), - execMask().to_string().c_str()); + // If we reach here, that means waitCnt instruction is executed and + // the waitcnts are set by the execute method. Check if waitcnts are + // satisfied. -} + // current number of vector memory ops in flight + int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm; -void -Wavefront::discardFetch() -{ - instructionBuffer.clear(); - dropFetch |=pendingFetch; -} + // current number of export insts or vector memory writes in flight + int exp_cnt = outstandingReqsWrGm; -uint32_t -Wavefront::pc() const -{ - return reconvergenceStack.back()->pc; + // current number of scalar/LDS memory ops in flight + // we do not consider GDS/message ops + int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm + + scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm; + + if (vmWaitCnt != -1) { + if (vm_cnt > vmWaitCnt) { + // vmWaitCnt not satisfied + return false; + } + } + + if (expWaitCnt != -1) { + if (exp_cnt > expWaitCnt) { + // expWaitCnt not satisfied + return false; + } + } + + if (lgkmWaitCnt != -1) { + if (lgkm_cnt > lgkmWaitCnt) { + // lgkmWaitCnt not satisfied + return false; + } + } + + // if we get here all outstanding waitcnts must + // be satisfied, so we resume normal operation + clearWaitCnts(); + + return true; } -uint32_t -Wavefront::rpc() const +void +Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt) { - return reconvergenceStack.back()->rpc; + // the scoreboard should have set the status + // to S_WAITCNT once a waitcnt instruction + // was marked as ready + assert(status == S_WAITCNT); + + // waitcnt instruction shouldn't be sending + // negative counts + assert(vm_wait_cnt >= 0); + assert(exp_wait_cnt >= 0); + assert(lgkm_wait_cnt >= 0); + // waitcnts are a max of 15 because we have + // only 1 nibble (4 bits) to set the counts + assert(vm_wait_cnt <= 0xf); + assert(exp_wait_cnt <= 0x7); + assert(lgkm_wait_cnt <= 0x1f); + + /** + * prior waitcnts should be satisfied, + * at which time the WF resets them + * back to -1, indicating they are no + * longer active + */ + assert(vmWaitCnt == -1); + assert(expWaitCnt == -1); + assert(lgkmWaitCnt == -1); + + /** + * if the instruction encoding + * indicates a waitcnt of 0xf, + * that means the waitcnt is + * not being used + */ + if (vm_wait_cnt != 0xf) + vmWaitCnt = vm_wait_cnt; + + if (exp_wait_cnt != 0x7) + expWaitCnt = exp_wait_cnt; + + if (lgkm_wait_cnt != 0x1f) + lgkmWaitCnt = lgkm_wait_cnt; } -VectorMask -Wavefront::execMask() const +void +Wavefront::clearWaitCnts() { - return reconvergenceStack.back()->execMask; + // reset the waitcnts back to + // -1, indicating they are no + // longer valid + vmWaitCnt = -1; + expWaitCnt = -1; + lgkmWaitCnt = -1; + + // resume running normally + status = S_RUNNING; } -bool -Wavefront::execMask(int lane) const +Addr +Wavefront::pc() const { - return reconvergenceStack.back()->execMask[lane]; + return _pc; } - void -Wavefront::pc(uint32_t new_pc) +Wavefront::pc(Addr new_pc) { - reconvergenceStack.back()->pc = new_pc; + _pc = new_pc; } -uint32_t -Wavefront::getStaticContextSize() const +VectorMask& +Wavefront::execMask() { - return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + - sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + - sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + - sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + - computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); + return _execMask; } -void -Wavefront::getContext(const void *out) -{ - uint8_t *iter = (uint8_t *)out; - for (int i = 0; i < barCnt.size(); i++) { - *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]); - } - *(int *)iter = wfId; iter += sizeof(wfId); - *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt); - *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); - *(int *)iter = barrierCnt; iter += sizeof(barrierCnt); - *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); - *(uint32_t *)iter = wgId; iter += sizeof(wgId); - *(uint32_t *)iter = barrierId; iter += sizeof(barrierId); - *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); - *(Addr *)iter = privBase; iter += sizeof(privBase); - *(Addr *)iter = spillBase; iter += sizeof(spillBase); - - int stackSize = reconvergenceStack.size(); - ReconvergenceStackEntry empty = {std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max()}; - for (int i = 0; i < workItemId[0].size(); i++) { - if (i < stackSize) { - *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back(); - iter += sizeof(ReconvergenceStackEntry); - reconvergenceStack.pop_back(); - } else { - *(ReconvergenceStackEntry *)iter = empty; - iter += sizeof(ReconvergenceStackEntry); - } - } - - int wf_size = computeUnit->wfSize(); - for (int i = 0; i < maxSpVgprs; i++) { - uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); - for (int lane = 0; lane < wf_size; lane++) { - uint32_t regVal = computeUnit->vrf[simdId]-> - read(vgprIdx,lane); - *(uint32_t *)iter = regVal; iter += sizeof(regVal); - } - } - - for (int i = 0; i < maxDpVgprs; i++) { - uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); - for (int lane = 0; lane < wf_size; lane++) { - uint64_t regVal = computeUnit->vrf[simdId]-> - read(vgprIdx,lane); - *(uint64_t *)iter = regVal; iter += sizeof(regVal); - } - } - - for (int i = 0; i < condRegState->numRegs(); i++) { - for (int lane = 0; lane < wf_size; lane++) { - uint64_t regVal = condRegState->read(i, lane); - *(uint64_t *)iter = regVal; iter += sizeof(regVal); - } - } - - /* saving LDS content */ - if (ldsChunk) - for (int i = 0; i < ldsChunk->size(); i++) { - char val = ldsChunk->read(i); - *(char *) iter = val; iter += sizeof(val); - } +bool +Wavefront::execMask(int lane) const +{ + return _execMask[lane]; } void -Wavefront::setContext(const void *in) -{ - uint8_t *iter = (uint8_t *)in; - for (int i = 0; i < barCnt.size(); i++) { - barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]); - } - wfId = *(int *)iter; iter += sizeof(wfId); - maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt); - oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt); - barrierCnt = *(int *)iter; iter += sizeof(barrierCnt); - computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id); - wgId = *(uint32_t *)iter; iter += sizeof(wgId); - barrierId = *(uint32_t *)iter; iter += sizeof(barrierId); - initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask); - privBase = *(Addr *)iter; iter += sizeof(privBase); - spillBase = *(Addr *)iter; iter += sizeof(spillBase); - - for (int i = 0; i < workItemId[0].size(); i++) { - ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter; - iter += sizeof(ReconvergenceStackEntry); - if (newEntry.pc != std::numeric_limits::max()) { - pushToReconvergenceStack(newEntry.pc, newEntry.rpc, - newEntry.execMask); - } - } - int wf_size = computeUnit->wfSize(); - - for (int i = 0; i < maxSpVgprs; i++) { - uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); - for (int lane = 0; lane < wf_size; lane++) { - uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal); - computeUnit->vrf[simdId]->write(vgprIdx, regVal, lane); - } - } - - for (int i = 0; i < maxDpVgprs; i++) { - uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); - for (int lane = 0; lane < wf_size; lane++) { - uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal); - computeUnit->vrf[simdId]->write(vgprIdx, regVal, lane); - } +Wavefront::freeRegisterFile() +{ + /* clear busy registers */ + for (int i=0; i < maxVgprs; i++) { + int vgprIdx = computeUnit->registerManager->mapVgpr(this, i); + computeUnit->vrf[simdId]->markReg(vgprIdx, false); } - for (int i = 0; i < condRegState->numRegs(); i++) { - for (int lane = 0; lane < wf_size; lane++) { - uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal); - condRegState->write(i, lane, regVal); - } - } - /** Restoring LDS contents */ - if (ldsChunk) - for (int i = 0; i < ldsChunk->size(); i++) { - char val = *(char *) iter; iter += sizeof(val); - ldsChunk->write(i, val); - } + /* Free registers used by this wavefront */ + uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) % + computeUnit->vrf[simdId]->numRegs(); + computeUnit->registerManager->vrfPoolMgrs[simdId]-> + freeRegion(startVgprIndex, endIndex); } void -Wavefront::computeActualWgSz(NDRange *ndr) +Wavefront::computeActualWgSz(HSAQueueEntry *task) { actualWgSzTotal = 1; - for (int d = 0; d < 3; ++d) { - actualWgSz[d] = std::min(workGroupSz[d], - gridSz[d] - ndr->wgId[d] * workGroupSz[d]); + for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) { + actualWgSz[d] = std::min(workGroupSz[d], gridSz[d] + - task->wgId(d) * workGroupSz[d]); actualWgSzTotal *= actualWgSz[d]; } } diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 9e73f1060..451e5dfcb 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -31,161 +31,116 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __WAVEFRONT_HH__ -#define __WAVEFRONT_HH__ +#ifndef __GPU_COMPUTE_WAVEFRONT_HH__ +#define __GPU_COMPUTE_WAVEFRONT_HH__ #include #include +#include #include -#include +#include #include #include "arch/gpu_isa.hh" #include "base/logging.hh" #include "base/types.hh" #include "config/the_gpu_isa.hh" -#include "gpu-compute/condition_register_state.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/dispatcher.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/lds_state.hh" #include "gpu-compute/misc.hh" -#include "gpu-compute/ndrange.hh" #include "params/Wavefront.hh" #include "sim/sim_object.hh" -static const int MAX_NUM_INSTS_PER_WF = 12; - -/** - * A reconvergence stack entry conveys the necessary state to implement - * control flow divergence. - */ -struct ReconvergenceStackEntry { - /** - * PC of current instruction. - */ - uint32_t pc; - /** - * PC of the immediate post-dominator instruction, i.e., the value of - * @a pc for the first instruction that will be executed by the wavefront - * when a reconvergence point is reached. - */ - uint32_t rpc; - /** - * Execution mask. - */ - VectorMask execMask; -}; - -/* - * Arguments for the hsail opcode call, are user defined and variable length. - * The hardware/finalizer can support arguments in hardware or use memory to - * pass arguments. For now, let's assume that an unlimited number of arguments - * are supported in hardware (the compiler inlines functions whenver it can - * anyways, so unless someone is interested in the implications of linking/ - * library functions, I think this is a reasonable assumption given the typical - * size of an OpenCL kernel). - * - * Note that call args are different than kernel arguments: - * * All work-items in a kernel refer the same set of kernel arguments - * * Each work-item has it's on set of call args. So a call argument at - * address 0x4 is different for work-item 0 and work-item 1. - * - * Ok, the table below shows an example of how we organize the call arguments in - * the CallArgMem class. - * - * int foo(int arg1, double arg2) - * ___________________________________________________ - * | 0: return.0 | 4: return.1 | ... | 252: return.63 | - * |---------------------------------------------------| - * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | - * |---------------------------------------------------| - * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | - * ___________________________________________________ - */ -class CallArgMem -{ - public: - // pointer to buffer for storing function arguments - uint8_t *mem; - int wfSize; - // size of function args - int funcArgsSizePerItem; - - template - int - getLaneOffset(int lane, int addr) - { - return addr * wfSize + sizeof(CType) * lane; - } - - CallArgMem(int func_args_size_per_item, int wf_size) - : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) - { - mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); - } - - ~CallArgMem() - { - free(mem); - } - - template - uint8_t* - getLaneAddr(int lane, int addr) - { - return mem + getLaneOffset(lane, addr); - } - - template - void - setLaneAddr(int lane, int addr, CType val) - { - *((CType*)(mem + getLaneOffset(lane, addr))) = val; - } -}; - class Wavefront : public SimObject { public: - enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; - enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; - - // Base pointer for array of instruction pointers - uint64_t basePtr; + enum status_e { + // wavefront is stalled + S_STOPPED, + // wavefront is returning from a kernel + S_RETURNING, + // wavefront is running normally + S_RUNNING, + // wavefront is stalled + S_STALLED, + /** + * wavefront has unsatisfied wait counts + * + * while in this state the WF will only execute if + * the oldest instruction is the waitcnt. while in + * S_WAITCNT, the wavefront will not be ready until + * all of its waitcnts have been satisfied. the + * scoreboard ready() function will check the status + * of the waitcnts whenever the WF is in S_WAITCNT, + * and once they are satisfied, it will resume normal + * operation. + */ + S_WAITCNT + }; uint32_t oldBarrierCnt; uint32_t barrierCnt; uint32_t barrierId; uint32_t barrierSlots; - status_e status; // HW slot id where the WF is mapped to inside a SIMD unit - int wfSlotId; + const int wfSlotId; int kernId; // SIMD unit where the WV has been scheduled - int simdId; + const int simdId; + // id of the execution unit (or pipeline) where the oldest instruction + // of the WF is scheduled + int execUnitId; + int flatLmUnitId; + int flatGmUnitId; // pointer to parent CU ComputeUnit *computeUnit; + int maxIbSize; std::deque instructionBuffer; bool pendingFetch; bool dropFetch; - - // Condition Register State (for HSAIL simulations only) - class ConditionRegisterState *condRegState; - // number of single precision VGPRs required by WF - uint32_t maxSpVgprs; - // number of double precision VGPRs required by WF - uint32_t maxDpVgprs; - // map virtual to physical vector register - uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); - void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); + // last tick during which all WFs in the CU are not idle + Tick lastNonIdleTick; + + // Execution unit resource ID's associated with this WF + // These are static mappings set at WF slot construction and + // based off of the simdId and wfSlotId. + + // Index to scalarALUs resource vector in CU + int scalarAlu; + + // Indices into readyList/dispatchList of resources used by this + // wavefront + int scalarAluGlobalIdx; + int globalMem; + int localMem; + int scalarMem; + + // number of VGPRs required by WF + uint32_t maxVgprs; + // number of SGPRs required by WF + uint32_t maxSgprs; + void freeResources(); + GPUDynInstPtr nextInstr(); + void setStatus(status_e newStatus); + status_e getStatus() { return status; } + void resizeRegFiles(int num_vregs, int num_sregs); bool isGmInstruction(GPUDynInstPtr ii); bool isLmInstruction(GPUDynInstPtr ii); + bool isOldestInstWaitcnt(); bool isOldestInstGMem(); bool isOldestInstLMem(); bool isOldestInstPrivMem(); bool isOldestInstFlatMem(); - bool isOldestInstALU(); + bool isOldestInstVectorALU(); + bool isOldestInstScalarALU(); + bool isOldestInstScalarMem(); bool isOldestInstBarrier(); + // used for passing spill address to DDInstGPU std::vector lastAddr; std::vector workItemId[3]; @@ -199,36 +154,44 @@ class Wavefront : public SimObject /* the actual WG size can differ than the maximum size */ uint32_t actualWgSz[3]; uint32_t actualWgSzTotal; - void computeActualWgSz(NDRange *ndr); + void computeActualWgSz(HSAQueueEntry *task); // wavefront id within a workgroup uint32_t wfId; uint32_t maxDynWaveId; uint32_t dispatchId; - // outstanding global+local memory requests - uint32_t outstandingReqs; - // memory requests between scoreboard - // and execute stage not yet executed - uint32_t memReqsInPipe; + // vector and scalar memory requests pending in memory system + int outstandingReqs; // outstanding global memory write requests - uint32_t outstandingReqsWrGm; + int outstandingReqsWrGm; // outstanding local memory write requests - uint32_t outstandingReqsWrLm; + int outstandingReqsWrLm; // outstanding global memory read requests - uint32_t outstandingReqsRdGm; + int outstandingReqsRdGm; // outstanding local memory read requests - uint32_t outstandingReqsRdLm; - uint32_t rdLmReqsInPipe; - uint32_t rdGmReqsInPipe; - uint32_t wrLmReqsInPipe; - uint32_t wrGmReqsInPipe; + int outstandingReqsRdLm; + // outstanding scalar memory read requests + int scalarOutstandingReqsRdGm; + // outstanding scalar memory write requests + int scalarOutstandingReqsWrGm; + int rdLmReqsInPipe; + int rdGmReqsInPipe; + int wrLmReqsInPipe; + int wrGmReqsInPipe; + int scalarRdGmReqsInPipe; + int scalarWrGmReqsInPipe; int memTraceBusy; uint64_t lastTrace; - // number of vector registers reserved by WF + // number of virtual vector registers reserved by WF int reservedVectorRegs; + // number of virtual scalar registers reserved by WF + int reservedScalarRegs; // Index into the Vector Register File's namespace where the WF's registers // will live while the WF is executed uint32_t startVgprIndex; + // Index into the Scalar Register File's namespace where the WF's registers + // will live while the WF is executed + uint32_t startSgprIndex; // Old value of destination gpr (for trace) std::vector oldVgpr; @@ -257,64 +220,63 @@ class Wavefront : public SimObject // to this workgroup (thus this wavefront) LdsChunk *ldsChunk; - // A pointer to the spill area - Addr spillBase; - // The size of the spill area - uint32_t spillSizePerItem; - // The vector width of the spill area - uint32_t spillWidth; - - // A pointer to the private memory area - Addr privBase; - // The size of the private memory area - uint32_t privSizePerItem; - - // A pointer ot the read-only memory area - Addr roBase; - // size of the read-only memory area - uint32_t roSize; - - // pointer to buffer for storing kernel arguments - uint8_t *kernelArgs; // unique WF id over all WFs executed across all CUs uint64_t wfDynId; - // number of times instruction issue for this wavefront is blocked - // due to VRF port availability - Stats::Scalar numTimesBlockedDueVrfPortAvail; + // Wavefront slot stats + + // Number of instructions executed by this wavefront slot across all + // dynamic wavefronts + Stats::Scalar numInstrExecuted; + + // Number of cycles this WF spends in SCH stage + Stats::Scalar schCycles; + + // Number of stall cycles encounterd by this WF in SCH stage + Stats::Scalar schStalls; + + // The following stats sum to the value of schStalls, and record, per + // WF slot, what the cause of each stall was at a coarse granularity. + + // Cycles WF is selected by scheduler, but RFs cannot support instruction + Stats::Scalar schRfAccessStalls; + // Cycles spent waiting for execution resources + Stats::Scalar schResourceStalls; + // cycles spent waiting for RF reads to complete in SCH stage + Stats::Scalar schOpdNrdyStalls; + // LDS arbitration stall cycles. WF attempts to execute LM instruction, + // but another wave is executing FLAT, which requires LM and GM and forces + // this WF to stall. + Stats::Scalar schLdsArbStalls; + // number of times an instruction of a WF is blocked from being issued // due to WAR and WAW dependencies Stats::Scalar numTimesBlockedDueWAXDependencies; // number of times an instruction of a WF is blocked from being issued // due to WAR and WAW dependencies Stats::Scalar numTimesBlockedDueRAWDependencies; - // distribution of executed instructions based on their register - // operands; this is used to highlight the load on the VRF - Stats::Distribution srcRegOpDist; - Stats::Distribution dstRegOpDist; - - // Functions to operate on call argument memory - // argument memory for hsail call instruction - CallArgMem *callArgMem; - void - initCallArgMem(int func_args_size_per_item, int wf_size) - { - callArgMem = new CallArgMem(func_args_size_per_item, wf_size); - } - template - CType - readCallArgMem(int lane, int addr) - { - return *((CType*)(callArgMem->getLaneAddr(lane, addr))); - } + // dyn inst id (per SIMD) of last instruction exec from this wave + uint64_t lastInstExec; - template - void - writeCallArgMem(int lane, int addr, CType val) - { - callArgMem->setLaneAddr(lane, addr, val); - } + // Distribution to track the distance between producer and consumer + // for vector register values + Stats::Distribution vecRawDistance; + // Map to track the dyn instruction id of each vector register value + // produced, indexed by physical vector register ID + std::unordered_map rawDist; + + // Distribution to track the number of times every vector register + // value produced is consumed. + Stats::Distribution readsPerWrite; + // Counts the number of reads performed to each physical register + // - counts are reset to 0 for each dynamic wavefront launched + std::vector vecReads; + + void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems); + + // context for save/restore + uint8_t *context; typedef WavefrontParams Params; Wavefront(const Params *p); @@ -327,50 +289,31 @@ class Wavefront : public SimObject computeUnit = cu; } + void validateRequestCounters(); void start(uint64_t _wfDynId, uint64_t _base_ptr); void exec(); - void updateResources(); - int ready(itype_e type); - bool instructionBufferHasBranch(); + // called by SCH stage to reserve + std::vector reserveResources(); + bool stopFetch(); void regStats(); - VectorMask getPred() { return execMask() & initMask; } bool waitingAtBarrier(int lane); - void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, - const VectorMask& exec_mask); - - void popFromReconvergenceStack(); - - uint32_t pc() const; - - uint32_t rpc() const; - - VectorMask execMask() const; + Addr pc() const; + void pc(Addr new_pc); + VectorMask& execMask(); bool execMask(int lane) const; - void pc(uint32_t new_pc); void discardFetch(); - /** - * Returns the size of the static hardware context of a particular wavefront - * This should be updated everytime the context is changed - */ - uint32_t getStaticContextSize() const; + bool waitCntsSatisfied(); + void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt); + void clearWaitCnts(); - /** - * Returns the hardware context as a stream of bytes - * This method is designed for HSAIL execution - */ - void getContext(const void *out); - - /** - * Sets the hardware context fromt a stream of bytes - * This method is designed for HSAIL execution - */ - void setContext(const void *in); + /** Freeing VRF space */ + void freeRegisterFile(); TheGpuISA::GPUISA& gpuISA() @@ -380,14 +323,32 @@ class Wavefront : public SimObject private: TheGpuISA::GPUISA _gpuISA; + + void reserveGmResource(GPUDynInstPtr ii); + void reserveLmResource(GPUDynInstPtr ii); + /** - * Stack containing Control Flow Graph nodes (i.e., kernel instructions) - * to be visited by the wavefront, and the associated execution masks. The - * reconvergence stack grows every time the wavefront reaches a divergence - * point (branch instruction), and shrinks every time the wavefront - * reaches a reconvergence point (immediate post-dominator instruction). + * the following are used for waitcnt instructions + * vmWaitCnt: once set, we wait for the oustanding + * number of vector mem instructions to be + * at, or below vmWaitCnt. + * + * expWaitCnt: once set, we wait for the outstanding + * number outstanding VM writes or EXP + * insts to be at, or below expWaitCnt. + * + * lgkmWaitCnt: once set, we wait for the oustanding + * number of LDS, GDS, scalar memory, + * and message instructions to be at, or + * below lgkmCount. we currently do not + * support GDS/message ops. */ - std::deque> reconvergenceStack; + int vmWaitCnt; + int expWaitCnt; + int lgkmWaitCnt; + status_e status; + Addr _pc; + VectorMask _execMask; }; -#endif // __WAVEFRONT_HH__ +#endif // __GPU_COMPUTE_WAVEFRONT_HH__ diff --git a/src/mem/packet.cc b/src/mem/packet.cc index 1c1da212d..b009cc5f6 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -86,6 +86,14 @@ MemCmd::commandInfo[] = WriteResp, "WriteReq" }, /* WriteResp */ { SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" }, + /* WriteCompleteResp - The WriteCompleteResp command is needed + * because in the GPU memory model we use a WriteResp to indicate + * that a write has reached the cache controller so we can free + * resources at the coalescer. Later, when the write succesfully + * completes we send a WriteCompleteResp to the CU so its wait + * counters can be updated. Wait counters in the CU is how memory + * dependences are handled in the GPU ISA. */ + { SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" }, /* WritebackDirty */ { SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache), InvalidCmd, "WritebackDirty" }, diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 42d286a5e..4af0d0b1c 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -83,6 +83,7 @@ class MemCmd ReadRespWithInvalidate, WriteReq, WriteResp, + WriteCompleteResp, WritebackDirty, WritebackClean, WriteClean, // writes dirty data below without evicting diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 9dffe0f2c..4047dc689 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); } else { if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { - if (in_msg.segment == HSASegment:SPILL) { - trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe); - } else if (WB) { + if (WB) { trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); } else { trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm new file mode 100644 index 000000000..124ebbeda --- /dev/null +++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2020 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +structure (GPUCoalescer, external = "yes") { + void readCallback(Addr, DataBlock); + void readCallback(Addr, MachineType, DataBlock); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void writeCallback(Addr, DataBlock); + void writeCallback(Addr, MachineType, DataBlock); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void evictionCallback(Addr); + void recordCPReadCallBack(MachineID, MachineID); + void recordCPWriteCallBack(MachineID, MachineID); +} + +structure (VIPERCoalescer, external = "yes") { + void readCallback(Addr, DataBlock); + void readCallback(Addr, MachineType, DataBlock); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void readCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void writeCallback(Addr, DataBlock); + void writeCallback(Addr, MachineType, DataBlock); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles); + void writeCallback(Addr, MachineType, DataBlock, + Cycles, Cycles, Cycles, bool); + void invCallback(Addr); + void wbCallback(Addr); + void evictionCallback(Addr); +} diff --git a/src/mem/ruby/protocol/GPU_VIPER.slicc b/src/mem/ruby/protocol/GPU_VIPER.slicc index 45f7f3477..55ed6710a 100644 --- a/src/mem/ruby/protocol/GPU_VIPER.slicc +++ b/src/mem/ruby/protocol/GPU_VIPER.slicc @@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc"; include "MOESI_AMD_Base-msg.sm"; include "MOESI_AMD_Base-dir.sm"; include "MOESI_AMD_Base-CorePair.sm"; +include "GPU_VIPER-msg.sm"; include "GPU_VIPER-TCP.sm"; include "GPU_VIPER-SQC.sm"; include "GPU_VIPER-TCC.sm"; diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm index a1e751180..f4f50cb32 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm @@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") { CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer"; WriteMask writeMask, desc="Write Through Data"; MachineID WTRequestor, desc="Node who initiated the write through"; - HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope"; int wfid, default="0", desc="wavefront id"; bool NoWriteConflict, default="true", desc="write collided with CAB entry"; int ProgramCounter, desc="PC that accesses to this block"; diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm index 08d30cfee..f1d17c85e 100644 --- a/src/mem/ruby/protocol/RubySlicc_Exports.sm +++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm @@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent") NotPresent, desc="block is NotPresent"; Busy, desc="block is in a transient state, currently invalid"; } -//HSA scopes -enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") { - UNSPECIFIED, desc="Unspecified scope"; - NOSCOPE, desc="Explictly unscoped"; - WAVEFRONT, desc="Wavefront scope"; - WORKGROUP, desc="Workgroup scope"; - DEVICE, desc="Device scope"; - SYSTEM, desc="System scope"; -} - -// HSA segment types -enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") { - GLOBAL, desc="Global segment"; - GROUP, desc="Group segment"; - PRIVATE, desc="Private segment"; - KERNARG, desc="Kernarg segment"; - READONLY, desc="Readonly segment"; - SPILL, desc="Spill segment"; - ARG, desc="Arg segment"; -} // TesterStatus enumeration(TesterStatus, desc="...") { diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm index b59cf9717..76c45b9b0 100644 --- a/src/mem/ruby/protocol/RubySlicc_Types.sm +++ b/src/mem/ruby/protocol/RubySlicc_Types.sm @@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") { bool checkResourceAvailable(CacheResourceType, Addr); } -structure (GPUCoalescer, external = "yes") { - void readCallback(Addr, DataBlock); - void readCallback(Addr, MachineType, DataBlock); - void readCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles); - void readCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles, bool); - void writeCallback(Addr, DataBlock); - void writeCallback(Addr, MachineType, DataBlock); - void writeCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles); - void writeCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles, bool); - void evictionCallback(Addr); - void recordCPReadCallBack(MachineID, MachineID); - void recordCPWriteCallBack(MachineID, MachineID); -} - -structure (VIPERCoalescer, external = "yes") { - void readCallback(Addr, DataBlock); - void readCallback(Addr, MachineType, DataBlock); - void readCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles); - void readCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles, bool); - void writeCallback(Addr, DataBlock); - void writeCallback(Addr, MachineType, DataBlock); - void writeCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles); - void writeCallback(Addr, MachineType, DataBlock, - Cycles, Cycles, Cycles, bool); - void invCallback(Addr); - void wbCallback(Addr); - void evictionCallback(Addr); -} - structure(RubyRequest, desc="...", interface="Message", external="yes") { Addr LineAddress, desc="Line address for this request"; Addr PhysicalAddress, desc="Physical address for this request"; @@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") { WriteMask writeMask, desc="Writethrough mask"; DataBlock WTData, desc="Writethrough data block"; int wfid, desc="Writethrough wavefront"; - HSAScope scope, desc="HSA scope"; - HSASegment segment, desc="HSA segment"; PacketPtr pkt, desc="Packet associated with this request"; } diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc index b729d26dd..bdc88b9ef 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.cc +++ b/src/mem/ruby/slicc_interface/AbstractController.cc @@ -43,7 +43,6 @@ #include "debug/RubyQueue.hh" #include "mem/ruby/network/Network.hh" #include "mem/ruby/protocol/MemoryMsg.hh" -#include "mem/ruby/system/GPUCoalescer.hh" #include "mem/ruby/system/RubySystem.hh" #include "mem/ruby/system/Sequencer.hh" #include "sim/system.hh" diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh index 68b11f55d..29bedfa51 100644 --- a/src/mem/ruby/slicc_interface/RubyRequest.hh +++ b/src/mem/ruby/slicc_interface/RubyRequest.hh @@ -35,8 +35,6 @@ #include "mem/ruby/common/Address.hh" #include "mem/ruby/common/DataBlock.hh" #include "mem/ruby/common/WriteMask.hh" -#include "mem/ruby/protocol/HSAScope.hh" -#include "mem/ruby/protocol/HSASegment.hh" #include "mem/ruby/protocol/Message.hh" #include "mem/ruby/protocol/PrefetchBit.hh" #include "mem/ruby/protocol/RubyAccessMode.hh" diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index 0153b4c4b..1eecb82ad 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -61,58 +61,6 @@ using namespace std; -GPUCoalescer * -RubyGPUCoalescerParams::create() -{ - return new GPUCoalescer(this); -} - -HSAScope -reqScopeToHSAScope(const RequestPtr &req) -{ - HSAScope accessScope = HSAScope_UNSPECIFIED; - if (req->isScoped()) { - if (req->isWavefrontScope()) { - accessScope = HSAScope_WAVEFRONT; - } else if (req->isWorkgroupScope()) { - accessScope = HSAScope_WORKGROUP; - } else if (req->isDeviceScope()) { - accessScope = HSAScope_DEVICE; - } else if (req->isSystemScope()) { - accessScope = HSAScope_SYSTEM; - } else { - fatal("Bad scope type"); - } - } - return accessScope; -} - -HSASegment -reqSegmentToHSASegment(const RequestPtr &req) -{ - HSASegment accessSegment = HSASegment_GLOBAL; - - if (req->isGlobalSegment()) { - accessSegment = HSASegment_GLOBAL; - } else if (req->isGroupSegment()) { - accessSegment = HSASegment_GROUP; - } else if (req->isPrivateSegment()) { - accessSegment = HSASegment_PRIVATE; - } else if (req->isKernargSegment()) { - accessSegment = HSASegment_KERNARG; - } else if (req->isReadonlySegment()) { - accessSegment = HSASegment_READONLY; - } else if (req->isSpillSegment()) { - accessSegment = HSASegment_SPILL; - } else if (req->isArgSegment()) { - accessSegment = HSASegment_ARG; - } else { - fatal("Bad segment type"); - } - - return accessSegment; -} - UncoalescedTable::UncoalescedTable(GPUCoalescer *gc) : coalescer(gc) { @@ -152,6 +100,7 @@ UncoalescedTable::updateResources() { for (auto iter = instMap.begin(); iter != instMap.end(); ) { if (iter->second.empty()) { + DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first); instMap.erase(iter++); coalescer->getGMTokenPort().sendTokens(1); } else { @@ -160,15 +109,27 @@ UncoalescedTable::updateResources() } } +bool +UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) { + // iterate the instructions held in UncoalescedTable to see whether there + // are more requests to issue; if yes, not yet done; otherwise, done + for (auto& inst : instMap) { + DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n" + ,inst.first, inst.second.size()); + if (inst.first == instSeqNum) { return false; } + } + + return true; +} + void UncoalescedTable::printRequestTable(std::stringstream& ss) { - ss << "UncoalescedTable contains " << instMap.size() - << " address entries." << std::endl; + ss << "Listing pending packets from " << instMap.size() << " instructions"; + for (auto& inst : instMap) { - ss << "Addr 0x" << std::hex << inst.first << std::dec - << " with " << inst.second.size() << " packets" - << std::endl; + ss << "\tAddr: " << printAddress(inst.first) << " with " + << inst.second.size() << " pending packets" << std::endl; } } @@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p) assert(m_dataCache_ptr); m_runningGarnetStandalone = p->garnet_standalone; - assumingRfOCoherence = p->assume_rfo; } GPUCoalescer::~GPUCoalescer() @@ -254,18 +214,9 @@ GPUCoalescer::wakeup() if (current_time - req->getIssueTime() > m_deadlock_threshold) { std::stringstream ss; printRequestTable(ss); - ss << "Outstanding requests: " << m_outstanding_count - << std::endl; - - panic("Possible Deadlock detected. Aborting!\n" - "version: %d request.paddr: 0x%x coalescedTable: %d " - "current time: %u issue_time: %d difference: %d\n" - "Request Tables:\n %s", m_version, - req->getFirstPkt()->getAddr(), - coalescedTable.size(), cyclesToTicks(current_time), - cyclesToTicks(req->getIssueTime()), - cyclesToTicks(current_time - req->getIssueTime()), - ss.str()); + warn("GPUCoalescer %d Possible deadlock detected!\n%s\n", + m_version, ss.str()); + panic("Aborting due to deadlock!\n"); } } } @@ -283,21 +234,27 @@ GPUCoalescer::wakeup() void GPUCoalescer::printRequestTable(std::stringstream& ss) { - uncoalescedTable.printRequestTable(ss); + ss << "Printing out " << coalescedTable.size() + << " outstanding requests in the coalesced table\n"; - ss << "CoalescedTable contains " << coalescedTable.size() - << " address entries." << std::endl; for (auto& requestList : coalescedTable) { - ss << "Addr 0x" << std::hex << requestList.first << std::dec - << ": type-"; for (auto& request : requestList.second) { - ss << RubyRequestType_to_string(request->getRubyType()) - << " pkts-" << request->getPackets().size() - << " issued-" << request->getIssueTime() << " seqNum-" - << request->getSeqNum() << "; "; + ss << "\tAddr: " << printAddress(requestList.first) << "\n" + << "\tInstruction sequence number: " + << request->getSeqNum() << "\n" + << "\t\tType: " + << RubyRequestType_to_string(request->getRubyType()) << "\n" + << "\t\tNumber of associated packets: " + << request->getPackets().size() << "\n" + << "\t\tIssue time: " + << request->getIssueTime() * clockPeriod() << "\n" + << "\t\tDifference from current tick: " + << (curCycle() - request->getIssueTime()) * clockPeriod(); } - ss << std::endl; } + + // print out packets waiting to be issued in uncoalesced table + uncoalescedTable.printRequestTable(ss); } void @@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address, hitCallback(crequest, mach, data, true, crequest->getIssueTime(), forwardRequestTime, firstResponseTime, isRegion); + // remove this crequest in coalescedTable delete crequest; coalescedTable.at(address).pop_front(); @@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address, } } +void +GPUCoalescer::writeCompleteCallback(Addr address, + uint64_t instSeqNum, + MachineType mach) +{ + DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x" + " instSeqNum = %d\n", address, instSeqNum); + + assert(pendingWriteInsts.count(instSeqNum) == 1); + PendingWriteInst& inst = pendingWriteInsts[instSeqNum]; + + // check the uncoalescedTable to see whether all requests for the inst + // have been issued or not + bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum); + DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, " + "reqsAllIssued=%d\n", reqsAllIssued, + inst.getNumPendingStores()-1, reqsAllIssued); + + if (inst.receiveWriteCompleteAck() && reqsAllIssued ) { + // if the pending write instruction has received all write completion + // callbacks for its issued Ruby requests, we can now start respond + // the requesting CU in one response packet. + inst.ackWriteCompletion(m_usingRubyTester); + + DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n", + instSeqNum); + pendingWriteInsts.erase(instSeqNum); + } +} + void GPUCoalescer::readCallback(Addr address, DataBlock& data) { @@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest, { PacketPtr pkt = crequest->getFirstPkt(); Addr request_address = pkt->getAddr(); - Addr request_line_address = makeLineAddress(request_address); + Addr request_line_address M5_VAR_USED = makeLineAddress(request_address); RubyRequestType type = crequest->getRubyType(); @@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest, "%s\n", RubyRequestType_to_string(type)); } - - // If using the RubyTester, update the RubyTester sender state's - // subBlock with the recieved data. The tester will later access - // this state. - // Note: RubyPort will access it's sender state before the - // RubyTester. - if (m_usingRubyTester) { - RubyPort::SenderState *requestSenderState = - safe_cast(pkt->senderState); - RubyTester::SenderState* testerSenderState = - safe_cast - (requestSenderState->predecessor); - testerSenderState->subBlock.mergeFrom(data); - } } @@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt) } else if (pkt->isWrite()) { req_type = RubyRequestType_ST; } else { - // Acquire and release packets will have been issued by - // makeRequest, so we do not need to check for it here. panic("Unsupported ruby packet type\n"); } @@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt) RequestStatus GPUCoalescer::makeRequest(PacketPtr pkt) { - // Check for GPU Barrier Kernel End or Kernel Begin - // Leave these to be handled by the child class - // Kernel End/Barrier = isFlush + isRelease - // Kernel Begin = isFlush + isAcquire - if (pkt->req->isKernel()) { - if (pkt->req->isAcquire()){ - // This is a Kernel Begin leave handling to - // virtual xCoalescer::makeRequest - return RequestStatus_Issued; - }else if (pkt->req->isRelease()) { - // This is a Kernel End leave handling to - // virtual xCoalescer::makeRequest - // If we are here then we didn't call - // a virtual version of this function - // so we will also schedule the callback - int wf_id = 0; - if (pkt->req->hasContextId()) { - wf_id = pkt->req->contextId(); - } - insertKernel(wf_id, pkt); - newKernelEnds.push_back(wf_id); - if (!issueEvent.scheduled()) { - schedule(issueEvent, curTick()); - } - return RequestStatus_Issued; - } - } + // all packets must have valid instruction sequence numbers + assert(pkt->req->hasInstSeqNum()); - if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() && - !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() && - (pkt->req->isRelease() || pkt->req->isAcquire())) { - if (assumingRfOCoherence) { - // If we reached here, this request must be a memFence - // and the protocol implements RfO, the coalescer can - // assume sequentially consistency and schedule the callback - // immediately. - // Currently the code implements fence callbacks - // by reusing the mechanism for kernel completions. - // This should be fixed. - int wf_id = 0; - if (pkt->req->hasContextId()) { - wf_id = pkt->req->contextId(); - } - insertKernel(wf_id, pkt); - newKernelEnds.push_back(wf_id); - if (!issueEvent.scheduled()) { - schedule(issueEvent, curTick()); - } - return RequestStatus_Issued; - } else { - // If not RfO, return issued here and let the child coalescer - // take care of it. - return RequestStatus_Issued; + if (pkt->cmd == MemCmd::MemSyncReq) { + // issue mem_sync requests immedidately to the cache system without + // going though uncoalescedTable like normal LD/ST/Atomic requests + issueMemSyncRequest(pkt); + } else { + // otherwise, this must be either read or write command + assert(pkt->isRead() || pkt->isWrite()); + + // the pkt is temporarily stored in the uncoalesced table until + // it's picked for coalescing process later in this cycle or in a + // future cycle + uncoalescedTable.insertPacket(pkt); + DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", + pkt->getAddr()); + + // we schedule an issue event here to process the uncoalesced table + // and try to issue Ruby request to cache system + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); } } - uncoalescedTable.insertPacket(pkt); - DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr()); - - if (!issueEvent.scheduled()) - schedule(issueEvent, curTick()); - // TODO: issue hardware prefetches here + // we always return RequestStatus_Issued in this coalescer + // b/c the coalescer's resouce was checked ealier and the coalescer is + // queueing up aliased requets in its coalesced table return RequestStatus_Issued; } +/** + * TODO: Figure out what do with this code. This code may go away + * and/or be merged into the VIPER coalescer once the VIPER + * protocol is re-integrated with GCN3 codes. + */ +/* void GPUCoalescer::issueRequest(CoalescedRequest* crequest) { @@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest) } assert(m_mandatory_q_ptr); - m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); -} + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); +}*/ template std::ostream & @@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const } -void -GPUCoalescer::recordRequestType(SequencerRequestType requestType) { - DPRINTF(RubyStats, "Recorded statistic: %s\n", - SequencerRequestType_to_string(requestType)); -} - bool GPUCoalescer::coalescePacket(PacketPtr pkt) { @@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt) // be counted as outstanding requests. m_outstanding_count++; + // We track all issued or to-be-issued Ruby requests associated with + // write instructions. An instruction may have multiple Ruby + // requests. + if (pkt->cmd == MemCmd::WriteReq) { + DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to" + " the pending write instruction list\n", seqNum, + line_addr); + + RubyPort::SenderState* ss = + safe_cast(pkt->senderState); + + // we need to save this port because it will be used to call + // back the requesting CU when we receive write + // complete callbacks for all issued Ruby requests of this + // instruction. + RubyPort::MemSlavePort* mem_slave_port = ss->port; + + GPUDynInstPtr gpuDynInst = nullptr; + + if (!m_usingRubyTester) { + // If this coalescer is connected to a real CU, we need + // to save the corresponding gpu dynamic instruction. + // CU will use that instruction to decrement wait counters + // in the issuing wavefront. + // For Ruby tester, gpuDynInst == nullptr + ComputeUnit::DataPort::SenderState* cu_state = + safe_cast + (ss->predecessor); + gpuDynInst = cu_state->_gpuDynInst; + } + + PendingWriteInst& inst = pendingWriteInsts[seqNum]; + inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester); + } + return true; } @@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address, } } -void -GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) -{ - if (myMachID == senderMachID) { - CP_TCPLdHits++; - } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { - CP_TCPLdTransfers++; - } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { - CP_TCCLdHits++; - } else { - CP_LdMiss++; - } -} - -void -GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) -{ - if (myMachID == senderMachID) { - CP_TCPStHits++; - } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { - CP_TCPStTransfers++; - } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { - CP_TCCStHits++; - } else { - CP_StMiss++; - } -} - void GPUCoalescer::completeHitCallback(std::vector & mylist) { @@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest, Cycles firstResponseTime, bool success, bool isRegion) { - RubyRequestType type = crequest->getRubyType(); - Cycles issued_time = crequest->getIssueTime(); - Cycles completion_time = curCycle(); - assert(completion_time >= issued_time); - Cycles total_lat = completion_time - issued_time; - - // cache stats (valid for RfO protocol only) - if (mach == MachineType_TCP) { - if (type == RubyRequestType_LD) { - GPU_TCPLdHits++; - } else { - GPU_TCPStHits++; - } - } else if (mach == MachineType_L1Cache_wCC) { - if (type == RubyRequestType_LD) { - GPU_TCPLdTransfers++; - } else { - GPU_TCPStTransfers++; - } - } else if (mach == MachineType_TCC) { - if (type == RubyRequestType_LD) { - GPU_TCCLdHits++; - } else { - GPU_TCCStHits++; - } - } else { - if (type == RubyRequestType_LD) { - GPU_LdMiss++; - } else { - GPU_StMiss++; - } - } - - // Profile all access latency, even zero latency accesses - m_latencyHist.sample(total_lat); - m_typeLatencyHist[type]->sample(total_lat); - - // Profile the miss latency for all non-zero demand misses - if (total_lat != Cycles(0)) { - m_missLatencyHist.sample(total_lat); - m_missTypeLatencyHist[type]->sample(total_lat); - - if (mach != MachineType_NUM) { - m_missMachLatencyHist[mach]->sample(total_lat); - m_missTypeMachLatencyHist[type][mach]->sample(total_lat); - - if ((issued_time <= initialRequestTime) && - (initialRequestTime <= forwardRequestTime) && - (forwardRequestTime <= firstResponseTime) && - (firstResponseTime <= completion_time)) { - - m_IssueToInitialDelayHist[mach]->sample( - initialRequestTime - issued_time); - m_InitialToForwardDelayHist[mach]->sample( - forwardRequestTime - initialRequestTime); - m_ForwardToFirstResponseDelayHist[mach]->sample( - firstResponseTime - forwardRequestTime); - m_FirstResponseToCompletionDelayHist[mach]->sample( - completion_time - firstResponseTime); - } - } - - } - - DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", - curTick(), m_version, "Coal", - success ? "Done" : "SC_Failed", "", "", - printAddress(crequest->getFirstPkt()->getAddr()), total_lat); } void @@ -1085,74 +962,4 @@ GPUCoalescer::regStats() m_missTypeMachLatencyHist[i][j]->init(10); } } - - // GPU cache stats - GPU_TCPLdHits - .name(name() + ".gpu_tcp_ld_hits") - .desc("loads that hit in the TCP") - ; - GPU_TCPLdTransfers - .name(name() + ".gpu_tcp_ld_transfers") - .desc("TCP to TCP load transfers") - ; - GPU_TCCLdHits - .name(name() + ".gpu_tcc_ld_hits") - .desc("loads that hit in the TCC") - ; - GPU_LdMiss - .name(name() + ".gpu_ld_misses") - .desc("loads that miss in the GPU") - ; - - GPU_TCPStHits - .name(name() + ".gpu_tcp_st_hits") - .desc("stores that hit in the TCP") - ; - GPU_TCPStTransfers - .name(name() + ".gpu_tcp_st_transfers") - .desc("TCP to TCP store transfers") - ; - GPU_TCCStHits - .name(name() + ".gpu_tcc_st_hits") - .desc("stores that hit in the TCC") - ; - GPU_StMiss - .name(name() + ".gpu_st_misses") - .desc("stores that miss in the GPU") - ; - - // CP cache stats - CP_TCPLdHits - .name(name() + ".cp_tcp_ld_hits") - .desc("loads that hit in the TCP") - ; - CP_TCPLdTransfers - .name(name() + ".cp_tcp_ld_transfers") - .desc("TCP to TCP load transfers") - ; - CP_TCCLdHits - .name(name() + ".cp_tcc_ld_hits") - .desc("loads that hit in the TCC") - ; - CP_LdMiss - .name(name() + ".cp_ld_misses") - .desc("loads that miss in the GPU") - ; - - CP_TCPStHits - .name(name() + ".cp_tcp_st_hits") - .desc("stores that hit in the TCP") - ; - CP_TCPStTransfers - .name(name() + ".cp_tcp_st_transfers") - .desc("TCP to TCP store transfers") - ; - CP_TCCStHits - .name(name() + ".cp_tcc_st_hits") - .desc("stores that hit in the TCC") - ; - CP_StMiss - .name(name() + ".cp_st_misses") - .desc("stores that miss in the GPU") - ; } diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index 56a207906..789ca308f 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -38,11 +38,11 @@ #include #include "base/statistics.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/misc.hh" #include "mem/request.hh" #include "mem/ruby/common/Address.hh" #include "mem/ruby/common/Consumer.hh" -#include "mem/ruby/protocol/HSAScope.hh" -#include "mem/ruby/protocol/HSASegment.hh" #include "mem/ruby/protocol/PrefetchBit.hh" #include "mem/ruby/protocol/RubyAccessMode.hh" #include "mem/ruby/protocol/RubyRequestType.hh" @@ -57,9 +57,6 @@ class CacheMemory; class RubyGPUCoalescerParams; -HSAScope reqScopeToHSAScope(const RequestPtr &req); -HSASegment reqSegmentToHSASegment(const RequestPtr &req); - // List of packets that belongs to a specific instruction. typedef std::list PerInstPackets; @@ -78,6 +75,7 @@ class UncoalescedTable // instructions at the offset. PerInstPackets* getInstPackets(int offset); void updateResources(); + bool areRequestsDone(const uint64_t instSeqNum); // Check if a packet hasn't been removed from instMap in too long. // Panics if a deadlock is detected and returns nothing otherwise. @@ -120,6 +118,86 @@ class CoalescedRequest std::vector pkts; }; +// PendingWriteInst tracks the number of outstanding Ruby requests +// per write instruction. Once all requests associated with one instruction +// are completely done in Ruby, we call back the requester to mark +// that this instruction is complete. +class PendingWriteInst +{ + public: + PendingWriteInst() + : numPendingStores(0), + originalPort(nullptr), + gpuDynInstPtr(nullptr) + {} + + ~PendingWriteInst() + {} + + void + addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst, + bool usingRubyTester) + { + assert(port); + originalPort = port; + + if (!usingRubyTester) { + gpuDynInstPtr = inst; + } + + numPendingStores++; + } + + // return true if no more ack is expected + bool + receiveWriteCompleteAck() + { + assert(numPendingStores > 0); + numPendingStores--; + return (numPendingStores == 0) ? true : false; + } + + // ack the original requester that this write instruction is complete + void + ackWriteCompletion(bool usingRubyTester) + { + assert(numPendingStores == 0); + + // make a response packet + PacketPtr pkt = new Packet(std::make_shared(), + MemCmd::WriteCompleteResp); + + if (!usingRubyTester) { + assert(gpuDynInstPtr); + ComputeUnit::DataPort::SenderState* ss = + new ComputeUnit::DataPort::SenderState + (gpuDynInstPtr, 0, nullptr); + pkt->senderState = ss; + } + + // send the ack response to the requester + originalPort->sendTimingResp(pkt); + } + + int + getNumPendingStores() { + return numPendingStores; + } + + private: + // the number of stores waiting for writeCompleteCallback + int numPendingStores; + // The original port that sent one of packets associated with this + // write instruction. We may have more than one packet per instruction, + // which implies multiple ports per instruction. However, we need + // only 1 of the ports to call back the CU. Therefore, here we keep + // track the port that sent the first packet of this instruction. + RubyPort::MemSlavePort* originalPort; + // similar to the originalPort, this gpuDynInstPtr is set only for + // the first packet of this instruction. + GPUDynInstPtr gpuDynInstPtr; +}; + class GPUCoalescer : public RubyPort { public: @@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort void collateStats(); void regStats() override; + // each store request needs two callbacks: + // (1) writeCallback is called when the store is received and processed + // by TCP. This writeCallback does not guarantee the store is actually + // completed at its destination cache or memory. writeCallback helps + // release hardware resources (e.g., its entry in coalescedTable) + // allocated for the store so that subsequent requests will not be + // blocked unnecessarily due to hardware resource constraints. + // (2) writeCompleteCallback is called when the store is fully completed + // at its destination cache or memory. writeCompleteCallback + // guarantees that the store is fully completed. This callback + // will decrement hardware counters in CU void writeCallback(Addr address, DataBlock& data); void writeCallback(Addr address, @@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort Cycles forwardRequestTime, Cycles firstResponseTime); + void writeCompleteCallback(Addr address, + uint64_t instSeqNum, + MachineType mach); + void readCallback(Addr address, DataBlock& data); void readCallback(Addr address, @@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion); - /* atomics need their own callback because the data - might be const coming from SLICC */ + void atomicCallback(Addr address, MachineType mach, const DataBlock& data); - void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID); - void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID); - - // Alternate implementations in VIPER Coalescer - virtual RequestStatus makeRequest(PacketPtr pkt) override; - + RequestStatus makeRequest(PacketPtr pkt) override; int outstandingCount() const override { return m_outstanding_count; } bool @@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort GMTokenPort& getGMTokenPort() { return gmTokenPort; } - void recordRequestType(SequencerRequestType requestType); Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } Stats::Histogram& getLatencyHist() { return m_latencyHist; } @@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort getFirstResponseToCompletionDelayHist(const MachineType t) const { return *m_FirstResponseToCompletionDelayHist[t]; } - // Changed to protected to enable inheritance by VIPER Coalescer protected: bool tryCacheAccess(Addr addr, RubyRequestType type, Addr pc, RubyAccessMode access_mode, int size, DataBlock*& data_ptr); - // Alternate implementations in VIPER Coalescer - virtual void issueRequest(CoalescedRequest* crequest); - void kernelCallback(int wavfront_id); + // since the two following issue functions are protocol-specific, + // they must be implemented in a derived coalescer + virtual void issueRequest(CoalescedRequest* crequest) = 0; + virtual void issueMemSyncRequest(PacketPtr pkt) = 0; + + void kernelCallback(int wavefront_id); void hitCallback(CoalescedRequest* crequest, MachineType mach, @@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort bool success, bool isRegion); void completeHitCallback(std::vector & mylist); - virtual RubyRequestType getRequestType(PacketPtr pkt); // Attempt to remove a packet from the uncoalescedTable and coalesce @@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort EventFunctionWrapper issueEvent; - - // Changed to protected to enable inheritance by VIPER Coalescer protected: int m_max_outstanding_requests; Cycles m_deadlock_threshold; @@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort // an address, the are serviced in age order. std::map> coalescedTable; + // a map btw an instruction sequence number and PendingWriteInst + // this is used to do a final call back for each write when it is + // completely done in the memory system + std::unordered_map pendingWriteInsts; + // Global outstanding request count, across all request tables int m_outstanding_count; bool m_deadlock_check_scheduled; @@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort EventFunctionWrapper deadlockCheckEvent; bool assumingRfOCoherence; - // m5 style stats for TCP hit/miss counts - Stats::Scalar GPU_TCPLdHits; - Stats::Scalar GPU_TCPLdTransfers; - Stats::Scalar GPU_TCCLdHits; - Stats::Scalar GPU_LdMiss; - - Stats::Scalar GPU_TCPStHits; - Stats::Scalar GPU_TCPStTransfers; - Stats::Scalar GPU_TCCStHits; - Stats::Scalar GPU_StMiss; - - Stats::Scalar CP_TCPLdHits; - Stats::Scalar CP_TCPLdTransfers; - Stats::Scalar CP_TCCLdHits; - Stats::Scalar CP_LdMiss; - - Stats::Scalar CP_TCPStHits; - Stats::Scalar CP_TCPStTransfers; - Stats::Scalar CP_TCCStHits; - Stats::Scalar CP_StMiss; +// TODO - Need to update the following stats once the VIPER protocol +// is re-integrated. +// // m5 style stats for TCP hit/miss counts +// Stats::Scalar GPU_TCPLdHits; +// Stats::Scalar GPU_TCPLdTransfers; +// Stats::Scalar GPU_TCCLdHits; +// Stats::Scalar GPU_LdMiss; +// +// Stats::Scalar GPU_TCPStHits; +// Stats::Scalar GPU_TCPStTransfers; +// Stats::Scalar GPU_TCCStHits; +// Stats::Scalar GPU_StMiss; +// +// Stats::Scalar CP_TCPLdHits; +// Stats::Scalar CP_TCPLdTransfers; +// Stats::Scalar CP_TCCLdHits; +// Stats::Scalar CP_LdMiss; +// +// Stats::Scalar CP_TCPStHits; +// Stats::Scalar CP_TCPStTransfers; +// Stats::Scalar CP_TCCStHits; +// Stats::Scalar CP_StMiss; //! Histogram for number of outstanding requests per cycle. Stats::Histogram m_outstandReqHist; @@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort std::vector m_ForwardToFirstResponseDelayHist; std::vector m_FirstResponseToCompletionDelayHist; +// TODO - Need to update the following stats once the VIPER protocol +// is re-integrated. +// Stats::Distribution numHopDelays; +// Stats::Distribution tcpToTccDelay; +// Stats::Distribution tccToSdDelay; +// Stats::Distribution sdToSdDelay; +// Stats::Distribution sdToTccDelay; +// Stats::Distribution tccToTcpDelay; +// +// Stats::Average avgTcpToTcc; +// Stats::Average avgTccToSd; +// Stats::Average avgSdToSd; +// Stats::Average avgSdToTcc; +// Stats::Average avgTccToTcp; + private: // Token port is used to send/receive tokens to/from GPU's global memory // pipeline across the port boundary. There is one per data diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py index 0335981c0..3345f7f94 100644 --- a/src/mem/ruby/system/GPUCoalescer.py +++ b/src/mem/ruby/system/GPUCoalescer.py @@ -36,6 +36,7 @@ from m5.objects.Sequencer import * class RubyGPUCoalescer(RubyPort): type = 'RubyGPUCoalescer' + abstract = True cxx_class = 'GPUCoalescer' cxx_header = "mem/ruby/system/GPUCoalescer.hh" @@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort): "max requests (incl. prefetches) outstanding") max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \ "coalesced in a single cycle") - assume_rfo = Param.Bool(True, "assume protocol implementes Read for " - "Ownership coherence"); icache = Param.RubyCache("") dcache = Param.RubyCache("") diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh index 78ad2912c..659c9fd34 100644 --- a/src/mem/ruby/system/VIPERCoalescer.hh +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer VIPERCoalescer(const Params *); ~VIPERCoalescer(); - void issueMemSyncRequest(PacketPtr pkt); + void issueMemSyncRequest(PacketPtr pkt) override; void issueRequest(CoalescedRequest* crequest) override; void wbCallback(Addr address); void invCallback(Addr address); diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py index d8adb07d0..d4af1be4f 100644 --- a/src/mem/ruby/system/VIPERCoalescer.py +++ b/src/mem/ruby/system/VIPERCoalescer.py @@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer): cxx_header = "mem/ruby/system/VIPERCoalescer.hh" max_inv_per_cycle = Param.Int(32, "max invalidations per cycle") max_wb_per_cycle = Param.Int(32, "max writebacks per cycle") - assume_rfo = False