From 024f978cff6389dd68330b8fa3299767fecd6320 Mon Sep 17 00:00:00 2001 From: Xianwei Zhang Date: Thu, 28 Jun 2018 02:34:41 -0400 Subject: [PATCH] gpu-compute: enable kernel-end WB functionality Change-Id: Ib17e1d700586d1aa04d408e7b924270f0de82efe Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29938 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Xianwei Zhang --- src/gpu-compute/compute_unit.cc | 28 +++++++++++++++------------- src/gpu-compute/shader.cc | 10 +++++----- src/mem/request.hh | 7 +++++++ 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index b0616d677..178fd6e96 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1218,23 +1218,25 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, schedule(mem_req_event, curTick() + req_tick_latency); } else { - assert(gpuDynInst->isEndOfKernel()); + // kernel end release must be enabled + assert(shader->impl_kern_end_rel); + assert(gpuDynInst->isEndOfKernel()); - req->setCacheCoherenceFlags(Request::RELEASE); - req->setReqInstSeqNum(gpuDynInst->seqNum()); - req->setFlags(Request::KERNEL); - pkt = new Packet(req, MemCmd::MemSyncReq); - pkt->pushSenderState( - new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); + req->setCacheCoherenceFlags(Request::WB_L2); + req->setReqInstSeqNum(gpuDynInst->seqNum()); + req->setFlags(Request::KERNEL); + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); - EventFunctionWrapper *mem_req_event = - memPort[0]->createMemReqEvent(pkt); + EventFunctionWrapper *mem_req_event = + memPort[0]->createMemReqEvent(pkt); - DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " - "a release\n", cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); + DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " + "a release\n", cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); - schedule(mem_req_event, curTick() + req_tick_latency); + schedule(mem_req_event, curTick() + req_tick_latency); } } else { gpuDynInst->setRequestFlags(req); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index f5e944471..59ce23971 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -223,11 +223,11 @@ Shader::prepareFlush(GPUDynInstPtr gpuDynInst){ // flush has never been started, performed only once at kernel end assert(_dispatcher.getOutstandingWbs(kernId) == 0); - // iterate all cus, managed by the shader, to perform flush. - for (int i_cu = 0; i_cu < n_cu; ++i_cu) { - _dispatcher.updateWbCounter(kernId, +1); - cuList[i_cu]->doFlush(gpuDynInst); - } + // the first cu, managed by the shader, performs flush operation, + // assuming that L2 cache is shared by all cus in the shader + int i_cu = 0; + _dispatcher.updateWbCounter(kernId, +1); + cuList[i_cu]->doFlush(gpuDynInst); } bool diff --git a/src/mem/request.hh b/src/mem/request.hh index 4e0ba974c..718d5fa24 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -225,6 +225,9 @@ class Request * See the AMD GCN3 ISA Architecture Manual for more * details. * + * INV_L1: L1 cache invalidation + * WB_L2: L2 cache writeback + * * SLC: System Level Coherent. Accesses are forced to miss in * the L2 cache and are coherent with system memory. * @@ -237,6 +240,10 @@ class Request * between atomic return/no-return operations. */ enum : CacheCoherenceFlagsType { + /** mem_sync_op flags */ + INV_L1 = 0x00000001, + WB_L2 = 0x00000020, + /** user-policy flags */ /** user-policy flags */ SLC_BIT = 0x00000080, GLC_BIT = 0x00000100, -- 2.30.2