From: Tuan Ta Date: Tue, 12 Jun 2018 20:36:27 +0000 (-0400) Subject: gpu-compute,mem-ruby: Replace ACQUIRE and RELEASE request flags X-Git-Tag: develop-gem5-snapshot~515 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=173c1c6eb0b76fcba1312b19067db29ece5357f3;p=gem5.git gpu-compute,mem-ruby: Replace ACQUIRE and RELEASE request flags This patch replaces ACQUIRE and RELEASE flags which are HSA-specific. ACQUIRE flag becomes INV_L1 in VIPER protocol. RELEASE flag is removed. Future protocols may support extra cache coherence flags like INV_L2 and WB_L2. Change-Id: I3d60c9d3625c898f4110a12d81742b6822728533 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32859 Reviewed-by: Jason Lowe-Power Reviewed-by: Matt Sinclair Maintainer: Matt Sinclair Tested-by: kokoro --- diff --git a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc index dbdaba4ed..6b3c3a04d 100644 --- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc +++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc @@ -232,7 +232,7 @@ GpuWavefront::issueAcquireOp() threadId, nullptr); acq_req->setPaddr(0); acq_req->setReqInstSeqNum(tester->getActionSeqNum()); - acq_req->setFlags(Request::ACQUIRE); + acq_req->setCacheCoherenceFlags(Request::INV_L1); // set protocol-specific flags setExtraRequestFlags(acq_req); diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 2787e427a..1da5a45a0 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -805,9 +805,9 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) // here (simdId=-1, wfSlotId=-1) if (gpuDynInst->isKernelLaunch()) { // for kernel launch, the original request must be both kernel-type - // and acquire + // and INV_L1 assert(pkt->req->isKernel()); - assert(pkt->req->isAcquire()); + assert(pkt->req->isInvL1()); // one D-Cache inv is done, decrement counter dispatcher.updateInvCounter(gpuDynInst->kern_id); @@ -820,16 +820,19 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) // retrieve wavefront from inst Wavefront *w = gpuDynInst->wavefront(); - // Check if we are waiting on Kernel End Release + // Check if we are waiting on Kernel End Flush if (w->getStatus() == Wavefront::S_RETURNING && gpuDynInst->isEndOfKernel()) { // for kernel end, the original request must be both kernel-type - // and release + // and last-level GPU cache should be flushed if it contains + // dirty data. This request may have been quiesced and + // immediately responded to if the GL2 is a write-through / + // read-only cache. assert(pkt->req->isKernel()); - assert(pkt->req->isRelease()); + assert(pkt->req->isGL2CacheFlush()); - // one wb done, decrement counter, and return whether all wbs are - // done for the kernel + // once flush done, decrement counter, and return whether all + // dirty writeback operations are done for the kernel bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id); // not all wbs are done for the kernel, just release pkt @@ -1218,7 +1221,7 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, if (kernelMemSync) { if (gpuDynInst->isKernelLaunch()) { - req->setCacheCoherenceFlags(Request::ACQUIRE); + req->setCacheCoherenceFlags(Request::INV_L1); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); pkt = new Packet(req, MemCmd::MemSyncReq); @@ -1234,11 +1237,12 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, schedule(mem_req_event, curTick() + req_tick_latency); } else { - // kernel end release must be enabled + // kernel end flush of GL2 cache may be quiesced by Ruby if the + // GL2 is a read-only cache assert(shader->impl_kern_end_rel); assert(gpuDynInst->isEndOfKernel()); - req->setCacheCoherenceFlags(Request::WB_L2); + req->setCacheCoherenceFlags(Request::FLUSH_L2); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); pkt = new Packet(req, MemCmd::MemSyncReq); diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index f34eff6c1..cdb130e2f 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -306,7 +306,7 @@ class GPUDynInst : public GPUExecContext assert(!isEndOfKernel()); // must be wbinv inst if not kernel launch/end - req->setCacheCoherenceFlags(Request::ACQUIRE); + req->setCacheCoherenceFlags(Request::INV_L1); } } diff --git a/src/mem/request.hh b/src/mem/request.hh index 73c823bf2..b9d7e1406 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -260,30 +260,36 @@ class Request typedef ::Flags CacheCoherenceFlags; /** - * These bits are used to set the coherence policy - * for the GPU and are encoded in the GCN3 instructions. - * See the AMD GCN3 ISA Architecture Manual for more - * details. + * These bits are used to set the coherence policy for the GPU and are + * encoded in the GCN3 instructions. The GCN3 ISA defines two cache levels + * See the AMD GCN3 ISA Architecture Manual for more details. * * INV_L1: L1 cache invalidation - * WB_L2: L2 cache writeback + * FLUSH_L2: L2 cache flush * - * SLC: System Level Coherent. Accesses are forced to miss in - * the L2 cache and are coherent with system memory. + * Invalidation means to simply discard all cache contents. This can be + * done in the L1 since it is implemented as a write-through cache and + * there are other copies elsewhere in the hierarchy. * - * GLC: Globally Coherent. Controls how reads and writes are - * handled by the L1 cache. Global here referes to the - * data being visible globally on the GPU (i.e., visible - * to all WGs). + * For flush the contents of the cache need to be written back to memory + * when dirty and can be discarded otherwise. This operation is more + * involved than invalidation and therefore we do not flush caches with + * redundant copies of data. * - * For atomics, the GLC bit is used to distinguish between - * between atomic return/no-return operations. + * SLC: System Level Coherent. Accesses are forced to miss in the L2 cache + * and are coherent with system memory. + * + * GLC: Globally Coherent. Controls how reads and writes are handled by + * the L1 cache. Global here referes to the data being visible + * globally on the GPU (i.e., visible to all WGs). + * + * For atomics, the GLC bit is used to distinguish between between atomic + * return/no-return operations. These flags are used by GPUDynInst. */ enum : CacheCoherenceFlagsType { /** mem_sync_op flags */ INV_L1 = 0x00000001, - WB_L2 = 0x00000020, - /** user-policy flags */ + FLUSH_L2 = 0x00000020, /** user-policy flags */ SLC_BIT = 0x00000080, GLC_BIT = 0x00000100, @@ -938,11 +944,15 @@ class Request /** * Accessor functions for the memory space configuration flags and used by * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that - * these are for testing only; setting extraFlags should be done via - * setCacheCoherenceFlags(). + * setting extraFlags should be done via setCacheCoherenceFlags(). */ - bool isSLC() const { return _cacheCoherenceFlags.isSet(SLC_BIT); } - bool isGLC() const { return _cacheCoherenceFlags.isSet(GLC_BIT); } + bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); } + + bool + isGL2CacheFlush() const + { + return _cacheCoherenceFlags.isSet(FLUSH_L2); + } /** * Accessor functions to determine whether this request is part of diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index 3f7356839..b51a9e734 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -587,7 +587,15 @@ GPUCoalescer::makeRequest(PacketPtr pkt) assert(pkt->isRead() || pkt->isWrite()); InstSeqNum seq_num = pkt->req->getReqInstSeqNum(); - int num_packets = getDynInst(pkt)->exec_mask.count(); + + // in the case of protocol tester, there is one packet per sequence + // number. The number of packets during simulation depends on the + // number of lanes actives for that vmem request (i.e., the popcnt + // of the exec_mask. + int num_packets = 1; + if (!m_usingRubyTester) { + num_packets = getDynInst(pkt)->exec_mask.count(); + } // the pkt is temporarily stored in the uncoalesced table until // it's picked for coalescing process later in this cycle or in a diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index 69add1b98..111f9f22b 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -70,20 +70,19 @@ RequestStatus VIPERCoalescer::makeRequest(PacketPtr pkt) { // VIPER only supports following memory request types - // MemSyncReq & Acquire: TCP cache invalidation + // MemSyncReq & INV_L1 : TCP cache invalidation // ReadReq : cache read // WriteReq : cache write // AtomicOp : cache atomic // // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit // does not specify an equivalent type of memory request. - // TODO: future patches should rename Acquire and Release - assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) || + assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) || pkt->cmd == MemCmd::ReadReq || pkt->cmd == MemCmd::WriteReq || pkt->isAtomicOp()); - if (pkt->req->isAcquire() && m_cache_inv_pkt) { + if (pkt->req->isInvL1() && m_cache_inv_pkt) { // In VIPER protocol, the coalescer is not able to handle two or // more cache invalidation requests at a time. Cache invalidation // requests must be serialized to ensure that all stale data in @@ -94,8 +93,8 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) GPUCoalescer::makeRequest(pkt); - if (pkt->req->isAcquire()) { - // In VIPER protocol, a compute unit sends a MemSyncReq with Acquire + if (pkt->req->isInvL1()) { + // In VIPER protocol, a compute unit sends a MemSyncReq with INV_L1 // flag to invalidate TCP. Upon receiving a request of this type, // VIPERCoalescer starts a cache walk to invalidate all valid entries // in TCP. The request is completed once all entries are invalidated. @@ -276,7 +275,7 @@ VIPERCoalescer::invTCPCallback(Addr addr) } /** - * Invalidate TCP (Acquire) + * Invalidate TCP */ void VIPERCoalescer::invTCP()