gpu-compute,mem-ruby: Replace ACQUIRE and RELEASE request flags

author Tuan Ta <qtt2@cornell.edu>

Tue, 12 Jun 2018 20:36:27 +0000 (16:36 -0400)

committer Matthew Poremba <matthew.poremba@amd.com>

Wed, 4 Nov 2020 21:09:26 +0000 (21:09 +0000)
author Tuan Ta <qtt2@cornell.edu>
Tue, 12 Jun 2018 20:36:27 +0000 (16:36 -0400)
committer Matthew Poremba <matthew.poremba@amd.com>
Wed, 4 Nov 2020 21:09:26 +0000 (21:09 +0000)
diff --git a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc

index dbdaba4ed038cd37e562d3ab14563c60828761b1..6b3c3a04d2783d9c10f76b9dd6048ec65b46c3e5 100644 (file)
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -232,7 +232,7 @@ GpuWavefront::issueAcquireOp()
                                               threadId, nullptr);
      acq_req->setPaddr(0);
      acq_req->setReqInstSeqNum(tester->getActionSeqNum());
-    acq_req->setFlags(Request::ACQUIRE);
+    acq_req->setCacheCoherenceFlags(Request::INV_L1);
      // set protocol-specific flags
      setExtraRequestFlags(acq_req);
  
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index 2787e427a127c0bf5541893f9ce17cee073bd451..1da5a45a027a4cb878f646a43dde7f287d1c7111 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -805,9 +805,9 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
          // here (simdId=-1, wfSlotId=-1)
          if (gpuDynInst->isKernelLaunch()) {
              // for kernel launch, the original request must be both kernel-type
-            // and acquire
+            // and INV_L1
              assert(pkt->req->isKernel());
-            assert(pkt->req->isAcquire());
+            assert(pkt->req->isInvL1());
  
              // one D-Cache inv is done, decrement counter
              dispatcher.updateInvCounter(gpuDynInst->kern_id);
@@ -820,16 +820,19 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
          // retrieve wavefront from inst
          Wavefront *w = gpuDynInst->wavefront();
  
-        // Check if we are waiting on Kernel End Release
+        // Check if we are waiting on Kernel End Flush
          if (w->getStatus() == Wavefront::S_RETURNING
              && gpuDynInst->isEndOfKernel()) {
              // for kernel end, the original request must be both kernel-type
-            // and release
+            // and last-level GPU cache should be flushed if it contains
+            // dirty data.  This request may have been quiesced and
+            // immediately responded to if the GL2 is a write-through /
+            // read-only cache.
              assert(pkt->req->isKernel());
-            assert(pkt->req->isRelease());
+            assert(pkt->req->isGL2CacheFlush());
  
-            // one wb done, decrement counter, and return whether all wbs are
-            // done for the kernel
+            // once flush done, decrement counter, and return whether all
+            // dirty writeback operations are done for the kernel
              bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
  
              // not all wbs are done for the kernel, just release pkt
@@ -1218,7 +1221,7 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
  
      if (kernelMemSync) {
          if (gpuDynInst->isKernelLaunch()) {
-            req->setCacheCoherenceFlags(Request::ACQUIRE);
+            req->setCacheCoherenceFlags(Request::INV_L1);
              req->setReqInstSeqNum(gpuDynInst->seqNum());
              req->setFlags(Request::KERNEL);
              pkt = new Packet(req, MemCmd::MemSyncReq);
@@ -1234,11 +1237,12 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
  
              schedule(mem_req_event, curTick() + req_tick_latency);
          } else {
-          // kernel end release must be enabled
+          // kernel end flush of GL2 cache may be quiesced by Ruby if the
+          // GL2 is a read-only cache
            assert(shader->impl_kern_end_rel);
            assert(gpuDynInst->isEndOfKernel());
  
-          req->setCacheCoherenceFlags(Request::WB_L2);
+          req->setCacheCoherenceFlags(Request::FLUSH_L2);
            req->setReqInstSeqNum(gpuDynInst->seqNum());
            req->setFlags(Request::KERNEL);
            pkt = new Packet(req, MemCmd::MemSyncReq);
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh

index f34eff6c1684ffec75727e55597f2a3f11881e15..cdb130e2f28650ec82d4eb154886d623db1ee1ef 100644 (file)
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -306,7 +306,7 @@ class GPUDynInst : public GPUExecContext
              assert(!isEndOfKernel());
  
              // must be wbinv inst if not kernel launch/end
-            req->setCacheCoherenceFlags(Request::ACQUIRE);
+            req->setCacheCoherenceFlags(Request::INV_L1);
          }
      }
  
diff --git a/src/mem/request.hh b/src/mem/request.hh

index 73c823bf2b751d0d91a0d06c710d9d52e44e0098..b9d7e1406ca28c134b0c0ae02cf0a93f7c89519f 100644 (file)
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -260,30 +260,36 @@ class Request
      typedef ::Flags<CacheCoherenceFlagsType> CacheCoherenceFlags;
  
      /**
-     * These bits are used to set the coherence policy
-     * for the GPU and are encoded in the GCN3 instructions.
-     * See the AMD GCN3 ISA Architecture Manual for more
-     * details.
+     * These bits are used to set the coherence policy for the GPU and are
+     * encoded in the GCN3 instructions. The GCN3 ISA defines two cache levels
+     * See the AMD GCN3 ISA Architecture Manual for more details.
       *
       * INV_L1: L1 cache invalidation
-     * WB_L2: L2 cache writeback
+     * FLUSH_L2: L2 cache flush
       *
-     * SLC: System Level Coherent. Accesses are forced to miss in
-     *      the L2 cache and are coherent with system memory.
+     * Invalidation means to simply discard all cache contents. This can be
+     * done in the L1 since it is implemented as a write-through cache and
+     * there are other copies elsewhere in the hierarchy.
       *
-     * GLC: Globally Coherent. Controls how reads and writes are
-     *      handled by the L1 cache. Global here referes to the
-     *      data being visible globally on the GPU (i.e., visible
-     *      to all WGs).
+     * For flush the contents of the cache need to be written back to memory
+     * when dirty and can be discarded otherwise. This operation is more
+     * involved than invalidation and therefore we do not flush caches with
+     * redundant copies of data.
       *
-     * For atomics, the GLC bit is used to distinguish between
-     * between atomic return/no-return operations.
+     * SLC: System Level Coherent. Accesses are forced to miss in the L2 cache
+     *      and are coherent with system memory.
+     *
+     * GLC: Globally Coherent. Controls how reads and writes are handled by
+     *      the L1 cache. Global here referes to the data being visible
+     *      globally on the GPU (i.e., visible to all WGs).
+     *
+     * For atomics, the GLC bit is used to distinguish between between atomic
+     * return/no-return operations. These flags are used by GPUDynInst.
       */
      enum : CacheCoherenceFlagsType {
          /** mem_sync_op flags */
          INV_L1                  = 0x00000001,
-        WB_L2                   = 0x00000020,
-        /** user-policy flags */
+        FLUSH_L2                = 0x00000020,
          /** user-policy flags */
          SLC_BIT                 = 0x00000080,
          GLC_BIT                 = 0x00000100,
@@ -938,11 +944,15 @@ class Request
      /**
       * Accessor functions for the memory space configuration flags and used by
       * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that
-     * these are for testing only; setting extraFlags should be done via
-     * setCacheCoherenceFlags().
+     * setting extraFlags should be done via setCacheCoherenceFlags().
       */
-    bool isSLC() const { return _cacheCoherenceFlags.isSet(SLC_BIT); }
-    bool isGLC() const { return _cacheCoherenceFlags.isSet(GLC_BIT); }
+    bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+
+    bool
+    isGL2CacheFlush() const
+    {
+        return _cacheCoherenceFlags.isSet(FLUSH_L2);
+    }
  
      /**
       * Accessor functions to determine whether this request is part of
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc

index 3f73568396d2ca250fa69f3ed7b52933aa06dcdb..b51a9e734a649cb000c85871086a367c7708bf7d 100644 (file)
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -587,7 +587,15 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
          assert(pkt->isRead() || pkt->isWrite());
  
          InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
-        int num_packets = getDynInst(pkt)->exec_mask.count();
+
+        // in the case of protocol tester, there is one packet per sequence
+        // number. The number of packets during simulation depends on the
+        // number of lanes actives for that vmem request (i.e., the popcnt
+        // of the exec_mask.
+        int num_packets = 1;
+        if (!m_usingRubyTester) {
+            num_packets = getDynInst(pkt)->exec_mask.count();
+        }
  
          // the pkt is temporarily stored in the uncoalesced table until
          // it's picked for coalescing process later in this cycle or in a
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc

index 69add1b98209529a41ab5ea535b2a1fde8c44d68..111f9f22b5417a040b4cabf695bc5d0f132b5a0c 100644 (file)
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -70,20 +70,19 @@ RequestStatus
  VIPERCoalescer::makeRequest(PacketPtr pkt)
  {
      // VIPER only supports following memory request types
-    //    MemSyncReq & Acquire: TCP cache invalidation
+    //    MemSyncReq & INV_L1 : TCP cache invalidation
      //    ReadReq             : cache read
      //    WriteReq            : cache write
      //    AtomicOp            : cache atomic
      //
      // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
      // does not specify an equivalent type of memory request.
-    // TODO: future patches should rename Acquire and Release
-    assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
+    assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
              pkt->cmd == MemCmd::ReadReq ||
              pkt->cmd == MemCmd::WriteReq ||
              pkt->isAtomicOp());
  
-    if (pkt->req->isAcquire() && m_cache_inv_pkt) {
+    if (pkt->req->isInvL1() && m_cache_inv_pkt) {
          // In VIPER protocol, the coalescer is not able to handle two or
          // more cache invalidation requests at a time. Cache invalidation
          // requests must be serialized to ensure that all stale data in
@@ -94,8 +93,8 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
  
      GPUCoalescer::makeRequest(pkt);
  
-    if (pkt->req->isAcquire()) {
-        // In VIPER protocol, a compute unit sends a MemSyncReq with Acquire
+    if (pkt->req->isInvL1()) {
+        // In VIPER protocol, a compute unit sends a MemSyncReq with INV_L1
          // flag to invalidate TCP. Upon receiving a request of this type,
          // VIPERCoalescer starts a cache walk to invalidate all valid entries
          // in TCP. The request is completed once all entries are invalidated.
@@ -276,7 +275,7 @@ VIPERCoalescer::invTCPCallback(Addr addr)
  }
  
  /**
-  * Invalidate TCP (Acquire)
+  * Invalidate TCP
    */
  void
  VIPERCoalescer::invTCP()
author	Tuan Ta <qtt2@cornell.edu>
	Tue, 12 Jun 2018 20:36:27 +0000 (16:36 -0400)
committer	Matthew Poremba <matthew.poremba@amd.com>
	Wed, 4 Nov 2020 21:09:26 +0000 (21:09 +0000)
src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc		patch \| blob \| history
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/gpu_dyn_inst.hh		patch \| blob \| history
src/mem/request.hh		patch \| blob \| history
src/mem/ruby/system/GPUCoalescer.cc		patch \| blob \| history
src/mem/ruby/system/VIPERCoalescer.cc		patch \| blob \| history