gpu-compute: enable kernel-end WB functionality

author Xianwei Zhang <xianwei.zhang@amd.com>

Thu, 28 Jun 2018 06:34:41 +0000 (02:34 -0400)

committer Anthony Gutierrez <anthony.gutierrez@amd.com>

Mon, 13 Jul 2020 23:32:37 +0000 (23:32 +0000)
author Xianwei Zhang <xianwei.zhang@amd.com>
Thu, 28 Jun 2018 06:34:41 +0000 (02:34 -0400)
committer Anthony Gutierrez <anthony.gutierrez@amd.com>
Mon, 13 Jul 2020 23:32:37 +0000 (23:32 +0000)
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index b0616d677b3aa14e16cd9d213e2ebe5cd4ae1174..178fd6e9605bbcc16df86f7f4ffca6bf28ad2dcd 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1218,23 +1218,25 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
  
              schedule(mem_req_event, curTick() + req_tick_latency);
          } else {
-            assert(gpuDynInst->isEndOfKernel());
+          // kernel end release must be enabled
+          assert(shader->impl_kern_end_rel);
+          assert(gpuDynInst->isEndOfKernel());
  
-            req->setCacheCoherenceFlags(Request::RELEASE);
-            req->setReqInstSeqNum(gpuDynInst->seqNum());
-            req->setFlags(Request::KERNEL);
-            pkt = new Packet(req, MemCmd::MemSyncReq);
-            pkt->pushSenderState(
-               new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
+          req->setCacheCoherenceFlags(Request::WB_L2);
+          req->setReqInstSeqNum(gpuDynInst->seqNum());
+          req->setFlags(Request::KERNEL);
+          pkt = new Packet(req, MemCmd::MemSyncReq);
+          pkt->pushSenderState(
+             new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
  
-            EventFunctionWrapper *mem_req_event =
-              memPort[0]->createMemReqEvent(pkt);
+          EventFunctionWrapper *mem_req_event =
+            memPort[0]->createMemReqEvent(pkt);
  
-            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
-                    "a release\n", cu_id, gpuDynInst->simdId,
-                    gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
+          DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
+                  "a release\n", cu_id, gpuDynInst->simdId,
+                  gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
  
-            schedule(mem_req_event, curTick() + req_tick_latency);
+          schedule(mem_req_event, curTick() + req_tick_latency);
          }
      } else {
          gpuDynInst->setRequestFlags(req);
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc

index f5e944471365abe4fa78da19045513baca822fda..59ce23971fe34eeb9fbe6ee59eeba501dd18d6ec 100644 (file)
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -223,11 +223,11 @@ Shader::prepareFlush(GPUDynInstPtr gpuDynInst){
      // flush has never been started, performed only once at kernel end
      assert(_dispatcher.getOutstandingWbs(kernId) == 0);
  
-    // iterate all cus, managed by the shader, to perform flush.
-    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
-        _dispatcher.updateWbCounter(kernId, +1);
-        cuList[i_cu]->doFlush(gpuDynInst);
-    }
+    // the first cu, managed by the shader, performs flush operation,
+    // assuming that L2 cache is shared by all cus in the shader
+    int i_cu = 0;
+    _dispatcher.updateWbCounter(kernId, +1);
+    cuList[i_cu]->doFlush(gpuDynInst);
  }
  
  bool
diff --git a/src/mem/request.hh b/src/mem/request.hh

index 4e0ba974cb7d9aba9387157966c8282e063b8e61..718d5fa243e02a7076e834124e0132a36e01c986 100644 (file)
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -225,6 +225,9 @@ class Request
       * See the AMD GCN3 ISA Architecture Manual for more
       * details.
       *
+     * INV_L1: L1 cache invalidation
+     * WB_L2: L2 cache writeback
+     *
       * SLC: System Level Coherent. Accesses are forced to miss in
       *      the L2 cache and are coherent with system memory.
       *
@@ -237,6 +240,10 @@ class Request
       * between atomic return/no-return operations.
       */
      enum : CacheCoherenceFlagsType {
+        /** mem_sync_op flags */
+        INV_L1                  = 0x00000001,
+        WB_L2                   = 0x00000020,
+        /** user-policy flags */
          /** user-policy flags */
          SLC_BIT                 = 0x00000080,
          GLC_BIT                 = 0x00000100,
author	Xianwei Zhang <xianwei.zhang@amd.com>
	Thu, 28 Jun 2018 06:34:41 +0000 (02:34 -0400)
committer	Anthony Gutierrez <anthony.gutierrez@amd.com>
	Mon, 13 Jul 2020 23:32:37 +0000 (23:32 +0000)
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/shader.cc		patch \| blob \| history
src/mem/request.hh		patch \| blob \| history