gpu-compute: enable flexible control of kernel boundary syncs

author Xianwei Zhang <xianwei.zhang@amd.com>

Mon, 18 Jun 2018 17:50:11 +0000 (13:50 -0400)

committer Anthony Gutierrez <anthony.gutierrez@amd.com>

Fri, 19 Jun 2020 20:40:05 +0000 (20:40 +0000)
author Xianwei Zhang <xianwei.zhang@amd.com>
Mon, 18 Jun 2018 17:50:11 +0000 (13:50 -0400)
committer Anthony Gutierrez <anthony.gutierrez@amd.com>
Fri, 19 Jun 2020 20:40:05 +0000 (20:40 +0000)
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc

index 7578694b633ab644c3f7f66cbf36c2e1a15b443e..8d63296bf1f4eb27423893d064bdc2a9d4685b8a 100644 (file)
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -3759,9 +3759,13 @@ namespace Gcn3ISA
              // the last workgroup in the kernel).
              bool kernelEnd =
                  wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf);
+            // further check whether 'release @ kernel end' is needed
+            bool relNeeded =
+                wf->computeUnit->shader->impl_kern_end_rel;
  
-            // if it is not a kernel end, then retire the workgroup directly
-            if (!kernelEnd) {
+            // if not a kernel end or no release needed, retire the workgroup
+            // directly
+            if (!kernelEnd || !relNeeded) {
                  wf->computeUnit->shader->dispatcher().notifyWgCompl(wf);
                  wf->setStatus(Wavefront::S_STOPPED);
                  wf->computeUnit->completedWGs++;
@@ -3770,8 +3774,8 @@ namespace Gcn3ISA
              }
  
              /**
-             * If it is a kernel end, inject a memory sync and retire the
-             * workgroup after receving response.
+             * If a kernel end and release needed, inject a memory sync and
+             * retire the workgroup after receving all acks.
               */
              setFlag(MemSync);
              setFlag(GlobalSegment);
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py

index 6b033f40366e059fe4cfb52d5f789f28c87d94a8..8a2ad812e8be55794252afeb3d260bbcfb521c16 100644 (file)
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -213,8 +213,10 @@ class Shader(ClockedObject):
      gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
      dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
      n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
-    impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
-                                         ruby at kernel boundaries""")
+    impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into
+                                         ruby at kernel launch""")
+    impl_kern_end_rel = Param.Bool(False, """Insert rel packet into
+                                         ruby at kernel end""")
      globalmem = Param.MemorySize('64kB', 'Memory size')
      timing = Param.Bool(False, 'timing memory accesses')
  
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc

index 51f5e97fed2e101f211cd4c046c2e6c977b5c5b4..6a8242f11e657363cd1b599dbf408e6643901db9 100644 (file)
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -166,12 +166,12 @@ GPUDispatcher::exec()
          auto task = hsaQueueEntries[exec_id];
          bool launched(false);
  
-        // invalidate is needed before starting dispatch
-        if (shader->impl_kern_boundary_sync) {
+        // acq is needed before starting dispatch
+        if (shader->impl_kern_launch_acq) {
              // try to invalidate cache
              shader->prepareInvalidate(task);
          } else {
-            // kern boundary sync is not set, skip invalidate
+            // kern launch acquire is not set, skip invalidate
              task->markInvDone();
          }
  
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc

index 4be2fbfbd8385c1dc4e41ce63d75960f9163e71c..aa7a6dd2f2a265a7d5ae4fbbc62813c640920157 100644 (file)
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -56,7 +56,8 @@ Shader::Shader(const Params *p) : ClockedObject(p),
      tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
            false, Event::CPU_Tick_Pri),
      timingSim(p->timing), hsail_mode(SIMT),
-    impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+    impl_kern_launch_acq(p->impl_kern_launch_acq),
+    impl_kern_end_rel(p->impl_kern_end_rel),
      coissue_return(1),
      trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
      globalMemSize(p->globalmem),
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh

index 72063a4a57abf1c8e09974dd1b6c9d07affc92e8..eeaf3437feb850007f9d9e6bc66a804e05868cf2 100644 (file)
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -216,7 +216,9 @@ class Shader : public ClockedObject
      hsail_mode_e hsail_mode;
  
      // If set, issue acq packet @ kernel launch
-    int impl_kern_boundary_sync;
+    int impl_kern_launch_acq;
+    // If set, issue rel packet @ kernel end
+    int impl_kern_end_rel;
      // If set, fetch returns may be coissued with instructions
      int coissue_return;
      // If set, always dump all 64 gprs to trace
author	Xianwei Zhang <xianwei.zhang@amd.com>
	Mon, 18 Jun 2018 17:50:11 +0000 (13:50 -0400)
committer	Anthony Gutierrez <anthony.gutierrez@amd.com>
	Fri, 19 Jun 2020 20:40:05 +0000 (20:40 +0000)
src/arch/gcn3/insts/instructions.cc		patch \| blob \| history
src/gpu-compute/GPU.py		patch \| blob \| history
src/gpu-compute/dispatcher.cc		patch \| blob \| history
src/gpu-compute/shader.cc		patch \| blob \| history
src/gpu-compute/shader.hh		patch \| blob \| history