From 2c1e9c4e81cc80c39d05f2bc547dfb3392d7751f Mon Sep 17 00:00:00 2001 From: Xianwei Zhang Date: Mon, 18 Jun 2018 13:50:11 -0400 Subject: [PATCH] gpu-compute: enable flexible control of kernel boundary syncs Kernel end release was turned on for VIPER protocol, which is in fact write-through based and thus no need to have release operation. This changeset splits the option 'impl_kern_boundary_sync' into 'impl_kern_launch_acq' and 'impl_kern_end_rel', and turns off release on VIPER. Change-Id: I5490019b6765a25bd801cc78fb7445b90eb02a3d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29917 Reviewed-by: Anthony Gutierrez Reviewed-by: Xianwei Zhang Maintainer: Anthony Gutierrez Tested-by: kokoro --- src/arch/gcn3/insts/instructions.cc | 12 ++++++++---- src/gpu-compute/GPU.py | 6 ++++-- src/gpu-compute/dispatcher.cc | 6 +++--- src/gpu-compute/shader.cc | 3 ++- src/gpu-compute/shader.hh | 4 +++- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 7578694b6..8d63296bf 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -3759,9 +3759,13 @@ namespace Gcn3ISA // the last workgroup in the kernel). bool kernelEnd = wf->computeUnit->shader->dispatcher().isReachingKernelEnd(wf); + // further check whether 'release @ kernel end' is needed + bool relNeeded = + wf->computeUnit->shader->impl_kern_end_rel; - // if it is not a kernel end, then retire the workgroup directly - if (!kernelEnd) { + // if not a kernel end or no release needed, retire the workgroup + // directly + if (!kernelEnd || !relNeeded) { wf->computeUnit->shader->dispatcher().notifyWgCompl(wf); wf->setStatus(Wavefront::S_STOPPED); wf->computeUnit->completedWGs++; @@ -3770,8 +3774,8 @@ namespace Gcn3ISA } /** - * If it is a kernel end, inject a memory sync and retire the - * workgroup after receving response. + * If a kernel end and release needed, inject a memory sync and + * retire the workgroup after receving all acks. */ setFlag(MemSync); setFlag(GlobalSegment); diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 6b033f403..8a2ad812e 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -213,8 +213,10 @@ class Shader(ClockedObject): gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU') dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher') n_wf = Param.Int(10, 'Number of wavefront slots per SIMD') - impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into - ruby at kernel boundaries""") + impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into + ruby at kernel launch""") + impl_kern_end_rel = Param.Bool(False, """Insert rel packet into + ruby at kernel end""") globalmem = Param.MemorySize('64kB', 'Memory size') timing = Param.Bool(False, 'timing memory accesses') diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc index 51f5e97fe..6a8242f11 100644 --- a/src/gpu-compute/dispatcher.cc +++ b/src/gpu-compute/dispatcher.cc @@ -166,12 +166,12 @@ GPUDispatcher::exec() auto task = hsaQueueEntries[exec_id]; bool launched(false); - // invalidate is needed before starting dispatch - if (shader->impl_kern_boundary_sync) { + // acq is needed before starting dispatch + if (shader->impl_kern_launch_acq) { // try to invalidate cache shader->prepareInvalidate(task); } else { - // kern boundary sync is not set, skip invalidate + // kern launch acquire is not set, skip invalidate task->markInvDone(); } diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 4be2fbfbd..aa7a6dd2f 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -56,7 +56,8 @@ Shader::Shader(const Params *p) : ClockedObject(p), tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event", false, Event::CPU_Tick_Pri), timingSim(p->timing), hsail_mode(SIMT), - impl_kern_boundary_sync(p->impl_kern_boundary_sync), + impl_kern_launch_acq(p->impl_kern_launch_acq), + impl_kern_end_rel(p->impl_kern_end_rel), coissue_return(1), trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf), globalMemSize(p->globalmem), diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 72063a4a5..eeaf3437f 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -216,7 +216,9 @@ class Shader : public ClockedObject hsail_mode_e hsail_mode; // If set, issue acq packet @ kernel launch - int impl_kern_boundary_sync; + int impl_kern_launch_acq; + // If set, issue rel packet @ kernel end + int impl_kern_end_rel; // If set, fetch returns may be coissued with instructions int coissue_return; // If set, always dump all 64 gprs to trace -- 2.30.2