From 4d737462c29149342c9f8a5bce560bebc2e196d4 Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Fri, 10 Aug 2018 15:01:12 -0400 Subject: [PATCH] gpu-compute, arch-gcn3: Change how waitcnts are implemented Use single counters per memory operation type and increment them upon issue, not execute. Change-Id: I6afc0b66b21882538ef90a14a57a3ab3cc7bd6f3 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29973 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- src/arch/gcn3/insts/instructions.cc | 36 +++++++++++++ src/gpu-compute/global_memory_pipeline.cc | 1 + src/gpu-compute/gpu_dyn_inst.cc | 3 ++ src/gpu-compute/local_memory_pipeline.cc | 1 + src/gpu-compute/scalar_memory_pipeline.cc | 1 + src/gpu-compute/schedule_stage.cc | 9 ++++ src/gpu-compute/wavefront.cc | 63 ++++++++++++++++------- src/gpu-compute/wavefront.hh | 10 ++++ 8 files changed, 106 insertions(+), 18 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 9987fade0..7c2cf0e49 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -32565,6 +32565,7 @@ namespace Gcn3ISA vdst.write(); + wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); } // execute @@ -32635,6 +32636,7 @@ namespace Gcn3ISA vdst.write(); + wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); } // execute @@ -39400,6 +39402,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39496,6 +39500,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39592,6 +39598,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39660,6 +39668,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39728,6 +39738,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; return; @@ -39805,6 +39817,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->rdGmReqsInPipe--; wf->rdLmReqsInPipe--; } @@ -39884,6 +39898,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -39952,6 +39968,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40021,6 +40039,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40090,6 +40110,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40159,6 +40181,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40237,6 +40261,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->wrLmReqsInPipe--; return; @@ -40325,6 +40351,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->rdGmReqsInPipe--; return; @@ -40425,6 +40453,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->rdGmReqsInPipe--; return; @@ -40526,6 +40556,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->rdGmReqsInPipe--; return; @@ -40893,6 +40925,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->rdGmReqsInPipe--; return; @@ -40995,6 +41029,8 @@ namespace Gcn3ISA Wavefront *wf = gpuDynInst->wavefront(); if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); wf->wrGmReqsInPipe--; wf->rdGmReqsInPipe--; return; diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index dcc80f061..9fc515aef 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -130,6 +130,7 @@ GlobalMemPipeline::exec() DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n", m->cu_id, m->simdId, m->wfSlotId, m->disassemble()); m->completeAcc(m); + w->decVMemInstsIssued(); if (m->isLoad() || m->isAtomicRet()) { w->computeUnit->vrf[w->simdId]-> diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 2a49522da..03ed68951 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -819,6 +819,7 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) if (executedAs() == Enums::SC_GLOBAL) { // no transormation for global segment wavefront()->execUnitId = wavefront()->flatGmUnitId; + wavefront()->decLGKMInstsIssued(); if (isLoad()) { wavefront()->rdLmReqsInPipe--; } else if (isStore()) { @@ -838,6 +839,7 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) } } wavefront()->execUnitId = wavefront()->flatLmUnitId; + wavefront()->decVMemInstsIssued(); if (isLoad()) { wavefront()->rdGmReqsInPipe--; } else if (isStore()) { @@ -897,6 +899,7 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) } } wavefront()->execUnitId = wavefront()->flatLmUnitId; + wavefront()->decLGKMInstsIssued(); if (isLoad()) { wavefront()->rdGmReqsInPipe--; } else if (isStore()) { diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index df576907c..ca090e956 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -76,6 +76,7 @@ LocalMemPipeline::exec() DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n", m->cu_id, m->simdId, m->wfSlotId, m->disassemble()); m->completeAcc(m); + w->decLGKMInstsIssued(); if (m->isLoad() || m->isAtomicRet()) { w->computeUnit->vrf[w->simdId]-> diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc index 35b4ca5e5..5e4496d51 100644 --- a/src/gpu-compute/scalar_memory_pipeline.cc +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -85,6 +85,7 @@ ScalarMemPipeline::exec() } m->completeAcc(m); + w->decLGKMInstsIssued(); if (m->isLoad() || m->isAtomic()) { returnedLoads.pop(); diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index fb52b6dd1..005e6f61e 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -135,6 +135,15 @@ ScheduleStage::exec() // this wave spends in SCH stage. wf->schCycles++; addToSchListStalls[j]++; + } else { + if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) { + wf->incLGKMInstsIssued(); + } else { + wf->incVMemInstsIssued(); + if (gpu_dyn_inst->isFlat()) { + wf->incLGKMInstsIssued(); + } + } } } diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index f72cd50fd..0e737db0e 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -53,6 +53,7 @@ Wavefront::Wavefront(const Params *p) : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId), maxIbSize(p->max_ib_size), _gpuISA(*this), vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1), + vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0), barId(WFBarrier::InvalidID) { lastTrace = 0; @@ -1253,37 +1254,27 @@ Wavefront::waitCntsSatisfied() return false; } - // If we reach here, that means waitCnt instruction is executed and - // the waitcnts are set by the execute method. Check if waitcnts are - // satisfied. - - // current number of vector memory ops in flight - int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm; - - // current number of export insts or vector memory writes in flight - int exp_cnt = outstandingReqsWrGm; - - // current number of scalar/LDS memory ops in flight - // we do not consider GDS/message ops - int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm + - scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm; - + /** + * If we reach here, that means an s_waitcnt instruction was executed + * and the waitcnts are set by the execute method. Check if waitcnts + * are satisfied. + */ if (vmWaitCnt != -1) { - if (vm_cnt > vmWaitCnt) { + if (vmemInstsIssued > vmWaitCnt) { // vmWaitCnt not satisfied return false; } } if (expWaitCnt != -1) { - if (exp_cnt > expWaitCnt) { + if (expInstsIssued > expWaitCnt) { // expWaitCnt not satisfied return false; } } if (lgkmWaitCnt != -1) { - if (lgkm_cnt > lgkmWaitCnt) { + if (lgkmInstsIssued > lgkmWaitCnt) { // lgkmWaitCnt not satisfied return false; } @@ -1355,6 +1346,42 @@ Wavefront::clearWaitCnts() status = S_RUNNING; } +void +Wavefront::incVMemInstsIssued() +{ + ++vmemInstsIssued; +} + +void +Wavefront::incExpInstsIssued() +{ + ++expInstsIssued; +} + +void +Wavefront::incLGKMInstsIssued() +{ + ++lgkmInstsIssued; +} + +void +Wavefront::decVMemInstsIssued() +{ + --vmemInstsIssued; +} + +void +Wavefront::decExpInstsIssued() +{ + --expInstsIssued; +} + +void +Wavefront::decLGKMInstsIssued() +{ + --lgkmInstsIssued; +} + Addr Wavefront::pc() const { diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index e07af0ecc..34e45facf 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -304,6 +304,13 @@ class Wavefront : public SimObject void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt); void clearWaitCnts(); + void incVMemInstsIssued(); + void incExpInstsIssued(); + void incLGKMInstsIssued(); + void decVMemInstsIssued(); + void decExpInstsIssued(); + void decLGKMInstsIssued(); + /** Freeing VRF space */ void freeRegisterFile(); @@ -343,6 +350,9 @@ class Wavefront : public SimObject int vmWaitCnt; int expWaitCnt; int lgkmWaitCnt; + int vmemInstsIssued; + int expInstsIssued; + int lgkmInstsIssued; status_e status; Addr _pc; VectorMask _execMask; -- 2.30.2