gpu-compute: Simplify LGKM decrementing for Flat instructions
[gem5.git] / src / gpu-compute / global_memory_pipeline.cc
index cfd7c3db1ced3af80c3618d2936cfc25b50ac140..2f251e864f2f1c352c27f7cd8f94c8375c121c8b 100644 (file)
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
 
-GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
-    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    maxWaveRequests(p->max_wave_requests), inflightStores(0),
-    inflightLoads(0)
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
+                                     ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
+      gmQueueSize(p.global_mem_queue_size),
+      maxWaveRequests(p.max_wave_requests), inflightStores(0),
+      inflightLoads(0), stats(&cu)
 {
 }
 
 void
-GlobalMemPipeline::init(ComputeUnit *cu)
+GlobalMemPipeline::init()
 {
-    computeUnit = cu;
-    globalMemSize = computeUnit->shader->globalMemSize;
-    _name = computeUnit->name() + ".GlobalMemPipeline";
+    globalMemSize = computeUnit.shader->globalMemSize;
 }
 
 bool
@@ -121,15 +121,19 @@ GlobalMemPipeline::exec()
 
     }
 
-    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
-        accessVrf && (computeUnit->shader->coissue_return ||
-        computeUnit->vectorGlobalMemUnit.rdy())) {
+    if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
+        accessVrf && (computeUnit.shader->coissue_return ||
+        computeUnit.vectorGlobalMemUnit.rdy())) {
 
         w = m->wavefront();
 
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);
+        if (m->isFlat()) {
+            w->decLGKMInstsIssued();
+        }
+        w->decVMemInstsIssued();
 
         if (m->isLoad() || m->isAtomicRet()) {
             w->computeUnit->vrf[w->simdId]->
@@ -141,16 +145,16 @@ GlobalMemPipeline::exec()
         Tick accessTime = curTick() - m->getAccessTime();
 
         // Decrement outstanding requests count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
         if (m->isStore() || m->isAtomic() || m->isMemSync()) {
-            computeUnit->shader->sampleStore(accessTime);
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+            computeUnit.shader->sampleStore(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
                                              m->time, -1);
         }
 
         if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
-            computeUnit->shader->sampleLoad(accessTime);
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+            computeUnit.shader->sampleLoad(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
                                              m->time, -1);
         }
 
@@ -160,12 +164,12 @@ GlobalMemPipeline::exec()
         // going all the way to memory and stats for individual cache
         // blocks generated by the instruction.
         m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
-        computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
-        computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+        computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
+        computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
 
         // Mark write bus busy for appropriate amount of time
-        computeUnit->glbMemToVrfBus.set(m->time);
-        if (!computeUnit->shader->coissue_return)
+        computeUnit.glbMemToVrfBus.set(m->time);
+        if (!computeUnit.shader->coissue_return)
             w->computeUnit->vectorGlobalMemUnit.set(m->time);
     }
 
@@ -192,6 +196,10 @@ GlobalMemPipeline::exec()
                 mp->disassemble(), mp->seqNum());
         mp->initiateAcc(mp);
 
+        if (mp->isStore() && mp->isGlobalSeg()) {
+            mp->wavefront()->decExpInstsIssued();
+        }
+
         if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
             /**
              * if we are not in out-of-order data delivery mode
@@ -217,13 +225,13 @@ GlobalMemPipeline::exec()
             * correctly.
             */
             handleResponse(mp);
-            computeUnit->getTokenManager()->recvTokens(1);
+            computeUnit.getTokenManager()->recvTokens(1);
         }
 
         gmIssuedRequests.pop();
 
         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
-                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+                computeUnit.cu_id, mp->simdId, mp->wfSlotId);
     }
 }
 
@@ -281,12 +289,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
     mem_req->second.second = true;
 }
 
-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "GlobalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+               "are delayed before updating the VRF")
 {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles GM data are delayed before updating "
-              "the VRF")
-        ;
 }