gpu-compute: Simplify LGKM decrementing for Flat instructions

[gem5.git] / src / gpu-compute / global_memory_pipeline.cc
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc

index cfd7c3db1ced3af80c3618d2936cfc25b50ac140..2f251e864f2f1c352c27f7cd8f94c8375c121c8b 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -43,19 +43,19 @@
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
-    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    maxWaveRequests(p->max_wave_requests), inflightStores(0),
-    inflightLoads(0)
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
+                                     ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
+      gmQueueSize(p.global_mem_queue_size),
+      maxWaveRequests(p.max_wave_requests), inflightStores(0),
+      inflightLoads(0), stats(&cu)
  {
  }
  
  void
-GlobalMemPipeline::init(ComputeUnit *cu)
+GlobalMemPipeline::init()
  {
-    computeUnit = cu;
-    globalMemSize = computeUnit->shader->globalMemSize;
-    _name = computeUnit->name() + ".GlobalMemPipeline";
+    globalMemSize = computeUnit.shader->globalMemSize;
  }
  
  bool
@@ -121,15 +121,19 @@ GlobalMemPipeline::exec()
  
      }
  
-    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
-        accessVrf && (computeUnit->shader->coissue_return ||
-        computeUnit->vectorGlobalMemUnit.rdy())) {
+    if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
+        accessVrf && (computeUnit.shader->coissue_return ||
+        computeUnit.vectorGlobalMemUnit.rdy())) {
  
          w = m->wavefront();
  
          DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
                  m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
          m->completeAcc(m);
+        if (m->isFlat()) {
+            w->decLGKMInstsIssued();
+        }
+        w->decVMemInstsIssued();
  
          if (m->isLoad() || m->isAtomicRet()) {
              w->computeUnit->vrf[w->simdId]->
@@ -141,16 +145,16 @@ GlobalMemPipeline::exec()
          Tick accessTime = curTick() - m->getAccessTime();
  
          // Decrement outstanding requests count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
          if (m->isStore() || m->isAtomic() || m->isMemSync()) {
-            computeUnit->shader->sampleStore(accessTime);
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+            computeUnit.shader->sampleStore(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
                                               m->time, -1);
          }
  
          if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
-            computeUnit->shader->sampleLoad(accessTime);
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+            computeUnit.shader->sampleLoad(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
                                               m->time, -1);
          }
  
@@ -160,12 +164,12 @@ GlobalMemPipeline::exec()
          // going all the way to memory and stats for individual cache
          // blocks generated by the instruction.
          m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
-        computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
-        computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+        computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
+        computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
  
          // Mark write bus busy for appropriate amount of time
-        computeUnit->glbMemToVrfBus.set(m->time);
-        if (!computeUnit->shader->coissue_return)
+        computeUnit.glbMemToVrfBus.set(m->time);
+        if (!computeUnit.shader->coissue_return)
              w->computeUnit->vectorGlobalMemUnit.set(m->time);
      }
  
@@ -192,6 +196,10 @@ GlobalMemPipeline::exec()
                  mp->disassemble(), mp->seqNum());
          mp->initiateAcc(mp);
  
+        if (mp->isStore() && mp->isGlobalSeg()) {
+            mp->wavefront()->decExpInstsIssued();
+        }
+
          if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
              /**
               * if we are not in out-of-order data delivery mode
@@ -217,13 +225,13 @@ GlobalMemPipeline::exec()
              * correctly.
              */
              handleResponse(mp);
-            computeUnit->getTokenManager()->recvTokens(1);
+            computeUnit.getTokenManager()->recvTokens(1);
          }
  
          gmIssuedRequests.pop();
  
          DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
-                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+                computeUnit.cu_id, mp->simdId, mp->wfSlotId);
      }
  }
  
@@ -281,12 +289,10 @@ GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
      mem_req->second.second = true;
  }
  
-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "GlobalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+               "are delayed before updating the VRF")
  {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles GM data are delayed before updating "
-              "the VRF")
-        ;
  }