gpu-compute: Simplify LGKM decrementing for Flat instructions

[gem5.git] / src / gpu-compute / global_memory_pipeline.cc
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc

index 64778f011651e929db3868b5b441ef0056414573..2f251e864f2f1c352c27f7cd8f94c8375c121c8b 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -31,30 +31,31 @@
   * POSSIBILITY OF SUCH DAMAGE.
   */
  
-#include "gpu-compute/global_memory_pipeline.hh"
-
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
  #include "debug/GPUCoalescer.hh"
  #include "debug/GPUMem.hh"
  #include "debug/GPUReg.hh"
  #include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
  #include "gpu-compute/gpu_dyn_inst.hh"
  #include "gpu-compute/shader.hh"
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
-    computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
-    inflightLoads(0)
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,
+                                     ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
+      gmQueueSize(p.global_mem_queue_size),
+      maxWaveRequests(p.max_wave_requests), inflightStores(0),
+      inflightLoads(0), stats(&cu)
  {
  }
  
  void
-GlobalMemPipeline::init(ComputeUnit *cu)
+GlobalMemPipeline::init()
  {
-    computeUnit = cu;
-    globalMemSize = computeUnit->shader->globalMemSize;
-    _name = computeUnit->name() + ".GlobalMemPipeline";
+    globalMemSize = computeUnit.shader->globalMemSize;
  }
  
  bool
@@ -76,6 +77,31 @@ GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
      return true;
  }
  
+void
+GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
+    assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
+    mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
+}
+
+bool
+GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
+{
+    // Ensure we haven't exceeded the maximum number of vmem requests
+    // for this wavefront
+    if ((mp->wavefront()->outstandingReqsRdGm
+         + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
+        return false;
+    }
+
+    return true;
+}
+
  void
  GlobalMemPipeline::exec()
  {
@@ -87,42 +113,64 @@ GlobalMemPipeline::exec()
  
      // check the VRF to see if the operands of a load (or load component
      // of an atomic) are accessible
-    if ((m) && (m->isLoad() || m->isAtomicRet())) {
+    if (m && (m->isLoad() || m->isAtomicRet())) {
          w = m->wavefront();
  
-        accessVrf =
-            w->computeUnit->vrf[w->simdId]->
-                vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
+        accessVrf = w->computeUnit->vrf[w->simdId]->
+            canScheduleWriteOperandsFromLoad(w, m);
+
      }
  
-    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
-        accessVrf && m->statusBitVector == VectorMask(0) &&
-        (computeUnit->shader->coissue_return ||
-        computeUnit->wfWait.at(m->pipeId).rdy())) {
+    if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
+        accessVrf && (computeUnit.shader->coissue_return ||
+        computeUnit.vectorGlobalMemUnit.rdy())) {
  
          w = m->wavefront();
  
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
+                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
          m->completeAcc(m);
+        if (m->isFlat()) {
+            w->decLGKMInstsIssued();
+        }
+        w->decVMemInstsIssued();
+
+        if (m->isLoad() || m->isAtomicRet()) {
+            w->computeUnit->vrf[w->simdId]->
+            scheduleWriteOperandsFromLoad(w, m);
+        }
  
          completeRequest(m);
  
-        // Decrement outstanding register count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        Tick accessTime = curTick() - m->getAccessTime();
  
-        if (m->isStore() || m->isAtomic()) {
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+        // Decrement outstanding requests count
+        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        if (m->isStore() || m->isAtomic() || m->isMemSync()) {
+            computeUnit.shader->sampleStore(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
                                               m->time, -1);
          }
  
-        if (m->isLoad() || m->isAtomic()) {
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+        if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
+            computeUnit.shader->sampleLoad(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
                                               m->time, -1);
          }
  
+        w->validateRequestCounters();
+
+        // Generate stats for round-trip time for vectory memory insts
+        // going all the way to memory and stats for individual cache
+        // blocks generated by the instruction.
+        m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
+        computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
+        computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
+
          // Mark write bus busy for appropriate amount of time
-        computeUnit->glbMemToVrfBus.set(m->time);
-        if (!computeUnit->shader->coissue_return)
-            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+        computeUnit.glbMemToVrfBus.set(m->time);
+        if (!computeUnit.shader->coissue_return)
+            w->computeUnit->vectorGlobalMemUnit.set(m->time);
      }
  
      // If pipeline has executed a global memory instruction
@@ -146,15 +194,13 @@ GlobalMemPipeline::exec()
  
          DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
                  mp->disassemble(), mp->seqNum());
-        // Memfences will not return tokens and must be issued so we should
-        // not request one as this will deplete the token count until deadlock
-        if (!mp->isMemFence()) {
-            assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
-            mp->computeUnit()->getTokenManager()->acquireTokens(1);
-        }
          mp->initiateAcc(mp);
  
-        if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+        if (mp->isStore() && mp->isGlobalSeg()) {
+            mp->wavefront()->decExpInstsIssued();
+        }
+
+        if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
              /**
               * if we are not in out-of-order data delivery mode
               * then we keep the responses sorted in program order.
@@ -168,29 +214,35 @@ GlobalMemPipeline::exec()
                  std::make_pair(mp, false)));
          }
  
+        if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
+            /**
+            * Memory accesses instructions that do not generate any memory
+            * requests (such as out-of-bounds buffer acceses where all lanes
+            * are out of bounds) will not trigger a callback to complete the
+            * request, so we need to mark it as completed as soon as it is
+            * issued.  Note this this will still insert an entry in the
+            * ordered return FIFO such that waitcnt is still resolved
+            * correctly.
+            */
+            handleResponse(mp);
+            computeUnit.getTokenManager()->recvTokens(1);
+        }
+
          gmIssuedRequests.pop();
  
          DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
-                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+                computeUnit.cu_id, mp->simdId, mp->wfSlotId);
      }
  }
  
  GPUDynInstPtr
  GlobalMemPipeline::getNextReadyResp()
  {
-    if (outOfOrderDataDelivery) {
-        if (!gmReturnedLoads.empty()) {
-            return gmReturnedLoads.front();
-        } else if (!gmReturnedStores.empty()) {
-            return gmReturnedStores.front();
-        }
-    } else {
-        if (!gmOrderedRespBuffer.empty()) {
-            auto mem_req = gmOrderedRespBuffer.begin();
+    if (!gmOrderedRespBuffer.empty()) {
+        auto mem_req = gmOrderedRespBuffer.begin();
  
-            if (mem_req->second.second) {
-                return mem_req->second.first;
-            }
+        if (mem_req->second.second) {
+            return mem_req->second.first;
          }
      }
  
@@ -208,59 +260,39 @@ GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
          --inflightStores;
      }
  
-    if (outOfOrderDataDelivery) {
-        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-            assert(!gmReturnedLoads.empty());
-            gmReturnedLoads.pop();
-        } else if (gpuDynInst->isStore()) {
-            assert(!gmReturnedStores.empty());
-            gmReturnedStores.pop();
-        }
-    } else {
-        // we should only pop the oldest requst, and it
-        // should be marked as done if we are here
-        assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
-        assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
-        assert(gmOrderedRespBuffer.begin()->second.second);
-        // remove this instruction from the buffer by its
-        // unique seq ID
-        gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
-    }
+    // we should only pop the oldest requst, and it
+    // should be marked as done if we are here
+    assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+    assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+    assert(gmOrderedRespBuffer.begin()->second.second);
+    // remove this instruction from the buffer by its
+    // unique seq ID
+    gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
  }
  
  void
  GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
  {
+    gpuDynInst->setAccessTime(curTick());
+    gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
      gmIssuedRequests.push(gpuDynInst);
  }
  
  void
  GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
  {
-    if (outOfOrderDataDelivery) {
-        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-            assert(isGMLdRespFIFOWrRdy());
-            gmReturnedLoads.push(gpuDynInst);
-        } else {
-            assert(isGMStRespFIFOWrRdy());
-            gmReturnedStores.push(gpuDynInst);
-        }
-    } else {
-        auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
-        // if we are getting a response for this mem request,
-        // then it ought to already be in the ordered response
-        // buffer
-        assert(mem_req != gmOrderedRespBuffer.end());
-        mem_req->second.second = true;
-    }
+    auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+    // if we are getting a response for this mem request,
+    // then it ought to already be in the ordered response
+    // buffer
+    assert(mem_req != gmOrderedRespBuffer.end());
+    mem_req->second.second = true;
  }
  
-void
-GlobalMemPipeline::regStats()
+GlobalMemPipeline::
+GlobalMemPipelineStats::GlobalMemPipelineStats(Stats::Group *parent)
+    : Stats::Group(parent, "GlobalMemPipeline"),
+      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "
+               "are delayed before updating the VRF")
  {
-    loadVrfBankConflictCycles
-        .name(name() + ".load_vrf_bank_conflict_cycles")
-        .desc("total number of cycles GM data are delayed before updating "
-              "the VRF")
-        ;
  }