gpu-compute: Use refs to CU in pipe stages/mem pipes

author Tony Gutierrez <anthony.gutierrez@amd.com>

Fri, 29 Jun 2018 21:39:53 +0000 (17:39 -0400)

committer Anthony Gutierrez <anthony.gutierrez@amd.com>

Fri, 17 Jul 2020 16:34:36 +0000 (16:34 +0000)
author Tony Gutierrez <anthony.gutierrez@amd.com>
Fri, 29 Jun 2018 21:39:53 +0000 (17:39 -0400)
committer Anthony Gutierrez <anthony.gutierrez@amd.com>
Fri, 17 Jul 2020 16:34:36 +0000 (16:34 +0000)
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index 653c074bc6ee41fcc3922a032956b9dee45db5dc..a59a7fd6e24e2f6d91ca1c85f4a06d3017bdc348 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -67,13 +67,13 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
      vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
      coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
      registerManager(p->register_manager),
-    fetchStage(p, this),
-    scoreboardCheckStage(p, this),
-    scheduleStage(p, this),
-    execStage(p, this),
-    globalMemoryPipe(p, this),
-    localMemoryPipe(p, this),
-    scalarMemoryPipe(p, this),
+    fetchStage(p, *this),
+    scoreboardCheckStage(p, *this),
+    scheduleStage(p, *this),
+    execStage(p, *this),
+    globalMemoryPipe(p, *this),
+    localMemoryPipe(p, *this),
+    scalarMemoryPipe(p, *this),
      tickEvent([this]{ exec(); }, "Compute unit tick event",
            false, Event::CPU_Tick_Pri),
      cu_id(p->cu_id),
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc

index e420579c97da2dbe843717a00d8b4dcd14f9ffe7..2b0a79785c6795d9d6b62c2756da6ada30fda51f 100644 (file)
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -41,10 +41,10 @@
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
      : computeUnit(cu), lastTimeInstExecuted(false),
        thisTimeInstExecuted(false), instrExecuted (false),
-      executionResourcesUsed(0), _name(cu->name() + ".ExecStage")
+      executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
  
  {
      numTransActiveIdle = 0;
@@ -54,7 +54,7 @@ ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
  void
  ExecStage::init()
  {
-    dispatchList = &computeUnit->dispatchList;
+    dispatchList = &computeUnit.dispatchList;
      idle_dur = 0;
  }
  
@@ -127,7 +127,7 @@ ExecStage::dumpDispList()
  {
      std::stringstream ss;
      bool empty = true;
-    for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+    for (int i = 0; i < computeUnit.numExeUnits(); i++) {
          DISPATCH_STATUS s = dispatchList->at(i).second;
          ss << i << ": " << dispStatusToStr(s);
          if (s != EMPTY) {
@@ -151,7 +151,7 @@ ExecStage::exec()
      if (Debug::GPUSched) {
          dumpDispList();
      }
-    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+    for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
          DISPATCH_STATUS s = dispatchList->at(unitId).second;
          switch (s) {
          case EMPTY:
@@ -168,7 +168,7 @@ ExecStage::exec()
                      (w->instructionBuffer.front())->disassemble());
              DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
              dispatchList->at(unitId).first->exec();
-            (computeUnit->scheduleStage).deleteFromSch(w);
+            (computeUnit.scheduleStage).deleteFromSch(w);
              dispatchList->at(unitId).second = EMPTY;
              dispatchList->at(unitId).first->freeResources();
              dispatchList->at(unitId).first = nullptr;
@@ -208,7 +208,7 @@ ExecStage::regStats()
          ;
  
      spc
-        .init(0, computeUnit->numExeUnits(), 1)
+        .init(0, computeUnit.numExeUnits(), 1)
          .name(name() + ".spc")
          .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
          ;
@@ -220,26 +220,26 @@ ExecStage::regStats()
          ;
  
      numCyclesWithInstrTypeIssued
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
          .name(name() + ".num_cycles_issue_exec_rsrc")
          .desc("Number of cycles at least one instruction issued to "
                "execution resource type")
          ;
  
      numCyclesWithNoInstrTypeIssued
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
         .name(name() + ".num_cycles_no_issue_exec_rsrc")
         .desc("Number of clks no instructions issued to execution "
               "resource type")
         ;
  
      int c = 0;
-    for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
+    for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
          std::string s = "VectorALU" + std::to_string(i);
          numCyclesWithNoInstrTypeIssued.subname(c, s);
          numCyclesWithInstrTypeIssued.subname(c, s);
      }
-    for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
+    for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
          std::string s = "ScalarALU" + std::to_string(i);
          numCyclesWithNoInstrTypeIssued.subname(c, s);
          numCyclesWithInstrTypeIssued.subname(c, s);
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh

index f984d729cdde5f7fcaa3e9ca47114edbb0464271..cd4343e6d54098b63fb5cb0fdff6e56b47a1f2bf 100644 (file)
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -69,7 +69,7 @@ enum DISPATCH_STATUS
  class ExecStage
  {
    public:
-    ExecStage(const ComputeUnitParams* p, ComputeUnit *cu);
+    ExecStage(const ComputeUnitParams* p, ComputeUnit &cu);
      ~ExecStage() { }
      void init();
      void exec();
@@ -77,7 +77,7 @@ class ExecStage
      std::string dispStatusToStr(int j);
      void dumpDispList();
  
-    std::string name() { return _name; }
+    const std::string& name() const { return _name; }
      void regStats();
      // number of idle cycles
      Stats::Scalar numCyclesWithNoIssue;
@@ -96,7 +96,7 @@ class ExecStage
    private:
      void collectStatistics(enum STAT_STATUS stage, int unitId);
      void initStatistics();
-    ComputeUnit *computeUnit;
+    ComputeUnit &computeUnit;
  
      // List of waves which will be dispatched to
      // each execution resource. A FILLED implies
@@ -115,7 +115,7 @@ class ExecStage
      Stats::Distribution idleDur;
      int executionResourcesUsed;
      uint64_t idle_dur;
-    std::string _name;
+    const std::string _name;
  };
  
  #endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc

index b9df6ce4f8e5aebbe9140d7565e4bccd9d614ee3..6c3b8f40ca2f6c89a4aa4242828c3960549502de 100644 (file)
--- a/src/gpu-compute/fetch_stage.cc
+++ b/src/gpu-compute/fetch_stage.cc
@@ -36,9 +36,9 @@
  #include "gpu-compute/compute_unit.hh"
  #include "gpu-compute/wavefront.hh"
  
-FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit *cu)
+FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit &cu)
      : numVectorALUs(p->num_SIMDs), computeUnit(cu),
-      _name(cu->name() + ".FetchStage")
+      _name(cu.name() + ".FetchStage")
  {
      for (int j = 0; j < numVectorALUs; ++j) {
          FetchUnit newFetchUnit(p, cu);
@@ -55,7 +55,7 @@ void
  FetchStage::init()
  {
      for (int j = 0; j < numVectorALUs; ++j) {
-        _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+        _fetchUnit[j].bindWaveList(&computeUnit.wfList[j]);
          _fetchUnit[j].init();
      }
  }
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh

index afecce7cb6f4f8c5181314754d95cc1ed3a04795..ea556611a2761d9fb7f29b9e3000a26797900c87 100644 (file)
--- a/src/gpu-compute/fetch_stage.hh
+++ b/src/gpu-compute/fetch_stage.hh
@@ -51,7 +51,7 @@ class Wavefront;
  class FetchStage
  {
    public:
-    FetchStage(const ComputeUnitParams* p, ComputeUnit *cu);
+    FetchStage(const ComputeUnitParams* p, ComputeUnit &cu);
      ~FetchStage();
      void init();
      void exec();
@@ -59,19 +59,19 @@ class FetchStage
      void fetch(PacketPtr pkt, Wavefront *wave);
  
      // Stats related variables and methods
-    std::string name() { return _name; }
+    const std::string& name() const { return _name; }
      void regStats();
      Stats::Distribution instFetchInstReturned;
      FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
  
    private:
      int numVectorALUs;
-    ComputeUnit *computeUnit;
+    ComputeUnit &computeUnit;
  
      // List of fetch units. A fetch unit is
      // instantiated per VALU/SIMD
      std::vector<FetchUnit> _fetchUnit;
-    std::string _name;
+    const std::string _name;
  };
  
  #endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc

index e0127e868c5be59418fdcbc16777e95f3bc2991d..ac9a5a656a84c7c4d1befab047b6b82856d02e6b 100644 (file)
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -45,7 +45,7 @@
  
  uint32_t FetchUnit::globalFetchUnitID;
  
-FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit *cu)
+FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit &cu)
      : timingSim(true), computeUnit(cu), fetchScheduler(p),
        waveList(nullptr), fetchDepth(p->fetch_depth)
  {
@@ -60,16 +60,16 @@ FetchUnit::~FetchUnit()
  void
  FetchUnit::init()
  {
-    timingSim = computeUnit->shader->timingSim;
+    timingSim = computeUnit.shader->timingSim;
      fetchQueue.clear();
-    fetchStatusQueue.resize(computeUnit->shader->n_wf);
-    fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
+    fetchStatusQueue.resize(computeUnit.shader->n_wf);
+    fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc());
  
-    for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
+    for (int i = 0; i < computeUnit.shader->n_wf; ++i) {
          Wavefront *wf = waveList->at(i);
          assert(wf->wfSlotId == i);
          fetchStatusQueue[i] = std::make_pair(wf, false);
-        fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
+        fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf);
          fetchBuf[i].decoder(&decoder);
      }
  
@@ -97,7 +97,7 @@ FetchUnit::exec()
      }
  
      // re-evaluate waves which are marked as not ready for fetch
-    for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+    for (int j = 0; j < computeUnit.shader->n_wf; ++j) {
          // Following code assumes 64-bit opertaion and all insts are
          // represented by 64-bit pointers to inst objects.
          Wavefront *curWave = fetchStatusQueue[j].first;
@@ -143,7 +143,7 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
  
      // this should already be aligned to a cache line
      assert(vaddr == makeLineAddress(vaddr,
-           computeUnit->getCacheLineBits()));
+           computeUnit.getCacheLineBits()));
  
      // shouldn't be fetching a line that is already buffered
      assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
@@ -151,16 +151,16 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
      fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
  
      DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
-            "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
+            "from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId,
              wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
  
      DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
-            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+            computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
  
      // set up virtual request
      RequestPtr req = std::make_shared<Request>(
-        vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
-        computeUnit->masterId(), 0, 0, nullptr);
+        vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH,
+        computeUnit.masterId(), 0, 0, nullptr);
  
      PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
  
@@ -171,36 +171,36 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
          // Sender State needed by TLB hierarchy
          pkt->senderState =
              new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
-                                                 computeUnit->shader->gpuTc,
+                                                 computeUnit.shader->gpuTc,
                                                   false, pkt->senderState);
  
-        if (computeUnit->sqcTLBPort->isStalled()) {
-            assert(computeUnit->sqcTLBPort->retries.size() > 0);
+        if (computeUnit.sqcTLBPort->isStalled()) {
+            assert(computeUnit.sqcTLBPort->retries.size() > 0);
  
              DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
                      vaddr);
  
-            computeUnit->sqcTLBPort->retries.push_back(pkt);
-        } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+            computeUnit.sqcTLBPort->retries.push_back(pkt);
+        } else if (!computeUnit.sqcTLBPort->sendTimingReq(pkt)) {
              // Stall the data port;
              // No more packet is issued till
              // ruby indicates resources are freed by
              // a recvReqRetry() call back on this port.
-            computeUnit->sqcTLBPort->stallPort();
+            computeUnit.sqcTLBPort->stallPort();
  
              DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
                      vaddr);
  
-            computeUnit->sqcTLBPort->retries.push_back(pkt);
+            computeUnit.sqcTLBPort->retries.push_back(pkt);
          } else {
              DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
          }
      } else {
          pkt->senderState =
              new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
-                                                 computeUnit->shader->gpuTc);
+                                                 computeUnit.shader->gpuTc);
  
-        computeUnit->sqcTLBPort->sendFunctional(pkt);
+        computeUnit.sqcTLBPort->sendFunctional(pkt);
  
          TheISA::GpuTLB::TranslationState *sender_state =
               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
@@ -220,7 +220,7 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
      assert(pkt->req->hasSize());
  
      DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
-            computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+            computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
              pkt->req->getPaddr());
  
      /**
@@ -257,20 +257,20 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
      if (timingSim) {
          // translation is done. Send the appropriate timing memory request.
  
-        if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
-            computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+        if (!computeUnit.sqcPort->sendTimingReq(pkt)) {
+            computeUnit.sqcPort->retries.push_back(std::make_pair(pkt,
                                                                     wavefront));
  
              DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
-                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
                      pkt->req->getPaddr());
          } else {
              DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
-                    computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+                    computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
                      pkt->req->getPaddr());
          }
      } else {
-        computeUnit->sqcPort->sendFunctional(pkt);
+        computeUnit.sqcPort->sendFunctional(pkt);
          processFetchReturn(pkt);
      }
  }
@@ -284,7 +284,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
      Wavefront *wavefront = sender_state->wavefront;
  
      DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
-            "%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
+            "%d bytes!\n", computeUnit.cu_id, wavefront->simdId,
              wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
  
      if (wavefront->dropFetch) {
@@ -553,7 +553,7 @@ FetchUnit::FetchBufDesc::decodeInsts()
                  = std::make_shared<GPUDynInst>(wavefront->computeUnit,
                                                 wavefront, gpu_static_inst,
                                                 wavefront->computeUnit->
-                                                   getAndIncSeqNum());
+                                                getAndIncSeqNum());
              wavefront->instructionBuffer.push_back(gpu_dyn_inst);
  
              DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh

index 1615d81bb43ca019c7da96d5dc23b7d64983da7e..36ab9c3656cf22c5abf1c7e3e9b7bddcd3f14241 100644 (file)
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -49,7 +49,7 @@ class Wavefront;
  class FetchUnit
  {
    public:
-    FetchUnit(const ComputeUnitParams* p, ComputeUnit *cu);
+    FetchUnit(const ComputeUnitParams* p, ComputeUnit &cu);
      ~FetchUnit();
      void init();
      void exec();
@@ -234,7 +234,7 @@ class FetchUnit
      };
  
      bool timingSim;
-    ComputeUnit *computeUnit;
+    ComputeUnit &computeUnit;
      TheGpuISA::Decoder decoder;
  
      // Fetch scheduler; Selects one wave from
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc

index 2619360a4367e3ca21b1a60f2080b5747f2f65e9..dcc80f061c0bc69921e4cae73154dc2b75887dee 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -44,8 +44,8 @@
  #include "gpu-compute/wavefront.hh"
  
  GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
-                                     ComputeUnit *cu)
-    : computeUnit(cu), _name(cu->name() + ".GlobalMemPipeline"),
+                                     ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
        gmQueueSize(p->global_mem_queue_size),
        maxWaveRequests(p->max_wave_requests), inflightStores(0),
        inflightLoads(0)
@@ -55,7 +55,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
  void
  GlobalMemPipeline::init()
  {
-    globalMemSize = computeUnit->shader->globalMemSize;
+    globalMemSize = computeUnit.shader->globalMemSize;
  }
  
  bool
@@ -121,9 +121,9 @@ GlobalMemPipeline::exec()
  
      }
  
-    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
-        accessVrf && (computeUnit->shader->coissue_return ||
-        computeUnit->vectorGlobalMemUnit.rdy())) {
+    if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
+        accessVrf && (computeUnit.shader->coissue_return ||
+        computeUnit.vectorGlobalMemUnit.rdy())) {
  
          w = m->wavefront();
  
@@ -141,16 +141,16 @@ GlobalMemPipeline::exec()
          Tick accessTime = curTick() - m->getAccessTime();
  
          // Decrement outstanding requests count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
          if (m->isStore() || m->isAtomic() || m->isMemSync()) {
-            computeUnit->shader->sampleStore(accessTime);
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+            computeUnit.shader->sampleStore(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
                                               m->time, -1);
          }
  
          if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
-            computeUnit->shader->sampleLoad(accessTime);
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+            computeUnit.shader->sampleLoad(accessTime);
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
                                               m->time, -1);
          }
  
@@ -160,12 +160,12 @@ GlobalMemPipeline::exec()
          // going all the way to memory and stats for individual cache
          // blocks generated by the instruction.
          m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
-        computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
-        computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+        computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
+        computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
  
          // Mark write bus busy for appropriate amount of time
-        computeUnit->glbMemToVrfBus.set(m->time);
-        if (!computeUnit->shader->coissue_return)
+        computeUnit.glbMemToVrfBus.set(m->time);
+        if (!computeUnit.shader->coissue_return)
              w->computeUnit->vectorGlobalMemUnit.set(m->time);
      }
  
@@ -217,13 +217,13 @@ GlobalMemPipeline::exec()
              * correctly.
              */
              handleResponse(mp);
-            computeUnit->getTokenManager()->recvTokens(1);
+            computeUnit.getTokenManager()->recvTokens(1);
          }
  
          gmIssuedRequests.pop();
  
          DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
-                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+                computeUnit.cu_id, mp->simdId, mp->wfSlotId);
      }
  }
  
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh

index 97c0e8d5955fad907225677e0641fe81c5a690ab..c53789ee509865f50d95225d0f0bc29f9a1415f5 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -56,7 +56,7 @@ class ComputeUnit;
  class GlobalMemPipeline
  {
    public:
-    GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
+    GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
      void init();
      void exec();
  
@@ -108,8 +108,8 @@ class GlobalMemPipeline
      void acqCoalescerToken(GPUDynInstPtr mp);
  
    private:
-    ComputeUnit *computeUnit;
-    std::string _name;
+    ComputeUnit &computeUnit;
+    const std::string _name;
      int gmQueueSize;
      int maxWaveRequests;
  
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc

index 6644a0bf454981b06783d552a54ffd8135117073..df576907cca34102abecf9bbbd306ee380d1b809 100644 (file)
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -41,8 +41,8 @@
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit *cu)
-    : computeUnit(cu), _name(cu->name() + ".LocalMemPipeline"),
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
        lmQueueSize(p->local_mem_queue_size)
  {
  }
@@ -66,9 +66,9 @@ LocalMemPipeline::exec()
      }
  
      if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
-        computeUnit->locMemToVrfBus.rdy()
-        && (computeUnit->shader->coissue_return
-        || computeUnit->vectorSharedMemUnit.rdy())) {
+        computeUnit.locMemToVrfBus.rdy()
+        && (computeUnit.shader->coissue_return
+        || computeUnit.vectorSharedMemUnit.rdy())) {
  
          lmReturnedRequests.pop();
          w = m->wavefront();
@@ -83,21 +83,21 @@ LocalMemPipeline::exec()
          }
  
          // Decrement outstanding request count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
  
          if (m->isStore() || m->isAtomic()) {
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrLm,
                                               m->time, -1);
          }
  
          if (m->isLoad() || m->isAtomic()) {
-            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
+            computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdLm,
                                               m->time, -1);
          }
  
          // Mark write bus busy for appropriate amount of time
-        computeUnit->locMemToVrfBus.set(m->time);
-        if (computeUnit->shader->coissue_return == 0)
+        computeUnit.locMemToVrfBus.set(m->time);
+        if (computeUnit.shader->coissue_return == 0)
              w->computeUnit->vectorSharedMemUnit.set(m->time);
      }
  
@@ -108,7 +108,7 @@ LocalMemPipeline::exec()
  
          GPUDynInstPtr m = lmIssuedRequests.front();
  
-        bool returnVal = computeUnit->sendToLds(m);
+        bool returnVal = computeUnit.sendToLds(m);
          if (!returnVal) {
              DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
          }
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh

index 7821785c29483fb5be81fe981500823aedc6a4fa..3ff3b79eccd2c7688460fc32b3a86c7245b7f713 100644 (file)
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -55,7 +55,7 @@ class Wavefront;
  class LocalMemPipeline
  {
    public:
-    LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
+    LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
      void exec();
      std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
  
@@ -84,8 +84,8 @@ class LocalMemPipeline
      }
  
    private:
-    ComputeUnit *computeUnit;
-    std::string _name;
+    ComputeUnit &computeUnit;
+    const std::string _name;
      int lmQueueSize;
      Stats::Scalar loadVrfBankConflictCycles;
      // Local Memory Request Fifo: all shared memory requests
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc

index 9ec354c1b1be7b74c78ab6573d8af4068edbdf6b..35b4ca5e596f1c9d5d5cabfeb6b64455e9acfff6 100644 (file)
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -44,8 +44,8 @@
  #include "gpu-compute/wavefront.hh"
  
  ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p,
-                                     ComputeUnit *cu)
-    : computeUnit(cu), _name(cu->name() + ".ScalarMemPipeline"),
+                                     ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".ScalarMemPipeline"),
        queueSize(p->scalar_mem_queue_size),
        inflightStores(0), inflightLoads(0)
  {
@@ -72,10 +72,10 @@ ScalarMemPipeline::exec()
      }
  
      if ((!returnedStores.empty() || !returnedLoads.empty()) &&
-        m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
+        m->latency.rdy() && computeUnit.scalarMemToSrfBus.rdy() &&
          accessSrf &&
-        (computeUnit->shader->coissue_return ||
-         computeUnit->scalarMemUnit.rdy())) {
+        (computeUnit.shader->coissue_return ||
+         computeUnit.scalarMemUnit.rdy())) {
  
          w = m->wavefront();
  
@@ -97,21 +97,21 @@ ScalarMemPipeline::exec()
          }
  
          // Decrement outstanding register count
-        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
  
          if (m->isStore() || m->isAtomic()) {
-            computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
+            computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
                                               m->time, -1);
          }
  
          if (m->isLoad() || m->isAtomic()) {
-            computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
+            computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
                                               m->time, -1);
          }
  
          // Mark write bus busy for appropriate amount of time
-        computeUnit->scalarMemToSrfBus.set(m->time);
-        if (!computeUnit->shader->coissue_return)
+        computeUnit.scalarMemToSrfBus.set(m->time);
+        if (!computeUnit.shader->coissue_return)
              w->computeUnit->scalarMemUnit.set(m->time);
      }
  
@@ -138,7 +138,7 @@ ScalarMemPipeline::exec()
          issuedRequests.pop();
  
          DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
-                computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+                computeUnit.cu_id, mp->simdId, mp->wfSlotId);
      }
  }
  
diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh

index 03211a291113474605864e67a548f06eca7b5ce1..b839701ae41b4d9be1cb67c4fc5024f167be3550 100644 (file)
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -59,7 +59,7 @@ class ComputeUnit;
  class ScalarMemPipeline
  {
    public:
-    ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
+    ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
      void exec();
  
      std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
@@ -84,12 +84,12 @@ class ScalarMemPipeline
          return (issuedRequests.size() + pendReqs) < queueSize;
      }
  
-    const std::string &name() const { return _name; }
+    const std::string& name() const { return _name; }
      void regStats();
  
    private:
-    ComputeUnit *computeUnit;
-    std::string _name;
+    ComputeUnit &computeUnit;
+    const std::string _name;
      int queueSize;
  
      // Counters to track and limit the inflight scalar loads and stores
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc

index 510d3f34775093314f38d11cecacafd2445217fc..0785aa03d8b0f2447c2d9173305ca33c4f481bda 100644 (file)
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -43,17 +43,17 @@
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
-    : computeUnit(cu), _name(cu->name() + ".ScheduleStage"),
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
        vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
        scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
        locMemBusRdy(false), locMemIssueRdy(false)
  {
-    for (int j = 0; j < cu->numExeUnits(); ++j) {
+    for (int j = 0; j < cu.numExeUnits(); ++j) {
          scheduler.emplace_back(p);
      }
      wavesInSch.clear();
-    schList.resize(cu->numExeUnits());
+    schList.resize(cu.numExeUnits());
      for (auto &dq : schList) {
          dq.clear();
      }
@@ -70,36 +70,36 @@ void
  ScheduleStage::init()
  {
  
-    fatal_if(scheduler.size() != computeUnit->readyList.size(),
+    fatal_if(scheduler.size() != computeUnit.readyList.size(),
               "Scheduler should have same number of entries as CU's readyList");
-    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
-        scheduler[j].bindList(&computeUnit->readyList[j]);
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        scheduler[j].bindList(&computeUnit.readyList[j]);
      }
  
-    dispatchList = &computeUnit->dispatchList;
+    dispatchList = &computeUnit.dispatchList;
  
-    assert(computeUnit->numVectorGlobalMemUnits == 1);
-    assert(computeUnit->numVectorSharedMemUnits == 1);
+    assert(computeUnit.numVectorGlobalMemUnits == 1);
+    assert(computeUnit.numVectorSharedMemUnits == 1);
  }
  
  void
  ScheduleStage::exec()
  {
      // Update readyList
-    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
          // delete all ready wavefronts whose instruction buffers are now
          // empty because the last instruction was executed
-        computeUnit->updateReadyList(j);
+        computeUnit.updateReadyList(j);
          /**
           * Remove any wave that already has an instruction present in SCH
           * waiting for RF reads to complete. This prevents out of order
           * execution within a wave.
           */
-        for (auto wIt = computeUnit->readyList.at(j).begin();
-             wIt != computeUnit->readyList.at(j).end();) {
+        for (auto wIt = computeUnit.readyList.at(j).begin();
+             wIt != computeUnit.readyList.at(j).end();) {
              if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
                  *wIt = nullptr;
-                wIt = computeUnit->readyList.at(j).erase(wIt);
+                wIt = computeUnit.readyList.at(j).erase(wIt);
              } else {
                  wIt++;
              }
@@ -112,10 +112,10 @@ ScheduleStage::exec()
      // Scalar Memory are iterated after VMEM
  
      // Iterate VMEM and SMEM
-    int firstMemUnit = computeUnit->firstMemUnit();
-    int lastMemUnit = computeUnit->lastMemUnit();
+    int firstMemUnit = computeUnit.firstMemUnit();
+    int lastMemUnit = computeUnit.lastMemUnit();
      for (int j = firstMemUnit; j <= lastMemUnit; j++) {
-        int readyListSize = computeUnit->readyList[j].size();
+        int readyListSize = computeUnit.readyList[j].size();
          // If no wave is ready to be scheduled on the execution resource
          // then skip scheduling for this execution resource
          if (!readyListSize) {
@@ -135,12 +135,12 @@ ScheduleStage::exec()
      }
  
      // Iterate everything else
-    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
          // skip the VMEM resources
          if (j >= firstMemUnit && j <= lastMemUnit) {
              continue;
          }
-        int readyListSize = computeUnit->readyList[j].size();
+        int readyListSize = computeUnit.readyList[j].size();
          // If no wave is ready to be scheduled on the execution resource
          // then skip scheduling for this execution resource
          if (!readyListSize) {
@@ -205,16 +205,16 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
      bool accessVrfWr = true;
      if (!ii->isScalar()) {
          accessVrfWr =
-            computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+            computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
      }
      bool accessSrfWr =
-        computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
+        computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
      bool accessRf = accessVrfWr && accessSrfWr;
      if (accessRf) {
          if (!ii->isScalar()) {
-            computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
+            computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
          }
-        computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
+        computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
          return true;
      } else {
          rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
@@ -235,7 +235,7 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
  void
  ScheduleStage::scheduleRfDestOperands()
  {
-    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
          if (!dispatchList->at(j).first) {
              continue;
          }
@@ -269,10 +269,10 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w)
      bool accessVrf = true;
      if (!ii->isScalar()) {
          accessVrf =
-            computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
+            computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
      }
      bool accessSrf =
-        computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
+        computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
      // If RFs can support instruction, add to schList in RFBUSY state,
      // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
      // to the VRF
@@ -282,16 +282,16 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w)
                  exeType, w->simdId, w->wfDynId,
                  ii->seqNum(), ii->disassemble());
  
-        computeUnit->insertInPipeMap(w);
+        computeUnit.insertInPipeMap(w);
          wavesInSch.emplace(w->wfDynId);
          schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
          if (w->isOldestInstWaitcnt()) {
              w->setStatus(Wavefront::S_WAITCNT);
          }
          if (!ii->isScalar()) {
-            computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
+            computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
          }
-        computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
+        computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
  
          DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
                  exeType, w->simdId, w->wfDynId,
@@ -341,33 +341,33 @@ ScheduleStage::checkMemResources()
      scalarMemBusRdy = false;
      scalarMemIssueRdy = false;
      // check if there is a SRF->Global Memory bus available and
-    if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
+    if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
          scalarMemBusRdy = true;
      }
      // check if we can issue a scalar memory instruction
-    if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
+    if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
          scalarMemIssueRdy = true;
      }
  
      glbMemBusRdy = false;
      glbMemIssueRdy = false;
      // check if there is a VRF->Global Memory bus available
-    if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+    if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
          glbMemBusRdy = true;
      }
      // check if we can issue a Global memory instruction
-    if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
+    if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
          glbMemIssueRdy = true;
      }
  
      locMemBusRdy = false;
      locMemIssueRdy = false;
      // check if there is a VRF->LDS bus available
-    if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+    if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
          locMemBusRdy = true;
      }
      // check if we can issue a LDS instruction
-    if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
+    if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
          locMemIssueRdy = true;
      }
  }
@@ -378,10 +378,10 @@ ScheduleStage::dispatchReady(Wavefront *w)
      vectorAluRdy = false;
      scalarAluRdy = false;
      // check for available vector/scalar ALUs in the next cycle
-    if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
+    if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
          vectorAluRdy = true;
      }
-    if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+    if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
          scalarAluRdy = true;
      }
      GPUDynInstPtr ii = w->instructionBuffer.front();
@@ -423,11 +423,11 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+        if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
              rdy = false;
              dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
          }
-        if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
              rdy = false;
              dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
          }
@@ -445,7 +445,7 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit->scalarMemoryPipe.
+        if (!computeUnit.scalarMemoryPipe.
                  isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
                                   w->scalarWrGmReqsInPipe)) {
              rdy = false;
@@ -465,7 +465,7 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit->localMemoryPipe.
+        if (!computeUnit.localMemoryPipe.
                  isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
              rdy = false;
              dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
@@ -484,15 +484,15 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+        if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
          }
-        if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
          }
-        if (!computeUnit->localMemoryPipe.
+        if (!computeUnit.localMemoryPipe.
                  isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
@@ -514,7 +514,7 @@ ScheduleStage::fillDispatchList()
      // update execution resource status
      checkMemResources();
      // iterate execution resources
-    for (int j = 0; j < computeUnit->numExeUnits(); j++) {
+    for (int j = 0; j < computeUnit.numExeUnits(); j++) {
          assert(dispatchList->at(j).second == EMPTY);
  
          // iterate waves in schList to pick one for dispatch
@@ -537,7 +537,7 @@ ScheduleStage::fillDispatchList()
                                         instructionBuffer.front();
                      if (!mp->isMemSync() && !mp->isScalar() &&
                          (mp->isGlobalMem() || mp->isFlat())) {
-                        computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
+                        computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
                      }
  
                      doDispatchListTransition(j, EXREADY, schIter->first);
@@ -581,9 +581,9 @@ ScheduleStage::arbitrateVrfToLdsBus()
      // and a VRF->LDS bus. In GFx9, this is not the case.
  
      // iterate the GM pipelines
-    for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
+    for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
          // get the GM pipe index in the dispatchList
-        int gm_exe_unit = computeUnit->firstMemUnit() + i;
+        int gm_exe_unit = computeUnit.firstMemUnit() + i;
          // get the wave in the dispatchList
          Wavefront *w = dispatchList->at(gm_exe_unit).first;
          // If the WF is valid, ready to execute, and the instruction
@@ -617,7 +617,7 @@ ScheduleStage::checkRfOperandReadComplete()
      // Iterate the schList queues and check if operand reads
      // have completed in the RFs. If so, mark the wave as ready for
      // selection for dispatchList
-    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
          for (auto &p : schList.at(j)) {
              Wavefront *w = p.first;
              assert(w);
@@ -630,10 +630,10 @@ ScheduleStage::checkRfOperandReadComplete()
              bool vrfRdy = true;
              if (!ii->isScalar()) {
                  vrfRdy =
-                    computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
+                    computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
              }
              bool srfRdy =
-                computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
+                computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
              bool operandsReady = vrfRdy && srfRdy;
              if (operandsReady) {
                  DPRINTF(GPUSched,
@@ -671,9 +671,9 @@ void
  ScheduleStage::reserveResources()
  {
      std::vector<bool> exeUnitReservations;
-    exeUnitReservations.resize(computeUnit->numExeUnits(), false);
+    exeUnitReservations.resize(computeUnit.numExeUnits(), false);
  
-    for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
          Wavefront *dispatchedWave = dispatchList->at(j).first;
          if (dispatchedWave) {
              DISPATCH_STATUS s = dispatchList->at(j).second;
@@ -686,10 +686,10 @@ ScheduleStage::reserveResources()
                  GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
  
                  if (!ii->isScalar()) {
-                    computeUnit->vrf[dispatchedWave->simdId]->
+                    computeUnit.vrf[dispatchedWave->simdId]->
                          dispatchInstruction(ii);
                  }
-                computeUnit->srf[dispatchedWave->simdId]->
+                computeUnit.srf[dispatchedWave->simdId]->
                      dispatchInstruction(ii);
  
                  std::stringstream ss;
@@ -743,35 +743,35 @@ void
  ScheduleStage::regStats()
  {
      rdyListNotEmpty
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
          .name(name() + ".rdy_list_not_empty")
          .desc("number of cycles one or more wave on ready list per "
                "execution resource")
          ;
  
      rdyListEmpty
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
          .name(name() + ".rdy_list_empty")
          .desc("number of cycles no wave on ready list per "
                "execution resource")
          ;
  
      addToSchListStalls
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
          .name(name() + ".sch_list_add_stalls")
          .desc("number of cycles a wave is not added to schList per "
                "execution resource when ready list is not empty")
          ;
  
      schListToDispList
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
          .name(name() + ".sch_list_to_disp_list")
          .desc("number of cycles a wave is added to dispatchList per "
                "execution resource")
          ;
  
      schListToDispListStalls
-        .init(computeUnit->numExeUnits())
+        .init(computeUnit.numExeUnits())
          .name(name() + ".sch_list_to_disp_list_stalls")
          .desc("number of cycles no wave is added to dispatchList per "
                "execution resource")
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh

index be2691b28a262d92243449132c066ba19236a30b..6ec4a8ddda5f5c5dab204ebf2f9aa86c7e11f241 100644 (file)
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -57,13 +57,13 @@ struct ComputeUnitParams;
  class ScheduleStage
  {
    public:
-    ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu);
+    ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu);
      ~ScheduleStage();
      void init();
      void exec();
  
      // Stats related variables and methods
-    std::string name() { return _name; }
+    const std::string& name() const { return _name; }
      enum SchNonRdyType {
          SCH_SCALAR_ALU_NRDY,
          SCH_VECTOR_ALU_NRDY,
@@ -114,7 +114,7 @@ class ScheduleStage
      };
  
    private:
-    ComputeUnit *computeUnit;
+    ComputeUnit &computeUnit;
      // Each execution resource will have its own
      // scheduler and a dispatch list
      std::vector<Scheduler> scheduler;
@@ -168,7 +168,7 @@ class ScheduleStage
      // to dispatchList
      Stats::Vector dispNrdyStalls;
  
-    std::string _name;
+    const std::string _name;
  
      // called by exec() to add a wave to schList if the RFs can support it
      bool addToSchList(int exeType, Wavefront *w);
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc

index e14a2f2b21d8dd0bfd9d31f06ffc4c5df3fe9b67..fb99e69122d1028da1c98aaa401bae7e4a826173 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -45,8 +45,8 @@
  #include "params/ComputeUnit.hh"
  
  ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p,
-                                           ComputeUnit *cu)
-    : computeUnit(cu), _name(cu->name() + ".ScoreboardCheckStage")
+                                           ComputeUnit &cu)
+    : computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage")
  {
  }
  
@@ -58,8 +58,8 @@ ScoreboardCheckStage::~ScoreboardCheckStage()
  void
  ScoreboardCheckStage::init()
  {
-    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
-        readyList.push_back(&computeUnit->readyList[unitId]);
+    for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
+        readyList.push_back(&computeUnit.readyList[unitId]);
      }
  }
  
@@ -104,7 +104,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
      if (w->getStatus() == Wavefront::S_BARRIER) {
          assert(w->hasBarrier());
          int bar_id = w->barrierId();
-        if (!computeUnit->allAtBarrier(bar_id)) {
+        if (!computeUnit.allAtBarrier(bar_id)) {
              DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at "
                      "barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id,
                      w->simdId, w->wfSlotId, w->wfDynId, bar_id,
@@ -116,8 +116,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
          DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier "
                  "Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id,
                  w->simdId, w->wfSlotId, w->wfDynId, bar_id);
-        computeUnit->resetBarrier(bar_id);
-        computeUnit->releaseWFsFromBarrier(bar_id);
+        computeUnit.resetBarrier(bar_id);
+        computeUnit.releaseWFsFromBarrier(bar_id);
      }
  
      // Check WF status: it has to be running
@@ -154,17 +154,17 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
      }
  
      DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
-            computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
+            computeUnit.cu_id, w->simdId, w->wfSlotId, ii->disassemble());
  
      // Non-scalar (i.e., vector) instructions may use VGPRs
      if (!ii->isScalar()) {
-        if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
+        if (!computeUnit.vrf[w->simdId]->operandsReady(w, ii)) {
              *rdyStatus = NRDY_VGPR_NRDY;
              return false;
          }
      }
      // Scalar and non-scalar instructions may use SGPR
-    if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
+    if (!computeUnit.srf[w->simdId]->operandsReady(w, ii)) {
          *rdyStatus = NRDY_SGPR_NRDY;
          return false;
      }
@@ -190,7 +190,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
              return false;
          }
      }
-    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit.cu_id,
              w->simdId, w->wfSlotId, ii->disassemble());
      *exeResType = mapWaveToExeUnit(w);
      *rdyStatus = INST_RDY;
@@ -236,7 +236,7 @@ ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
          }
      }
      panic("%s: unmapped to an execution resource", ii->disassemble());
-    return computeUnit->numExeUnits();
+    return computeUnit.numExeUnits();
  }
  
  void
@@ -244,7 +244,7 @@ ScoreboardCheckStage::exec()
  {
      // reset the ready list for all execution units; it will be
      // constructed every cycle since resource availability may change
-    for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+    for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
          // Reset wavefront pointers to nullptr so clear() on the vector
          // does not accidentally destruct the wavefront object
          for (int i = 0; i < readyList[unitId]->size(); i++) {
@@ -253,10 +253,10 @@ ScoreboardCheckStage::exec()
          readyList[unitId]->clear();
      }
      // iterate over all WF slots across all vector ALUs
-    for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
-        for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
+    for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) {
+        for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) {
              // reset the ready status of each wavefront
-            Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
+            Wavefront *curWave = computeUnit.wfList[simdId][wfSlot];
              nonrdytype_e rdyStatus = NRDY_ILLEGAL;
              int exeResType = -1;
              // check WF readiness: If the WF's oldest
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh

index 3cdae278c4185048ed2e4c2298a98497ccde3ce1..6953c4c66f2c058f0b3794f49fea33b7621b1b3f 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -70,7 +70,7 @@ class ScoreboardCheckStage
          NRDY_CONDITIONS
      };
  
-    ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit *cu);
+    ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu);
      ~ScoreboardCheckStage();
      void init();
      void exec();
@@ -84,7 +84,7 @@ class ScoreboardCheckStage
      int mapWaveToExeUnit(Wavefront *w);
      bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
                 int *exeResType, int wfSlot);
-    ComputeUnit *computeUnit;
+    ComputeUnit &computeUnit;
  
      // List of waves which are ready to be scheduled.
      // Each execution resource has a ready list
@@ -93,7 +93,7 @@ class ScoreboardCheckStage
      // Stats
      Stats::Vector stallCycles;
  
-    std::string _name;
+    const std::string _name;
  };
  
  #endif // __SCOREBOARD_CHECK_STAGE_HH__
author	Tony Gutierrez <anthony.gutierrez@amd.com>
	Fri, 29 Jun 2018 21:39:53 +0000 (17:39 -0400)
committer	Anthony Gutierrez <anthony.gutierrez@amd.com>
	Fri, 17 Jul 2020 16:34:36 +0000 (16:34 +0000)
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/exec_stage.cc		patch \| blob \| history
src/gpu-compute/exec_stage.hh		patch \| blob \| history
src/gpu-compute/fetch_stage.cc		patch \| blob \| history
src/gpu-compute/fetch_stage.hh		patch \| blob \| history
src/gpu-compute/fetch_unit.cc		patch \| blob \| history
src/gpu-compute/fetch_unit.hh		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.hh		patch \| blob \| history
src/gpu-compute/local_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/local_memory_pipeline.hh		patch \| blob \| history
src/gpu-compute/scalar_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/scalar_memory_pipeline.hh		patch \| blob \| history
src/gpu-compute/schedule_stage.cc		patch \| blob \| history
src/gpu-compute/schedule_stage.hh		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.cc		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.hh		patch \| blob \| history