gpu-compute, arch-gcn3: refactor barriers

author Tony Gutierrez <anthony.gutierrez@amd.com>

Fri, 15 Jun 2018 20:00:58 +0000 (16:00 -0400)

committer Anthony Gutierrez <anthony.gutierrez@amd.com>

Thu, 16 Jul 2020 20:37:22 +0000 (20:37 +0000)
author Tony Gutierrez <anthony.gutierrez@amd.com>
Fri, 15 Jun 2018 20:00:58 +0000 (16:00 -0400)
committer Anthony Gutierrez <anthony.gutierrez@amd.com>
Thu, 16 Jul 2020 20:37:22 +0000 (20:37 +0000)
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc

index 607e3c6f20eee6edf1129e5f6b0b2fda893eaa48..817b339164ca3e96c3710d95cf7813ce687ff2fa 100644 (file)
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -39,6 +39,7 @@
  
  #include "arch/gcn3/insts/inst_util.hh"
  #include "debug/GCN3.hh"
+#include "debug/GPUSync.hh"
  #include "gpu-compute/shader.hh"
  
  namespace Gcn3ISA
@@ -3709,6 +3710,7 @@ namespace Gcn3ISA
      Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
      {
          Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
  
          // delete extra instructions fetched for completed work-items
          wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
@@ -3725,6 +3727,25 @@ namespace Gcn3ISA
          int refCount = wf->computeUnit->getLds()
              .decreaseRefCounter(wf->dispatchId, wf->wgId);
  
+        /**
+         * The parent WF of this instruction is exiting, therefore
+         * it should not participate in this barrier any longer. This
+         * prevents possible deadlock issues if WFs exit early.
+         */
+        int bar_id = WFBarrier::InvalidID;
+        if (wf->hasBarrier()) {
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            bar_id = wf->barrierId();
+            assert(bar_id != WFBarrier::InvalidID);
+            wf->releaseBarrier();
+            cu->decMaxBarrierCnt(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+                    "program and decrementing max barrier count for "
+                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+                    cu->maxBarrierCnt(bar_id));
+        }
+
          DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
              wf->computeUnit->cu_id, wf->wgId, refCount);
  
@@ -3748,6 +3769,20 @@ namespace Gcn3ISA
          wf->lastInstExec = 0;
  
          if (!refCount) {
+            /**
+             * If all WFs have finished, and hence the WG has finished,
+             * then we can free up the barrier belonging to the parent
+             * WG, but only if we actually used a barrier (i.e., more
+             * than one WF in the WG).
+             */
+            if (bar_id != WFBarrier::InvalidID) {
+                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+                        wf->simdId, wf->wfSlotId, wf->wfDynId,
+                        wf->barrierId());
+                cu->releaseBarrier(bar_id);
+            }
+
             /**
               * Last wavefront of the workgroup has executed return. If the
               * workgroup is not the final one in the kernel, then simply
@@ -4027,12 +4062,21 @@ namespace Gcn3ISA
      Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
      {
          Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
  
-        assert(wf->barrierCnt == wf->oldBarrierCnt);
-
-        wf->barrierCnt = wf->oldBarrierCnt + 1;
-        wf->stalledAtBarrier = true;
-    }
+        if (wf->hasBarrier()) {
+            int bar_id = wf->barrierId();
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            wf->setStatus(Wavefront::S_BARRIER);
+            cu->incNumAtBarrier(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+                    "barrier Id%d. %d waves now at barrier, %d waves "
+                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+                    cu->numYetToReachBarrier(bar_id));
+        }
+    } // execute
+    // --- Inst_SOPP__S_SETKILL class methods ---
  
      Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
          : Inst_SOPP(iFmt, "s_setkill")
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py

index 8a2ad812e8be55794252afeb3d260bbcfb521c16..7408bf9634aabbf850102bfce80d79199bc1d59a 100644 (file)
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -117,6 +117,7 @@ class ComputeUnit(ClockedObject):
      # Wavefront size is 64. This is configurable, however changing
      # this value to anything other than 64 will likely cause errors.
      wf_size = Param.Int(64, 'Wavefront size (in work items)')
+    num_barrier_slots = Param.Int(4, 'Number of barrier slots in a CU')
      num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
      num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
      num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index 178fd6e9605bbcc16df86f7f4ffca6bf28ad2dcd..0fcbb1ac6f02905b1332d52b9b8872c9d135269f 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -86,13 +86,14 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
      prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
      debugSegFault(p->debugSegFault),
      functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
-    countPages(p->countPages), barrier_id(0),
+    countPages(p->countPages),
      req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
      resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
      _masterId(p->system->getMasterId(this, "ComputeUnit")),
      lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
-    _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
-    wavefrontSize(p->wf_size)
+    _cacheLineSize(p->system->cacheLineSize()),
+    _numBarrierSlots(p->num_barrier_slots),
+    globalSeqNum(0), wavefrontSize(p->wf_size)
  {
      /**
       * This check is necessary because std::bitset only provides conversion
@@ -122,6 +123,12 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
      lastVaddrWF.resize(numVectorALUs);
      wfList.resize(numVectorALUs);
  
+    wfBarrierSlots.resize(p->num_barrier_slots, WFBarrier());
+
+    for (int i = 0; i < p->num_barrier_slots; ++i) {
+        freeBarrierIds.insert(i);
+    }
+
      for (int j = 0; j < numVectorALUs; ++j) {
          lastVaddrWF[j].resize(p->n_wf);
  
@@ -305,7 +312,7 @@ ComputeUnit::updateReadyList(int unitId)
  
  void
  ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                            HSAQueueEntry *task, bool fetchContext)
+                            HSAQueueEntry *task, int bar_id, bool fetchContext)
  {
      static int _n_wave = 0;
  
@@ -323,6 +330,12 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
      w->wfId = waveId;
      w->initMask = init_mask.to_ullong();
  
+    if (bar_id > WFBarrier::InvalidID) {
+        w->barrierId(bar_id);
+    } else {
+        assert(!w->hasBarrier());
+    }
+
      for (int k = 0; k < wfSize(); ++k) {
          w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
          w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
@@ -335,14 +348,6 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
              w->workItemId[0][k];
      }
  
-    w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
-
-    w->barCnt.resize(wfSize(), 0);
-
-    w->maxBarCnt = 0;
-    w->oldBarrierCnt = 0;
-    w->barrierCnt = 0;
-
      // WG state
      w->wgId = task->globalWgId();
      w->dispatchId = task->dispatchId();
@@ -350,9 +355,6 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
      w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
      w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
  
-    w->barrierId = barrier_id;
-    w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true;
-
      // set the wavefront context to have a pointer to this section of the LDS
      w->ldsChunk = ldsChunk;
  
@@ -367,8 +369,8 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
          w->dropFetch = true;
  
      DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
-            "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId,
-            w->wfSlotId);
+            "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
+            w->simdId, w->wfSlotId, refCount);
  
      w->initRegState(task, w->actualWgSzTotal);
      w->start(_n_wave++, task->codeAddr());
@@ -407,7 +409,7 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
  }
  
  void
-ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
+ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
  {
      // If we aren't ticking, start it up!
      if (!tickEvent.scheduled()) {
@@ -433,6 +435,28 @@ ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
      int sregDemand = task->numScalarRegs();
      int wave_id = 0;
  
+    int barrier_id = WFBarrier::InvalidID;
+
+    /**
+     * If this WG only has one WF it will not consume any barrier
+     * resources because it has no need of them.
+     */
+    if (num_wfs_in_wg > 1) {
+        /**
+         * Find a free barrier slot for this WG. Each WF in the WG will
+         * receive the same barrier ID.
+         */
+        barrier_id = getFreeBarrierId();
+        auto &wf_barrier = barrierSlot(barrier_id);
+        assert(!wf_barrier.maxBarrierCnt());
+        assert(!wf_barrier.numAtBarrier());
+        wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
+
+        DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
+                "%d waves using this barrier.\n", cu_id, barrier_id,
+                num_wfs_in_wg);
+    }
+
      // Assign WFs according to numWfsToSched vector, which is computed by
      // hasDispResources()
      for (int j = 0; j < shader->n_wf; ++j) {
@@ -455,12 +479,11 @@ ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
  
                  registerManager->allocateRegisters(w, vregDemand, sregDemand);
  
-                startWavefront(w, wave_id, ldsChunk, task);
+                startWavefront(w, wave_id, ldsChunk, task, barrier_id);
                  ++wave_id;
              }
          }
      }
-    ++barrier_id;
  }
  
  void
@@ -485,7 +508,7 @@ ComputeUnit::deleteFromPipeMap(Wavefront *w)
  }
  
  bool
-ComputeUnit::hasDispResources(HSAQueueEntry *task)
+ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
  {
      // compute true size of workgroup (after clamping to grid size)
      int trueWgSize[HSAQueueEntry::MAX_DIM];
@@ -503,6 +526,13 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task)
  
      // calculate the number of WFs in this WG
      int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+    num_wfs_in_wg = numWfs;
+
+    bool barrier_avail = true;
+
+    if (numWfs > 1 && !freeBarrierIds.size()) {
+        barrier_avail = false;
+    }
  
      // calculate the number of 32-bit vector registers required by each
      // work item of the work group
@@ -591,54 +621,89 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task)
          wgBlockedDueLdsAllocation++;
      }
  
+    if (!barrier_avail) {
+        wgBlockedDueBarrierAllocation++;
+    }
+
      // Return true if the following are all true:
      // (a) all WFs of the WG were mapped to free WF slots
      // (b) there are enough VGPRs to schedule all WFs to their SIMD units
      // (c) there are enough SGPRs on the CU to schedule all WFs
      // (d) there is enough space in LDS to allocate for all WFs
      bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
-                        && ldsAvail;
+                        && ldsAvail && barrier_avail;
      return can_dispatch;
  }
  
  int
-ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+ComputeUnit::numYetToReachBarrier(int bar_id)
  {
-    DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
-    int ccnt = 0;
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.numYetToReachBarrier();
+}
  
-    for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) {
-        for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
-            Wavefront *w = wfList[i_simd][i_wf];
+bool
+ComputeUnit::allAtBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.allAtBarrier();
+}
  
-            if (w->getStatus() == Wavefront::S_RUNNING) {
-                DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+void
+ComputeUnit::incNumAtBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.incNumAtBarrier();
+}
  
-                DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
-                        w->barrierId, _barrier_id);
+int
+ComputeUnit::numAtBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.numAtBarrier();
+}
  
-                DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n",
-                        w->barrierCnt, bcnt);
+int
+ComputeUnit::maxBarrierCnt(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.maxBarrierCnt();
+}
  
-                DPRINTF(GPUSync, "outstanding Reqs = %d\n",
-                         w->outstandingReqs);
-            }
+void
+ComputeUnit::resetBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.reset();
+}
+
+void
+ComputeUnit::decMaxBarrierCnt(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.decMaxBarrierCnt();
+}
  
-            if (w->getStatus() == Wavefront::S_RUNNING &&
-                w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
-                !w->outstandingReqs) {
-                ++ccnt;
+void
+ComputeUnit::releaseBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.release();
+    freeBarrierIds.insert(bar_id);
+}
  
-                DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
-                        "%d\n", i_simd, i_wf, ccnt);
+void
+ComputeUnit::releaseWFsFromBarrier(int bar_id)
+{
+    for (int i = 0; i < numVectorALUs; ++i) {
+        for (int j = 0; j < shader->n_wf; ++j) {
+            Wavefront *wf = wfList[i][j];
+            if (wf->barrierId() == bar_id) {
+                assert(wf->getStatus() == Wavefront::S_BARRIER);
+                wf->setStatus(Wavefront::S_RUNNING);
              }
          }
      }
-
-    DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
-            cu_id, ccnt, bslots);
-
-    return ccnt == bslots;
  }
  
  // Execute one clock worth of work on the ComputeUnit.
@@ -813,10 +878,6 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
              computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
          }
  
-        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n",
-                computeUnit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, w->barrierCnt);
-
          delete pkt->senderState;
          delete pkt;
          return true;
@@ -2204,6 +2265,11 @@ ComputeUnit::regStats()
          .desc("Number of dynamic non-GM memory insts executed")
          ;
  
+    wgBlockedDueBarrierAllocation
+        .name(name() + ".wg_blocked_due_barrier_alloc")
+        .desc("WG dispatch was blocked due to lack of barrier resources")
+        ;
+
      wgBlockedDueLdsAllocation
          .name(name() + ".wg_blocked_due_lds_alloc")
          .desc("Workgroup blocked due to LDS capacity")
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh

index 110097e0c519c8891398b2d8897f8c56f44705dd..d4e978b40894aa7ad60e8e9dd9c2d1484a85d50d 100644 (file)
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -79,6 +79,121 @@ enum TLB_CACHE
      TLB_HIT_CACHE_HIT
  };
  
+/**
+ * WF barrier slots. This represents the barrier resource for
+ * WF-level barriers (i.e., barriers to sync WFs within a WG).
+ */
+class WFBarrier
+{
+  public:
+    WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
+    {
+    }
+
+    static const int InvalidID = -1;
+
+    int
+    numAtBarrier() const
+    {
+        return _numAtBarrier;
+    }
+
+    /**
+     * Number of WFs that have not yet reached the barrier.
+     */
+    int
+    numYetToReachBarrier() const
+    {
+        return _maxBarrierCnt - _numAtBarrier;
+    }
+
+    int
+    maxBarrierCnt() const
+    {
+        return _maxBarrierCnt;
+    }
+
+    /**
+     * Set the maximum barrier count (i.e., the number of WFs that are
+     * participating in the barrier).
+     */
+    void
+    setMaxBarrierCnt(int max_barrier_cnt)
+    {
+        _maxBarrierCnt = max_barrier_cnt;
+    }
+
+    /**
+     * Mark that a WF has reached the barrier.
+     */
+    void
+    incNumAtBarrier()
+    {
+        assert(_numAtBarrier < _maxBarrierCnt);
+        ++_numAtBarrier;
+    }
+
+    /**
+     * Have all WFs participating in this barrier reached the barrier?
+     * If so, then the barrier is satisfied and WFs may proceed past
+     * the barrier.
+     */
+    bool
+    allAtBarrier() const
+    {
+        return _numAtBarrier == _maxBarrierCnt;
+    }
+
+    /**
+     * Decrement the number of WFs that are participating in this barrier.
+     * This should be called when a WF exits.
+     */
+    void
+    decMaxBarrierCnt()
+    {
+        assert(_maxBarrierCnt > 0);
+        --_maxBarrierCnt;
+    }
+
+    /**
+     * Release this barrier resource so it can be used by other WGs. This
+     * is generally called when a WG has finished.
+     */
+    void
+    release()
+    {
+        _numAtBarrier = 0;
+        _maxBarrierCnt = 0;
+    }
+
+    /**
+     * Reset the barrier. This is used to reset the barrier, usually when
+     * a dynamic instance of a barrier has been satisfied.
+     */
+    void
+    reset()
+    {
+        _numAtBarrier = 0;
+    }
+
+  private:
+    /**
+     * The number of WFs in the WG that have reached the barrier. Once
+     * the number of WFs that reach a barrier matches the number of WFs
+     * in the WG, the barrier is satisfied.
+     */
+    int _numAtBarrier;
+
+    /**
+     * The maximum number of WFs that can reach this barrier. This is
+     * essentially the number of WFs in the WG, and a barrier is satisfied
+     * when the number of WFs that reach the barrier equal this value. If
+     * a WF exits early it must decrement this value so that it is no
+     * longer considered for this barrier.
+     */
+    int _maxBarrierCnt;
+};
+
  class ComputeUnit : public ClockedObject
  {
    public:
@@ -277,7 +392,6 @@ class ComputeUnit : public ClockedObject
      bool countPages;
  
      Shader *shader;
-    uint32_t barrier_id;
  
      Tick req_tick_latency;
      Tick resp_tick_latency;
@@ -328,24 +442,47 @@ class ComputeUnit : public ClockedObject
      void fillKernelState(Wavefront *w, HSAQueueEntry *task);
  
      void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                        HSAQueueEntry *task, bool fetchContext=false);
+                        HSAQueueEntry *task, int bar_id,
+                        bool fetchContext=false);
  
      void doInvalidate(RequestPtr req, int kernId);
      void doFlush(GPUDynInstPtr gpuDynInst);
  
-    void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
-    bool hasDispResources(HSAQueueEntry *task);
+    void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
+    bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
  
      int cacheLineSize() const { return _cacheLineSize; }
      int getCacheLineBits() const { return cacheLineBits; }
  
-    /* This function cycles through all the wavefronts in all the phases to see
-     * if all of the wavefronts which should be associated with one barrier
-     * (denoted with _barrier_id), are all at the same barrier in the program
-     * (denoted by bcnt). When the number at the barrier matches bslots, then
-     * return true.
-     */
-    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+  private:
+    WFBarrier&
+    barrierSlot(int bar_id)
+    {
+        assert(bar_id > WFBarrier::InvalidID);
+        return wfBarrierSlots.at(bar_id);
+    }
+
+    int
+    getFreeBarrierId()
+    {
+        assert(freeBarrierIds.size());
+        auto free_bar_id = freeBarrierIds.begin();
+        int bar_id = *free_bar_id;
+        freeBarrierIds.erase(free_bar_id);
+        return bar_id;
+    }
+
+  public:
+    int numYetToReachBarrier(int bar_id);
+    bool allAtBarrier(int bar_id);
+    void incNumAtBarrier(int bar_id);
+    int numAtBarrier(int bar_id);
+    int maxBarrierCnt(int bar_id);
+    void resetBarrier(int bar_id);
+    void decMaxBarrierCnt(int bar_id);
+    void releaseBarrier(int bar_id);
+    void releaseWFsFromBarrier(int bar_id);
+    int numBarrierSlots() const { return _numBarrierSlots; }
  
      template<typename c0, typename c1>
      void doSmReturn(GPUDynInstPtr gpuDynInst);
@@ -455,6 +592,7 @@ class ComputeUnit : public ClockedObject
      Stats::Scalar dynamicFlatMemInstrCnt;
      Stats::Scalar dynamicLMemInstrCnt;
  
+    Stats::Scalar wgBlockedDueBarrierAllocation;
      Stats::Scalar wgBlockedDueLdsAllocation;
      // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
      // active when the instruction is committed, this number is still
@@ -974,10 +1112,20 @@ class ComputeUnit : public ClockedObject
  
    private:
      const int _cacheLineSize;
+    const int _numBarrierSlots;
      int cacheLineBits;
      InstSeqNum globalSeqNum;
      int wavefrontSize;
  
+    /**
+     * The barrier slots for this CU.
+     */
+    std::vector<WFBarrier> wfBarrierSlots;
+    /**
+     * A set used to easily retrieve a free barrier ID.
+     */
+    std::unordered_set<int> freeBarrierIds;
+
      // hold the time of the arrival of the first cache block related to
      // a particular GPUDynInst. This is used to calculate the difference
      // between the first and last chace block arrival times.
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc

index c4b9b9fb66849c801d6654b0a3093212eb991464..7b715cebf977ab59a361a25a5a23c03105328160 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -35,6 +35,7 @@
  
  #include "debug/GPUExec.hh"
  #include "debug/GPUSched.hh"
+#include "debug/GPUSync.hh"
  #include "gpu-compute/compute_unit.hh"
  #include "gpu-compute/gpu_static_inst.hh"
  #include "gpu-compute/scalar_register_file.hh"
@@ -101,15 +102,23 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
      // Is the wave waiting at a barrier. Check this condition BEFORE checking
      // for instruction buffer occupancy to avoid a deadlock when the barrier is
      // the last instruction in the instruction buffer.
-    if (w->stalledAtBarrier) {
-        if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
-                        computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
+    if (w->getStatus() == Wavefront::S_BARRIER) {
+        assert(w->hasBarrier());
+        int bar_id = w->barrierId();
+        if (!computeUnit->allAtBarrier(bar_id)) {
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at "
+                    "barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id,
+                    w->simdId, w->wfSlotId, w->wfDynId, bar_id,
+                    w->computeUnit->numYetToReachBarrier(bar_id));
              // Are all threads at barrier?
              *rdyStatus = NRDY_BARRIER_WAIT;
              return false;
          }
-        w->oldBarrierCnt = w->barrierCnt;
-        w->stalledAtBarrier = false;
+        DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier "
+                "Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id,
+                w->simdId, w->wfSlotId, w->wfDynId, bar_id);
+        computeUnit->resetBarrier(bar_id);
+        computeUnit->releaseWFsFromBarrier(bar_id);
      }
  
      // Check WF status: it has to be running
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc

index 59ce23971fe34eeb9fbe6ee59eeba501dd18d6ec..1d88e855a47529aec1b08fa7606bccb776a63f78 100644 (file)
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -244,7 +244,9 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
          // dispatch workgroup iff the following two conditions are met:
          // (a) wg_rem is true - there are unassigned workgroups in the grid
          // (b) there are enough free slots in cu cuList[i] for this wg
-        if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
+        int num_wfs_in_wg = 0;
+        bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
+        if (!task->dispComplete() && can_disp) {
              scheduledSomething = true;
              DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
                              curCu, task->globalWgId());
@@ -259,7 +261,7 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
  
              panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
                       "Invalid activeCu size\n");
-            cuList[curCu]->dispWorkgroup(task);
+            cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
  
              task->markWgDispatch();
          }
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc

index c2c98ba0c3d6ea2fbea8d34ccc9ed54ffe9be16c..f72cd50fda03bd69a18644071aa7f1046b209f1f 100644 (file)
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -52,7 +52,8 @@ WavefrontParams::create()
  Wavefront::Wavefront(const Params *p)
    : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
      maxIbSize(p->max_ib_size), _gpuISA(*this),
-    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1)
+    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
+    barId(WFBarrier::InvalidID)
  {
      lastTrace = 0;
      execUnitId = -1;
@@ -75,9 +76,6 @@ Wavefront::Wavefront(const Params *p)
      scalarOutstandingReqsRdGm = 0;
      scalarOutstandingReqsWrGm = 0;
      lastNonIdleTick = 0;
-    barrierCnt = 0;
-    oldBarrierCnt = 0;
-    stalledAtBarrier = false;
      ldsChunk = nullptr;
  
      memTraceBusy = 0;
@@ -93,7 +91,6 @@ Wavefront::Wavefront(const Params *p)
      lastAddr.resize(p->wf_size);
      workItemFlatId.resize(p->wf_size);
      oldDgpr.resize(p->wf_size);
-    barCnt.resize(p->wf_size);
      for (int i = 0; i < 3; ++i) {
          workItemId[i].resize(p->wf_size);
      }
@@ -595,7 +592,7 @@ Wavefront::setStatus(status_e newStatus)
      if (computeUnit->idleCUTimeout > 0) {
          // Wavefront's status transitions to stalled or stopped
          if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
-             newStatus == S_WAITCNT) &&
+             newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
              (status != newStatus)) {
              computeUnit->idleWfs++;
              assert(computeUnit->idleWfs <=
@@ -607,7 +604,7 @@ Wavefront::setStatus(status_e newStatus)
              // Wavefront's status transitions to an active state (from
              // a stopped or stalled state)
          } else if ((status == S_STOPPED || status == S_STALLED ||
-                    status == S_WAITCNT) &&
+                    status == S_WAITCNT || status == S_BARRIER) &&
                     (status != newStatus)) {
              // if all WFs in the CU were idle then check if the idleness
              // period exceeded the timeout threshold
@@ -1214,12 +1211,6 @@ Wavefront::exec()
      }
  }
  
-bool
-Wavefront::waitingAtBarrier(int lane)
-{
-    return barCnt[lane] < maxBarCnt;
-}
-
  GPUDynInstPtr
  Wavefront::nextInstr()
  {
@@ -1414,3 +1405,29 @@ Wavefront::computeActualWgSz(HSAQueueEntry *task)
          actualWgSzTotal *= actualWgSz[d];
      }
  }
+
+void
+Wavefront::barrierId(int bar_id)
+{
+    assert(bar_id >= WFBarrier::InvalidID);
+    assert(bar_id < computeUnit->numBarrierSlots());
+    barId = bar_id;
+}
+
+int
+Wavefront::barrierId() const
+{
+    return barId;
+}
+
+bool
+Wavefront::hasBarrier() const
+{
+    return barId > WFBarrier::InvalidID;
+}
+
+void
+Wavefront::releaseBarrier()
+{
+    barId = WFBarrier::InvalidID;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh

index 451e5dfcbd44f09bf0b1ee5dbb2a12fb2e7061c3..e07af0eccfa112c08578b09cdd56f941552c0473 100644 (file)
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -78,13 +78,13 @@ class Wavefront : public SimObject
           * and once they are satisfied, it will resume normal
           * operation.
           */
-        S_WAITCNT
+        S_WAITCNT,
+        /**
+         * WF is stalled at a barrier.
+         */
+        S_BARRIER
      };
  
-    uint32_t oldBarrierCnt;
-    uint32_t barrierCnt;
-    uint32_t barrierId;
-    uint32_t barrierSlots;
      // HW slot id where the WF is mapped to inside a SIMD unit
      const int wfSlotId;
      int kernId;
@@ -210,12 +210,6 @@ class Wavefront : public SimObject
      // Execution mask at wavefront start
      VectorMask initMask;
  
-    // number of barriers this WF has joined
-    std::vector<int> barCnt;
-    int maxBarCnt;
-    // Flag to stall a wave on barrier
-    bool stalledAtBarrier;
-
      // a pointer to the fraction of the LDS allocated
      // to this workgroup (thus this wavefront)
      LdsChunk *ldsChunk;
@@ -297,8 +291,6 @@ class Wavefront : public SimObject
      bool stopFetch();
      void regStats();
  
-    bool waitingAtBarrier(int lane);
-
      Addr pc() const;
      void pc(Addr new_pc);
  
@@ -321,6 +313,11 @@ class Wavefront : public SimObject
          return _gpuISA;
      }
  
+    void barrierId(int bar_id);
+    int barrierId() const;
+    bool hasBarrier() const;
+    void releaseBarrier();
+
    private:
      TheGpuISA::GPUISA _gpuISA;
  
@@ -349,6 +346,7 @@ class Wavefront : public SimObject
      status_e status;
      Addr _pc;
      VectorMask _execMask;
+    int barId;
  };
  
  #endif // __GPU_COMPUTE_WAVEFRONT_HH__
author	Tony Gutierrez <anthony.gutierrez@amd.com>
	Fri, 15 Jun 2018 20:00:58 +0000 (16:00 -0400)
committer	Anthony Gutierrez <anthony.gutierrez@amd.com>
	Thu, 16 Jul 2020 20:37:22 +0000 (20:37 +0000)
src/arch/gcn3/insts/instructions.cc		patch \| blob \| history
src/gpu-compute/GPU.py		patch \| blob \| history
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/compute_unit.hh		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.cc		patch \| blob \| history
src/gpu-compute/shader.cc		patch \| blob \| history
src/gpu-compute/wavefront.cc		patch \| blob \| history
src/gpu-compute/wavefront.hh		patch \| blob \| history