From: Tony Gutierrez <anthony.gutierrez@amd.com>
Date: Fri, 15 Jun 2018 20:00:58 +0000 (-0400)
Subject: gpu-compute, arch-gcn3: refactor barriers
X-Git-Tag: v20.1.0.0~453
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=af621cd6e66921b0b5890d72c2ccf3d7ef6f3ac3;p=gem5.git

gpu-compute, arch-gcn3: refactor barriers

Barriers were not modeled properly. Firstly, barriers were
allocated to each WG that was launched, which is not
correct, and the CU would provide an infinite number
of barrier slots. There are a limited number of barrier slots
per CU in reality. In addition, the CU will not allocate
barrier slots to WGs with a single WF (nothing to sync if
only one WF).

Beyond modeling problems, there also the issue of deadlock.
The barrier could deadlock because not all WFs are freed
from the barrier once it has been satisfied. Instead, we
relied on the scoreboard stage to release them lazily,
one-by-one.

Under this implementation the scoreboard may not fully release
all WFs participating in a barrier; this happens because the
first WF to be freed from the barrier could reach an s_barrier
instruction again, forever causing the barrier counts across
WFs to be out-of-sync.

This change refactors the barrier logic to:

1) Create a proper barrier slot implementation

2) Enforce (via a parameter) the number of barrier
   slots on the CU.

3) Simplify the logic and cleanup the code (i.e., we
   no longer iterate through the entire WF list each
   time we check if a barrier is satisfied).

4) Fix deadlock issues.

Change-Id: If53955b54931886baaae322640a7b9da7a1595e0
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29943
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---

diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index 607e3c6f2..817b33916 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -39,6 +39,7 @@
 
 #include "arch/gcn3/insts/inst_util.hh"
 #include "debug/GCN3.hh"
+#include "debug/GPUSync.hh"
 #include "gpu-compute/shader.hh"
 
 namespace Gcn3ISA
@@ -3709,6 +3710,7 @@ namespace Gcn3ISA
     Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
 
         // delete extra instructions fetched for completed work-items
         wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
@@ -3725,6 +3727,25 @@ namespace Gcn3ISA
         int refCount = wf->computeUnit->getLds()
             .decreaseRefCounter(wf->dispatchId, wf->wgId);
 
+        /**
+         * The parent WF of this instruction is exiting, therefore
+         * it should not participate in this barrier any longer. This
+         * prevents possible deadlock issues if WFs exit early.
+         */
+        int bar_id = WFBarrier::InvalidID;
+        if (wf->hasBarrier()) {
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            bar_id = wf->barrierId();
+            assert(bar_id != WFBarrier::InvalidID);
+            wf->releaseBarrier();
+            cu->decMaxBarrierCnt(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+                    "program and decrementing max barrier count for "
+                    "barrier Id%d. New max count: %d.\n", cu->cu_id,
+                    wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+                    cu->maxBarrierCnt(bar_id));
+        }
+
         DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
             wf->computeUnit->cu_id, wf->wgId, refCount);
 
@@ -3748,6 +3769,20 @@ namespace Gcn3ISA
         wf->lastInstExec = 0;
 
         if (!refCount) {
+            /**
+             * If all WFs have finished, and hence the WG has finished,
+             * then we can free up the barrier belonging to the parent
+             * WG, but only if we actually used a barrier (i.e., more
+             * than one WF in the WG).
+             */
+            if (bar_id != WFBarrier::InvalidID) {
+                DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+                        "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+                        wf->simdId, wf->wfSlotId, wf->wfDynId,
+                        wf->barrierId());
+                cu->releaseBarrier(bar_id);
+            }
+
            /**
              * Last wavefront of the workgroup has executed return. If the
              * workgroup is not the final one in the kernel, then simply
@@ -4027,12 +4062,21 @@ namespace Gcn3ISA
     Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
+        ComputeUnit *cu = gpuDynInst->computeUnit();
 
-        assert(wf->barrierCnt == wf->oldBarrierCnt);
-
-        wf->barrierCnt = wf->oldBarrierCnt + 1;
-        wf->stalledAtBarrier = true;
-    }
+        if (wf->hasBarrier()) {
+            int bar_id = wf->barrierId();
+            assert(wf->getStatus() != Wavefront::S_BARRIER);
+            wf->setStatus(Wavefront::S_BARRIER);
+            cu->incNumAtBarrier(bar_id);
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+                    "barrier Id%d. %d waves now at barrier, %d waves "
+                    "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+                    wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+                    cu->numYetToReachBarrier(bar_id));
+        }
+    } // execute
+    // --- Inst_SOPP__S_SETKILL class methods ---
 
     Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_setkill")
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 8a2ad812e..7408bf963 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -117,6 +117,7 @@ class ComputeUnit(ClockedObject):
     # Wavefront size is 64. This is configurable, however changing
     # this value to anything other than 64 will likely cause errors.
     wf_size = Param.Int(64, 'Wavefront size (in work items)')
+    num_barrier_slots = Param.Int(4, 'Number of barrier slots in a CU')
     num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
     num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
     num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 178fd6e96..0fcbb1ac6 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -86,13 +86,14 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
     prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
     debugSegFault(p->debugSegFault),
     functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
-    countPages(p->countPages), barrier_id(0),
+    countPages(p->countPages),
     req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
     resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
     _masterId(p->system->getMasterId(this, "ComputeUnit")),
     lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
-    _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
-    wavefrontSize(p->wf_size)
+    _cacheLineSize(p->system->cacheLineSize()),
+    _numBarrierSlots(p->num_barrier_slots),
+    globalSeqNum(0), wavefrontSize(p->wf_size)
 {
     /**
      * This check is necessary because std::bitset only provides conversion
@@ -122,6 +123,12 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
     lastVaddrWF.resize(numVectorALUs);
     wfList.resize(numVectorALUs);
 
+    wfBarrierSlots.resize(p->num_barrier_slots, WFBarrier());
+
+    for (int i = 0; i < p->num_barrier_slots; ++i) {
+        freeBarrierIds.insert(i);
+    }
+
     for (int j = 0; j < numVectorALUs; ++j) {
         lastVaddrWF[j].resize(p->n_wf);
 
@@ -305,7 +312,7 @@ ComputeUnit::updateReadyList(int unitId)
 
 void
 ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                            HSAQueueEntry *task, bool fetchContext)
+                            HSAQueueEntry *task, int bar_id, bool fetchContext)
 {
     static int _n_wave = 0;
 
@@ -323,6 +330,12 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
     w->wfId = waveId;
     w->initMask = init_mask.to_ullong();
 
+    if (bar_id > WFBarrier::InvalidID) {
+        w->barrierId(bar_id);
+    } else {
+        assert(!w->hasBarrier());
+    }
+
     for (int k = 0; k < wfSize(); ++k) {
         w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
         w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
@@ -335,14 +348,6 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
             w->workItemId[0][k];
     }
 
-    w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
-
-    w->barCnt.resize(wfSize(), 0);
-
-    w->maxBarCnt = 0;
-    w->oldBarrierCnt = 0;
-    w->barrierCnt = 0;
-
     // WG state
     w->wgId = task->globalWgId();
     w->dispatchId = task->dispatchId();
@@ -350,9 +355,6 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
     w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
     w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
 
-    w->barrierId = barrier_id;
-    w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true;
-
     // set the wavefront context to have a pointer to this section of the LDS
     w->ldsChunk = ldsChunk;
 
@@ -367,8 +369,8 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
         w->dropFetch = true;
 
     DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
-            "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId,
-            w->wfSlotId);
+            "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
+            w->simdId, w->wfSlotId, refCount);
 
     w->initRegState(task, w->actualWgSzTotal);
     w->start(_n_wave++, task->codeAddr());
@@ -407,7 +409,7 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
 }
 
 void
-ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
+ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
 {
     // If we aren't ticking, start it up!
     if (!tickEvent.scheduled()) {
@@ -433,6 +435,28 @@ ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
     int sregDemand = task->numScalarRegs();
     int wave_id = 0;
 
+    int barrier_id = WFBarrier::InvalidID;
+
+    /**
+     * If this WG only has one WF it will not consume any barrier
+     * resources because it has no need of them.
+     */
+    if (num_wfs_in_wg > 1) {
+        /**
+         * Find a free barrier slot for this WG. Each WF in the WG will
+         * receive the same barrier ID.
+         */
+        barrier_id = getFreeBarrierId();
+        auto &wf_barrier = barrierSlot(barrier_id);
+        assert(!wf_barrier.maxBarrierCnt());
+        assert(!wf_barrier.numAtBarrier());
+        wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
+
+        DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
+                "%d waves using this barrier.\n", cu_id, barrier_id,
+                num_wfs_in_wg);
+    }
+
     // Assign WFs according to numWfsToSched vector, which is computed by
     // hasDispResources()
     for (int j = 0; j < shader->n_wf; ++j) {
@@ -455,12 +479,11 @@ ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
 
                 registerManager->allocateRegisters(w, vregDemand, sregDemand);
 
-                startWavefront(w, wave_id, ldsChunk, task);
+                startWavefront(w, wave_id, ldsChunk, task, barrier_id);
                 ++wave_id;
             }
         }
     }
-    ++barrier_id;
 }
 
 void
@@ -485,7 +508,7 @@ ComputeUnit::deleteFromPipeMap(Wavefront *w)
 }
 
 bool
-ComputeUnit::hasDispResources(HSAQueueEntry *task)
+ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
 {
     // compute true size of workgroup (after clamping to grid size)
     int trueWgSize[HSAQueueEntry::MAX_DIM];
@@ -503,6 +526,13 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task)
 
     // calculate the number of WFs in this WG
     int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+    num_wfs_in_wg = numWfs;
+
+    bool barrier_avail = true;
+
+    if (numWfs > 1 && !freeBarrierIds.size()) {
+        barrier_avail = false;
+    }
 
     // calculate the number of 32-bit vector registers required by each
     // work item of the work group
@@ -591,54 +621,89 @@ ComputeUnit::hasDispResources(HSAQueueEntry *task)
         wgBlockedDueLdsAllocation++;
     }
 
+    if (!barrier_avail) {
+        wgBlockedDueBarrierAllocation++;
+    }
+
     // Return true if the following are all true:
     // (a) all WFs of the WG were mapped to free WF slots
     // (b) there are enough VGPRs to schedule all WFs to their SIMD units
     // (c) there are enough SGPRs on the CU to schedule all WFs
     // (d) there is enough space in LDS to allocate for all WFs
     bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
-                        && ldsAvail;
+                        && ldsAvail && barrier_avail;
     return can_dispatch;
 }
 
 int
-ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+ComputeUnit::numYetToReachBarrier(int bar_id)
 {
-    DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
-    int ccnt = 0;
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.numYetToReachBarrier();
+}
 
-    for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) {
-        for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
-            Wavefront *w = wfList[i_simd][i_wf];
+bool
+ComputeUnit::allAtBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.allAtBarrier();
+}
 
-            if (w->getStatus() == Wavefront::S_RUNNING) {
-                DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+void
+ComputeUnit::incNumAtBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.incNumAtBarrier();
+}
 
-                DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
-                        w->barrierId, _barrier_id);
+int
+ComputeUnit::numAtBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.numAtBarrier();
+}
 
-                DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n",
-                        w->barrierCnt, bcnt);
+int
+ComputeUnit::maxBarrierCnt(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    return wf_barrier.maxBarrierCnt();
+}
 
-                DPRINTF(GPUSync, "outstanding Reqs = %d\n",
-                         w->outstandingReqs);
-            }
+void
+ComputeUnit::resetBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.reset();
+}
+
+void
+ComputeUnit::decMaxBarrierCnt(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.decMaxBarrierCnt();
+}
 
-            if (w->getStatus() == Wavefront::S_RUNNING &&
-                w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
-                !w->outstandingReqs) {
-                ++ccnt;
+void
+ComputeUnit::releaseBarrier(int bar_id)
+{
+    auto &wf_barrier = barrierSlot(bar_id);
+    wf_barrier.release();
+    freeBarrierIds.insert(bar_id);
+}
 
-                DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
-                        "%d\n", i_simd, i_wf, ccnt);
+void
+ComputeUnit::releaseWFsFromBarrier(int bar_id)
+{
+    for (int i = 0; i < numVectorALUs; ++i) {
+        for (int j = 0; j < shader->n_wf; ++j) {
+            Wavefront *wf = wfList[i][j];
+            if (wf->barrierId() == bar_id) {
+                assert(wf->getStatus() == Wavefront::S_BARRIER);
+                wf->setStatus(Wavefront::S_RUNNING);
             }
         }
     }
-
-    DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
-            cu_id, ccnt, bslots);
-
-    return ccnt == bslots;
 }
 
 // Execute one clock worth of work on the ComputeUnit.
@@ -813,10 +878,6 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
             computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
         }
 
-        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n",
-                computeUnit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, w->barrierCnt);
-
         delete pkt->senderState;
         delete pkt;
         return true;
@@ -2204,6 +2265,11 @@ ComputeUnit::regStats()
         .desc("Number of dynamic non-GM memory insts executed")
         ;
 
+    wgBlockedDueBarrierAllocation
+        .name(name() + ".wg_blocked_due_barrier_alloc")
+        .desc("WG dispatch was blocked due to lack of barrier resources")
+        ;
+
     wgBlockedDueLdsAllocation
         .name(name() + ".wg_blocked_due_lds_alloc")
         .desc("Workgroup blocked due to LDS capacity")
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 110097e0c..d4e978b40 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -79,6 +79,121 @@ enum TLB_CACHE
     TLB_HIT_CACHE_HIT
 };
 
+/**
+ * WF barrier slots. This represents the barrier resource for
+ * WF-level barriers (i.e., barriers to sync WFs within a WG).
+ */
+class WFBarrier
+{
+  public:
+    WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
+    {
+    }
+
+    static const int InvalidID = -1;
+
+    int
+    numAtBarrier() const
+    {
+        return _numAtBarrier;
+    }
+
+    /**
+     * Number of WFs that have not yet reached the barrier.
+     */
+    int
+    numYetToReachBarrier() const
+    {
+        return _maxBarrierCnt - _numAtBarrier;
+    }
+
+    int
+    maxBarrierCnt() const
+    {
+        return _maxBarrierCnt;
+    }
+
+    /**
+     * Set the maximum barrier count (i.e., the number of WFs that are
+     * participating in the barrier).
+     */
+    void
+    setMaxBarrierCnt(int max_barrier_cnt)
+    {
+        _maxBarrierCnt = max_barrier_cnt;
+    }
+
+    /**
+     * Mark that a WF has reached the barrier.
+     */
+    void
+    incNumAtBarrier()
+    {
+        assert(_numAtBarrier < _maxBarrierCnt);
+        ++_numAtBarrier;
+    }
+
+    /**
+     * Have all WFs participating in this barrier reached the barrier?
+     * If so, then the barrier is satisfied and WFs may proceed past
+     * the barrier.
+     */
+    bool
+    allAtBarrier() const
+    {
+        return _numAtBarrier == _maxBarrierCnt;
+    }
+
+    /**
+     * Decrement the number of WFs that are participating in this barrier.
+     * This should be called when a WF exits.
+     */
+    void
+    decMaxBarrierCnt()
+    {
+        assert(_maxBarrierCnt > 0);
+        --_maxBarrierCnt;
+    }
+
+    /**
+     * Release this barrier resource so it can be used by other WGs. This
+     * is generally called when a WG has finished.
+     */
+    void
+    release()
+    {
+        _numAtBarrier = 0;
+        _maxBarrierCnt = 0;
+    }
+
+    /**
+     * Reset the barrier. This is used to reset the barrier, usually when
+     * a dynamic instance of a barrier has been satisfied.
+     */
+    void
+    reset()
+    {
+        _numAtBarrier = 0;
+    }
+
+  private:
+    /**
+     * The number of WFs in the WG that have reached the barrier. Once
+     * the number of WFs that reach a barrier matches the number of WFs
+     * in the WG, the barrier is satisfied.
+     */
+    int _numAtBarrier;
+
+    /**
+     * The maximum number of WFs that can reach this barrier. This is
+     * essentially the number of WFs in the WG, and a barrier is satisfied
+     * when the number of WFs that reach the barrier equal this value. If
+     * a WF exits early it must decrement this value so that it is no
+     * longer considered for this barrier.
+     */
+    int _maxBarrierCnt;
+};
+
 class ComputeUnit : public ClockedObject
 {
   public:
@@ -277,7 +392,6 @@ class ComputeUnit : public ClockedObject
     bool countPages;
 
     Shader *shader;
-    uint32_t barrier_id;
 
     Tick req_tick_latency;
     Tick resp_tick_latency;
@@ -328,24 +442,47 @@ class ComputeUnit : public ClockedObject
     void fillKernelState(Wavefront *w, HSAQueueEntry *task);
 
     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
-                        HSAQueueEntry *task, bool fetchContext=false);
+                        HSAQueueEntry *task, int bar_id,
+                        bool fetchContext=false);
 
     void doInvalidate(RequestPtr req, int kernId);
     void doFlush(GPUDynInstPtr gpuDynInst);
 
-    void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
-    bool hasDispResources(HSAQueueEntry *task);
+    void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
+    bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
 
     int cacheLineSize() const { return _cacheLineSize; }
     int getCacheLineBits() const { return cacheLineBits; }
 
-    /* This function cycles through all the wavefronts in all the phases to see
-     * if all of the wavefronts which should be associated with one barrier
-     * (denoted with _barrier_id), are all at the same barrier in the program
-     * (denoted by bcnt). When the number at the barrier matches bslots, then
-     * return true.
-     */
-    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+  private:
+    WFBarrier&
+    barrierSlot(int bar_id)
+    {
+        assert(bar_id > WFBarrier::InvalidID);
+        return wfBarrierSlots.at(bar_id);
+    }
+
+    int
+    getFreeBarrierId()
+    {
+        assert(freeBarrierIds.size());
+        auto free_bar_id = freeBarrierIds.begin();
+        int bar_id = *free_bar_id;
+        freeBarrierIds.erase(free_bar_id);
+        return bar_id;
+    }
+
+  public:
+    int numYetToReachBarrier(int bar_id);
+    bool allAtBarrier(int bar_id);
+    void incNumAtBarrier(int bar_id);
+    int numAtBarrier(int bar_id);
+    int maxBarrierCnt(int bar_id);
+    void resetBarrier(int bar_id);
+    void decMaxBarrierCnt(int bar_id);
+    void releaseBarrier(int bar_id);
+    void releaseWFsFromBarrier(int bar_id);
+    int numBarrierSlots() const { return _numBarrierSlots; }
 
     template<typename c0, typename c1>
     void doSmReturn(GPUDynInstPtr gpuDynInst);
@@ -455,6 +592,7 @@ class ComputeUnit : public ClockedObject
     Stats::Scalar dynamicFlatMemInstrCnt;
     Stats::Scalar dynamicLMemInstrCnt;
 
+    Stats::Scalar wgBlockedDueBarrierAllocation;
     Stats::Scalar wgBlockedDueLdsAllocation;
     // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
     // active when the instruction is committed, this number is still
@@ -974,10 +1112,20 @@ class ComputeUnit : public ClockedObject
 
   private:
     const int _cacheLineSize;
+    const int _numBarrierSlots;
     int cacheLineBits;
     InstSeqNum globalSeqNum;
     int wavefrontSize;
 
+    /**
+     * The barrier slots for this CU.
+     */
+    std::vector<WFBarrier> wfBarrierSlots;
+    /**
+     * A set used to easily retrieve a free barrier ID.
+     */
+    std::unordered_set<int> freeBarrierIds;
+
     // hold the time of the arrival of the first cache block related to
     // a particular GPUDynInst. This is used to calculate the difference
     // between the first and last chace block arrival times.
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
index c4b9b9fb6..7b715cebf 100644
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -35,6 +35,7 @@
 
 #include "debug/GPUExec.hh"
 #include "debug/GPUSched.hh"
+#include "debug/GPUSync.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
 #include "gpu-compute/scalar_register_file.hh"
@@ -101,15 +102,23 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus,
     // Is the wave waiting at a barrier. Check this condition BEFORE checking
     // for instruction buffer occupancy to avoid a deadlock when the barrier is
     // the last instruction in the instruction buffer.
-    if (w->stalledAtBarrier) {
-        if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
-                        computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
+    if (w->getStatus() == Wavefront::S_BARRIER) {
+        assert(w->hasBarrier());
+        int bar_id = w->barrierId();
+        if (!computeUnit->allAtBarrier(bar_id)) {
+            DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at "
+                    "barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id,
+                    w->simdId, w->wfSlotId, w->wfDynId, bar_id,
+                    w->computeUnit->numYetToReachBarrier(bar_id));
             // Are all threads at barrier?
             *rdyStatus = NRDY_BARRIER_WAIT;
             return false;
         }
-        w->oldBarrierCnt = w->barrierCnt;
-        w->stalledAtBarrier = false;
+        DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier "
+                "Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id,
+                w->simdId, w->wfSlotId, w->wfDynId, bar_id);
+        computeUnit->resetBarrier(bar_id);
+        computeUnit->releaseWFsFromBarrier(bar_id);
     }
 
     // Check WF status: it has to be running
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 59ce23971..1d88e855a 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -244,7 +244,9 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
         // dispatch workgroup iff the following two conditions are met:
         // (a) wg_rem is true - there are unassigned workgroups in the grid
         // (b) there are enough free slots in cu cuList[i] for this wg
-        if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
+        int num_wfs_in_wg = 0;
+        bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
+        if (!task->dispComplete() && can_disp) {
             scheduledSomething = true;
             DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
                             curCu, task->globalWgId());
@@ -259,7 +261,7 @@ Shader::dispatchWorkgroups(HSAQueueEntry *task)
 
             panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
                      "Invalid activeCu size\n");
-            cuList[curCu]->dispWorkgroup(task);
+            cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
 
             task->markWgDispatch();
         }
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index c2c98ba0c..f72cd50fd 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -52,7 +52,8 @@ WavefrontParams::create()
 Wavefront::Wavefront(const Params *p)
   : SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
     maxIbSize(p->max_ib_size), _gpuISA(*this),
-    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1)
+    vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
+    barId(WFBarrier::InvalidID)
 {
     lastTrace = 0;
     execUnitId = -1;
@@ -75,9 +76,6 @@ Wavefront::Wavefront(const Params *p)
     scalarOutstandingReqsRdGm = 0;
     scalarOutstandingReqsWrGm = 0;
     lastNonIdleTick = 0;
-    barrierCnt = 0;
-    oldBarrierCnt = 0;
-    stalledAtBarrier = false;
     ldsChunk = nullptr;
 
     memTraceBusy = 0;
@@ -93,7 +91,6 @@ Wavefront::Wavefront(const Params *p)
     lastAddr.resize(p->wf_size);
     workItemFlatId.resize(p->wf_size);
     oldDgpr.resize(p->wf_size);
-    barCnt.resize(p->wf_size);
     for (int i = 0; i < 3; ++i) {
         workItemId[i].resize(p->wf_size);
     }
@@ -595,7 +592,7 @@ Wavefront::setStatus(status_e newStatus)
     if (computeUnit->idleCUTimeout > 0) {
         // Wavefront's status transitions to stalled or stopped
         if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
-             newStatus == S_WAITCNT) &&
+             newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
             (status != newStatus)) {
             computeUnit->idleWfs++;
             assert(computeUnit->idleWfs <=
@@ -607,7 +604,7 @@ Wavefront::setStatus(status_e newStatus)
             // Wavefront's status transitions to an active state (from
             // a stopped or stalled state)
         } else if ((status == S_STOPPED || status == S_STALLED ||
-                    status == S_WAITCNT) &&
+                    status == S_WAITCNT || status == S_BARRIER) &&
                    (status != newStatus)) {
             // if all WFs in the CU were idle then check if the idleness
             // period exceeded the timeout threshold
@@ -1214,12 +1211,6 @@ Wavefront::exec()
     }
 }
 
-bool
-Wavefront::waitingAtBarrier(int lane)
-{
-    return barCnt[lane] < maxBarCnt;
-}
-
 GPUDynInstPtr
 Wavefront::nextInstr()
 {
@@ -1414,3 +1405,29 @@ Wavefront::computeActualWgSz(HSAQueueEntry *task)
         actualWgSzTotal *= actualWgSz[d];
     }
 }
+
+void
+Wavefront::barrierId(int bar_id)
+{
+    assert(bar_id >= WFBarrier::InvalidID);
+    assert(bar_id < computeUnit->numBarrierSlots());
+    barId = bar_id;
+}
+
+int
+Wavefront::barrierId() const
+{
+    return barId;
+}
+
+bool
+Wavefront::hasBarrier() const
+{
+    return barId > WFBarrier::InvalidID;
+}
+
+void
+Wavefront::releaseBarrier()
+{
+    barId = WFBarrier::InvalidID;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 451e5dfcb..e07af0ecc 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -78,13 +78,13 @@ class Wavefront : public SimObject
          * and once they are satisfied, it will resume normal
          * operation.
          */
-        S_WAITCNT
+        S_WAITCNT,
+        /**
+         * WF is stalled at a barrier.
+         */
+        S_BARRIER
     };
 
-    uint32_t oldBarrierCnt;
-    uint32_t barrierCnt;
-    uint32_t barrierId;
-    uint32_t barrierSlots;
     // HW slot id where the WF is mapped to inside a SIMD unit
     const int wfSlotId;
     int kernId;
@@ -210,12 +210,6 @@ class Wavefront : public SimObject
     // Execution mask at wavefront start
     VectorMask initMask;
 
-    // number of barriers this WF has joined
-    std::vector<int> barCnt;
-    int maxBarCnt;
-    // Flag to stall a wave on barrier
-    bool stalledAtBarrier;
-
     // a pointer to the fraction of the LDS allocated
     // to this workgroup (thus this wavefront)
     LdsChunk *ldsChunk;
@@ -297,8 +291,6 @@ class Wavefront : public SimObject
     bool stopFetch();
     void regStats();
 
-    bool waitingAtBarrier(int lane);
-
     Addr pc() const;
     void pc(Addr new_pc);
 
@@ -321,6 +313,11 @@ class Wavefront : public SimObject
         return _gpuISA;
     }
 
+    void barrierId(int bar_id);
+    int barrierId() const;
+    bool hasBarrier() const;
+    void releaseBarrier();
+
   private:
     TheGpuISA::GPUISA _gpuISA;
 
@@ -349,6 +346,7 @@ class Wavefront : public SimObject
     status_e status;
     Addr _pc;
     VectorMask _execMask;
+    int barId;
 };
 
 #endif // __GPU_COMPUTE_WAVEFRONT_HH__