#include "arch/gcn3/insts/inst_util.hh"
#include "debug/GCN3.hh"
+#include "debug/GPUSync.hh"
#include "gpu-compute/shader.hh"
namespace Gcn3ISA
Inst_SOPP__S_ENDPGM::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
+ ComputeUnit *cu = gpuDynInst->computeUnit();
// delete extra instructions fetched for completed work-items
wf->instructionBuffer.erase(wf->instructionBuffer.begin() + 1,
int refCount = wf->computeUnit->getLds()
.decreaseRefCounter(wf->dispatchId, wf->wgId);
+ /**
+ * The parent WF of this instruction is exiting, therefore
+ * it should not participate in this barrier any longer. This
+ * prevents possible deadlock issues if WFs exit early.
+ */
+ int bar_id = WFBarrier::InvalidID;
+ if (wf->hasBarrier()) {
+ assert(wf->getStatus() != Wavefront::S_BARRIER);
+ bar_id = wf->barrierId();
+ assert(bar_id != WFBarrier::InvalidID);
+ wf->releaseBarrier();
+ cu->decMaxBarrierCnt(bar_id);
+ DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Exiting the "
+ "program and decrementing max barrier count for "
+ "barrier Id%d. New max count: %d.\n", cu->cu_id,
+ wf->simdId, wf->wfSlotId, wf->wfDynId, bar_id,
+ cu->maxBarrierCnt(bar_id));
+ }
+
DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
wf->computeUnit->cu_id, wf->wgId, refCount);
wf->lastInstExec = 0;
if (!refCount) {
+ /**
+ * If all WFs have finished, and hence the WG has finished,
+ * then we can free up the barrier belonging to the parent
+ * WG, but only if we actually used a barrier (i.e., more
+ * than one WF in the WG).
+ */
+ if (bar_id != WFBarrier::InvalidID) {
+ DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves are "
+ "now complete. Releasing barrier Id%d.\n", cu->cu_id,
+ wf->simdId, wf->wfSlotId, wf->wfDynId,
+ wf->barrierId());
+ cu->releaseBarrier(bar_id);
+ }
+
/**
* Last wavefront of the workgroup has executed return. If the
* workgroup is not the final one in the kernel, then simply
Inst_SOPP__S_BARRIER::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
+ ComputeUnit *cu = gpuDynInst->computeUnit();
- assert(wf->barrierCnt == wf->oldBarrierCnt);
-
- wf->barrierCnt = wf->oldBarrierCnt + 1;
- wf->stalledAtBarrier = true;
- }
+ if (wf->hasBarrier()) {
+ int bar_id = wf->barrierId();
+ assert(wf->getStatus() != Wavefront::S_BARRIER);
+ wf->setStatus(Wavefront::S_BARRIER);
+ cu->incNumAtBarrier(bar_id);
+ DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalling at "
+ "barrier Id%d. %d waves now at barrier, %d waves "
+ "remain.\n", cu->cu_id, wf->simdId, wf->wfSlotId,
+ wf->wfDynId, bar_id, cu->numAtBarrier(bar_id),
+ cu->numYetToReachBarrier(bar_id));
+ }
+ } // execute
+ // --- Inst_SOPP__S_SETKILL class methods ---
Inst_SOPP__S_SETKILL::Inst_SOPP__S_SETKILL(InFmt_SOPP *iFmt)
: Inst_SOPP(iFmt, "s_setkill")
# Wavefront size is 64. This is configurable, however changing
# this value to anything other than 64 will likely cause errors.
wf_size = Param.Int(64, 'Wavefront size (in work items)')
+ num_barrier_slots = Param.Int(4, 'Number of barrier slots in a CU')
num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
num_scalar_cores = Param.Int(1, 'number of Scalar cores per CU')
num_scalar_mem_pipes = Param.Int(1, 'number of Scalar memory pipelines '\
prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
debugSegFault(p->debugSegFault),
functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
- countPages(p->countPages), barrier_id(0),
+ countPages(p->countPages),
req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
_masterId(p->system->getMasterId(this, "ComputeUnit")),
lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
- _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
- wavefrontSize(p->wf_size)
+ _cacheLineSize(p->system->cacheLineSize()),
+ _numBarrierSlots(p->num_barrier_slots),
+ globalSeqNum(0), wavefrontSize(p->wf_size)
{
/**
* This check is necessary because std::bitset only provides conversion
lastVaddrWF.resize(numVectorALUs);
wfList.resize(numVectorALUs);
+ wfBarrierSlots.resize(p->num_barrier_slots, WFBarrier());
+
+ for (int i = 0; i < p->num_barrier_slots; ++i) {
+ freeBarrierIds.insert(i);
+ }
+
for (int j = 0; j < numVectorALUs; ++j) {
lastVaddrWF[j].resize(p->n_wf);
void
ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
- HSAQueueEntry *task, bool fetchContext)
+ HSAQueueEntry *task, int bar_id, bool fetchContext)
{
static int _n_wave = 0;
w->wfId = waveId;
w->initMask = init_mask.to_ullong();
+ if (bar_id > WFBarrier::InvalidID) {
+ w->barrierId(bar_id);
+ } else {
+ assert(!w->hasBarrier());
+ }
+
for (int k = 0; k < wfSize(); ++k) {
w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
w->workItemId[0][k];
}
- w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
-
- w->barCnt.resize(wfSize(), 0);
-
- w->maxBarCnt = 0;
- w->oldBarrierCnt = 0;
- w->barrierCnt = 0;
-
// WG state
w->wgId = task->globalWgId();
w->dispatchId = task->dispatchId();
w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
- w->barrierId = barrier_id;
- w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true;
-
// set the wavefront context to have a pointer to this section of the LDS
w->ldsChunk = ldsChunk;
w->dropFetch = true;
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
- "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId,
- w->wfSlotId);
+ "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id,
+ w->simdId, w->wfSlotId, refCount);
w->initRegState(task, w->actualWgSzTotal);
w->start(_n_wave++, task->codeAddr());
}
void
-ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
+ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
{
// If we aren't ticking, start it up!
if (!tickEvent.scheduled()) {
int sregDemand = task->numScalarRegs();
int wave_id = 0;
+ int barrier_id = WFBarrier::InvalidID;
+
+ /**
+ * If this WG only has one WF it will not consume any barrier
+ * resources because it has no need of them.
+ */
+ if (num_wfs_in_wg > 1) {
+ /**
+ * Find a free barrier slot for this WG. Each WF in the WG will
+ * receive the same barrier ID.
+ */
+ barrier_id = getFreeBarrierId();
+ auto &wf_barrier = barrierSlot(barrier_id);
+ assert(!wf_barrier.maxBarrierCnt());
+ assert(!wf_barrier.numAtBarrier());
+ wf_barrier.setMaxBarrierCnt(num_wfs_in_wg);
+
+ DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. "
+ "%d waves using this barrier.\n", cu_id, barrier_id,
+ num_wfs_in_wg);
+ }
+
// Assign WFs according to numWfsToSched vector, which is computed by
// hasDispResources()
for (int j = 0; j < shader->n_wf; ++j) {
registerManager->allocateRegisters(w, vregDemand, sregDemand);
- startWavefront(w, wave_id, ldsChunk, task);
+ startWavefront(w, wave_id, ldsChunk, task, barrier_id);
++wave_id;
}
}
}
- ++barrier_id;
}
void
}
bool
-ComputeUnit::hasDispResources(HSAQueueEntry *task)
+ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg)
{
// compute true size of workgroup (after clamping to grid size)
int trueWgSize[HSAQueueEntry::MAX_DIM];
// calculate the number of WFs in this WG
int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
+ num_wfs_in_wg = numWfs;
+
+ bool barrier_avail = true;
+
+ if (numWfs > 1 && !freeBarrierIds.size()) {
+ barrier_avail = false;
+ }
// calculate the number of 32-bit vector registers required by each
// work item of the work group
wgBlockedDueLdsAllocation++;
}
+ if (!barrier_avail) {
+ wgBlockedDueBarrierAllocation++;
+ }
+
// Return true if the following are all true:
// (a) all WFs of the WG were mapped to free WF slots
// (b) there are enough VGPRs to schedule all WFs to their SIMD units
// (c) there are enough SGPRs on the CU to schedule all WFs
// (d) there is enough space in LDS to allocate for all WFs
bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
- && ldsAvail;
+ && ldsAvail && barrier_avail;
return can_dispatch;
}
int
-ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+ComputeUnit::numYetToReachBarrier(int bar_id)
{
- DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
- int ccnt = 0;
+ auto &wf_barrier = barrierSlot(bar_id);
+ return wf_barrier.numYetToReachBarrier();
+}
- for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) {
- for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
- Wavefront *w = wfList[i_simd][i_wf];
+bool
+ComputeUnit::allAtBarrier(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ return wf_barrier.allAtBarrier();
+}
- if (w->getStatus() == Wavefront::S_RUNNING) {
- DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+void
+ComputeUnit::incNumAtBarrier(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ wf_barrier.incNumAtBarrier();
+}
- DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
- w->barrierId, _barrier_id);
+int
+ComputeUnit::numAtBarrier(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ return wf_barrier.numAtBarrier();
+}
- DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n",
- w->barrierCnt, bcnt);
+int
+ComputeUnit::maxBarrierCnt(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ return wf_barrier.maxBarrierCnt();
+}
- DPRINTF(GPUSync, "outstanding Reqs = %d\n",
- w->outstandingReqs);
- }
+void
+ComputeUnit::resetBarrier(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ wf_barrier.reset();
+}
+
+void
+ComputeUnit::decMaxBarrierCnt(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ wf_barrier.decMaxBarrierCnt();
+}
- if (w->getStatus() == Wavefront::S_RUNNING &&
- w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
- !w->outstandingReqs) {
- ++ccnt;
+void
+ComputeUnit::releaseBarrier(int bar_id)
+{
+ auto &wf_barrier = barrierSlot(bar_id);
+ wf_barrier.release();
+ freeBarrierIds.insert(bar_id);
+}
- DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
- "%d\n", i_simd, i_wf, ccnt);
+void
+ComputeUnit::releaseWFsFromBarrier(int bar_id)
+{
+ for (int i = 0; i < numVectorALUs; ++i) {
+ for (int j = 0; j < shader->n_wf; ++j) {
+ Wavefront *wf = wfList[i][j];
+ if (wf->barrierId() == bar_id) {
+ assert(wf->getStatus() == Wavefront::S_BARRIER);
+ wf->setStatus(Wavefront::S_RUNNING);
}
}
}
-
- DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
- cu_id, ccnt, bslots);
-
- return ccnt == bslots;
}
// Execute one clock worth of work on the ComputeUnit.
computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
}
- DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n",
- computeUnit->cu_id, gpuDynInst->simdId,
- gpuDynInst->wfSlotId, w->barrierCnt);
-
delete pkt->senderState;
delete pkt;
return true;
.desc("Number of dynamic non-GM memory insts executed")
;
+ wgBlockedDueBarrierAllocation
+ .name(name() + ".wg_blocked_due_barrier_alloc")
+ .desc("WG dispatch was blocked due to lack of barrier resources")
+ ;
+
wgBlockedDueLdsAllocation
.name(name() + ".wg_blocked_due_lds_alloc")
.desc("Workgroup blocked due to LDS capacity")
TLB_HIT_CACHE_HIT
};
+/**
+ * WF barrier slots. This represents the barrier resource for
+ * WF-level barriers (i.e., barriers to sync WFs within a WG).
+ */
+class WFBarrier
+{
+ public:
+ WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
+ {
+ }
+
+ static const int InvalidID = -1;
+
+ int
+ numAtBarrier() const
+ {
+ return _numAtBarrier;
+ }
+
+ /**
+ * Number of WFs that have not yet reached the barrier.
+ */
+ int
+ numYetToReachBarrier() const
+ {
+ return _maxBarrierCnt - _numAtBarrier;
+ }
+
+ int
+ maxBarrierCnt() const
+ {
+ return _maxBarrierCnt;
+ }
+
+ /**
+ * Set the maximum barrier count (i.e., the number of WFs that are
+ * participating in the barrier).
+ */
+ void
+ setMaxBarrierCnt(int max_barrier_cnt)
+ {
+ _maxBarrierCnt = max_barrier_cnt;
+ }
+
+ /**
+ * Mark that a WF has reached the barrier.
+ */
+ void
+ incNumAtBarrier()
+ {
+ assert(_numAtBarrier < _maxBarrierCnt);
+ ++_numAtBarrier;
+ }
+
+ /**
+ * Have all WFs participating in this barrier reached the barrier?
+ * If so, then the barrier is satisfied and WFs may proceed past
+ * the barrier.
+ */
+ bool
+ allAtBarrier() const
+ {
+ return _numAtBarrier == _maxBarrierCnt;
+ }
+
+ /**
+ * Decrement the number of WFs that are participating in this barrier.
+ * This should be called when a WF exits.
+ */
+ void
+ decMaxBarrierCnt()
+ {
+ assert(_maxBarrierCnt > 0);
+ --_maxBarrierCnt;
+ }
+
+ /**
+ * Release this barrier resource so it can be used by other WGs. This
+ * is generally called when a WG has finished.
+ */
+ void
+ release()
+ {
+ _numAtBarrier = 0;
+ _maxBarrierCnt = 0;
+ }
+
+ /**
+ * Reset the barrier. This is used to reset the barrier, usually when
+ * a dynamic instance of a barrier has been satisfied.
+ */
+ void
+ reset()
+ {
+ _numAtBarrier = 0;
+ }
+
+ private:
+ /**
+ * The number of WFs in the WG that have reached the barrier. Once
+ * the number of WFs that reach a barrier matches the number of WFs
+ * in the WG, the barrier is satisfied.
+ */
+ int _numAtBarrier;
+
+ /**
+ * The maximum number of WFs that can reach this barrier. This is
+ * essentially the number of WFs in the WG, and a barrier is satisfied
+ * when the number of WFs that reach the barrier equal this value. If
+ * a WF exits early it must decrement this value so that it is no
+ * longer considered for this barrier.
+ */
+ int _maxBarrierCnt;
+};
+
class ComputeUnit : public ClockedObject
{
public:
bool countPages;
Shader *shader;
- uint32_t barrier_id;
Tick req_tick_latency;
Tick resp_tick_latency;
void fillKernelState(Wavefront *w, HSAQueueEntry *task);
void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
- HSAQueueEntry *task, bool fetchContext=false);
+ HSAQueueEntry *task, int bar_id,
+ bool fetchContext=false);
void doInvalidate(RequestPtr req, int kernId);
void doFlush(GPUDynInstPtr gpuDynInst);
- void dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler=false);
- bool hasDispResources(HSAQueueEntry *task);
+ void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
+ bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
int cacheLineSize() const { return _cacheLineSize; }
int getCacheLineBits() const { return cacheLineBits; }
- /* This function cycles through all the wavefronts in all the phases to see
- * if all of the wavefronts which should be associated with one barrier
- * (denoted with _barrier_id), are all at the same barrier in the program
- * (denoted by bcnt). When the number at the barrier matches bslots, then
- * return true.
- */
- int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+ private:
+ WFBarrier&
+ barrierSlot(int bar_id)
+ {
+ assert(bar_id > WFBarrier::InvalidID);
+ return wfBarrierSlots.at(bar_id);
+ }
+
+ int
+ getFreeBarrierId()
+ {
+ assert(freeBarrierIds.size());
+ auto free_bar_id = freeBarrierIds.begin();
+ int bar_id = *free_bar_id;
+ freeBarrierIds.erase(free_bar_id);
+ return bar_id;
+ }
+
+ public:
+ int numYetToReachBarrier(int bar_id);
+ bool allAtBarrier(int bar_id);
+ void incNumAtBarrier(int bar_id);
+ int numAtBarrier(int bar_id);
+ int maxBarrierCnt(int bar_id);
+ void resetBarrier(int bar_id);
+ void decMaxBarrierCnt(int bar_id);
+ void releaseBarrier(int bar_id);
+ void releaseWFsFromBarrier(int bar_id);
+ int numBarrierSlots() const { return _numBarrierSlots; }
template<typename c0, typename c1>
void doSmReturn(GPUDynInstPtr gpuDynInst);
Stats::Scalar dynamicFlatMemInstrCnt;
Stats::Scalar dynamicLMemInstrCnt;
+ Stats::Scalar wgBlockedDueBarrierAllocation;
Stats::Scalar wgBlockedDueLdsAllocation;
// Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
// active when the instruction is committed, this number is still
private:
const int _cacheLineSize;
+ const int _numBarrierSlots;
int cacheLineBits;
InstSeqNum globalSeqNum;
int wavefrontSize;
+ /**
+ * The barrier slots for this CU.
+ */
+ std::vector<WFBarrier> wfBarrierSlots;
+ /**
+ * A set used to easily retrieve a free barrier ID.
+ */
+ std::unordered_set<int> freeBarrierIds;
+
// hold the time of the arrival of the first cache block related to
// a particular GPUDynInst. This is used to calculate the difference
// between the first and last chace block arrival times.
#include "debug/GPUExec.hh"
#include "debug/GPUSched.hh"
+#include "debug/GPUSync.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
// Is the wave waiting at a barrier. Check this condition BEFORE checking
// for instruction buffer occupancy to avoid a deadlock when the barrier is
// the last instruction in the instruction buffer.
- if (w->stalledAtBarrier) {
- if (!computeUnit->AllAtBarrier(w->barrierId,w->barrierCnt,
- computeUnit->getRefCounter(w->dispatchId, w->wgId))) {
+ if (w->getStatus() == Wavefront::S_BARRIER) {
+ assert(w->hasBarrier());
+ int bar_id = w->barrierId();
+ if (!computeUnit->allAtBarrier(bar_id)) {
+ DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at "
+ "barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id,
+ w->simdId, w->wfSlotId, w->wfDynId, bar_id,
+ w->computeUnit->numYetToReachBarrier(bar_id));
// Are all threads at barrier?
*rdyStatus = NRDY_BARRIER_WAIT;
return false;
}
- w->oldBarrierCnt = w->barrierCnt;
- w->stalledAtBarrier = false;
+ DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier "
+ "Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id,
+ w->simdId, w->wfSlotId, w->wfDynId, bar_id);
+ computeUnit->resetBarrier(bar_id);
+ computeUnit->releaseWFsFromBarrier(bar_id);
}
// Check WF status: it has to be running
// dispatch workgroup iff the following two conditions are met:
// (a) wg_rem is true - there are unassigned workgroups in the grid
// (b) there are enough free slots in cu cuList[i] for this wg
- if (!task->dispComplete() && cuList[curCu]->hasDispResources(task)) {
+ int num_wfs_in_wg = 0;
+ bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
+ if (!task->dispComplete() && can_disp) {
scheduledSomething = true;
DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
curCu, task->globalWgId());
panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
"Invalid activeCu size\n");
- cuList[curCu]->dispWorkgroup(task);
+ cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
task->markWgDispatch();
}
Wavefront::Wavefront(const Params *p)
: SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
maxIbSize(p->max_ib_size), _gpuISA(*this),
- vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1)
+ vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
+ barId(WFBarrier::InvalidID)
{
lastTrace = 0;
execUnitId = -1;
scalarOutstandingReqsRdGm = 0;
scalarOutstandingReqsWrGm = 0;
lastNonIdleTick = 0;
- barrierCnt = 0;
- oldBarrierCnt = 0;
- stalledAtBarrier = false;
ldsChunk = nullptr;
memTraceBusy = 0;
lastAddr.resize(p->wf_size);
workItemFlatId.resize(p->wf_size);
oldDgpr.resize(p->wf_size);
- barCnt.resize(p->wf_size);
for (int i = 0; i < 3; ++i) {
workItemId[i].resize(p->wf_size);
}
if (computeUnit->idleCUTimeout > 0) {
// Wavefront's status transitions to stalled or stopped
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
- newStatus == S_WAITCNT) &&
+ newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
(status != newStatus)) {
computeUnit->idleWfs++;
assert(computeUnit->idleWfs <=
// Wavefront's status transitions to an active state (from
// a stopped or stalled state)
} else if ((status == S_STOPPED || status == S_STALLED ||
- status == S_WAITCNT) &&
+ status == S_WAITCNT || status == S_BARRIER) &&
(status != newStatus)) {
// if all WFs in the CU were idle then check if the idleness
// period exceeded the timeout threshold
}
}
-bool
-Wavefront::waitingAtBarrier(int lane)
-{
- return barCnt[lane] < maxBarCnt;
-}
-
GPUDynInstPtr
Wavefront::nextInstr()
{
actualWgSzTotal *= actualWgSz[d];
}
}
+
+void
+Wavefront::barrierId(int bar_id)
+{
+ assert(bar_id >= WFBarrier::InvalidID);
+ assert(bar_id < computeUnit->numBarrierSlots());
+ barId = bar_id;
+}
+
+int
+Wavefront::barrierId() const
+{
+ return barId;
+}
+
+bool
+Wavefront::hasBarrier() const
+{
+ return barId > WFBarrier::InvalidID;
+}
+
+void
+Wavefront::releaseBarrier()
+{
+ barId = WFBarrier::InvalidID;
+}
* and once they are satisfied, it will resume normal
* operation.
*/
- S_WAITCNT
+ S_WAITCNT,
+ /**
+ * WF is stalled at a barrier.
+ */
+ S_BARRIER
};
- uint32_t oldBarrierCnt;
- uint32_t barrierCnt;
- uint32_t barrierId;
- uint32_t barrierSlots;
// HW slot id where the WF is mapped to inside a SIMD unit
const int wfSlotId;
int kernId;
// Execution mask at wavefront start
VectorMask initMask;
- // number of barriers this WF has joined
- std::vector<int> barCnt;
- int maxBarCnt;
- // Flag to stall a wave on barrier
- bool stalledAtBarrier;
-
// a pointer to the fraction of the LDS allocated
// to this workgroup (thus this wavefront)
LdsChunk *ldsChunk;
bool stopFetch();
void regStats();
- bool waitingAtBarrier(int lane);
-
Addr pc() const;
void pc(Addr new_pc);
return _gpuISA;
}
+ void barrierId(int bar_id);
+ int barrierId() const;
+ bool hasBarrier() const;
+ void releaseBarrier();
+
private:
TheGpuISA::GPUISA _gpuISA;
status_e status;
Addr _pc;
VectorMask _execMask;
+ int barId;
};
#endif // __GPU_COMPUTE_WAVEFRONT_HH__