vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
registerManager(p->register_manager),
- fetchStage(p, this),
- scoreboardCheckStage(p, this),
- scheduleStage(p, this),
- execStage(p, this),
- globalMemoryPipe(p, this),
- localMemoryPipe(p, this),
- scalarMemoryPipe(p, this),
+ fetchStage(p, *this),
+ scoreboardCheckStage(p, *this),
+ scheduleStage(p, *this),
+ execStage(p, *this),
+ globalMemoryPipe(p, *this),
+ localMemoryPipe(p, *this),
+ scalarMemoryPipe(p, *this),
tickEvent([this]{ exec(); }, "Compute unit tick event",
false, Event::CPU_Tick_Pri),
cu_id(p->cu_id),
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu)
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
: computeUnit(cu), lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
- executionResourcesUsed(0), _name(cu->name() + ".ExecStage")
+ executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
{
numTransActiveIdle = 0;
void
ExecStage::init()
{
- dispatchList = &computeUnit->dispatchList;
+ dispatchList = &computeUnit.dispatchList;
idle_dur = 0;
}
{
std::stringstream ss;
bool empty = true;
- for (int i = 0; i < computeUnit->numExeUnits(); i++) {
+ for (int i = 0; i < computeUnit.numExeUnits(); i++) {
DISPATCH_STATUS s = dispatchList->at(i).second;
ss << i << ": " << dispStatusToStr(s);
if (s != EMPTY) {
if (Debug::GPUSched) {
dumpDispList();
}
- for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+ for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
DISPATCH_STATUS s = dispatchList->at(unitId).second;
switch (s) {
case EMPTY:
(w->instructionBuffer.front())->disassemble());
DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
dispatchList->at(unitId).first->exec();
- (computeUnit->scheduleStage).deleteFromSch(w);
+ (computeUnit.scheduleStage).deleteFromSch(w);
dispatchList->at(unitId).second = EMPTY;
dispatchList->at(unitId).first->freeResources();
dispatchList->at(unitId).first = nullptr;
;
spc
- .init(0, computeUnit->numExeUnits(), 1)
+ .init(0, computeUnit.numExeUnits(), 1)
.name(name() + ".spc")
.desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
;
;
numCyclesWithInstrTypeIssued
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".num_cycles_issue_exec_rsrc")
.desc("Number of cycles at least one instruction issued to "
"execution resource type")
;
numCyclesWithNoInstrTypeIssued
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".num_cycles_no_issue_exec_rsrc")
.desc("Number of clks no instructions issued to execution "
"resource type")
;
int c = 0;
- for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) {
+ for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) {
std::string s = "VectorALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
}
- for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) {
+ for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) {
std::string s = "ScalarALU" + std::to_string(i);
numCyclesWithNoInstrTypeIssued.subname(c, s);
numCyclesWithInstrTypeIssued.subname(c, s);
class ExecStage
{
public:
- ExecStage(const ComputeUnitParams* p, ComputeUnit *cu);
+ ExecStage(const ComputeUnitParams* p, ComputeUnit &cu);
~ExecStage() { }
void init();
void exec();
std::string dispStatusToStr(int j);
void dumpDispList();
- std::string name() { return _name; }
+ const std::string& name() const { return _name; }
void regStats();
// number of idle cycles
Stats::Scalar numCyclesWithNoIssue;
private:
void collectStatistics(enum STAT_STATUS stage, int unitId);
void initStatistics();
- ComputeUnit *computeUnit;
+ ComputeUnit &computeUnit;
// List of waves which will be dispatched to
// each execution resource. A FILLED implies
Stats::Distribution idleDur;
int executionResourcesUsed;
uint64_t idle_dur;
- std::string _name;
+ const std::string _name;
};
#endif // __EXEC_STAGE_HH__
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/wavefront.hh"
-FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit *cu)
+FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit &cu)
: numVectorALUs(p->num_SIMDs), computeUnit(cu),
- _name(cu->name() + ".FetchStage")
+ _name(cu.name() + ".FetchStage")
{
for (int j = 0; j < numVectorALUs; ++j) {
FetchUnit newFetchUnit(p, cu);
FetchStage::init()
{
for (int j = 0; j < numVectorALUs; ++j) {
- _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+ _fetchUnit[j].bindWaveList(&computeUnit.wfList[j]);
_fetchUnit[j].init();
}
}
class FetchStage
{
public:
- FetchStage(const ComputeUnitParams* p, ComputeUnit *cu);
+ FetchStage(const ComputeUnitParams* p, ComputeUnit &cu);
~FetchStage();
void init();
void exec();
void fetch(PacketPtr pkt, Wavefront *wave);
// Stats related variables and methods
- std::string name() { return _name; }
+ const std::string& name() const { return _name; }
void regStats();
Stats::Distribution instFetchInstReturned;
FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); }
private:
int numVectorALUs;
- ComputeUnit *computeUnit;
+ ComputeUnit &computeUnit;
// List of fetch units. A fetch unit is
// instantiated per VALU/SIMD
std::vector<FetchUnit> _fetchUnit;
- std::string _name;
+ const std::string _name;
};
#endif // __FETCH_STAGE_HH__
uint32_t FetchUnit::globalFetchUnitID;
-FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit *cu)
+FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit &cu)
: timingSim(true), computeUnit(cu), fetchScheduler(p),
waveList(nullptr), fetchDepth(p->fetch_depth)
{
void
FetchUnit::init()
{
- timingSim = computeUnit->shader->timingSim;
+ timingSim = computeUnit.shader->timingSim;
fetchQueue.clear();
- fetchStatusQueue.resize(computeUnit->shader->n_wf);
- fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc());
+ fetchStatusQueue.resize(computeUnit.shader->n_wf);
+ fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc());
- for (int i = 0; i < computeUnit->shader->n_wf; ++i) {
+ for (int i = 0; i < computeUnit.shader->n_wf; ++i) {
Wavefront *wf = waveList->at(i);
assert(wf->wfSlotId == i);
fetchStatusQueue[i] = std::make_pair(wf, false);
- fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf);
+ fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf);
fetchBuf[i].decoder(&decoder);
}
}
// re-evaluate waves which are marked as not ready for fetch
- for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+ for (int j = 0; j < computeUnit.shader->n_wf; ++j) {
// Following code assumes 64-bit opertaion and all insts are
// represented by 64-bit pointers to inst objects.
Wavefront *curWave = fetchStatusQueue[j].first;
// this should already be aligned to a cache line
assert(vaddr == makeLineAddress(vaddr,
- computeUnit->getCacheLineBits()));
+ computeUnit.getCacheLineBits()));
// shouldn't be fetching a line that is already buffered
assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr));
fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr);
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
- "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId,
+ "from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId,
wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
- computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+ computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
// set up virtual request
RequestPtr req = std::make_shared<Request>(
- vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH,
- computeUnit->masterId(), 0, 0, nullptr);
+ vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH,
+ computeUnit.masterId(), 0, 0, nullptr);
PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
// Sender State needed by TLB hierarchy
pkt->senderState =
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
- computeUnit->shader->gpuTc,
+ computeUnit.shader->gpuTc,
false, pkt->senderState);
- if (computeUnit->sqcTLBPort->isStalled()) {
- assert(computeUnit->sqcTLBPort->retries.size() > 0);
+ if (computeUnit.sqcTLBPort->isStalled()) {
+ assert(computeUnit.sqcTLBPort->retries.size() > 0);
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
vaddr);
- computeUnit->sqcTLBPort->retries.push_back(pkt);
- } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+ computeUnit.sqcTLBPort->retries.push_back(pkt);
+ } else if (!computeUnit.sqcTLBPort->sendTimingReq(pkt)) {
// Stall the data port;
// No more packet is issued till
// ruby indicates resources are freed by
// a recvReqRetry() call back on this port.
- computeUnit->sqcTLBPort->stallPort();
+ computeUnit.sqcTLBPort->stallPort();
DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
vaddr);
- computeUnit->sqcTLBPort->retries.push_back(pkt);
+ computeUnit.sqcTLBPort->retries.push_back(pkt);
} else {
DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
}
} else {
pkt->senderState =
new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
- computeUnit->shader->gpuTc);
+ computeUnit.shader->gpuTc);
- computeUnit->sqcTLBPort->sendFunctional(pkt);
+ computeUnit.sqcTLBPort->sendFunctional(pkt);
TheISA::GpuTLB::TranslationState *sender_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
assert(pkt->req->hasSize());
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
- computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
/**
if (timingSim) {
// translation is done. Send the appropriate timing memory request.
- if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
- computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+ if (!computeUnit.sqcPort->sendTimingReq(pkt)) {
+ computeUnit.sqcPort->retries.push_back(std::make_pair(pkt,
wavefront));
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
- computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
} else {
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
- computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
}
} else {
- computeUnit->sqcPort->sendFunctional(pkt);
+ computeUnit.sqcPort->sendFunctional(pkt);
processFetchReturn(pkt);
}
}
Wavefront *wavefront = sender_state->wavefront;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
- "%d bytes!\n", computeUnit->cu_id, wavefront->simdId,
+ "%d bytes!\n", computeUnit.cu_id, wavefront->simdId,
wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize());
if (wavefront->dropFetch) {
= std::make_shared<GPUDynInst>(wavefront->computeUnit,
wavefront, gpu_static_inst,
wavefront->computeUnit->
- getAndIncSeqNum());
+ getAndIncSeqNum());
wavefront->instructionBuffer.push_back(gpu_dyn_inst);
DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
class FetchUnit
{
public:
- FetchUnit(const ComputeUnitParams* p, ComputeUnit *cu);
+ FetchUnit(const ComputeUnitParams* p, ComputeUnit &cu);
~FetchUnit();
void init();
void exec();
};
bool timingSim;
- ComputeUnit *computeUnit;
+ ComputeUnit &computeUnit;
TheGpuISA::Decoder decoder;
// Fetch scheduler; Selects one wave from
#include "gpu-compute/wavefront.hh"
GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
- ComputeUnit *cu)
- : computeUnit(cu), _name(cu->name() + ".GlobalMemPipeline"),
+ ComputeUnit &cu)
+ : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
gmQueueSize(p->global_mem_queue_size),
maxWaveRequests(p->max_wave_requests), inflightStores(0),
inflightLoads(0)
void
GlobalMemPipeline::init()
{
- globalMemSize = computeUnit->shader->globalMemSize;
+ globalMemSize = computeUnit.shader->globalMemSize;
}
bool
}
- if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
- accessVrf && (computeUnit->shader->coissue_return ||
- computeUnit->vectorGlobalMemUnit.rdy())) {
+ if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
+ accessVrf && (computeUnit.shader->coissue_return ||
+ computeUnit.vectorGlobalMemUnit.rdy())) {
w = m->wavefront();
Tick accessTime = curTick() - m->getAccessTime();
// Decrement outstanding requests count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
if (m->isStore() || m->isAtomic() || m->isMemSync()) {
- computeUnit->shader->sampleStore(accessTime);
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+ computeUnit.shader->sampleStore(accessTime);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
m->time, -1);
}
if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
- computeUnit->shader->sampleLoad(accessTime);
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+ computeUnit.shader->sampleLoad(accessTime);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
m->time, -1);
}
// going all the way to memory and stats for individual cache
// blocks generated by the instruction.
m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
- computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
- computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
+ computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
+ computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
// Mark write bus busy for appropriate amount of time
- computeUnit->glbMemToVrfBus.set(m->time);
- if (!computeUnit->shader->coissue_return)
+ computeUnit.glbMemToVrfBus.set(m->time);
+ if (!computeUnit.shader->coissue_return)
w->computeUnit->vectorGlobalMemUnit.set(m->time);
}
* correctly.
*/
handleResponse(mp);
- computeUnit->getTokenManager()->recvTokens(1);
+ computeUnit.getTokenManager()->recvTokens(1);
}
gmIssuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
- computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+ computeUnit.cu_id, mp->simdId, mp->wfSlotId);
}
}
class GlobalMemPipeline
{
public:
- GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
+ GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
void init();
void exec();
void acqCoalescerToken(GPUDynInstPtr mp);
private:
- ComputeUnit *computeUnit;
- std::string _name;
+ ComputeUnit &computeUnit;
+ const std::string _name;
int gmQueueSize;
int maxWaveRequests;
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit *cu)
- : computeUnit(cu), _name(cu->name() + ".LocalMemPipeline"),
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit &cu)
+ : computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"),
lmQueueSize(p->local_mem_queue_size)
{
}
}
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
- computeUnit->locMemToVrfBus.rdy()
- && (computeUnit->shader->coissue_return
- || computeUnit->vectorSharedMemUnit.rdy())) {
+ computeUnit.locMemToVrfBus.rdy()
+ && (computeUnit.shader->coissue_return
+ || computeUnit.vectorSharedMemUnit.rdy())) {
lmReturnedRequests.pop();
w = m->wavefront();
}
// Decrement outstanding request count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
if (m->isStore() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrLm,
m->time, -1);
}
if (m->isLoad() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdLm,
m->time, -1);
}
// Mark write bus busy for appropriate amount of time
- computeUnit->locMemToVrfBus.set(m->time);
- if (computeUnit->shader->coissue_return == 0)
+ computeUnit.locMemToVrfBus.set(m->time);
+ if (computeUnit.shader->coissue_return == 0)
w->computeUnit->vectorSharedMemUnit.set(m->time);
}
GPUDynInstPtr m = lmIssuedRequests.front();
- bool returnVal = computeUnit->sendToLds(m);
+ bool returnVal = computeUnit.sendToLds(m);
if (!returnVal) {
DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
}
class LocalMemPipeline
{
public:
- LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
+ LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
void exec();
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
}
private:
- ComputeUnit *computeUnit;
- std::string _name;
+ ComputeUnit &computeUnit;
+ const std::string _name;
int lmQueueSize;
Stats::Scalar loadVrfBankConflictCycles;
// Local Memory Request Fifo: all shared memory requests
#include "gpu-compute/wavefront.hh"
ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p,
- ComputeUnit *cu)
- : computeUnit(cu), _name(cu->name() + ".ScalarMemPipeline"),
+ ComputeUnit &cu)
+ : computeUnit(cu), _name(cu.name() + ".ScalarMemPipeline"),
queueSize(p->scalar_mem_queue_size),
inflightStores(0), inflightLoads(0)
{
}
if ((!returnedStores.empty() || !returnedLoads.empty()) &&
- m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() &&
+ m->latency.rdy() && computeUnit.scalarMemToSrfBus.rdy() &&
accessSrf &&
- (computeUnit->shader->coissue_return ||
- computeUnit->scalarMemUnit.rdy())) {
+ (computeUnit.shader->coissue_return ||
+ computeUnit.scalarMemUnit.rdy())) {
w = m->wavefront();
}
// Decrement outstanding register count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+ computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
if (m->isStore() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
+ computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm,
m->time, -1);
}
if (m->isLoad() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
+ computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm,
m->time, -1);
}
// Mark write bus busy for appropriate amount of time
- computeUnit->scalarMemToSrfBus.set(m->time);
- if (!computeUnit->shader->coissue_return)
+ computeUnit.scalarMemToSrfBus.set(m->time);
+ if (!computeUnit.shader->coissue_return)
w->computeUnit->scalarMemUnit.set(m->time);
}
issuedRequests.pop();
DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n",
- computeUnit->cu_id, mp->simdId, mp->wfSlotId);
+ computeUnit.cu_id, mp->simdId, mp->wfSlotId);
}
}
class ScalarMemPipeline
{
public:
- ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu);
+ ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu);
void exec();
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return issuedRequests; }
return (issuedRequests.size() + pendReqs) < queueSize;
}
- const std::string &name() const { return _name; }
+ const std::string& name() const { return _name; }
void regStats();
private:
- ComputeUnit *computeUnit;
- std::string _name;
+ ComputeUnit &computeUnit;
+ const std::string _name;
int queueSize;
// Counters to track and limit the inflight scalar loads and stores
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu)
- : computeUnit(cu), _name(cu->name() + ".ScheduleStage"),
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
+ : computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
locMemBusRdy(false), locMemIssueRdy(false)
{
- for (int j = 0; j < cu->numExeUnits(); ++j) {
+ for (int j = 0; j < cu.numExeUnits(); ++j) {
scheduler.emplace_back(p);
}
wavesInSch.clear();
- schList.resize(cu->numExeUnits());
+ schList.resize(cu.numExeUnits());
for (auto &dq : schList) {
dq.clear();
}
ScheduleStage::init()
{
- fatal_if(scheduler.size() != computeUnit->readyList.size(),
+ fatal_if(scheduler.size() != computeUnit.readyList.size(),
"Scheduler should have same number of entries as CU's readyList");
- for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
- scheduler[j].bindList(&computeUnit->readyList[j]);
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ scheduler[j].bindList(&computeUnit.readyList[j]);
}
- dispatchList = &computeUnit->dispatchList;
+ dispatchList = &computeUnit.dispatchList;
- assert(computeUnit->numVectorGlobalMemUnits == 1);
- assert(computeUnit->numVectorSharedMemUnits == 1);
+ assert(computeUnit.numVectorGlobalMemUnits == 1);
+ assert(computeUnit.numVectorSharedMemUnits == 1);
}
void
ScheduleStage::exec()
{
// Update readyList
- for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
// delete all ready wavefronts whose instruction buffers are now
// empty because the last instruction was executed
- computeUnit->updateReadyList(j);
+ computeUnit.updateReadyList(j);
/**
* Remove any wave that already has an instruction present in SCH
* waiting for RF reads to complete. This prevents out of order
* execution within a wave.
*/
- for (auto wIt = computeUnit->readyList.at(j).begin();
- wIt != computeUnit->readyList.at(j).end();) {
+ for (auto wIt = computeUnit.readyList.at(j).begin();
+ wIt != computeUnit.readyList.at(j).end();) {
if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
*wIt = nullptr;
- wIt = computeUnit->readyList.at(j).erase(wIt);
+ wIt = computeUnit.readyList.at(j).erase(wIt);
} else {
wIt++;
}
// Scalar Memory are iterated after VMEM
// Iterate VMEM and SMEM
- int firstMemUnit = computeUnit->firstMemUnit();
- int lastMemUnit = computeUnit->lastMemUnit();
+ int firstMemUnit = computeUnit.firstMemUnit();
+ int lastMemUnit = computeUnit.lastMemUnit();
for (int j = firstMemUnit; j <= lastMemUnit; j++) {
- int readyListSize = computeUnit->readyList[j].size();
+ int readyListSize = computeUnit.readyList[j].size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
}
// Iterate everything else
- for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
// skip the VMEM resources
if (j >= firstMemUnit && j <= lastMemUnit) {
continue;
}
- int readyListSize = computeUnit->readyList[j].size();
+ int readyListSize = computeUnit.readyList[j].size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
bool accessVrfWr = true;
if (!ii->isScalar()) {
accessVrfWr =
- computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+ computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
}
bool accessSrfWr =
- computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii);
+ computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
bool accessRf = accessVrfWr && accessSrfWr;
if (accessRf) {
if (!ii->isScalar()) {
- computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii);
+ computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
}
- computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii);
+ computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
return true;
} else {
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
void
ScheduleStage::scheduleRfDestOperands()
{
- for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
if (!dispatchList->at(j).first) {
continue;
}
bool accessVrf = true;
if (!ii->isScalar()) {
accessVrf =
- computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii);
+ computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
}
bool accessSrf =
- computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii);
+ computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
// If RFs can support instruction, add to schList in RFBUSY state,
// place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
// to the VRF
exeType, w->simdId, w->wfDynId,
ii->seqNum(), ii->disassemble());
- computeUnit->insertInPipeMap(w);
+ computeUnit.insertInPipeMap(w);
wavesInSch.emplace(w->wfDynId);
schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
if (w->isOldestInstWaitcnt()) {
w->setStatus(Wavefront::S_WAITCNT);
}
if (!ii->isScalar()) {
- computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii);
+ computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
}
- computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii);
+ computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
exeType, w->simdId, w->wfDynId,
scalarMemBusRdy = false;
scalarMemIssueRdy = false;
// check if there is a SRF->Global Memory bus available and
- if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) {
+ if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
scalarMemBusRdy = true;
}
// check if we can issue a scalar memory instruction
- if (computeUnit->scalarMemUnit.rdy(Cycles(1))) {
+ if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
scalarMemIssueRdy = true;
}
glbMemBusRdy = false;
glbMemIssueRdy = false;
// check if there is a VRF->Global Memory bus available
- if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+ if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
glbMemBusRdy = true;
}
// check if we can issue a Global memory instruction
- if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) {
+ if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
glbMemIssueRdy = true;
}
locMemBusRdy = false;
locMemIssueRdy = false;
// check if there is a VRF->LDS bus available
- if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+ if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
locMemBusRdy = true;
}
// check if we can issue a LDS instruction
- if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) {
+ if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
locMemIssueRdy = true;
}
}
vectorAluRdy = false;
scalarAluRdy = false;
// check for available vector/scalar ALUs in the next cycle
- if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) {
+ if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
vectorAluRdy = true;
}
- if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+ if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
scalarAluRdy = true;
}
GPUDynInstPtr ii = w->instructionBuffer.front();
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+ if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
}
- if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+ if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
}
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit->scalarMemoryPipe.
+ if (!computeUnit.scalarMemoryPipe.
isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
w->scalarWrGmReqsInPipe)) {
rdy = false;
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit->localMemoryPipe.
+ if (!computeUnit.localMemoryPipe.
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+ if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
}
- if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) {
+ if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
}
- if (!computeUnit->localMemoryPipe.
+ if (!computeUnit.localMemoryPipe.
isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
// update execution resource status
checkMemResources();
// iterate execution resources
- for (int j = 0; j < computeUnit->numExeUnits(); j++) {
+ for (int j = 0; j < computeUnit.numExeUnits(); j++) {
assert(dispatchList->at(j).second == EMPTY);
// iterate waves in schList to pick one for dispatch
instructionBuffer.front();
if (!mp->isMemSync() && !mp->isScalar() &&
(mp->isGlobalMem() || mp->isFlat())) {
- computeUnit->globalMemoryPipe.acqCoalescerToken(mp);
+ computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
}
doDispatchListTransition(j, EXREADY, schIter->first);
// and a VRF->LDS bus. In GFx9, this is not the case.
// iterate the GM pipelines
- for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) {
+ for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
// get the GM pipe index in the dispatchList
- int gm_exe_unit = computeUnit->firstMemUnit() + i;
+ int gm_exe_unit = computeUnit.firstMemUnit() + i;
// get the wave in the dispatchList
Wavefront *w = dispatchList->at(gm_exe_unit).first;
// If the WF is valid, ready to execute, and the instruction
// Iterate the schList queues and check if operand reads
// have completed in the RFs. If so, mark the wave as ready for
// selection for dispatchList
- for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
for (auto &p : schList.at(j)) {
Wavefront *w = p.first;
assert(w);
bool vrfRdy = true;
if (!ii->isScalar()) {
vrfRdy =
- computeUnit->vrf[w->simdId]->operandReadComplete(w, ii);
+ computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
}
bool srfRdy =
- computeUnit->srf[w->simdId]->operandReadComplete(w, ii);
+ computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
bool operandsReady = vrfRdy && srfRdy;
if (operandsReady) {
DPRINTF(GPUSched,
ScheduleStage::reserveResources()
{
std::vector<bool> exeUnitReservations;
- exeUnitReservations.resize(computeUnit->numExeUnits(), false);
+ exeUnitReservations.resize(computeUnit.numExeUnits(), false);
- for (int j = 0; j < computeUnit->numExeUnits(); ++j) {
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
Wavefront *dispatchedWave = dispatchList->at(j).first;
if (dispatchedWave) {
DISPATCH_STATUS s = dispatchList->at(j).second;
GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
if (!ii->isScalar()) {
- computeUnit->vrf[dispatchedWave->simdId]->
+ computeUnit.vrf[dispatchedWave->simdId]->
dispatchInstruction(ii);
}
- computeUnit->srf[dispatchedWave->simdId]->
+ computeUnit.srf[dispatchedWave->simdId]->
dispatchInstruction(ii);
std::stringstream ss;
ScheduleStage::regStats()
{
rdyListNotEmpty
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".rdy_list_not_empty")
.desc("number of cycles one or more wave on ready list per "
"execution resource")
;
rdyListEmpty
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".rdy_list_empty")
.desc("number of cycles no wave on ready list per "
"execution resource")
;
addToSchListStalls
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".sch_list_add_stalls")
.desc("number of cycles a wave is not added to schList per "
"execution resource when ready list is not empty")
;
schListToDispList
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".sch_list_to_disp_list")
.desc("number of cycles a wave is added to dispatchList per "
"execution resource")
;
schListToDispListStalls
- .init(computeUnit->numExeUnits())
+ .init(computeUnit.numExeUnits())
.name(name() + ".sch_list_to_disp_list_stalls")
.desc("number of cycles no wave is added to dispatchList per "
"execution resource")
class ScheduleStage
{
public:
- ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu);
+ ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu);
~ScheduleStage();
void init();
void exec();
// Stats related variables and methods
- std::string name() { return _name; }
+ const std::string& name() const { return _name; }
enum SchNonRdyType {
SCH_SCALAR_ALU_NRDY,
SCH_VECTOR_ALU_NRDY,
};
private:
- ComputeUnit *computeUnit;
+ ComputeUnit &computeUnit;
// Each execution resource will have its own
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
// to dispatchList
Stats::Vector dispNrdyStalls;
- std::string _name;
+ const std::string _name;
// called by exec() to add a wave to schList if the RFs can support it
bool addToSchList(int exeType, Wavefront *w);
#include "params/ComputeUnit.hh"
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p,
- ComputeUnit *cu)
- : computeUnit(cu), _name(cu->name() + ".ScoreboardCheckStage")
+ ComputeUnit &cu)
+ : computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage")
{
}
void
ScoreboardCheckStage::init()
{
- for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
- readyList.push_back(&computeUnit->readyList[unitId]);
+ for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
+ readyList.push_back(&computeUnit.readyList[unitId]);
}
}
if (w->getStatus() == Wavefront::S_BARRIER) {
assert(w->hasBarrier());
int bar_id = w->barrierId();
- if (!computeUnit->allAtBarrier(bar_id)) {
+ if (!computeUnit.allAtBarrier(bar_id)) {
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at "
"barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id,
w->simdId, w->wfSlotId, w->wfDynId, bar_id,
DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier "
"Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id,
w->simdId, w->wfSlotId, w->wfDynId, bar_id);
- computeUnit->resetBarrier(bar_id);
- computeUnit->releaseWFsFromBarrier(bar_id);
+ computeUnit.resetBarrier(bar_id);
+ computeUnit.releaseWFsFromBarrier(bar_id);
}
// Check WF status: it has to be running
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n",
- computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble());
+ computeUnit.cu_id, w->simdId, w->wfSlotId, ii->disassemble());
// Non-scalar (i.e., vector) instructions may use VGPRs
if (!ii->isScalar()) {
- if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) {
+ if (!computeUnit.vrf[w->simdId]->operandsReady(w, ii)) {
*rdyStatus = NRDY_VGPR_NRDY;
return false;
}
}
// Scalar and non-scalar instructions may use SGPR
- if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) {
+ if (!computeUnit.srf[w->simdId]->operandsReady(w, ii)) {
*rdyStatus = NRDY_SGPR_NRDY;
return false;
}
return false;
}
}
- DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit.cu_id,
w->simdId, w->wfSlotId, ii->disassemble());
*exeResType = mapWaveToExeUnit(w);
*rdyStatus = INST_RDY;
}
}
panic("%s: unmapped to an execution resource", ii->disassemble());
- return computeUnit->numExeUnits();
+ return computeUnit.numExeUnits();
}
void
{
// reset the ready list for all execution units; it will be
// constructed every cycle since resource availability may change
- for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) {
+ for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
// Reset wavefront pointers to nullptr so clear() on the vector
// does not accidentally destruct the wavefront object
for (int i = 0; i < readyList[unitId]->size(); i++) {
readyList[unitId]->clear();
}
// iterate over all WF slots across all vector ALUs
- for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) {
- for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) {
+ for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) {
+ for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) {
// reset the ready status of each wavefront
- Wavefront *curWave = computeUnit->wfList[simdId][wfSlot];
+ Wavefront *curWave = computeUnit.wfList[simdId][wfSlot];
nonrdytype_e rdyStatus = NRDY_ILLEGAL;
int exeResType = -1;
// check WF readiness: If the WF's oldest
NRDY_CONDITIONS
};
- ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit *cu);
+ ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu);
~ScoreboardCheckStage();
void init();
void exec();
int mapWaveToExeUnit(Wavefront *w);
bool ready(Wavefront *w, nonrdytype_e *rdyStatus,
int *exeResType, int wfSlot);
- ComputeUnit *computeUnit;
+ ComputeUnit &computeUnit;
// List of waves which are ready to be scheduled.
// Each execution resource has a ready list
// Stats
Stats::Vector stallCycles;
- std::string _name;
+ const std::string _name;
};
#endif // __SCOREBOARD_CHECK_STAGE_HH__