From 5f0378b8d00ef7b133e0232fad409b2b65093a8b Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Fri, 29 Jun 2018 17:39:53 -0400 Subject: [PATCH] gpu-compute: Use refs to CU in pipe stages/mem pipes The pipe stages and memory pipes are changed to store a reference to their parent CU as opposed to a pointer. These objects will never change which CU they belong to, and they are constructed by their parent CU. Change-Id: Ie5476e1e2e124a024c2efebceb28cb3a9baa78c1 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29969 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- src/gpu-compute/compute_unit.cc | 14 +-- src/gpu-compute/exec_stage.cc | 22 ++-- src/gpu-compute/exec_stage.hh | 8 +- src/gpu-compute/fetch_stage.cc | 6 +- src/gpu-compute/fetch_stage.hh | 8 +- src/gpu-compute/fetch_unit.cc | 58 +++++----- src/gpu-compute/fetch_unit.hh | 4 +- src/gpu-compute/global_memory_pipeline.cc | 34 +++--- src/gpu-compute/global_memory_pipeline.hh | 6 +- src/gpu-compute/local_memory_pipeline.cc | 22 ++-- src/gpu-compute/local_memory_pipeline.hh | 6 +- src/gpu-compute/scalar_memory_pipeline.cc | 22 ++-- src/gpu-compute/scalar_memory_pipeline.hh | 8 +- src/gpu-compute/schedule_stage.cc | 122 +++++++++++----------- src/gpu-compute/schedule_stage.hh | 8 +- src/gpu-compute/scoreboard_check_stage.cc | 32 +++--- src/gpu-compute/scoreboard_check_stage.hh | 6 +- 17 files changed, 193 insertions(+), 193 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 653c074bc..a59a7fd6e 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -67,13 +67,13 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), registerManager(p->register_manager), - fetchStage(p, this), - scoreboardCheckStage(p, this), - scheduleStage(p, this), - execStage(p, this), - globalMemoryPipe(p, this), - localMemoryPipe(p, this), - scalarMemoryPipe(p, this), + fetchStage(p, *this), + scoreboardCheckStage(p, *this), + scheduleStage(p, *this), + execStage(p, *this), + globalMemoryPipe(p, *this), + localMemoryPipe(p, *this), + scalarMemoryPipe(p, *this), tickEvent([this]{ exec(); }, "Compute unit tick event", false, Event::CPU_Tick_Pri), cu_id(p->cu_id), diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index e420579c9..2b0a79785 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -41,10 +41,10 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu) +ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu) : computeUnit(cu), lastTimeInstExecuted(false), thisTimeInstExecuted(false), instrExecuted (false), - executionResourcesUsed(0), _name(cu->name() + ".ExecStage") + executionResourcesUsed(0), _name(cu.name() + ".ExecStage") { numTransActiveIdle = 0; @@ -54,7 +54,7 @@ ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit *cu) void ExecStage::init() { - dispatchList = &computeUnit->dispatchList; + dispatchList = &computeUnit.dispatchList; idle_dur = 0; } @@ -127,7 +127,7 @@ ExecStage::dumpDispList() { std::stringstream ss; bool empty = true; - for (int i = 0; i < computeUnit->numExeUnits(); i++) { + for (int i = 0; i < computeUnit.numExeUnits(); i++) { DISPATCH_STATUS s = dispatchList->at(i).second; ss << i << ": " << dispStatusToStr(s); if (s != EMPTY) { @@ -151,7 +151,7 @@ ExecStage::exec() if (Debug::GPUSched) { dumpDispList(); } - for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { + for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { DISPATCH_STATUS s = dispatchList->at(unitId).second; switch (s) { case EMPTY: @@ -168,7 +168,7 @@ ExecStage::exec() (w->instructionBuffer.front())->disassemble()); DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId); dispatchList->at(unitId).first->exec(); - (computeUnit->scheduleStage).deleteFromSch(w); + (computeUnit.scheduleStage).deleteFromSch(w); dispatchList->at(unitId).second = EMPTY; dispatchList->at(unitId).first->freeResources(); dispatchList->at(unitId).first = nullptr; @@ -208,7 +208,7 @@ ExecStage::regStats() ; spc - .init(0, computeUnit->numExeUnits(), 1) + .init(0, computeUnit.numExeUnits(), 1) .name(name() + ".spc") .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)") ; @@ -220,26 +220,26 @@ ExecStage::regStats() ; numCyclesWithInstrTypeIssued - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".num_cycles_issue_exec_rsrc") .desc("Number of cycles at least one instruction issued to " "execution resource type") ; numCyclesWithNoInstrTypeIssued - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".num_cycles_no_issue_exec_rsrc") .desc("Number of clks no instructions issued to execution " "resource type") ; int c = 0; - for (int i = 0; i < computeUnit->numVectorALUs; i++,c++) { + for (int i = 0; i < computeUnit.numVectorALUs; i++,c++) { std::string s = "VectorALU" + std::to_string(i); numCyclesWithNoInstrTypeIssued.subname(c, s); numCyclesWithInstrTypeIssued.subname(c, s); } - for (int i = 0; i < computeUnit->numScalarALUs; i++,c++) { + for (int i = 0; i < computeUnit.numScalarALUs; i++,c++) { std::string s = "ScalarALU" + std::to_string(i); numCyclesWithNoInstrTypeIssued.subname(c, s); numCyclesWithInstrTypeIssued.subname(c, s); diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh index f984d729c..cd4343e6d 100644 --- a/src/gpu-compute/exec_stage.hh +++ b/src/gpu-compute/exec_stage.hh @@ -69,7 +69,7 @@ enum DISPATCH_STATUS class ExecStage { public: - ExecStage(const ComputeUnitParams* p, ComputeUnit *cu); + ExecStage(const ComputeUnitParams* p, ComputeUnit &cu); ~ExecStage() { } void init(); void exec(); @@ -77,7 +77,7 @@ class ExecStage std::string dispStatusToStr(int j); void dumpDispList(); - std::string name() { return _name; } + const std::string& name() const { return _name; } void regStats(); // number of idle cycles Stats::Scalar numCyclesWithNoIssue; @@ -96,7 +96,7 @@ class ExecStage private: void collectStatistics(enum STAT_STATUS stage, int unitId); void initStatistics(); - ComputeUnit *computeUnit; + ComputeUnit &computeUnit; // List of waves which will be dispatched to // each execution resource. A FILLED implies @@ -115,7 +115,7 @@ class ExecStage Stats::Distribution idleDur; int executionResourcesUsed; uint64_t idle_dur; - std::string _name; + const std::string _name; }; #endif // __EXEC_STAGE_HH__ diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc index b9df6ce4f..6c3b8f40c 100644 --- a/src/gpu-compute/fetch_stage.cc +++ b/src/gpu-compute/fetch_stage.cc @@ -36,9 +36,9 @@ #include "gpu-compute/compute_unit.hh" #include "gpu-compute/wavefront.hh" -FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit *cu) +FetchStage::FetchStage(const ComputeUnitParams* p, ComputeUnit &cu) : numVectorALUs(p->num_SIMDs), computeUnit(cu), - _name(cu->name() + ".FetchStage") + _name(cu.name() + ".FetchStage") { for (int j = 0; j < numVectorALUs; ++j) { FetchUnit newFetchUnit(p, cu); @@ -55,7 +55,7 @@ void FetchStage::init() { for (int j = 0; j < numVectorALUs; ++j) { - _fetchUnit[j].bindWaveList(&computeUnit->wfList[j]); + _fetchUnit[j].bindWaveList(&computeUnit.wfList[j]); _fetchUnit[j].init(); } } diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh index afecce7cb..ea556611a 100644 --- a/src/gpu-compute/fetch_stage.hh +++ b/src/gpu-compute/fetch_stage.hh @@ -51,7 +51,7 @@ class Wavefront; class FetchStage { public: - FetchStage(const ComputeUnitParams* p, ComputeUnit *cu); + FetchStage(const ComputeUnitParams* p, ComputeUnit &cu); ~FetchStage(); void init(); void exec(); @@ -59,19 +59,19 @@ class FetchStage void fetch(PacketPtr pkt, Wavefront *wave); // Stats related variables and methods - std::string name() { return _name; } + const std::string& name() const { return _name; } void regStats(); Stats::Distribution instFetchInstReturned; FetchUnit &fetchUnit(int simdId) { return _fetchUnit.at(simdId); } private: int numVectorALUs; - ComputeUnit *computeUnit; + ComputeUnit &computeUnit; // List of fetch units. A fetch unit is // instantiated per VALU/SIMD std::vector _fetchUnit; - std::string _name; + const std::string _name; }; #endif // __FETCH_STAGE_HH__ diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index e0127e868..ac9a5a656 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -45,7 +45,7 @@ uint32_t FetchUnit::globalFetchUnitID; -FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit *cu) +FetchUnit::FetchUnit(const ComputeUnitParams *p, ComputeUnit &cu) : timingSim(true), computeUnit(cu), fetchScheduler(p), waveList(nullptr), fetchDepth(p->fetch_depth) { @@ -60,16 +60,16 @@ FetchUnit::~FetchUnit() void FetchUnit::init() { - timingSim = computeUnit->shader->timingSim; + timingSim = computeUnit.shader->timingSim; fetchQueue.clear(); - fetchStatusQueue.resize(computeUnit->shader->n_wf); - fetchBuf.resize(computeUnit->shader->n_wf, FetchBufDesc()); + fetchStatusQueue.resize(computeUnit.shader->n_wf); + fetchBuf.resize(computeUnit.shader->n_wf, FetchBufDesc()); - for (int i = 0; i < computeUnit->shader->n_wf; ++i) { + for (int i = 0; i < computeUnit.shader->n_wf; ++i) { Wavefront *wf = waveList->at(i); assert(wf->wfSlotId == i); fetchStatusQueue[i] = std::make_pair(wf, false); - fetchBuf[i].allocateBuf(fetchDepth, computeUnit->cacheLineSize(), wf); + fetchBuf[i].allocateBuf(fetchDepth, computeUnit.cacheLineSize(), wf); fetchBuf[i].decoder(&decoder); } @@ -97,7 +97,7 @@ FetchUnit::exec() } // re-evaluate waves which are marked as not ready for fetch - for (int j = 0; j < computeUnit->shader->n_wf; ++j) { + for (int j = 0; j < computeUnit.shader->n_wf; ++j) { // Following code assumes 64-bit opertaion and all insts are // represented by 64-bit pointers to inst objects. Wavefront *curWave = fetchStatusQueue[j].first; @@ -143,7 +143,7 @@ FetchUnit::initiateFetch(Wavefront *wavefront) // this should already be aligned to a cache line assert(vaddr == makeLineAddress(vaddr, - computeUnit->getCacheLineBits())); + computeUnit.getCacheLineBits())); // shouldn't be fetching a line that is already buffered assert(!fetchBuf.at(wavefront->wfSlotId).pcBuffered(vaddr)); @@ -151,16 +151,16 @@ FetchUnit::initiateFetch(Wavefront *wavefront) fetchBuf.at(wavefront->wfSlotId).reserveBuf(vaddr); DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Id%d: Initiate fetch " - "from pc: %d %#x\n", computeUnit->cu_id, wavefront->simdId, + "from pc: %d %#x\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId, wavefront->pc(), vaddr); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", - computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); + computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); // set up virtual request RequestPtr req = std::make_shared( - vaddr, computeUnit->cacheLineSize(), Request::INST_FETCH, - computeUnit->masterId(), 0, 0, nullptr); + vaddr, computeUnit.cacheLineSize(), Request::INST_FETCH, + computeUnit.masterId(), 0, 0, nullptr); PacketPtr pkt = new Packet(req, MemCmd::ReadReq); @@ -171,36 +171,36 @@ FetchUnit::initiateFetch(Wavefront *wavefront) // Sender State needed by TLB hierarchy pkt->senderState = new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, - computeUnit->shader->gpuTc, + computeUnit.shader->gpuTc, false, pkt->senderState); - if (computeUnit->sqcTLBPort->isStalled()) { - assert(computeUnit->sqcTLBPort->retries.size() > 0); + if (computeUnit.sqcTLBPort->isStalled()) { + assert(computeUnit.sqcTLBPort->retries.size() > 0); DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", vaddr); - computeUnit->sqcTLBPort->retries.push_back(pkt); - } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { + computeUnit.sqcTLBPort->retries.push_back(pkt); + } else if (!computeUnit.sqcTLBPort->sendTimingReq(pkt)) { // Stall the data port; // No more packet is issued till // ruby indicates resources are freed by // a recvReqRetry() call back on this port. - computeUnit->sqcTLBPort->stallPort(); + computeUnit.sqcTLBPort->stallPort(); DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", vaddr); - computeUnit->sqcTLBPort->retries.push_back(pkt); + computeUnit.sqcTLBPort->retries.push_back(pkt); } else { DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); } } else { pkt->senderState = new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, - computeUnit->shader->gpuTc); + computeUnit.shader->gpuTc); - computeUnit->sqcTLBPort->sendFunctional(pkt); + computeUnit.sqcTLBPort->sendFunctional(pkt); TheISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); @@ -220,7 +220,7 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) assert(pkt->req->hasSize()); DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", - computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); /** @@ -257,20 +257,20 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) if (timingSim) { // translation is done. Send the appropriate timing memory request. - if (!computeUnit->sqcPort->sendTimingReq(pkt)) { - computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, + if (!computeUnit.sqcPort->sendTimingReq(pkt)) { + computeUnit.sqcPort->retries.push_back(std::make_pair(pkt, wavefront)); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", - computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", - computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, + computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); } } else { - computeUnit->sqcPort->sendFunctional(pkt); + computeUnit.sqcPort->sendFunctional(pkt); processFetchReturn(pkt); } } @@ -284,7 +284,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt) Wavefront *wavefront = sender_state->wavefront; DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " - "%d bytes!\n", computeUnit->cu_id, wavefront->simdId, + "%d bytes!\n", computeUnit.cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), pkt->req->getSize()); if (wavefront->dropFetch) { @@ -553,7 +553,7 @@ FetchUnit::FetchBufDesc::decodeInsts() = std::make_shared(wavefront->computeUnit, wavefront, gpu_static_inst, wavefront->computeUnit-> - getAndIncSeqNum()); + getAndIncSeqNum()); wavefront->instructionBuffer.push_back(gpu_dyn_inst); DPRINTF(GPUFetch, "WF[%d][%d]: Id%ld decoded %s (%d bytes). " diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh index 1615d81bb..36ab9c365 100644 --- a/src/gpu-compute/fetch_unit.hh +++ b/src/gpu-compute/fetch_unit.hh @@ -49,7 +49,7 @@ class Wavefront; class FetchUnit { public: - FetchUnit(const ComputeUnitParams* p, ComputeUnit *cu); + FetchUnit(const ComputeUnitParams* p, ComputeUnit &cu); ~FetchUnit(); void init(); void exec(); @@ -234,7 +234,7 @@ class FetchUnit }; bool timingSim; - ComputeUnit *computeUnit; + ComputeUnit &computeUnit; TheGpuISA::Decoder decoder; // Fetch scheduler; Selects one wave from diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 2619360a4..dcc80f061 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -44,8 +44,8 @@ #include "gpu-compute/wavefront.hh" GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p, - ComputeUnit *cu) - : computeUnit(cu), _name(cu->name() + ".GlobalMemPipeline"), + ComputeUnit &cu) + : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"), gmQueueSize(p->global_mem_queue_size), maxWaveRequests(p->max_wave_requests), inflightStores(0), inflightLoads(0) @@ -55,7 +55,7 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p, void GlobalMemPipeline::init() { - globalMemSize = computeUnit->shader->globalMemSize; + globalMemSize = computeUnit.shader->globalMemSize; } bool @@ -121,9 +121,9 @@ GlobalMemPipeline::exec() } - if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && - accessVrf && (computeUnit->shader->coissue_return || - computeUnit->vectorGlobalMemUnit.rdy())) { + if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() && + accessVrf && (computeUnit.shader->coissue_return || + computeUnit.vectorGlobalMemUnit.rdy())) { w = m->wavefront(); @@ -141,16 +141,16 @@ GlobalMemPipeline::exec() Tick accessTime = curTick() - m->getAccessTime(); // Decrement outstanding requests count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); if (m->isStore() || m->isAtomic() || m->isMemSync()) { - computeUnit->shader->sampleStore(accessTime); - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, + computeUnit.shader->sampleStore(accessTime); + computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, -1); } if (m->isLoad() || m->isAtomic() || m->isMemSync()) { - computeUnit->shader->sampleLoad(accessTime); - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, + computeUnit.shader->sampleLoad(accessTime); + computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, -1); } @@ -160,12 +160,12 @@ GlobalMemPipeline::exec() // going all the way to memory and stats for individual cache // blocks generated by the instruction. m->profileRoundTripTime(curTick(), InstMemoryHop::Complete); - computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime()); - computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime()); + computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime()); + computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime()); // Mark write bus busy for appropriate amount of time - computeUnit->glbMemToVrfBus.set(m->time); - if (!computeUnit->shader->coissue_return) + computeUnit.glbMemToVrfBus.set(m->time); + if (!computeUnit.shader->coissue_return) w->computeUnit->vectorGlobalMemUnit.set(m->time); } @@ -217,13 +217,13 @@ GlobalMemPipeline::exec() * correctly. */ handleResponse(mp); - computeUnit->getTokenManager()->recvTokens(1); + computeUnit.getTokenManager()->recvTokens(1); } gmIssuedRequests.pop(); DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", - computeUnit->cu_id, mp->simdId, mp->wfSlotId); + computeUnit.cu_id, mp->simdId, mp->wfSlotId); } } diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh index 97c0e8d59..c53789ee5 100644 --- a/src/gpu-compute/global_memory_pipeline.hh +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -56,7 +56,7 @@ class ComputeUnit; class GlobalMemPipeline { public: - GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu); + GlobalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu); void init(); void exec(); @@ -108,8 +108,8 @@ class GlobalMemPipeline void acqCoalescerToken(GPUDynInstPtr mp); private: - ComputeUnit *computeUnit; - std::string _name; + ComputeUnit &computeUnit; + const std::string _name; int gmQueueSize; int maxWaveRequests; diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index 6644a0bf4..df576907c 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -41,8 +41,8 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit *cu) - : computeUnit(cu), _name(cu->name() + ".LocalMemPipeline"), +LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p, ComputeUnit &cu) + : computeUnit(cu), _name(cu.name() + ".LocalMemPipeline"), lmQueueSize(p->local_mem_queue_size) { } @@ -66,9 +66,9 @@ LocalMemPipeline::exec() } if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf && - computeUnit->locMemToVrfBus.rdy() - && (computeUnit->shader->coissue_return - || computeUnit->vectorSharedMemUnit.rdy())) { + computeUnit.locMemToVrfBus.rdy() + && (computeUnit.shader->coissue_return + || computeUnit.vectorSharedMemUnit.rdy())) { lmReturnedRequests.pop(); w = m->wavefront(); @@ -83,21 +83,21 @@ LocalMemPipeline::exec() } // Decrement outstanding request count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, + computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrLm, m->time, -1); } if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, + computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdLm, m->time, -1); } // Mark write bus busy for appropriate amount of time - computeUnit->locMemToVrfBus.set(m->time); - if (computeUnit->shader->coissue_return == 0) + computeUnit.locMemToVrfBus.set(m->time); + if (computeUnit.shader->coissue_return == 0) w->computeUnit->vectorSharedMemUnit.set(m->time); } @@ -108,7 +108,7 @@ LocalMemPipeline::exec() GPUDynInstPtr m = lmIssuedRequests.front(); - bool returnVal = computeUnit->sendToLds(m); + bool returnVal = computeUnit.sendToLds(m); if (!returnVal) { DPRINTF(GPUPort, "packet was nack'd and put in retry queue"); } diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh index 7821785c2..3ff3b79ec 100644 --- a/src/gpu-compute/local_memory_pipeline.hh +++ b/src/gpu-compute/local_memory_pipeline.hh @@ -55,7 +55,7 @@ class Wavefront; class LocalMemPipeline { public: - LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu); + LocalMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu); void exec(); std::queue &getLMRespFIFO() { return lmReturnedRequests; } @@ -84,8 +84,8 @@ class LocalMemPipeline } private: - ComputeUnit *computeUnit; - std::string _name; + ComputeUnit &computeUnit; + const std::string _name; int lmQueueSize; Stats::Scalar loadVrfBankConflictCycles; // Local Memory Request Fifo: all shared memory requests diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc index 9ec354c1b..35b4ca5e5 100644 --- a/src/gpu-compute/scalar_memory_pipeline.cc +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -44,8 +44,8 @@ #include "gpu-compute/wavefront.hh" ScalarMemPipeline::ScalarMemPipeline(const ComputeUnitParams* p, - ComputeUnit *cu) - : computeUnit(cu), _name(cu->name() + ".ScalarMemPipeline"), + ComputeUnit &cu) + : computeUnit(cu), _name(cu.name() + ".ScalarMemPipeline"), queueSize(p->scalar_mem_queue_size), inflightStores(0), inflightLoads(0) { @@ -72,10 +72,10 @@ ScalarMemPipeline::exec() } if ((!returnedStores.empty() || !returnedLoads.empty()) && - m->latency.rdy() && computeUnit->scalarMemToSrfBus.rdy() && + m->latency.rdy() && computeUnit.scalarMemToSrfBus.rdy() && accessSrf && - (computeUnit->shader->coissue_return || - computeUnit->scalarMemUnit.rdy())) { + (computeUnit.shader->coissue_return || + computeUnit.scalarMemUnit.rdy())) { w = m->wavefront(); @@ -97,21 +97,21 @@ ScalarMemPipeline::exec() } // Decrement outstanding register count - computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); + computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); if (m->isStore() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm, + computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsWrGm, m->time, -1); } if (m->isLoad() || m->isAtomic()) { - computeUnit->shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm, + computeUnit.shader->ScheduleAdd(&w->scalarOutstandingReqsRdGm, m->time, -1); } // Mark write bus busy for appropriate amount of time - computeUnit->scalarMemToSrfBus.set(m->time); - if (!computeUnit->shader->coissue_return) + computeUnit.scalarMemToSrfBus.set(m->time); + if (!computeUnit.shader->coissue_return) w->computeUnit->scalarMemUnit.set(m->time); } @@ -138,7 +138,7 @@ ScalarMemPipeline::exec() issuedRequests.pop(); DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping scalar mem_op\n", - computeUnit->cu_id, mp->simdId, mp->wfSlotId); + computeUnit.cu_id, mp->simdId, mp->wfSlotId); } } diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh index 03211a291..b839701ae 100644 --- a/src/gpu-compute/scalar_memory_pipeline.hh +++ b/src/gpu-compute/scalar_memory_pipeline.hh @@ -59,7 +59,7 @@ class ComputeUnit; class ScalarMemPipeline { public: - ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit *cu); + ScalarMemPipeline(const ComputeUnitParams *p, ComputeUnit &cu); void exec(); std::queue &getGMReqFIFO() { return issuedRequests; } @@ -84,12 +84,12 @@ class ScalarMemPipeline return (issuedRequests.size() + pendReqs) < queueSize; } - const std::string &name() const { return _name; } + const std::string& name() const { return _name; } void regStats(); private: - ComputeUnit *computeUnit; - std::string _name; + ComputeUnit &computeUnit; + const std::string _name; int queueSize; // Counters to track and limit the inflight scalar loads and stores diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 510d3f347..0785aa03d 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -43,17 +43,17 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu) - : computeUnit(cu), _name(cu->name() + ".ScheduleStage"), +ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu) + : computeUnit(cu), _name(cu.name() + ".ScheduleStage"), vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false), scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false), locMemBusRdy(false), locMemIssueRdy(false) { - for (int j = 0; j < cu->numExeUnits(); ++j) { + for (int j = 0; j < cu.numExeUnits(); ++j) { scheduler.emplace_back(p); } wavesInSch.clear(); - schList.resize(cu->numExeUnits()); + schList.resize(cu.numExeUnits()); for (auto &dq : schList) { dq.clear(); } @@ -70,36 +70,36 @@ void ScheduleStage::init() { - fatal_if(scheduler.size() != computeUnit->readyList.size(), + fatal_if(scheduler.size() != computeUnit.readyList.size(), "Scheduler should have same number of entries as CU's readyList"); - for (int j = 0; j < computeUnit->numExeUnits(); ++j) { - scheduler[j].bindList(&computeUnit->readyList[j]); + for (int j = 0; j < computeUnit.numExeUnits(); ++j) { + scheduler[j].bindList(&computeUnit.readyList[j]); } - dispatchList = &computeUnit->dispatchList; + dispatchList = &computeUnit.dispatchList; - assert(computeUnit->numVectorGlobalMemUnits == 1); - assert(computeUnit->numVectorSharedMemUnits == 1); + assert(computeUnit.numVectorGlobalMemUnits == 1); + assert(computeUnit.numVectorSharedMemUnits == 1); } void ScheduleStage::exec() { // Update readyList - for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + for (int j = 0; j < computeUnit.numExeUnits(); ++j) { // delete all ready wavefronts whose instruction buffers are now // empty because the last instruction was executed - computeUnit->updateReadyList(j); + computeUnit.updateReadyList(j); /** * Remove any wave that already has an instruction present in SCH * waiting for RF reads to complete. This prevents out of order * execution within a wave. */ - for (auto wIt = computeUnit->readyList.at(j).begin(); - wIt != computeUnit->readyList.at(j).end();) { + for (auto wIt = computeUnit.readyList.at(j).begin(); + wIt != computeUnit.readyList.at(j).end();) { if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) { *wIt = nullptr; - wIt = computeUnit->readyList.at(j).erase(wIt); + wIt = computeUnit.readyList.at(j).erase(wIt); } else { wIt++; } @@ -112,10 +112,10 @@ ScheduleStage::exec() // Scalar Memory are iterated after VMEM // Iterate VMEM and SMEM - int firstMemUnit = computeUnit->firstMemUnit(); - int lastMemUnit = computeUnit->lastMemUnit(); + int firstMemUnit = computeUnit.firstMemUnit(); + int lastMemUnit = computeUnit.lastMemUnit(); for (int j = firstMemUnit; j <= lastMemUnit; j++) { - int readyListSize = computeUnit->readyList[j].size(); + int readyListSize = computeUnit.readyList[j].size(); // If no wave is ready to be scheduled on the execution resource // then skip scheduling for this execution resource if (!readyListSize) { @@ -135,12 +135,12 @@ ScheduleStage::exec() } // Iterate everything else - for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + for (int j = 0; j < computeUnit.numExeUnits(); ++j) { // skip the VMEM resources if (j >= firstMemUnit && j <= lastMemUnit) { continue; } - int readyListSize = computeUnit->readyList[j].size(); + int readyListSize = computeUnit.readyList[j].size(); // If no wave is ready to be scheduled on the execution resource // then skip scheduling for this execution resource if (!readyListSize) { @@ -205,16 +205,16 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w) bool accessVrfWr = true; if (!ii->isScalar()) { accessVrfWr = - computeUnit->vrf[w->simdId]->canScheduleWriteOperands(w, ii); + computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii); } bool accessSrfWr = - computeUnit->srf[w->simdId]->canScheduleWriteOperands(w, ii); + computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii); bool accessRf = accessVrfWr && accessSrfWr; if (accessRf) { if (!ii->isScalar()) { - computeUnit->vrf[w->simdId]->scheduleWriteOperands(w, ii); + computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii); } - computeUnit->srf[w->simdId]->scheduleWriteOperands(w, ii); + computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii); return true; } else { rfAccessStalls[SCH_RF_ACCESS_NRDY]++; @@ -235,7 +235,7 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w) void ScheduleStage::scheduleRfDestOperands() { - for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + for (int j = 0; j < computeUnit.numExeUnits(); ++j) { if (!dispatchList->at(j).first) { continue; } @@ -269,10 +269,10 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w) bool accessVrf = true; if (!ii->isScalar()) { accessVrf = - computeUnit->vrf[w->simdId]->canScheduleReadOperands(w, ii); + computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii); } bool accessSrf = - computeUnit->srf[w->simdId]->canScheduleReadOperands(w, ii); + computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii); // If RFs can support instruction, add to schList in RFBUSY state, // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands // to the VRF @@ -282,16 +282,16 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w) exeType, w->simdId, w->wfDynId, ii->seqNum(), ii->disassemble()); - computeUnit->insertInPipeMap(w); + computeUnit.insertInPipeMap(w); wavesInSch.emplace(w->wfDynId); schList.at(exeType).push_back(std::make_pair(w, RFBUSY)); if (w->isOldestInstWaitcnt()) { w->setStatus(Wavefront::S_WAITCNT); } if (!ii->isScalar()) { - computeUnit->vrf[w->simdId]->scheduleReadOperands(w, ii); + computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii); } - computeUnit->srf[w->simdId]->scheduleReadOperands(w, ii); + computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii); DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n", exeType, w->simdId, w->wfDynId, @@ -341,33 +341,33 @@ ScheduleStage::checkMemResources() scalarMemBusRdy = false; scalarMemIssueRdy = false; // check if there is a SRF->Global Memory bus available and - if (computeUnit->srfToScalarMemPipeBus.rdy(Cycles(1))) { + if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) { scalarMemBusRdy = true; } // check if we can issue a scalar memory instruction - if (computeUnit->scalarMemUnit.rdy(Cycles(1))) { + if (computeUnit.scalarMemUnit.rdy(Cycles(1))) { scalarMemIssueRdy = true; } glbMemBusRdy = false; glbMemIssueRdy = false; // check if there is a VRF->Global Memory bus available - if (computeUnit->vrfToGlobalMemPipeBus.rdy(Cycles(1))) { + if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) { glbMemBusRdy = true; } // check if we can issue a Global memory instruction - if (computeUnit->vectorGlobalMemUnit.rdy(Cycles(1))) { + if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) { glbMemIssueRdy = true; } locMemBusRdy = false; locMemIssueRdy = false; // check if there is a VRF->LDS bus available - if (computeUnit->vrfToLocalMemPipeBus.rdy(Cycles(1))) { + if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) { locMemBusRdy = true; } // check if we can issue a LDS instruction - if (computeUnit->vectorSharedMemUnit.rdy(Cycles(1))) { + if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) { locMemIssueRdy = true; } } @@ -378,10 +378,10 @@ ScheduleStage::dispatchReady(Wavefront *w) vectorAluRdy = false; scalarAluRdy = false; // check for available vector/scalar ALUs in the next cycle - if (computeUnit->vectorALUs[w->simdId].rdy(Cycles(1))) { + if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) { vectorAluRdy = true; } - if (computeUnit->scalarALUs[w->scalarAlu].rdy(Cycles(1))) { + if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) { scalarAluRdy = true; } GPUDynInstPtr ii = w->instructionBuffer.front(); @@ -423,11 +423,11 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) { + if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) { rdy = false; dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++; } - if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) { + if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) { rdy = false; dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++; } @@ -445,7 +445,7 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit->scalarMemoryPipe. + if (!computeUnit.scalarMemoryPipe. isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe + w->scalarWrGmReqsInPipe)) { rdy = false; @@ -465,7 +465,7 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit->localMemoryPipe. + if (!computeUnit.localMemoryPipe. isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) { rdy = false; dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++; @@ -484,15 +484,15 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) { + if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) { rdy = false; dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++; } - if (!computeUnit->globalMemoryPipe.outstandingReqsCheck(ii)) { + if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) { rdy = false; dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++; } - if (!computeUnit->localMemoryPipe. + if (!computeUnit.localMemoryPipe. isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) { rdy = false; dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++; @@ -514,7 +514,7 @@ ScheduleStage::fillDispatchList() // update execution resource status checkMemResources(); // iterate execution resources - for (int j = 0; j < computeUnit->numExeUnits(); j++) { + for (int j = 0; j < computeUnit.numExeUnits(); j++) { assert(dispatchList->at(j).second == EMPTY); // iterate waves in schList to pick one for dispatch @@ -537,7 +537,7 @@ ScheduleStage::fillDispatchList() instructionBuffer.front(); if (!mp->isMemSync() && !mp->isScalar() && (mp->isGlobalMem() || mp->isFlat())) { - computeUnit->globalMemoryPipe.acqCoalescerToken(mp); + computeUnit.globalMemoryPipe.acqCoalescerToken(mp); } doDispatchListTransition(j, EXREADY, schIter->first); @@ -581,9 +581,9 @@ ScheduleStage::arbitrateVrfToLdsBus() // and a VRF->LDS bus. In GFx9, this is not the case. // iterate the GM pipelines - for (int i = 0; i < computeUnit->numVectorGlobalMemUnits; i++) { + for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) { // get the GM pipe index in the dispatchList - int gm_exe_unit = computeUnit->firstMemUnit() + i; + int gm_exe_unit = computeUnit.firstMemUnit() + i; // get the wave in the dispatchList Wavefront *w = dispatchList->at(gm_exe_unit).first; // If the WF is valid, ready to execute, and the instruction @@ -617,7 +617,7 @@ ScheduleStage::checkRfOperandReadComplete() // Iterate the schList queues and check if operand reads // have completed in the RFs. If so, mark the wave as ready for // selection for dispatchList - for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + for (int j = 0; j < computeUnit.numExeUnits(); ++j) { for (auto &p : schList.at(j)) { Wavefront *w = p.first; assert(w); @@ -630,10 +630,10 @@ ScheduleStage::checkRfOperandReadComplete() bool vrfRdy = true; if (!ii->isScalar()) { vrfRdy = - computeUnit->vrf[w->simdId]->operandReadComplete(w, ii); + computeUnit.vrf[w->simdId]->operandReadComplete(w, ii); } bool srfRdy = - computeUnit->srf[w->simdId]->operandReadComplete(w, ii); + computeUnit.srf[w->simdId]->operandReadComplete(w, ii); bool operandsReady = vrfRdy && srfRdy; if (operandsReady) { DPRINTF(GPUSched, @@ -671,9 +671,9 @@ void ScheduleStage::reserveResources() { std::vector exeUnitReservations; - exeUnitReservations.resize(computeUnit->numExeUnits(), false); + exeUnitReservations.resize(computeUnit.numExeUnits(), false); - for (int j = 0; j < computeUnit->numExeUnits(); ++j) { + for (int j = 0; j < computeUnit.numExeUnits(); ++j) { Wavefront *dispatchedWave = dispatchList->at(j).first; if (dispatchedWave) { DISPATCH_STATUS s = dispatchList->at(j).second; @@ -686,10 +686,10 @@ ScheduleStage::reserveResources() GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front(); if (!ii->isScalar()) { - computeUnit->vrf[dispatchedWave->simdId]-> + computeUnit.vrf[dispatchedWave->simdId]-> dispatchInstruction(ii); } - computeUnit->srf[dispatchedWave->simdId]-> + computeUnit.srf[dispatchedWave->simdId]-> dispatchInstruction(ii); std::stringstream ss; @@ -743,35 +743,35 @@ void ScheduleStage::regStats() { rdyListNotEmpty - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".rdy_list_not_empty") .desc("number of cycles one or more wave on ready list per " "execution resource") ; rdyListEmpty - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".rdy_list_empty") .desc("number of cycles no wave on ready list per " "execution resource") ; addToSchListStalls - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".sch_list_add_stalls") .desc("number of cycles a wave is not added to schList per " "execution resource when ready list is not empty") ; schListToDispList - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".sch_list_to_disp_list") .desc("number of cycles a wave is added to dispatchList per " "execution resource") ; schListToDispListStalls - .init(computeUnit->numExeUnits()) + .init(computeUnit.numExeUnits()) .name(name() + ".sch_list_to_disp_list_stalls") .desc("number of cycles no wave is added to dispatchList per " "execution resource") diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh index be2691b28..6ec4a8ddd 100644 --- a/src/gpu-compute/schedule_stage.hh +++ b/src/gpu-compute/schedule_stage.hh @@ -57,13 +57,13 @@ struct ComputeUnitParams; class ScheduleStage { public: - ScheduleStage(const ComputeUnitParams *p, ComputeUnit *cu); + ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu); ~ScheduleStage(); void init(); void exec(); // Stats related variables and methods - std::string name() { return _name; } + const std::string& name() const { return _name; } enum SchNonRdyType { SCH_SCALAR_ALU_NRDY, SCH_VECTOR_ALU_NRDY, @@ -114,7 +114,7 @@ class ScheduleStage }; private: - ComputeUnit *computeUnit; + ComputeUnit &computeUnit; // Each execution resource will have its own // scheduler and a dispatch list std::vector scheduler; @@ -168,7 +168,7 @@ class ScheduleStage // to dispatchList Stats::Vector dispNrdyStalls; - std::string _name; + const std::string _name; // called by exec() to add a wave to schList if the RFs can support it bool addToSchList(int exeType, Wavefront *w); diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc index e14a2f2b2..fb99e6912 100644 --- a/src/gpu-compute/scoreboard_check_stage.cc +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -45,8 +45,8 @@ #include "params/ComputeUnit.hh" ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p, - ComputeUnit *cu) - : computeUnit(cu), _name(cu->name() + ".ScoreboardCheckStage") + ComputeUnit &cu) + : computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage") { } @@ -58,8 +58,8 @@ ScoreboardCheckStage::~ScoreboardCheckStage() void ScoreboardCheckStage::init() { - for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { - readyList.push_back(&computeUnit->readyList[unitId]); + for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { + readyList.push_back(&computeUnit.readyList[unitId]); } } @@ -104,7 +104,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, if (w->getStatus() == Wavefront::S_BARRIER) { assert(w->hasBarrier()); int bar_id = w->barrierId(); - if (!computeUnit->allAtBarrier(bar_id)) { + if (!computeUnit.allAtBarrier(bar_id)) { DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - Stalled at " "barrier Id%d. %d waves remain.\n", w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId, bar_id, @@ -116,8 +116,8 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, DPRINTF(GPUSync, "CU[%d] WF[%d][%d] Wave[%d] - All waves at barrier " "Id%d. Resetting barrier resources.\n", w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId, bar_id); - computeUnit->resetBarrier(bar_id); - computeUnit->releaseWFsFromBarrier(bar_id); + computeUnit.resetBarrier(bar_id); + computeUnit.releaseWFsFromBarrier(bar_id); } // Check WF status: it has to be running @@ -154,17 +154,17 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, } DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Ready for Inst : %s\n", - computeUnit->cu_id, w->simdId, w->wfSlotId, ii->disassemble()); + computeUnit.cu_id, w->simdId, w->wfSlotId, ii->disassemble()); // Non-scalar (i.e., vector) instructions may use VGPRs if (!ii->isScalar()) { - if (!computeUnit->vrf[w->simdId]->operandsReady(w, ii)) { + if (!computeUnit.vrf[w->simdId]->operandsReady(w, ii)) { *rdyStatus = NRDY_VGPR_NRDY; return false; } } // Scalar and non-scalar instructions may use SGPR - if (!computeUnit->srf[w->simdId]->operandsReady(w, ii)) { + if (!computeUnit.srf[w->simdId]->operandsReady(w, ii)) { *rdyStatus = NRDY_SGPR_NRDY; return false; } @@ -190,7 +190,7 @@ ScoreboardCheckStage::ready(Wavefront *w, nonrdytype_e *rdyStatus, return false; } } - DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, + DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit.cu_id, w->simdId, w->wfSlotId, ii->disassemble()); *exeResType = mapWaveToExeUnit(w); *rdyStatus = INST_RDY; @@ -236,7 +236,7 @@ ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w) } } panic("%s: unmapped to an execution resource", ii->disassemble()); - return computeUnit->numExeUnits(); + return computeUnit.numExeUnits(); } void @@ -244,7 +244,7 @@ ScoreboardCheckStage::exec() { // reset the ready list for all execution units; it will be // constructed every cycle since resource availability may change - for (int unitId = 0; unitId < computeUnit->numExeUnits(); ++unitId) { + for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { // Reset wavefront pointers to nullptr so clear() on the vector // does not accidentally destruct the wavefront object for (int i = 0; i < readyList[unitId]->size(); i++) { @@ -253,10 +253,10 @@ ScoreboardCheckStage::exec() readyList[unitId]->clear(); } // iterate over all WF slots across all vector ALUs - for (int simdId = 0; simdId < computeUnit->numVectorALUs; ++simdId) { - for (int wfSlot = 0; wfSlot < computeUnit->shader->n_wf; ++wfSlot) { + for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) { + for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) { // reset the ready status of each wavefront - Wavefront *curWave = computeUnit->wfList[simdId][wfSlot]; + Wavefront *curWave = computeUnit.wfList[simdId][wfSlot]; nonrdytype_e rdyStatus = NRDY_ILLEGAL; int exeResType = -1; // check WF readiness: If the WF's oldest diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh index 3cdae278c..6953c4c66 100644 --- a/src/gpu-compute/scoreboard_check_stage.hh +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -70,7 +70,7 @@ class ScoreboardCheckStage NRDY_CONDITIONS }; - ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit *cu); + ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu); ~ScoreboardCheckStage(); void init(); void exec(); @@ -84,7 +84,7 @@ class ScoreboardCheckStage int mapWaveToExeUnit(Wavefront *w); bool ready(Wavefront *w, nonrdytype_e *rdyStatus, int *exeResType, int wfSlot); - ComputeUnit *computeUnit; + ComputeUnit &computeUnit; // List of waves which are ready to be scheduled. // Each execution resource has a ready list @@ -93,7 +93,7 @@ class ScoreboardCheckStage // Stats Stats::Vector stallCycles; - std::string _name; + const std::string _name; }; #endif // __SCOREBOARD_CHECK_STAGE_HH__ -- 2.30.2