From 63c76448ebd1411af02ff6c0fce5a6afccfb9c87 Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Mon, 2 Jul 2018 15:56:22 -0400 Subject: [PATCH] gpu-compute: Add pipeline stage interface classes This change separates the pipeline stage interfaces for the GPU's compute unit into their own classes with a well-defined interface. This helps to create a cleaner interface for users to extend the CU pipeline's capabilities and also helps consolidate all the pipeline communication code in one place in the source. Change-Id: I569d52bce84dc1b9fbf8f0f96d53a81a2b6773c6 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29972 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- src/gpu-compute/SConscript | 1 + src/gpu-compute/comm.cc | 154 ++++++++++ src/gpu-compute/comm.hh | 123 ++++++++ src/gpu-compute/compute_unit.cc | 38 +-- src/gpu-compute/compute_unit.hh | 72 ++--- src/gpu-compute/exec_stage.cc | 73 ++--- src/gpu-compute/exec_stage.hh | 16 +- src/gpu-compute/schedule_stage.cc | 328 ++++++++++++---------- src/gpu-compute/schedule_stage.hh | 31 +- src/gpu-compute/scoreboard_check_stage.cc | 36 +-- src/gpu-compute/scoreboard_check_stage.hh | 14 +- 11 files changed, 578 insertions(+), 308 deletions(-) create mode 100644 src/gpu-compute/comm.cc create mode 100644 src/gpu-compute/comm.hh diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 244791b9b..0f1afbcca 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,6 +41,7 @@ SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') SimObject('X86GPUTLB.py') +Source('comm.cc') Source('compute_unit.cc') Source('dispatcher.cc') Source('exec_stage.cc') diff --git a/src/gpu-compute/comm.cc b/src/gpu-compute/comm.cc new file mode 100644 index 000000000..b1dd03143 --- /dev/null +++ b/src/gpu-compute/comm.cc @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +#include "gpu-compute/comm.hh" + +#include + +#include "gpu-compute/wavefront.hh" +#include "params/ComputeUnit.hh" + +/** + * Scoreboard/Schedule stage interface. + */ +ScoreboardCheckToSchedule::ScoreboardCheckToSchedule(const ComputeUnitParams + *p) +{ + int num_func_units = p->num_SIMDs + p->num_scalar_cores + + p->num_global_mem_pipes + p->num_shared_mem_pipes + + p->num_scalar_mem_pipes; + _readyWFs.resize(num_func_units); + + for (auto &func_unit_wf_list : _readyWFs) { + func_unit_wf_list.reserve(p->n_wf); + } +} + +void +ScoreboardCheckToSchedule::reset() +{ + for (auto &func_unit_wf_list : _readyWFs) { + func_unit_wf_list.resize(0); + } +} + +void +ScoreboardCheckToSchedule::markWFReady(Wavefront *wf, int func_unit_id) +{ + _readyWFs[func_unit_id].push_back(wf); +} + +int +ScoreboardCheckToSchedule::numReadyLists() const +{ + return _readyWFs.size(); +} + +std::vector& +ScoreboardCheckToSchedule::readyWFs(int func_unit_id) +{ + return _readyWFs[func_unit_id]; +} + +/** + * Delete all wavefronts that have been marked as ready at scoreboard stage + * but are found to have empty instruction buffers at schedule stage. + */ +void +ScoreboardCheckToSchedule::updateReadyList(int func_unit_id) +{ + std::vector &func_unit_wf_list = _readyWFs[func_unit_id]; + + for (auto it = func_unit_wf_list.begin(); it != func_unit_wf_list.end();) { + if ((*it)->instructionBuffer.empty()) { + it = func_unit_wf_list.erase(it); + } else { + ++it; + } + } +} + +/** + * Schedule/Execute stage interface. + */ +ScheduleToExecute::ScheduleToExecute(const ComputeUnitParams *p) +{ + int num_func_units = p->num_SIMDs + p->num_scalar_cores + + p->num_global_mem_pipes + p->num_shared_mem_pipes + + p->num_scalar_mem_pipes; + _readyInsts.resize(num_func_units, nullptr); + _dispatchStatus.resize(num_func_units, EMPTY); +} + +void +ScheduleToExecute::reset() +{ + for (auto &func_unit_ready_inst : _readyInsts) { + func_unit_ready_inst = nullptr; + } + + for (auto &func_unit_status : _dispatchStatus) { + func_unit_status = EMPTY; + } +} + +GPUDynInstPtr& +ScheduleToExecute::readyInst(int func_unit_id) +{ + return _readyInsts[func_unit_id]; +} + +void +ScheduleToExecute::dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, + int func_unit_id, + DISPATCH_STATUS disp_status) +{ + _readyInsts[func_unit_id] = gpu_dyn_inst; + _dispatchStatus[func_unit_id] = disp_status; +} + +void +ScheduleToExecute::dispatchTransition(int func_unit_id, + DISPATCH_STATUS disp_status) +{ + _readyInsts[func_unit_id] = nullptr; + _dispatchStatus[func_unit_id] = disp_status; +} + +DISPATCH_STATUS +ScheduleToExecute::dispatchStatus(int func_unit_id) const +{ + return _dispatchStatus[func_unit_id]; +} diff --git a/src/gpu-compute/comm.hh b/src/gpu-compute/comm.hh new file mode 100644 index 000000000..bc3ec7b86 --- /dev/null +++ b/src/gpu-compute/comm.hh @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Anthony Gutierrez + */ + +#ifndef __GPU_COMPUTE_COMM_HH__ +#define __GPU_COMPUTE_COMM_HH__ + +#include +#include + +#include "gpu-compute/exec_stage.hh" +#include "gpu-compute/misc.hh" + +struct ComputeUnitParams; +class Wavefront; + +class PipeStageIFace +{ + public: + /** + * Reset the pipe stage interface. This is called to remove + * any stale state from the pipe stage that is leftover from + * the prior cycle. This is needed when stages do not actually + * consume the information passed via the stage interfaces. + */ + virtual void reset() = 0; +}; + +/** + * Communication interface between ScoreboardCheck and Schedule stages. + */ +class ScoreboardCheckToSchedule : public PipeStageIFace +{ + public: + ScoreboardCheckToSchedule() = delete; + ScoreboardCheckToSchedule(const ComputeUnitParams *p); + void reset() override; + /** + * Mark the WF as ready for execution on a particular functional + * unit. + */ + void markWFReady(Wavefront *wf, int func_unit_id); + /** + * Returns the number of ready lists (i.e., the number of functional + * units). Each functional unit has its own list of ready WFs to + * consider for arbitration. + */ + int numReadyLists() const; + /** + * TODO: These methods expose this class' implementation too much by + * returning references to its internal data structures directly. + * These are to support legacy functionality in the CU pipeline. + * They should be removed eventually for an API that hides such + * implementation details. + */ + std::vector& readyWFs(int func_unit_id); + + // TODO: Leftover from old CU code, needs to go away. + void updateReadyList(int func_unit_id); + + private: + std::vector> _readyWFs; +}; + +/** + * Communication interface between Schedule and Execute stages. + */ +class ScheduleToExecute : public PipeStageIFace +{ + public: + ScheduleToExecute() = delete; + ScheduleToExecute(const ComputeUnitParams *p); + void reset() override; + GPUDynInstPtr& readyInst(int func_unit_id); + /** + * Once the scheduler has chosen a winning WF for execution, and + * after the WF's oldest instruction's operands have been read, + * this method is used to mark the instruction as ready to execute. + * This puts it on the dispatch list to be consumed by the execute + * stage. + */ + void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst, + int func_unit_id, DISPATCH_STATUS disp_status); + void dispatchTransition(int func_unit_id, DISPATCH_STATUS disp_status); + DISPATCH_STATUS dispatchStatus(int func_unit_id) const; + + private: + std::vector _readyInsts; + std::vector _dispatchStatus; +}; + +#endif // __GPU_COMPUTE_COMM_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index a59a7fd6e..067c25469 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -68,9 +68,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), registerManager(p->register_manager), fetchStage(p, *this), - scoreboardCheckStage(p, *this), - scheduleStage(p, *this), - execStage(p, *this), + scoreboardCheckStage(p, *this, scoreboardCheckToSchedule), + scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute), + execStage(p, *this, scheduleToExecute), globalMemoryPipe(p, *this), localMemoryPipe(p, *this), scalarMemoryPipe(p, *this), @@ -98,7 +98,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this), _cacheLineSize(p->system->cacheLineSize()), _numBarrierSlots(p->num_barrier_slots), - globalSeqNum(0), wavefrontSize(p->wf_size) + globalSeqNum(0), wavefrontSize(p->wf_size), + scoreboardCheckToSchedule(p), + scheduleToExecute(p) { /** * This check is necessary because std::bitset only provides conversion @@ -213,8 +215,6 @@ ComputeUnit::~ComputeUnit() lastVaddrSimd[j].clear(); } lastVaddrCU.clear(); - readyList.clear(); - dispatchList.clear(); delete cuExitCallback; delete ldsPort; } @@ -297,24 +297,6 @@ ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task) w->computeActualWgSz(task); } -// delete all wavefronts that have been marked as ready at SCB stage -// but are found to have empty instruction buffers at SCH stage -void -ComputeUnit::updateReadyList(int unitId) -{ - if (!readyList[unitId].empty()) { - for (std::vector::iterator it = readyList[unitId].begin(); - it != readyList[unitId].end();) { - if ((*it)->instructionBuffer.empty()) { - it = readyList[unitId].erase(it); - } - else { - ++it; - } - } - } -} - void ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext) @@ -786,15 +768,7 @@ ComputeUnit::init() vectorRegsReserved.resize(numVectorALUs, 0); scalarRegsReserved.resize(numVectorALUs, 0); - // Initializing pipeline resources - readyList.resize(numExeUnits()); - - for (int j = 0; j < numExeUnits(); ++j) { - dispatchList.push_back(std::make_pair(nullptr, EMPTY)); - } - fetchStage.init(); - scoreboardCheckStage.init(); scheduleStage.init(); execStage.init(); globalMemoryPipe.init(); diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index d4e978b40..22960c0c9 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -44,6 +44,7 @@ #include "base/types.hh" #include "config/the_gpu_isa.hh" #include "enums/PrefetchType.hh" +#include "gpu-compute/comm.hh" #include "gpu-compute/exec_stage.hh" #include "gpu-compute/fetch_stage.hh" #include "gpu-compute/global_memory_pipeline.hh" @@ -266,40 +267,6 @@ class ComputeUnit : public ClockedObject int numCyclesPerStoreTransfer; // number of cycles per vector store int numCyclesPerLoadTransfer; // number of cycles per vector load - // Buffers used to communicate between various pipeline stages - - // At a high level, the following intra-/inter-stage communication occurs: - // SCB to SCH: readyList provides per exec resource list of waves that - // passed dependency and readiness checks. If selected by - // scheduler, attempt to add wave to schList conditional on - // RF support. - // SCH: schList holds waves that are gathering operands or waiting - // for execution resource availability. Once ready, waves are - // placed on the dispatchList as candidates for execution. A wave - // may spend multiple cycles in SCH stage, on the schList due to - // RF access conflicts or execution resource contention. - // SCH to EX: dispatchList holds waves that are ready to be executed. - // LM/FLAT arbitration may remove an LM wave and place it - // back on the schList. RF model may also force a wave back - // to the schList if using the detailed model. - - // List of waves which are ready to be scheduled. - // Each execution resource has a ready list. readyList is - // used to communicate between scoreboardCheck stage and - // schedule stage - std::vector> readyList; - - // List of waves which will be dispatched to - // each execution resource. An EXREADY implies - // dispatch list is non-empty and - // execution unit has something to execute - // this cycle. Currently, the dispatch list of - // an execution resource can hold only one wave because - // an execution resource can execute only one wave in a cycle. - // dispatchList is used to communicate between schedule - // and exec stage - // TODO: convert std::pair to a class to increase readability - std::vector> dispatchList; // track presence of dynamic instructions in the Schedule pipeline // stage. This is used to check the readiness of the oldest, // non-dispatched instruction of every WF in the Scoreboard stage. @@ -413,8 +380,6 @@ class ComputeUnit : public ClockedObject // number of available scalar registers per SIMD unit int numScalarRegsPerSimd; - void updateReadyList(int unitId); - // this hash map will keep track of page divergence // per memory instruction per wavefront. The hash map // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. @@ -1117,6 +1082,41 @@ class ComputeUnit : public ClockedObject InstSeqNum globalSeqNum; int wavefrontSize; + /** + * TODO: Update these comments once the pipe stage interface has + * been fully refactored. + * + * Pipeline stage interfaces. + * + * Buffers used to communicate between various pipeline stages + * List of waves which will be dispatched to + * each execution resource. An EXREADY implies + * dispatch list is non-empty and + * execution unit has something to execute + * this cycle. Currently, the dispatch list of + * an execution resource can hold only one wave because + * an execution resource can execute only one wave in a cycle. + * dispatchList is used to communicate between schedule + * and exec stage + * + * At a high level, the following intra-/inter-stage communication occurs: + * SCB to SCH: readyList provides per exec resource list of waves that + * passed dependency and readiness checks. If selected by + * scheduler, attempt to add wave to schList conditional on + * RF support. + * SCH: schList holds waves that are gathering operands or waiting + * for execution resource availability. Once ready, waves are + * placed on the dispatchList as candidates for execution. A wave + * may spend multiple cycles in SCH stage, on the schList due to + * RF access conflicts or execution resource contention. + * SCH to EX: dispatchList holds waves that are ready to be executed. + * LM/FLAT arbitration may remove an LM wave and place it + * back on the schList. RF model may also force a wave back + * to the schList if using the detailed model. + */ + ScoreboardCheckToSchedule scoreboardCheckToSchedule; + ScheduleToExecute scheduleToExecute; + /** * The barrier slots for this CU. */ diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc index 2b0a79785..79fca724f 100644 --- a/src/gpu-compute/exec_stage.cc +++ b/src/gpu-compute/exec_stage.cc @@ -41,8 +41,10 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu) - : computeUnit(cu), lastTimeInstExecuted(false), +ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu, + ScheduleToExecute &from_schedule) + : computeUnit(cu), fromSchedule(from_schedule), + lastTimeInstExecuted(false), thisTimeInstExecuted(false), instrExecuted (false), executionResourcesUsed(0), _name(cu.name() + ".ExecStage") @@ -54,7 +56,6 @@ ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu) void ExecStage::init() { - dispatchList = &computeUnit.dispatchList; idle_dur = 0; } @@ -128,14 +129,15 @@ ExecStage::dumpDispList() std::stringstream ss; bool empty = true; for (int i = 0; i < computeUnit.numExeUnits(); i++) { - DISPATCH_STATUS s = dispatchList->at(i).second; + DISPATCH_STATUS s = fromSchedule.dispatchStatus(i); ss << i << ": " << dispStatusToStr(s); if (s != EMPTY) { empty = false; - Wavefront *w = dispatchList->at(i).first; - ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: "; - ss << (w->instructionBuffer.front())->seqNum() << ": "; - ss << (w->instructionBuffer.front())->disassemble(); + GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(i); + Wavefront *wf = gpu_dyn_inst->wavefront(); + ss << " SIMD[" << wf->simdId << "] WV[" << wf->wfDynId << "]: "; + ss << (wf->instructionBuffer.front())->seqNum() << ": "; + ss << (wf->instructionBuffer.front())->disassemble(); } ss << "\n"; } @@ -152,36 +154,41 @@ ExecStage::exec() dumpDispList(); } for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { - DISPATCH_STATUS s = dispatchList->at(unitId).second; + DISPATCH_STATUS s = fromSchedule.dispatchStatus(unitId); switch (s) { - case EMPTY: + case EMPTY: // Do not execute if empty, waiting for VRF reads, // or LM tied to GM waiting for VRF reads collectStatistics(IdleExec, unitId); break; - case EXREADY: - { - collectStatistics(BusyExec, unitId); - Wavefront *w = dispatchList->at(unitId).first; - DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n", - unitId, w->simdId, w->wfDynId, - (w->instructionBuffer.front())->disassemble()); - DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId); - dispatchList->at(unitId).first->exec(); - (computeUnit.scheduleStage).deleteFromSch(w); - dispatchList->at(unitId).second = EMPTY; - dispatchList->at(unitId).first->freeResources(); - dispatchList->at(unitId).first = nullptr; - break; - } - case SKIP: - collectStatistics(BusyExec, unitId); - DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId); - dispatchList->at(unitId).second = EMPTY; - dispatchList->at(unitId).first->freeResources(); - dispatchList->at(unitId).first = nullptr; - break; - default: + case EXREADY: + { + collectStatistics(BusyExec, unitId); + GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId); + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); + DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n", + unitId, wf->simdId, wf->wfDynId, + gpu_dyn_inst->disassemble()); + DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId); + wf->exec(); + (computeUnit.scheduleStage).deleteFromSch(wf); + fromSchedule.dispatchTransition(unitId, EMPTY); + wf->freeResources(); + break; + } + case SKIP: + { + collectStatistics(BusyExec, unitId); + GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId); + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); + DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId); + fromSchedule.dispatchTransition(unitId, EMPTY); + wf->freeResources(); + break; + } + default: panic("Unknown dispatch status in exec()\n"); } } diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh index cd4343e6d..23e9369b3 100644 --- a/src/gpu-compute/exec_stage.hh +++ b/src/gpu-compute/exec_stage.hh @@ -42,7 +42,9 @@ #include "sim/stats.hh" class ComputeUnit; +class ScheduleToExecute; class Wavefront; + struct ComputeUnitParams; enum STAT_STATUS @@ -69,7 +71,8 @@ enum DISPATCH_STATUS class ExecStage { public: - ExecStage(const ComputeUnitParams* p, ComputeUnit &cu); + ExecStage(const ComputeUnitParams* p, ComputeUnit &cu, + ScheduleToExecute &from_schedule); ~ExecStage() { } void init(); void exec(); @@ -97,17 +100,8 @@ class ExecStage void collectStatistics(enum STAT_STATUS stage, int unitId); void initStatistics(); ComputeUnit &computeUnit; + ScheduleToExecute &fromSchedule; - // List of waves which will be dispatched to - // each execution resource. A FILLED implies - // dispatch list is non-empty and - // execution unit has something to execute - // this cycle. Currently, the dispatch list of - // an execution resource can hold only one wave because - // an execution resource can execute only one wave in a cycle. - // dispatchList is used to communicate between schedule - // and exec stage - std::vector> *dispatchList; bool lastTimeInstExecuted; bool thisTimeInstExecuted; bool instrExecuted; diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index e0600a623..fb52b6dd1 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -43,8 +43,12 @@ #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" -ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu) - : computeUnit(cu), _name(cu.name() + ".ScheduleStage"), +ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu, + ScoreboardCheckToSchedule &from_scoreboard_check, + ScheduleToExecute &to_execute) + : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check), + toExecute(to_execute), + _name(cu.name() + ".ScheduleStage"), vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false), scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false), locMemBusRdy(false), locMemIssueRdy(false) @@ -70,14 +74,12 @@ void ScheduleStage::init() { - fatal_if(scheduler.size() != computeUnit.readyList.size(), + fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(), "Scheduler should have same number of entries as CU's readyList"); for (int j = 0; j < computeUnit.numExeUnits(); ++j) { - scheduler[j].bindList(&computeUnit.readyList[j]); + scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j)); } - dispatchList = &computeUnit.dispatchList; - assert(computeUnit.numVectorGlobalMemUnits == 1); assert(computeUnit.numVectorSharedMemUnits == 1); } @@ -85,21 +87,21 @@ ScheduleStage::init() void ScheduleStage::exec() { + toExecute.reset(); + // Update readyList for (int j = 0; j < computeUnit.numExeUnits(); ++j) { - // delete all ready wavefronts whose instruction buffers are now - // empty because the last instruction was executed - computeUnit.updateReadyList(j); /** * Remove any wave that already has an instruction present in SCH * waiting for RF reads to complete. This prevents out of order * execution within a wave. */ - for (auto wIt = computeUnit.readyList.at(j).begin(); - wIt != computeUnit.readyList.at(j).end();) { + fromScoreboardCheck.updateReadyList(j); + for (auto wIt = fromScoreboardCheck.readyWFs(j).begin(); + wIt != fromScoreboardCheck.readyWFs(j).end();) { if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) { *wIt = nullptr; - wIt = computeUnit.readyList.at(j).erase(wIt); + wIt = fromScoreboardCheck.readyWFs(j).erase(wIt); } else { wIt++; } @@ -115,7 +117,7 @@ ScheduleStage::exec() int firstMemUnit = computeUnit.firstMemUnit(); int lastMemUnit = computeUnit.lastMemUnit(); for (int j = firstMemUnit; j <= lastMemUnit; j++) { - int readyListSize = computeUnit.readyList[j].size(); + int readyListSize = fromScoreboardCheck.readyWFs(j).size(); // If no wave is ready to be scheduled on the execution resource // then skip scheduling for this execution resource if (!readyListSize) { @@ -125,11 +127,13 @@ ScheduleStage::exec() rdyListNotEmpty[j]++; // Pick a wave and attempt to add it to schList - Wavefront *w = scheduler[j].chooseWave(); - if (!addToSchList(j, w)) { + Wavefront *wf = scheduler[j].chooseWave(); + GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front(); + assert(gpu_dyn_inst); + if (!addToSchList(j, gpu_dyn_inst)) { // For waves not added to schList, increment count of cycles // this wave spends in SCH stage. - w->schCycles++; + wf->schCycles++; addToSchListStalls[j]++; } } @@ -140,7 +144,7 @@ ScheduleStage::exec() if (j >= firstMemUnit && j <= lastMemUnit) { continue; } - int readyListSize = computeUnit.readyList[j].size(); + int readyListSize = fromScoreboardCheck.readyWFs(j).size(); // If no wave is ready to be scheduled on the execution resource // then skip scheduling for this execution resource if (!readyListSize) { @@ -150,11 +154,13 @@ ScheduleStage::exec() rdyListNotEmpty[j]++; // Pick a wave and attempt to add it to schList - Wavefront *w = scheduler[j].chooseWave(); - if (!addToSchList(j, w)) { + Wavefront *wf = scheduler[j].chooseWave(); + GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front(); + assert(gpu_dyn_inst); + if (!addToSchList(j, gpu_dyn_inst)) { // For waves not added to schList, increment count of cycles // this wave spends in SCH stage. - w->schCycles++; + wf->schCycles++; addToSchListStalls[j]++; } } @@ -191,30 +197,36 @@ ScheduleStage::exec() void ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s, - Wavefront *w) + const GPUDynInstPtr &gpu_dyn_inst) { - dispatchList->at(unitId).first = w; - dispatchList->at(unitId).second = s; + toExecute.dispatchTransition(gpu_dyn_inst, unitId, s); +} + +void +ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s) +{ + toExecute.dispatchTransition(unitId, s); } bool -ScheduleStage::schedRfWrites(int exeType, Wavefront *w) +ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst) { - GPUDynInstPtr ii = w->instructionBuffer.front(); - assert(ii); + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); bool accessVrfWr = true; - if (!ii->isScalar()) { - accessVrfWr = - computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii); + if (!gpu_dyn_inst->isScalar()) { + accessVrfWr = computeUnit.vrf[wf->simdId] + ->canScheduleWriteOperands(wf, gpu_dyn_inst); } - bool accessSrfWr = - computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii); + bool accessSrfWr = computeUnit.srf[wf->simdId] + ->canScheduleWriteOperands(wf, gpu_dyn_inst); bool accessRf = accessVrfWr && accessSrfWr; if (accessRf) { - if (!ii->isScalar()) { - computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii); + if (!gpu_dyn_inst->isScalar()) { + computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf, + gpu_dyn_inst); } - computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii); + computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst); return true; } else { rfAccessStalls[SCH_RF_ACCESS_NRDY]++; @@ -226,8 +238,8 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w) } // Increment stall counts for WF - w->schStalls++; - w->schRfAccessStalls++; + wf->schStalls++; + wf->schRfAccessStalls++; } return false; } @@ -236,18 +248,18 @@ void ScheduleStage::scheduleRfDestOperands() { for (int j = 0; j < computeUnit.numExeUnits(); ++j) { - if (dispatchList->at(j).second == EMPTY || - dispatchList->at(j).second == SKIP) { + if (toExecute.dispatchStatus(j) == EMPTY || + toExecute.dispatchStatus(j) == SKIP) { continue; } - assert(dispatchList->at(j).first); - // get the wave on dispatch list and attempt to allocate write // resources in the RFs - Wavefront *w = dispatchList->at(j).first; - if (!schedRfWrites(j, w)) { - reinsertToSchList(j, w); + const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j); + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); + if (!schedRfWrites(j, gpu_dyn_inst)) { + reinsertToSchList(j, gpu_dyn_inst); doDispatchListTransition(j, EMPTY); // if this is a flat inst, also transition the LM pipe to empty // Note: since FLAT/LM arbitration occurs before scheduling @@ -255,51 +267,53 @@ ScheduleStage::scheduleRfDestOperands() // instruction lost arbitration, but would have been able to // pass the RF destination operand check here, and execute // instead of the FLAT. - if (w->instructionBuffer.front()->isFlat()) { - assert(dispatchList->at(w->localMem).second == SKIP); - doDispatchListTransition(w->localMem, EMPTY); + if (wf->instructionBuffer.front()->isFlat()) { + assert(toExecute.dispatchStatus(wf->localMem) + == SKIP); + doDispatchListTransition(wf->localMem, EMPTY); } } } } bool -ScheduleStage::addToSchList(int exeType, Wavefront *w) +ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst) { // Attempt to add the wave to the schList if the VRF can support the // wave's next instruction - GPUDynInstPtr ii = w->instructionBuffer.front(); - assert(ii); + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); bool accessVrf = true; - if (!ii->isScalar()) { - accessVrf = - computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii); + if (!gpu_dyn_inst->isScalar()) { + accessVrf = computeUnit.vrf[wf->simdId] + ->canScheduleReadOperands(wf, gpu_dyn_inst); } - bool accessSrf = - computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii); + bool accessSrf = computeUnit.srf[wf->simdId] + ->canScheduleReadOperands(wf, gpu_dyn_inst); // If RFs can support instruction, add to schList in RFBUSY state, // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands // to the VRF bool accessRf = accessVrf && accessSrf; if (accessRf) { DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n", - exeType, w->simdId, w->wfDynId, - ii->seqNum(), ii->disassemble()); + exeType, wf->simdId, wf->wfDynId, + gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); - computeUnit.insertInPipeMap(w); - wavesInSch.emplace(w->wfDynId); - schList.at(exeType).push_back(std::make_pair(w, RFBUSY)); - if (w->isOldestInstWaitcnt()) { - w->setStatus(Wavefront::S_WAITCNT); + computeUnit.insertInPipeMap(wf); + wavesInSch.emplace(wf->wfDynId); + schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY)); + if (wf->isOldestInstWaitcnt()) { + wf->setStatus(Wavefront::S_WAITCNT); } - if (!ii->isScalar()) { - computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii); + if (!gpu_dyn_inst->isScalar()) { + computeUnit.vrf[wf->simdId] + ->scheduleReadOperands(wf, gpu_dyn_inst); } - computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii); + computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst); DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n", - exeType, w->simdId, w->wfDynId, - ii->seqNum(), ii->disassemble()); + exeType, wf->simdId, wf->wfDynId, + gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); return true; } else { // Number of stall cycles due to RF access denied @@ -314,28 +328,30 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w) } // Increment stall counts for WF - w->schStalls++; - w->schRfAccessStalls++; + wf->schStalls++; + wf->schRfAccessStalls++; DPRINTF(GPUSched, "schList[%d]: Could not add: " "SIMD[%d] WV[%d]: %d: %s\n", - exeType, w->simdId, w->wfDynId, - ii->seqNum(), ii->disassemble()); + exeType, wf->simdId, wf->wfDynId, + gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); } return false; } void -ScheduleStage::reinsertToSchList(int exeType, Wavefront *w) +ScheduleStage::reinsertToSchList(int exeType, + const GPUDynInstPtr &gpu_dyn_inst) { // Insert wave w into schList for specified exeType. // Wave is inserted in age order, with oldest wave being at the // front of the schList + assert(gpu_dyn_inst); auto schIter = schList.at(exeType).begin(); while (schIter != schList.at(exeType).end() - && schIter->first->wfDynId < w->wfDynId) { + && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) { schIter++; } - schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY)); + schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY)); } void @@ -377,46 +393,48 @@ ScheduleStage::checkMemResources() } bool -ScheduleStage::dispatchReady(Wavefront *w) +ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst) { + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); vectorAluRdy = false; scalarAluRdy = false; // check for available vector/scalar ALUs in the next cycle - if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) { + if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) { vectorAluRdy = true; } - if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) { + if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) { scalarAluRdy = true; } - GPUDynInstPtr ii = w->instructionBuffer.front(); - if (ii->isNop()) { + if (gpu_dyn_inst->isNop()) { // S_NOP requires SALU. V_NOP requires VALU. // TODO: Scalar NOP does not require SALU in hardware, // and is executed out of IB directly. - if (ii->isScalar() && !scalarAluRdy) { + if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; return false; - } else if (!ii->isScalar() && !vectorAluRdy) { + } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) { dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; return false; } - } else if (ii->isEndOfKernel()) { + } else if (gpu_dyn_inst->isEndOfKernel()) { // EndPgm instruction - if (ii->isScalar() && !scalarAluRdy) { + if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; return false; } - } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) { + } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch() + || gpu_dyn_inst->isALU()) { // Barrier, Branch, or ALU instruction - if (ii->isScalar() && !scalarAluRdy) { + if (gpu_dyn_inst->isScalar() && !scalarAluRdy) { dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++; return false; - } else if (!ii->isScalar() && !vectorAluRdy) { + } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) { dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++; return false; } - } else if (!ii->isScalar() && ii->isGlobalMem()) { + } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) { // Vector Global Memory instruction bool rdy = true; if (!glbMemIssueRdy) { @@ -427,18 +445,18 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) { + if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) { rdy = false; dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++; } - if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) { + if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) { rdy = false; dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++; } if (!rdy) { return false; } - } else if (ii->isScalar() && ii->isGlobalMem()) { + } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) { // Scalar Global Memory instruction bool rdy = true; if (!scalarMemIssueRdy) { @@ -449,16 +467,17 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit.scalarMemoryPipe. - isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe + - w->scalarWrGmReqsInPipe)) { + if (!computeUnit.scalarMemoryPipe + .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe + + wf->scalarWrGmReqsInPipe)) + { rdy = false; dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++; } if (!rdy) { return false; } - } else if (!ii->isScalar() && ii->isLocalMem()) { + } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) { // Vector Local Memory instruction bool rdy = true; if (!locMemIssueRdy) { @@ -470,14 +489,14 @@ ScheduleStage::dispatchReady(Wavefront *w) dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++; } if (!computeUnit.localMemoryPipe. - isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) { + isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) { rdy = false; dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++; } if (!rdy) { return false; } - } else if (!ii->isScalar() && ii->isFlat()) { + } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) { // Vector Flat memory instruction bool rdy = true; if (!glbMemIssueRdy || !locMemIssueRdy) { @@ -488,16 +507,16 @@ ScheduleStage::dispatchReady(Wavefront *w) rdy = false; dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++; } - if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) { + if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) { rdy = false; dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++; } - if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) { + if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) { rdy = false; dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++; } if (!computeUnit.localMemoryPipe. - isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) { + isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) { rdy = false; dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++; } @@ -505,7 +524,8 @@ ScheduleStage::dispatchReady(Wavefront *w) return false; } } else { - panic("%s: unknown instr checked for readiness", ii->disassemble()); + panic("%s: unknown instr checked for readiness", + gpu_dyn_inst->disassemble()); return false; } dispNrdyStalls[SCH_RDY]++; @@ -519,7 +539,7 @@ ScheduleStage::fillDispatchList() checkMemResources(); // iterate execution resources for (int j = 0; j < computeUnit.numExeUnits(); j++) { - assert(dispatchList->at(j).second == EMPTY); + assert(toExecute.dispatchStatus(j) == EMPTY); // iterate waves in schList to pick one for dispatch auto schIter = schList.at(j).begin(); @@ -537,8 +557,7 @@ ScheduleStage::fillDispatchList() // Acquire a coalescer token if it is a global mem // operation. - GPUDynInstPtr mp = schIter->first-> - instructionBuffer.front(); + GPUDynInstPtr mp = schIter->first; if (!mp->isMemSync() && !mp->isScalar() && (mp->isGlobalMem() || mp->isFlat())) { computeUnit.globalMemoryPipe.acqCoalescerToken(mp); @@ -553,10 +572,10 @@ ScheduleStage::fillDispatchList() } else { // Either another wave has been dispatched, or this wave // was not ready, so it is stalled this cycle - schIter->first->schStalls++; + schIter->first->wavefront()->schStalls++; if (!dispRdy) { // not ready for dispatch, increment stall stat - schIter->first->schResourceStalls++; + schIter->first->wavefront()->schResourceStalls++; } // Examine next wave for this resource schIter++; @@ -589,28 +608,31 @@ ScheduleStage::arbitrateVrfToLdsBus() // get the GM pipe index in the dispatchList int gm_exe_unit = computeUnit.firstMemUnit() + i; // get the wave in the dispatchList - Wavefront *w = dispatchList->at(gm_exe_unit).first; + GPUDynInstPtr &gpu_dyn_inst + = toExecute.readyInst(gm_exe_unit); // If the WF is valid, ready to execute, and the instruction // is a flat access, arbitrate with the WF's assigned LM pipe - if (w && dispatchList->at(gm_exe_unit).second == EXREADY && - w->instructionBuffer.front()->isFlat()) { + if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit) + == EXREADY && gpu_dyn_inst->isFlat()) { + Wavefront *wf = gpu_dyn_inst->wavefront(); // If the associated LM pipe also has a wave selected, block // that wave and let the Flat instruction issue. The WF in the // LM pipe is added back to the schList for consideration next // cycle. - if (dispatchList->at(w->localMem).second == EXREADY) { - reinsertToSchList(w->localMem, - dispatchList->at(w->localMem).first); + if (toExecute.dispatchStatus(wf->localMem) == EXREADY) { + reinsertToSchList(wf->localMem, toExecute + .readyInst(wf->localMem)); // Increment stall stats for LDS-VRF arbitration ldsBusArbStalls++; - dispatchList->at(w->localMem).first->schLdsArbStalls++; + toExecute.readyInst(wf->localMem) + ->wavefront()->schLdsArbStalls++; } // With arbitration of LM pipe complete, transition the // LM pipe to SKIP state in the dispatchList to inform EX stage // that a Flat instruction is executing next cycle - doDispatchListTransition(w->localMem, SKIP, w); + doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst); DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: " - "EXREADY->SKIP\n", w->localMem); + "EXREADY->SKIP\n", wf->localMem); } } } @@ -623,41 +645,41 @@ ScheduleStage::checkRfOperandReadComplete() // selection for dispatchList for (int j = 0; j < computeUnit.numExeUnits(); ++j) { for (auto &p : schList.at(j)) { - Wavefront *w = p.first; - assert(w); + const GPUDynInstPtr &gpu_dyn_inst = p.first; + assert(gpu_dyn_inst); + Wavefront *wf = gpu_dyn_inst->wavefront(); // Increment the number of cycles the wave spends in the // SCH stage, since this loop visits every wave in SCH. - w->schCycles++; + wf->schCycles++; - GPUDynInstPtr ii = w->instructionBuffer.front(); bool vrfRdy = true; - if (!ii->isScalar()) { - vrfRdy = - computeUnit.vrf[w->simdId]->operandReadComplete(w, ii); + if (!gpu_dyn_inst->isScalar()) { + vrfRdy = computeUnit.vrf[wf->simdId] + ->operandReadComplete(wf, gpu_dyn_inst); } - bool srfRdy = - computeUnit.srf[w->simdId]->operandReadComplete(w, ii); + bool srfRdy = computeUnit.srf[wf->simdId] + ->operandReadComplete(wf, gpu_dyn_inst); bool operandsReady = vrfRdy && srfRdy; if (operandsReady) { - DPRINTF(GPUSched, - "schList[%d]: WV[%d] operands ready for: %d: %s\n", - j, w->wfDynId, ii->seqNum(), ii->disassemble()); + DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: " + "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(), + gpu_dyn_inst->disassemble()); DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n", - j, w->wfDynId); + j, wf->wfDynId); p.second = RFREADY; } else { - DPRINTF(GPUSched, - "schList[%d]: WV[%d] operands not ready for: %d: %s\n", - j, w->wfDynId, ii->seqNum(), ii->disassemble()); + DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready " + "for: %d: %s\n", j, wf->wfDynId, + gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble()); // operands not ready yet, increment SCH stage stats // aggregate to all wavefronts on the CU p.second = RFBUSY; // Increment stall stats - w->schStalls++; - w->schOpdNrdyStalls++; + wf->schStalls++; + wf->schOpdNrdyStalls++; opdNrdyStalls[SCH_RF_OPD_NRDY]++; if (!vrfRdy) { @@ -678,23 +700,21 @@ ScheduleStage::reserveResources() exeUnitReservations.resize(computeUnit.numExeUnits(), false); for (int j = 0; j < computeUnit.numExeUnits(); ++j) { - Wavefront *dispatchedWave = dispatchList->at(j).first; - if (dispatchedWave) { - DISPATCH_STATUS s = dispatchList->at(j).second; + GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j); + if (gpu_dyn_inst) { + DISPATCH_STATUS s = toExecute.dispatchStatus(j); + Wavefront *wf = gpu_dyn_inst->wavefront(); if (s == EMPTY) { continue; } else if (s == EXREADY) { // Wave is ready for execution - std::vector execUnitIds = - dispatchedWave->reserveResources(); - GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front(); + std::vector execUnitIds = wf->reserveResources(); - if (!ii->isScalar()) { - computeUnit.vrf[dispatchedWave->simdId]-> - dispatchInstruction(ii); + if (!gpu_dyn_inst->isScalar()) { + computeUnit.vrf[wf->simdId] + ->dispatchInstruction(gpu_dyn_inst); } - computeUnit.srf[dispatchedWave->simdId]-> - dispatchInstruction(ii); + computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst); std::stringstream ss; for (auto id : execUnitIds) { @@ -702,16 +722,16 @@ ScheduleStage::reserveResources() } DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s" " Reserving ExeRes[ %s]\n", - j, dispatchedWave->simdId, dispatchedWave->wfDynId, - ii->seqNum(), ii->disassemble(), ss.str()); + j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(), + gpu_dyn_inst->disassemble(), ss.str()); // mark the resources as reserved for this cycle for (auto execUnitId : execUnitIds) { panic_if(exeUnitReservations.at(execUnitId), "Execution unit %d is reserved!!!\n" "SIMD[%d] WV[%d]: %d: %s", - execUnitId, dispatchedWave->simdId, - dispatchedWave->wfDynId, - ii->seqNum(), ii->disassemble()); + execUnitId, wf->simdId, wf->wfDynId, + gpu_dyn_inst->seqNum(), + gpu_dyn_inst->disassemble()); exeUnitReservations.at(execUnitId) = true; } @@ -720,18 +740,20 @@ ScheduleStage::reserveResources() // that we've reserved a global and local memory unit. Thus, // we need to mark the latter execution unit as not available. if (execUnitIds.size() > 1) { - int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem; - assert(dispatchList->at(lm_exec_unit).second == SKIP); + int lm_exec_unit M5_VAR_USED = wf->localMem; + assert(toExecute.dispatchStatus(lm_exec_unit) + == SKIP); } } else if (s == SKIP) { // Shared Memory pipe reserved for FLAT instruction. // Verify the GM pipe for this wave is ready to execute // and the wave in the GM pipe is the same as the wave // in the LM pipe - int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem; - assert(dispatchList->at(gm_exec_unit).first->wfDynId == - dispatchedWave->wfDynId); - assert(dispatchList->at(gm_exec_unit).second == EXREADY); + int gm_exec_unit M5_VAR_USED = wf->globalMem; + assert(wf->wfDynId == toExecute + .readyInst(gm_exec_unit)->wfDynId); + assert(toExecute.dispatchStatus(gm_exec_unit) + == EXREADY); } } } diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh index 6ec4a8ddd..c4dc28237 100644 --- a/src/gpu-compute/schedule_stage.hh +++ b/src/gpu-compute/schedule_stage.hh @@ -41,8 +41,8 @@ #include #include "gpu-compute/exec_stage.hh" +#include "gpu-compute/misc.hh" #include "gpu-compute/scheduler.hh" -#include "gpu-compute/scoreboard_check_stage.hh" // Schedule or execution arbitration stage. // From the pool of ready waves in the ready list, @@ -50,6 +50,8 @@ // The selection is made based on a scheduling policy class ComputeUnit; +class ScheduleToExecute; +class ScoreboardCheckToSchedule; class Wavefront; struct ComputeUnitParams; @@ -57,7 +59,9 @@ struct ComputeUnitParams; class ScheduleStage { public: - ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu); + ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu, + ScoreboardCheckToSchedule &from_scoreboard_check, + ScheduleToExecute &to_execute); ~ScheduleStage(); void init(); void exec(); @@ -115,17 +119,13 @@ class ScheduleStage private: ComputeUnit &computeUnit; + ScoreboardCheckToSchedule &fromScoreboardCheck; + ScheduleToExecute &toExecute; + // Each execution resource will have its own // scheduler and a dispatch list std::vector scheduler; - // List of waves which will be dispatched to - // each execution resource. - // Currently, the dispatch list of - // an execution resource can hold only one wave because - // an execution resource can execute only one wave in a cycle. - std::vector> *dispatchList; - // Stats // Number of cycles with empty (or not empty) readyList, per execution @@ -171,10 +171,10 @@ class ScheduleStage const std::string _name; // called by exec() to add a wave to schList if the RFs can support it - bool addToSchList(int exeType, Wavefront *w); + bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst); // re-insert a wave to schList if wave lost arbitration // wave is inserted such that age order (oldest to youngest) is preserved - void reinsertToSchList(int exeType, Wavefront *w); + void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst); // check waves in schList to see if RF reads complete void checkRfOperandReadComplete(); // check execution resources for readiness @@ -189,7 +189,7 @@ class ScheduleStage // check status of memory pipes and RF to Mem buses void checkMemResources(); // resource ready check called by fillDispatchList - bool dispatchReady(Wavefront *w); + bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst); // pick waves from schList and populate dispatchList with one wave // per EXE resource type void fillDispatchList(); @@ -199,12 +199,13 @@ class ScheduleStage // dispatchList void scheduleRfDestOperands(); // invoked by scheduleRfDestOperands to schedule RF writes for a wave - bool schedRfWrites(int exeType, Wavefront *w); + bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst); // reserve resources for waves surviving arbitration in dispatchList void reserveResources(); void doDispatchListTransition(int unitId, DISPATCH_STATUS s, - Wavefront *w = nullptr); + const GPUDynInstPtr &gpu_dyn_inst); + void doDispatchListTransition(int unitId, DISPATCH_STATUS s); // Set tracking wfDynId for each wave present in schedule stage // Used to allow only one instruction per wave in schedule @@ -219,7 +220,7 @@ class ScheduleStage // The maximum number of waves per resource can be determined by either // the VRF/SRF availability or limits imposed by paremeters (to be added) // of the SCH stage or CU. - std::vector>> schList; + std::vector>> schList; }; #endif // __SCHEDULE_STAGE_HH__ diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc index fb99e6912..0e52d310c 100644 --- a/src/gpu-compute/scoreboard_check_stage.cc +++ b/src/gpu-compute/scoreboard_check_stage.cc @@ -45,22 +45,16 @@ #include "params/ComputeUnit.hh" ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p, - ComputeUnit &cu) - : computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage") + ComputeUnit &cu, + ScoreboardCheckToSchedule + &to_schedule) + : computeUnit(cu), toSchedule(to_schedule), + _name(cu.name() + ".ScoreboardCheckStage") { } ScoreboardCheckStage::~ScoreboardCheckStage() { - readyList.clear(); -} - -void -ScoreboardCheckStage::init() -{ - for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { - readyList.push_back(&computeUnit.readyList[unitId]); - } } void @@ -242,17 +236,13 @@ ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w) void ScoreboardCheckStage::exec() { - // reset the ready list for all execution units; it will be - // constructed every cycle since resource availability may change - for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) { - // Reset wavefront pointers to nullptr so clear() on the vector - // does not accidentally destruct the wavefront object - for (int i = 0; i < readyList[unitId]->size(); i++) { - readyList[unitId]->at(i) = nullptr; - } - readyList[unitId]->clear(); - } - // iterate over all WF slots across all vector ALUs + /** + * Reset the ready list for all execution units; ready list will be + * constructed every cycle because resource availability may change. + */ + toSchedule.reset(); + + // Iterate over all WF slots across all SIMDs. for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) { for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) { // reset the ready status of each wavefront @@ -269,7 +259,7 @@ ScoreboardCheckStage::exec() curWave->simdId, curWave->wfDynId, curWave->nextInstr()->seqNum(), curWave->nextInstr()->disassemble()); - readyList.at(exeResType)->push_back(curWave); + toSchedule.markWFReady(curWave, exeResType); } collectStatistics(rdyStatus); } diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh index 6953c4c66..87582759b 100644 --- a/src/gpu-compute/scoreboard_check_stage.hh +++ b/src/gpu-compute/scoreboard_check_stage.hh @@ -43,6 +43,7 @@ #include "sim/stats.hh" class ComputeUnit; +class ScoreboardCheckToSchedule; class Wavefront; struct ComputeUnitParams; @@ -70,9 +71,9 @@ class ScoreboardCheckStage NRDY_CONDITIONS }; - ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu); + ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu, + ScoreboardCheckToSchedule &to_schedule); ~ScoreboardCheckStage(); - void init(); void exec(); // Stats related variables and methods @@ -86,9 +87,12 @@ class ScoreboardCheckStage int *exeResType, int wfSlot); ComputeUnit &computeUnit; - // List of waves which are ready to be scheduled. - // Each execution resource has a ready list - std::vector*> readyList; + /** + * Interface between scoreboard check and schedule stages. Each + * cycle the scoreboard check stage populates this interface with + * information needed by the schedule stage. + */ + ScoreboardCheckToSchedule &toSchedule; // Stats Stats::Vector stallCycles; -- 2.30.2