SimObject('LdsState.py')
SimObject('X86GPUTLB.py')
+Source('comm.cc')
Source('compute_unit.cc')
Source('dispatcher.cc')
Source('exec_stage.cc')
--- /dev/null
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/comm.hh"
+
+#include <cassert>
+
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+/**
+ * Scoreboard/Schedule stage interface.
+ */
+ScoreboardCheckToSchedule::ScoreboardCheckToSchedule(const ComputeUnitParams
+ *p)
+{
+ int num_func_units = p->num_SIMDs + p->num_scalar_cores
+ + p->num_global_mem_pipes + p->num_shared_mem_pipes
+ + p->num_scalar_mem_pipes;
+ _readyWFs.resize(num_func_units);
+
+ for (auto &func_unit_wf_list : _readyWFs) {
+ func_unit_wf_list.reserve(p->n_wf);
+ }
+}
+
+void
+ScoreboardCheckToSchedule::reset()
+{
+ for (auto &func_unit_wf_list : _readyWFs) {
+ func_unit_wf_list.resize(0);
+ }
+}
+
+void
+ScoreboardCheckToSchedule::markWFReady(Wavefront *wf, int func_unit_id)
+{
+ _readyWFs[func_unit_id].push_back(wf);
+}
+
+int
+ScoreboardCheckToSchedule::numReadyLists() const
+{
+ return _readyWFs.size();
+}
+
+std::vector<Wavefront*>&
+ScoreboardCheckToSchedule::readyWFs(int func_unit_id)
+{
+ return _readyWFs[func_unit_id];
+}
+
+/**
+ * Delete all wavefronts that have been marked as ready at scoreboard stage
+ * but are found to have empty instruction buffers at schedule stage.
+ */
+void
+ScoreboardCheckToSchedule::updateReadyList(int func_unit_id)
+{
+ std::vector<Wavefront*> &func_unit_wf_list = _readyWFs[func_unit_id];
+
+ for (auto it = func_unit_wf_list.begin(); it != func_unit_wf_list.end();) {
+ if ((*it)->instructionBuffer.empty()) {
+ it = func_unit_wf_list.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+/**
+ * Schedule/Execute stage interface.
+ */
+ScheduleToExecute::ScheduleToExecute(const ComputeUnitParams *p)
+{
+ int num_func_units = p->num_SIMDs + p->num_scalar_cores
+ + p->num_global_mem_pipes + p->num_shared_mem_pipes
+ + p->num_scalar_mem_pipes;
+ _readyInsts.resize(num_func_units, nullptr);
+ _dispatchStatus.resize(num_func_units, EMPTY);
+}
+
+void
+ScheduleToExecute::reset()
+{
+ for (auto &func_unit_ready_inst : _readyInsts) {
+ func_unit_ready_inst = nullptr;
+ }
+
+ for (auto &func_unit_status : _dispatchStatus) {
+ func_unit_status = EMPTY;
+ }
+}
+
+GPUDynInstPtr&
+ScheduleToExecute::readyInst(int func_unit_id)
+{
+ return _readyInsts[func_unit_id];
+}
+
+void
+ScheduleToExecute::dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst,
+ int func_unit_id,
+ DISPATCH_STATUS disp_status)
+{
+ _readyInsts[func_unit_id] = gpu_dyn_inst;
+ _dispatchStatus[func_unit_id] = disp_status;
+}
+
+void
+ScheduleToExecute::dispatchTransition(int func_unit_id,
+ DISPATCH_STATUS disp_status)
+{
+ _readyInsts[func_unit_id] = nullptr;
+ _dispatchStatus[func_unit_id] = disp_status;
+}
+
+DISPATCH_STATUS
+ScheduleToExecute::dispatchStatus(int func_unit_id) const
+{
+ return _dispatchStatus[func_unit_id];
+}
--- /dev/null
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#ifndef __GPU_COMPUTE_COMM_HH__
+#define __GPU_COMPUTE_COMM_HH__
+
+#include <array>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/misc.hh"
+
+struct ComputeUnitParams;
+class Wavefront;
+
+class PipeStageIFace
+{
+ public:
+ /**
+ * Reset the pipe stage interface. This is called to remove
+ * any stale state from the pipe stage that is leftover from
+ * the prior cycle. This is needed when stages do not actually
+ * consume the information passed via the stage interfaces.
+ */
+ virtual void reset() = 0;
+};
+
+/**
+ * Communication interface between ScoreboardCheck and Schedule stages.
+ */
+class ScoreboardCheckToSchedule : public PipeStageIFace
+{
+ public:
+ ScoreboardCheckToSchedule() = delete;
+ ScoreboardCheckToSchedule(const ComputeUnitParams *p);
+ void reset() override;
+ /**
+ * Mark the WF as ready for execution on a particular functional
+ * unit.
+ */
+ void markWFReady(Wavefront *wf, int func_unit_id);
+ /**
+ * Returns the number of ready lists (i.e., the number of functional
+ * units). Each functional unit has its own list of ready WFs to
+ * consider for arbitration.
+ */
+ int numReadyLists() const;
+ /**
+ * TODO: These methods expose this class' implementation too much by
+ * returning references to its internal data structures directly.
+ * These are to support legacy functionality in the CU pipeline.
+ * They should be removed eventually for an API that hides such
+ * implementation details.
+ */
+ std::vector<Wavefront*>& readyWFs(int func_unit_id);
+
+ // TODO: Leftover from old CU code, needs to go away.
+ void updateReadyList(int func_unit_id);
+
+ private:
+ std::vector<std::vector<Wavefront*>> _readyWFs;
+};
+
+/**
+ * Communication interface between Schedule and Execute stages.
+ */
+class ScheduleToExecute : public PipeStageIFace
+{
+ public:
+ ScheduleToExecute() = delete;
+ ScheduleToExecute(const ComputeUnitParams *p);
+ void reset() override;
+ GPUDynInstPtr& readyInst(int func_unit_id);
+ /**
+ * Once the scheduler has chosen a winning WF for execution, and
+ * after the WF's oldest instruction's operands have been read,
+ * this method is used to mark the instruction as ready to execute.
+ * This puts it on the dispatch list to be consumed by the execute
+ * stage.
+ */
+ void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst,
+ int func_unit_id, DISPATCH_STATUS disp_status);
+ void dispatchTransition(int func_unit_id, DISPATCH_STATUS disp_status);
+ DISPATCH_STATUS dispatchStatus(int func_unit_id) const;
+
+ private:
+ std::vector<GPUDynInstPtr> _readyInsts;
+ std::vector<DISPATCH_STATUS> _dispatchStatus;
+};
+
+#endif // __GPU_COMPUTE_COMM_HH__
coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
registerManager(p->register_manager),
fetchStage(p, *this),
- scoreboardCheckStage(p, *this),
- scheduleStage(p, *this),
- execStage(p, *this),
+ scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
+ scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
+ execStage(p, *this, scheduleToExecute),
globalMemoryPipe(p, *this),
localMemoryPipe(p, *this),
scalarMemoryPipe(p, *this),
lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
_cacheLineSize(p->system->cacheLineSize()),
_numBarrierSlots(p->num_barrier_slots),
- globalSeqNum(0), wavefrontSize(p->wf_size)
+ globalSeqNum(0), wavefrontSize(p->wf_size),
+ scoreboardCheckToSchedule(p),
+ scheduleToExecute(p)
{
/**
* This check is necessary because std::bitset only provides conversion
lastVaddrSimd[j].clear();
}
lastVaddrCU.clear();
- readyList.clear();
- dispatchList.clear();
delete cuExitCallback;
delete ldsPort;
}
w->computeActualWgSz(task);
}
-// delete all wavefronts that have been marked as ready at SCB stage
-// but are found to have empty instruction buffers at SCH stage
-void
-ComputeUnit::updateReadyList(int unitId)
-{
- if (!readyList[unitId].empty()) {
- for (std::vector<Wavefront *>::iterator it = readyList[unitId].begin();
- it != readyList[unitId].end();) {
- if ((*it)->instructionBuffer.empty()) {
- it = readyList[unitId].erase(it);
- }
- else {
- ++it;
- }
- }
- }
-}
-
void
ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
HSAQueueEntry *task, int bar_id, bool fetchContext)
vectorRegsReserved.resize(numVectorALUs, 0);
scalarRegsReserved.resize(numVectorALUs, 0);
- // Initializing pipeline resources
- readyList.resize(numExeUnits());
-
- for (int j = 0; j < numExeUnits(); ++j) {
- dispatchList.push_back(std::make_pair(nullptr, EMPTY));
- }
-
fetchStage.init();
- scoreboardCheckStage.init();
scheduleStage.init();
execStage.init();
globalMemoryPipe.init();
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "enums/PrefetchType.hh"
+#include "gpu-compute/comm.hh"
#include "gpu-compute/exec_stage.hh"
#include "gpu-compute/fetch_stage.hh"
#include "gpu-compute/global_memory_pipeline.hh"
int numCyclesPerStoreTransfer; // number of cycles per vector store
int numCyclesPerLoadTransfer; // number of cycles per vector load
- // Buffers used to communicate between various pipeline stages
-
- // At a high level, the following intra-/inter-stage communication occurs:
- // SCB to SCH: readyList provides per exec resource list of waves that
- // passed dependency and readiness checks. If selected by
- // scheduler, attempt to add wave to schList conditional on
- // RF support.
- // SCH: schList holds waves that are gathering operands or waiting
- // for execution resource availability. Once ready, waves are
- // placed on the dispatchList as candidates for execution. A wave
- // may spend multiple cycles in SCH stage, on the schList due to
- // RF access conflicts or execution resource contention.
- // SCH to EX: dispatchList holds waves that are ready to be executed.
- // LM/FLAT arbitration may remove an LM wave and place it
- // back on the schList. RF model may also force a wave back
- // to the schList if using the detailed model.
-
- // List of waves which are ready to be scheduled.
- // Each execution resource has a ready list. readyList is
- // used to communicate between scoreboardCheck stage and
- // schedule stage
- std::vector<std::vector<Wavefront*>> readyList;
-
- // List of waves which will be dispatched to
- // each execution resource. An EXREADY implies
- // dispatch list is non-empty and
- // execution unit has something to execute
- // this cycle. Currently, the dispatch list of
- // an execution resource can hold only one wave because
- // an execution resource can execute only one wave in a cycle.
- // dispatchList is used to communicate between schedule
- // and exec stage
- // TODO: convert std::pair to a class to increase readability
- std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
// track presence of dynamic instructions in the Schedule pipeline
// stage. This is used to check the readiness of the oldest,
// non-dispatched instruction of every WF in the Scoreboard stage.
// number of available scalar registers per SIMD unit
int numScalarRegsPerSimd;
- void updateReadyList(int unitId);
-
// this hash map will keep track of page divergence
// per memory instruction per wavefront. The hash map
// is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
InstSeqNum globalSeqNum;
int wavefrontSize;
+ /**
+ * TODO: Update these comments once the pipe stage interface has
+ * been fully refactored.
+ *
+ * Pipeline stage interfaces.
+ *
+ * Buffers used to communicate between various pipeline stages
+ * List of waves which will be dispatched to
+ * each execution resource. An EXREADY implies
+ * dispatch list is non-empty and
+ * execution unit has something to execute
+ * this cycle. Currently, the dispatch list of
+ * an execution resource can hold only one wave because
+ * an execution resource can execute only one wave in a cycle.
+ * dispatchList is used to communicate between schedule
+ * and exec stage
+ *
+ * At a high level, the following intra-/inter-stage communication occurs:
+ * SCB to SCH: readyList provides per exec resource list of waves that
+ * passed dependency and readiness checks. If selected by
+ * scheduler, attempt to add wave to schList conditional on
+ * RF support.
+ * SCH: schList holds waves that are gathering operands or waiting
+ * for execution resource availability. Once ready, waves are
+ * placed on the dispatchList as candidates for execution. A wave
+ * may spend multiple cycles in SCH stage, on the schList due to
+ * RF access conflicts or execution resource contention.
+ * SCH to EX: dispatchList holds waves that are ready to be executed.
+ * LM/FLAT arbitration may remove an LM wave and place it
+ * back on the schList. RF model may also force a wave back
+ * to the schList if using the detailed model.
+ */
+ ScoreboardCheckToSchedule scoreboardCheckToSchedule;
+ ScheduleToExecute scheduleToExecute;
+
/**
* The barrier slots for this CU.
*/
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
- : computeUnit(cu), lastTimeInstExecuted(false),
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu,
+ ScheduleToExecute &from_schedule)
+ : computeUnit(cu), fromSchedule(from_schedule),
+ lastTimeInstExecuted(false),
thisTimeInstExecuted(false), instrExecuted (false),
executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
void
ExecStage::init()
{
- dispatchList = &computeUnit.dispatchList;
idle_dur = 0;
}
std::stringstream ss;
bool empty = true;
for (int i = 0; i < computeUnit.numExeUnits(); i++) {
- DISPATCH_STATUS s = dispatchList->at(i).second;
+ DISPATCH_STATUS s = fromSchedule.dispatchStatus(i);
ss << i << ": " << dispStatusToStr(s);
if (s != EMPTY) {
empty = false;
- Wavefront *w = dispatchList->at(i).first;
- ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
- ss << (w->instructionBuffer.front())->seqNum() << ": ";
- ss << (w->instructionBuffer.front())->disassemble();
+ GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(i);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ ss << " SIMD[" << wf->simdId << "] WV[" << wf->wfDynId << "]: ";
+ ss << (wf->instructionBuffer.front())->seqNum() << ": ";
+ ss << (wf->instructionBuffer.front())->disassemble();
}
ss << "\n";
}
dumpDispList();
}
for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
- DISPATCH_STATUS s = dispatchList->at(unitId).second;
+ DISPATCH_STATUS s = fromSchedule.dispatchStatus(unitId);
switch (s) {
- case EMPTY:
+ case EMPTY:
// Do not execute if empty, waiting for VRF reads,
// or LM tied to GM waiting for VRF reads
collectStatistics(IdleExec, unitId);
break;
- case EXREADY:
- {
- collectStatistics(BusyExec, unitId);
- Wavefront *w = dispatchList->at(unitId).first;
- DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
- unitId, w->simdId, w->wfDynId,
- (w->instructionBuffer.front())->disassemble());
- DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
- dispatchList->at(unitId).first->exec();
- (computeUnit.scheduleStage).deleteFromSch(w);
- dispatchList->at(unitId).second = EMPTY;
- dispatchList->at(unitId).first->freeResources();
- dispatchList->at(unitId).first = nullptr;
- break;
- }
- case SKIP:
- collectStatistics(BusyExec, unitId);
- DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
- dispatchList->at(unitId).second = EMPTY;
- dispatchList->at(unitId).first->freeResources();
- dispatchList->at(unitId).first = nullptr;
- break;
- default:
+ case EXREADY:
+ {
+ collectStatistics(BusyExec, unitId);
+ GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId);
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
+ unitId, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->disassemble());
+ DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
+ wf->exec();
+ (computeUnit.scheduleStage).deleteFromSch(wf);
+ fromSchedule.dispatchTransition(unitId, EMPTY);
+ wf->freeResources();
+ break;
+ }
+ case SKIP:
+ {
+ collectStatistics(BusyExec, unitId);
+ GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId);
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
+ fromSchedule.dispatchTransition(unitId, EMPTY);
+ wf->freeResources();
+ break;
+ }
+ default:
panic("Unknown dispatch status in exec()\n");
}
}
#include "sim/stats.hh"
class ComputeUnit;
+class ScheduleToExecute;
class Wavefront;
+
struct ComputeUnitParams;
enum STAT_STATUS
class ExecStage
{
public:
- ExecStage(const ComputeUnitParams* p, ComputeUnit &cu);
+ ExecStage(const ComputeUnitParams* p, ComputeUnit &cu,
+ ScheduleToExecute &from_schedule);
~ExecStage() { }
void init();
void exec();
void collectStatistics(enum STAT_STATUS stage, int unitId);
void initStatistics();
ComputeUnit &computeUnit;
+ ScheduleToExecute &fromSchedule;
- // List of waves which will be dispatched to
- // each execution resource. A FILLED implies
- // dispatch list is non-empty and
- // execution unit has something to execute
- // this cycle. Currently, the dispatch list of
- // an execution resource can hold only one wave because
- // an execution resource can execute only one wave in a cycle.
- // dispatchList is used to communicate between schedule
- // and exec stage
- std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
bool lastTimeInstExecuted;
bool thisTimeInstExecuted;
bool instrExecuted;
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
- : computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
+ ScoreboardCheckToSchedule &from_scoreboard_check,
+ ScheduleToExecute &to_execute)
+ : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
+ toExecute(to_execute),
+ _name(cu.name() + ".ScheduleStage"),
vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
locMemBusRdy(false), locMemIssueRdy(false)
ScheduleStage::init()
{
- fatal_if(scheduler.size() != computeUnit.readyList.size(),
+ fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
"Scheduler should have same number of entries as CU's readyList");
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
- scheduler[j].bindList(&computeUnit.readyList[j]);
+ scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
}
- dispatchList = &computeUnit.dispatchList;
-
assert(computeUnit.numVectorGlobalMemUnits == 1);
assert(computeUnit.numVectorSharedMemUnits == 1);
}
void
ScheduleStage::exec()
{
+ toExecute.reset();
+
// Update readyList
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
- // delete all ready wavefronts whose instruction buffers are now
- // empty because the last instruction was executed
- computeUnit.updateReadyList(j);
/**
* Remove any wave that already has an instruction present in SCH
* waiting for RF reads to complete. This prevents out of order
* execution within a wave.
*/
- for (auto wIt = computeUnit.readyList.at(j).begin();
- wIt != computeUnit.readyList.at(j).end();) {
+ fromScoreboardCheck.updateReadyList(j);
+ for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
+ wIt != fromScoreboardCheck.readyWFs(j).end();) {
if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
*wIt = nullptr;
- wIt = computeUnit.readyList.at(j).erase(wIt);
+ wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
} else {
wIt++;
}
int firstMemUnit = computeUnit.firstMemUnit();
int lastMemUnit = computeUnit.lastMemUnit();
for (int j = firstMemUnit; j <= lastMemUnit; j++) {
- int readyListSize = computeUnit.readyList[j].size();
+ int readyListSize = fromScoreboardCheck.readyWFs(j).size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
- Wavefront *w = scheduler[j].chooseWave();
- if (!addToSchList(j, w)) {
+ Wavefront *wf = scheduler[j].chooseWave();
+ GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+ assert(gpu_dyn_inst);
+ if (!addToSchList(j, gpu_dyn_inst)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
- w->schCycles++;
+ wf->schCycles++;
addToSchListStalls[j]++;
}
}
if (j >= firstMemUnit && j <= lastMemUnit) {
continue;
}
- int readyListSize = computeUnit.readyList[j].size();
+ int readyListSize = fromScoreboardCheck.readyWFs(j).size();
// If no wave is ready to be scheduled on the execution resource
// then skip scheduling for this execution resource
if (!readyListSize) {
rdyListNotEmpty[j]++;
// Pick a wave and attempt to add it to schList
- Wavefront *w = scheduler[j].chooseWave();
- if (!addToSchList(j, w)) {
+ Wavefront *wf = scheduler[j].chooseWave();
+ GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+ assert(gpu_dyn_inst);
+ if (!addToSchList(j, gpu_dyn_inst)) {
// For waves not added to schList, increment count of cycles
// this wave spends in SCH stage.
- w->schCycles++;
+ wf->schCycles++;
addToSchListStalls[j]++;
}
}
void
ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
- Wavefront *w)
+ const GPUDynInstPtr &gpu_dyn_inst)
{
- dispatchList->at(unitId).first = w;
- dispatchList->at(unitId).second = s;
+ toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)
+{
+ toExecute.dispatchTransition(unitId, s);
}
bool
-ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
+ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
{
- GPUDynInstPtr ii = w->instructionBuffer.front();
- assert(ii);
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
bool accessVrfWr = true;
- if (!ii->isScalar()) {
- accessVrfWr =
- computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+ if (!gpu_dyn_inst->isScalar()) {
+ accessVrfWr = computeUnit.vrf[wf->simdId]
+ ->canScheduleWriteOperands(wf, gpu_dyn_inst);
}
- bool accessSrfWr =
- computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
+ bool accessSrfWr = computeUnit.srf[wf->simdId]
+ ->canScheduleWriteOperands(wf, gpu_dyn_inst);
bool accessRf = accessVrfWr && accessSrfWr;
if (accessRf) {
- if (!ii->isScalar()) {
- computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
+ if (!gpu_dyn_inst->isScalar()) {
+ computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
+ gpu_dyn_inst);
}
- computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
+ computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
return true;
} else {
rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
}
// Increment stall counts for WF
- w->schStalls++;
- w->schRfAccessStalls++;
+ wf->schStalls++;
+ wf->schRfAccessStalls++;
}
return false;
}
ScheduleStage::scheduleRfDestOperands()
{
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
- if (dispatchList->at(j).second == EMPTY ||
- dispatchList->at(j).second == SKIP) {
+ if (toExecute.dispatchStatus(j) == EMPTY ||
+ toExecute.dispatchStatus(j) == SKIP) {
continue;
}
- assert(dispatchList->at(j).first);
-
// get the wave on dispatch list and attempt to allocate write
// resources in the RFs
- Wavefront *w = dispatchList->at(j).first;
- if (!schedRfWrites(j, w)) {
- reinsertToSchList(j, w);
+ const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ if (!schedRfWrites(j, gpu_dyn_inst)) {
+ reinsertToSchList(j, gpu_dyn_inst);
doDispatchListTransition(j, EMPTY);
// if this is a flat inst, also transition the LM pipe to empty
// Note: since FLAT/LM arbitration occurs before scheduling
// instruction lost arbitration, but would have been able to
// pass the RF destination operand check here, and execute
// instead of the FLAT.
- if (w->instructionBuffer.front()->isFlat()) {
- assert(dispatchList->at(w->localMem).second == SKIP);
- doDispatchListTransition(w->localMem, EMPTY);
+ if (wf->instructionBuffer.front()->isFlat()) {
+ assert(toExecute.dispatchStatus(wf->localMem)
+ == SKIP);
+ doDispatchListTransition(wf->localMem, EMPTY);
}
}
}
}
bool
-ScheduleStage::addToSchList(int exeType, Wavefront *w)
+ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
{
// Attempt to add the wave to the schList if the VRF can support the
// wave's next instruction
- GPUDynInstPtr ii = w->instructionBuffer.front();
- assert(ii);
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
bool accessVrf = true;
- if (!ii->isScalar()) {
- accessVrf =
- computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
+ if (!gpu_dyn_inst->isScalar()) {
+ accessVrf = computeUnit.vrf[wf->simdId]
+ ->canScheduleReadOperands(wf, gpu_dyn_inst);
}
- bool accessSrf =
- computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
+ bool accessSrf = computeUnit.srf[wf->simdId]
+ ->canScheduleReadOperands(wf, gpu_dyn_inst);
// If RFs can support instruction, add to schList in RFBUSY state,
// place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
// to the VRF
bool accessRf = accessVrf && accessSrf;
if (accessRf) {
DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
- exeType, w->simdId, w->wfDynId,
- ii->seqNum(), ii->disassemble());
+ exeType, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
- computeUnit.insertInPipeMap(w);
- wavesInSch.emplace(w->wfDynId);
- schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
- if (w->isOldestInstWaitcnt()) {
- w->setStatus(Wavefront::S_WAITCNT);
+ computeUnit.insertInPipeMap(wf);
+ wavesInSch.emplace(wf->wfDynId);
+ schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
+ if (wf->isOldestInstWaitcnt()) {
+ wf->setStatus(Wavefront::S_WAITCNT);
}
- if (!ii->isScalar()) {
- computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
+ if (!gpu_dyn_inst->isScalar()) {
+ computeUnit.vrf[wf->simdId]
+ ->scheduleReadOperands(wf, gpu_dyn_inst);
}
- computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
+ computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
- exeType, w->simdId, w->wfDynId,
- ii->seqNum(), ii->disassemble());
+ exeType, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
return true;
} else {
// Number of stall cycles due to RF access denied
}
// Increment stall counts for WF
- w->schStalls++;
- w->schRfAccessStalls++;
+ wf->schStalls++;
+ wf->schRfAccessStalls++;
DPRINTF(GPUSched, "schList[%d]: Could not add: "
"SIMD[%d] WV[%d]: %d: %s\n",
- exeType, w->simdId, w->wfDynId,
- ii->seqNum(), ii->disassemble());
+ exeType, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
}
return false;
}
void
-ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
+ScheduleStage::reinsertToSchList(int exeType,
+ const GPUDynInstPtr &gpu_dyn_inst)
{
// Insert wave w into schList for specified exeType.
// Wave is inserted in age order, with oldest wave being at the
// front of the schList
+ assert(gpu_dyn_inst);
auto schIter = schList.at(exeType).begin();
while (schIter != schList.at(exeType).end()
- && schIter->first->wfDynId < w->wfDynId) {
+ && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
schIter++;
}
- schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
+ schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
}
void
}
bool
-ScheduleStage::dispatchReady(Wavefront *w)
+ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
{
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
vectorAluRdy = false;
scalarAluRdy = false;
// check for available vector/scalar ALUs in the next cycle
- if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
+ if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
vectorAluRdy = true;
}
- if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+ if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
scalarAluRdy = true;
}
- GPUDynInstPtr ii = w->instructionBuffer.front();
- if (ii->isNop()) {
+ if (gpu_dyn_inst->isNop()) {
// S_NOP requires SALU. V_NOP requires VALU.
// TODO: Scalar NOP does not require SALU in hardware,
// and is executed out of IB directly.
- if (ii->isScalar() && !scalarAluRdy) {
+ if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
- } else if (!ii->isScalar() && !vectorAluRdy) {
+ } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
- } else if (ii->isEndOfKernel()) {
+ } else if (gpu_dyn_inst->isEndOfKernel()) {
// EndPgm instruction
- if (ii->isScalar() && !scalarAluRdy) {
+ if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
}
- } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
+ } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
+ || gpu_dyn_inst->isALU()) {
// Barrier, Branch, or ALU instruction
- if (ii->isScalar() && !scalarAluRdy) {
+ if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
return false;
- } else if (!ii->isScalar() && !vectorAluRdy) {
+ } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
return false;
}
- } else if (!ii->isScalar() && ii->isGlobalMem()) {
+ } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
// Vector Global Memory instruction
bool rdy = true;
if (!glbMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
+ if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
}
- if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
+ if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
}
if (!rdy) {
return false;
}
- } else if (ii->isScalar() && ii->isGlobalMem()) {
+ } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
// Scalar Global Memory instruction
bool rdy = true;
if (!scalarMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit.scalarMemoryPipe.
- isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
- w->scalarWrGmReqsInPipe)) {
+ if (!computeUnit.scalarMemoryPipe
+ .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
+ + wf->scalarWrGmReqsInPipe))
+ {
rdy = false;
dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
}
- } else if (!ii->isScalar() && ii->isLocalMem()) {
+ } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
// Vector Local Memory instruction
bool rdy = true;
if (!locMemIssueRdy) {
dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
}
if (!computeUnit.localMemoryPipe.
- isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+ isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
}
if (!rdy) {
return false;
}
- } else if (!ii->isScalar() && ii->isFlat()) {
+ } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
// Vector Flat memory instruction
bool rdy = true;
if (!glbMemIssueRdy || !locMemIssueRdy) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
}
- if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
+ if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
}
- if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
+ if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
}
if (!computeUnit.localMemoryPipe.
- isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+ isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
rdy = false;
dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
}
return false;
}
} else {
- panic("%s: unknown instr checked for readiness", ii->disassemble());
+ panic("%s: unknown instr checked for readiness",
+ gpu_dyn_inst->disassemble());
return false;
}
dispNrdyStalls[SCH_RDY]++;
checkMemResources();
// iterate execution resources
for (int j = 0; j < computeUnit.numExeUnits(); j++) {
- assert(dispatchList->at(j).second == EMPTY);
+ assert(toExecute.dispatchStatus(j) == EMPTY);
// iterate waves in schList to pick one for dispatch
auto schIter = schList.at(j).begin();
// Acquire a coalescer token if it is a global mem
// operation.
- GPUDynInstPtr mp = schIter->first->
- instructionBuffer.front();
+ GPUDynInstPtr mp = schIter->first;
if (!mp->isMemSync() && !mp->isScalar() &&
(mp->isGlobalMem() || mp->isFlat())) {
computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
} else {
// Either another wave has been dispatched, or this wave
// was not ready, so it is stalled this cycle
- schIter->first->schStalls++;
+ schIter->first->wavefront()->schStalls++;
if (!dispRdy) {
// not ready for dispatch, increment stall stat
- schIter->first->schResourceStalls++;
+ schIter->first->wavefront()->schResourceStalls++;
}
// Examine next wave for this resource
schIter++;
// get the GM pipe index in the dispatchList
int gm_exe_unit = computeUnit.firstMemUnit() + i;
// get the wave in the dispatchList
- Wavefront *w = dispatchList->at(gm_exe_unit).first;
+ GPUDynInstPtr &gpu_dyn_inst
+ = toExecute.readyInst(gm_exe_unit);
// If the WF is valid, ready to execute, and the instruction
// is a flat access, arbitrate with the WF's assigned LM pipe
- if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
- w->instructionBuffer.front()->isFlat()) {
+ if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
+ == EXREADY && gpu_dyn_inst->isFlat()) {
+ Wavefront *wf = gpu_dyn_inst->wavefront();
// If the associated LM pipe also has a wave selected, block
// that wave and let the Flat instruction issue. The WF in the
// LM pipe is added back to the schList for consideration next
// cycle.
- if (dispatchList->at(w->localMem).second == EXREADY) {
- reinsertToSchList(w->localMem,
- dispatchList->at(w->localMem).first);
+ if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
+ reinsertToSchList(wf->localMem, toExecute
+ .readyInst(wf->localMem));
// Increment stall stats for LDS-VRF arbitration
ldsBusArbStalls++;
- dispatchList->at(w->localMem).first->schLdsArbStalls++;
+ toExecute.readyInst(wf->localMem)
+ ->wavefront()->schLdsArbStalls++;
}
// With arbitration of LM pipe complete, transition the
// LM pipe to SKIP state in the dispatchList to inform EX stage
// that a Flat instruction is executing next cycle
- doDispatchListTransition(w->localMem, SKIP, w);
+ doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
- "EXREADY->SKIP\n", w->localMem);
+ "EXREADY->SKIP\n", wf->localMem);
}
}
}
// selection for dispatchList
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
for (auto &p : schList.at(j)) {
- Wavefront *w = p.first;
- assert(w);
+ const GPUDynInstPtr &gpu_dyn_inst = p.first;
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
// Increment the number of cycles the wave spends in the
// SCH stage, since this loop visits every wave in SCH.
- w->schCycles++;
+ wf->schCycles++;
- GPUDynInstPtr ii = w->instructionBuffer.front();
bool vrfRdy = true;
- if (!ii->isScalar()) {
- vrfRdy =
- computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
+ if (!gpu_dyn_inst->isScalar()) {
+ vrfRdy = computeUnit.vrf[wf->simdId]
+ ->operandReadComplete(wf, gpu_dyn_inst);
}
- bool srfRdy =
- computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
+ bool srfRdy = computeUnit.srf[wf->simdId]
+ ->operandReadComplete(wf, gpu_dyn_inst);
bool operandsReady = vrfRdy && srfRdy;
if (operandsReady) {
- DPRINTF(GPUSched,
- "schList[%d]: WV[%d] operands ready for: %d: %s\n",
- j, w->wfDynId, ii->seqNum(), ii->disassemble());
+ DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
+ "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
+ gpu_dyn_inst->disassemble());
DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
- j, w->wfDynId);
+ j, wf->wfDynId);
p.second = RFREADY;
} else {
- DPRINTF(GPUSched,
- "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
- j, w->wfDynId, ii->seqNum(), ii->disassemble());
+ DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
+ "for: %d: %s\n", j, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
// operands not ready yet, increment SCH stage stats
// aggregate to all wavefronts on the CU
p.second = RFBUSY;
// Increment stall stats
- w->schStalls++;
- w->schOpdNrdyStalls++;
+ wf->schStalls++;
+ wf->schOpdNrdyStalls++;
opdNrdyStalls[SCH_RF_OPD_NRDY]++;
if (!vrfRdy) {
exeUnitReservations.resize(computeUnit.numExeUnits(), false);
for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
- Wavefront *dispatchedWave = dispatchList->at(j).first;
- if (dispatchedWave) {
- DISPATCH_STATUS s = dispatchList->at(j).second;
+ GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+ if (gpu_dyn_inst) {
+ DISPATCH_STATUS s = toExecute.dispatchStatus(j);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
if (s == EMPTY) {
continue;
} else if (s == EXREADY) {
// Wave is ready for execution
- std::vector<int> execUnitIds =
- dispatchedWave->reserveResources();
- GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
+ std::vector<int> execUnitIds = wf->reserveResources();
- if (!ii->isScalar()) {
- computeUnit.vrf[dispatchedWave->simdId]->
- dispatchInstruction(ii);
+ if (!gpu_dyn_inst->isScalar()) {
+ computeUnit.vrf[wf->simdId]
+ ->dispatchInstruction(gpu_dyn_inst);
}
- computeUnit.srf[dispatchedWave->simdId]->
- dispatchInstruction(ii);
+ computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
std::stringstream ss;
for (auto id : execUnitIds) {
}
DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
" Reserving ExeRes[ %s]\n",
- j, dispatchedWave->simdId, dispatchedWave->wfDynId,
- ii->seqNum(), ii->disassemble(), ss.str());
+ j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
+ gpu_dyn_inst->disassemble(), ss.str());
// mark the resources as reserved for this cycle
for (auto execUnitId : execUnitIds) {
panic_if(exeUnitReservations.at(execUnitId),
"Execution unit %d is reserved!!!\n"
"SIMD[%d] WV[%d]: %d: %s",
- execUnitId, dispatchedWave->simdId,
- dispatchedWave->wfDynId,
- ii->seqNum(), ii->disassemble());
+ execUnitId, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(),
+ gpu_dyn_inst->disassemble());
exeUnitReservations.at(execUnitId) = true;
}
// that we've reserved a global and local memory unit. Thus,
// we need to mark the latter execution unit as not available.
if (execUnitIds.size() > 1) {
- int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
- assert(dispatchList->at(lm_exec_unit).second == SKIP);
+ int lm_exec_unit M5_VAR_USED = wf->localMem;
+ assert(toExecute.dispatchStatus(lm_exec_unit)
+ == SKIP);
}
} else if (s == SKIP) {
// Shared Memory pipe reserved for FLAT instruction.
// Verify the GM pipe for this wave is ready to execute
// and the wave in the GM pipe is the same as the wave
// in the LM pipe
- int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
- assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
- dispatchedWave->wfDynId);
- assert(dispatchList->at(gm_exec_unit).second == EXREADY);
+ int gm_exec_unit M5_VAR_USED = wf->globalMem;
+ assert(wf->wfDynId == toExecute
+ .readyInst(gm_exec_unit)->wfDynId);
+ assert(toExecute.dispatchStatus(gm_exec_unit)
+ == EXREADY);
}
}
}
#include <vector>
#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/misc.hh"
#include "gpu-compute/scheduler.hh"
-#include "gpu-compute/scoreboard_check_stage.hh"
// Schedule or execution arbitration stage.
// From the pool of ready waves in the ready list,
// The selection is made based on a scheduling policy
class ComputeUnit;
+class ScheduleToExecute;
+class ScoreboardCheckToSchedule;
class Wavefront;
struct ComputeUnitParams;
class ScheduleStage
{
public:
- ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu);
+ ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
+ ScoreboardCheckToSchedule &from_scoreboard_check,
+ ScheduleToExecute &to_execute);
~ScheduleStage();
void init();
void exec();
private:
ComputeUnit &computeUnit;
+ ScoreboardCheckToSchedule &fromScoreboardCheck;
+ ScheduleToExecute &toExecute;
+
// Each execution resource will have its own
// scheduler and a dispatch list
std::vector<Scheduler> scheduler;
- // List of waves which will be dispatched to
- // each execution resource.
- // Currently, the dispatch list of
- // an execution resource can hold only one wave because
- // an execution resource can execute only one wave in a cycle.
- std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
-
// Stats
// Number of cycles with empty (or not empty) readyList, per execution
const std::string _name;
// called by exec() to add a wave to schList if the RFs can support it
- bool addToSchList(int exeType, Wavefront *w);
+ bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
// re-insert a wave to schList if wave lost arbitration
// wave is inserted such that age order (oldest to youngest) is preserved
- void reinsertToSchList(int exeType, Wavefront *w);
+ void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
// check waves in schList to see if RF reads complete
void checkRfOperandReadComplete();
// check execution resources for readiness
// check status of memory pipes and RF to Mem buses
void checkMemResources();
// resource ready check called by fillDispatchList
- bool dispatchReady(Wavefront *w);
+ bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst);
// pick waves from schList and populate dispatchList with one wave
// per EXE resource type
void fillDispatchList();
// dispatchList
void scheduleRfDestOperands();
// invoked by scheduleRfDestOperands to schedule RF writes for a wave
- bool schedRfWrites(int exeType, Wavefront *w);
+ bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
// reserve resources for waves surviving arbitration in dispatchList
void reserveResources();
void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
- Wavefront *w = nullptr);
+ const GPUDynInstPtr &gpu_dyn_inst);
+ void doDispatchListTransition(int unitId, DISPATCH_STATUS s);
// Set tracking wfDynId for each wave present in schedule stage
// Used to allow only one instruction per wave in schedule
// The maximum number of waves per resource can be determined by either
// the VRF/SRF availability or limits imposed by paremeters (to be added)
// of the SCH stage or CU.
- std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
+ std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
};
#endif // __SCHEDULE_STAGE_HH__
#include "params/ComputeUnit.hh"
ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p,
- ComputeUnit &cu)
- : computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage")
+ ComputeUnit &cu,
+ ScoreboardCheckToSchedule
+ &to_schedule)
+ : computeUnit(cu), toSchedule(to_schedule),
+ _name(cu.name() + ".ScoreboardCheckStage")
{
}
ScoreboardCheckStage::~ScoreboardCheckStage()
{
- readyList.clear();
-}
-
-void
-ScoreboardCheckStage::init()
-{
- for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
- readyList.push_back(&computeUnit.readyList[unitId]);
- }
}
void
void
ScoreboardCheckStage::exec()
{
- // reset the ready list for all execution units; it will be
- // constructed every cycle since resource availability may change
- for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
- // Reset wavefront pointers to nullptr so clear() on the vector
- // does not accidentally destruct the wavefront object
- for (int i = 0; i < readyList[unitId]->size(); i++) {
- readyList[unitId]->at(i) = nullptr;
- }
- readyList[unitId]->clear();
- }
- // iterate over all WF slots across all vector ALUs
+ /**
+ * Reset the ready list for all execution units; ready list will be
+ * constructed every cycle because resource availability may change.
+ */
+ toSchedule.reset();
+
+ // Iterate over all WF slots across all SIMDs.
for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) {
for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) {
// reset the ready status of each wavefront
curWave->simdId, curWave->wfDynId,
curWave->nextInstr()->seqNum(),
curWave->nextInstr()->disassemble());
- readyList.at(exeResType)->push_back(curWave);
+ toSchedule.markWFReady(curWave, exeResType);
}
collectStatistics(rdyStatus);
}
#include "sim/stats.hh"
class ComputeUnit;
+class ScoreboardCheckToSchedule;
class Wavefront;
struct ComputeUnitParams;
NRDY_CONDITIONS
};
- ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu);
+ ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu,
+ ScoreboardCheckToSchedule &to_schedule);
~ScoreboardCheckStage();
- void init();
void exec();
// Stats related variables and methods
int *exeResType, int wfSlot);
ComputeUnit &computeUnit;
- // List of waves which are ready to be scheduled.
- // Each execution resource has a ready list
- std::vector<std::vector<Wavefront*>*> readyList;
+ /**
+ * Interface between scoreboard check and schedule stages. Each
+ * cycle the scoreboard check stage populates this interface with
+ * information needed by the schedule stage.
+ */
+ ScoreboardCheckToSchedule &toSchedule;
// Stats
Stats::Vector stallCycles;