* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
- *
- * Author: Sooraj Puthoor
*/
#include "gpu-compute/schedule_stage.hh"
+#include <unordered_set>
+
+#include "debug/GPUSched.hh"
+#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
- : numSIMDs(p->num_SIMDs),
- numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
+ ScoreboardCheckToSchedule &from_scoreboard_check,
+ ScheduleToExecute &to_execute)
+ : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
+ toExecute(to_execute),
+ _name(cu.name() + ".ScheduleStage"),
+ vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
+ scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
+ locMemBusRdy(false), locMemIssueRdy(false)
{
- for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
- Scheduler newScheduler(p);
- scheduler.push_back(newScheduler);
+ for (int j = 0; j < cu.numExeUnits(); ++j) {
+ scheduler.emplace_back(p);
+ }
+ wavesInSch.clear();
+ schList.resize(cu.numExeUnits());
+ for (auto &dq : schList) {
+ dq.clear();
}
}
ScheduleStage::~ScheduleStage()
{
scheduler.clear();
- waveStatusList.clear();
+ wavesInSch.clear();
+ schList.clear();
+}
+
+void
+ScheduleStage::init()
+{
+
+ fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
+ "Scheduler should have same number of entries as CU's readyList");
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
+ }
+
+ assert(computeUnit.numVectorGlobalMemUnits == 1);
+ assert(computeUnit.numVectorSharedMemUnits == 1);
+}
+
+void
+ScheduleStage::exec()
+{
+ toExecute.reset();
+
+ // Update readyList
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ /**
+ * Remove any wave that already has an instruction present in SCH
+ * waiting for RF reads to complete. This prevents out of order
+ * execution within a wave.
+ */
+ fromScoreboardCheck.updateReadyList(j);
+ for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
+ wIt != fromScoreboardCheck.readyWFs(j).end();) {
+ if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
+ *wIt = nullptr;
+ wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
+ } else {
+ wIt++;
+ }
+ }
+ }
+
+ // Attempt to add another wave for each EXE type to schList queues
+ // VMEM resources are iterated first, effectively giving priority
+ // to VMEM over VALU for scheduling read of operands to the RFs.
+ // Scalar Memory are iterated after VMEM
+
+ // Iterate VMEM and SMEM
+ int firstMemUnit = computeUnit.firstMemUnit();
+ int lastMemUnit = computeUnit.lastMemUnit();
+ for (int j = firstMemUnit; j <= lastMemUnit; j++) {
+ int readyListSize = fromScoreboardCheck.readyWFs(j).size();
+ // If no wave is ready to be scheduled on the execution resource
+ // then skip scheduling for this execution resource
+ if (!readyListSize) {
+ rdyListEmpty[j]++;
+ continue;
+ }
+ rdyListNotEmpty[j]++;
+
+ // Pick a wave and attempt to add it to schList
+ Wavefront *wf = scheduler[j].chooseWave();
+ GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+ assert(gpu_dyn_inst);
+ if (!addToSchList(j, gpu_dyn_inst)) {
+ // For waves not added to schList, increment count of cycles
+ // this wave spends in SCH stage.
+ wf->schCycles++;
+ addToSchListStalls[j]++;
+ } else {
+ if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
+ wf->incLGKMInstsIssued();
+ } else {
+ wf->incVMemInstsIssued();
+ if (gpu_dyn_inst->isFlat()) {
+ wf->incLGKMInstsIssued();
+ }
+ }
+ }
+ }
+
+ // Iterate everything else
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ // skip the VMEM resources
+ if (j >= firstMemUnit && j <= lastMemUnit) {
+ continue;
+ }
+ int readyListSize = fromScoreboardCheck.readyWFs(j).size();
+ // If no wave is ready to be scheduled on the execution resource
+ // then skip scheduling for this execution resource
+ if (!readyListSize) {
+ rdyListEmpty[j]++;
+ continue;
+ }
+ rdyListNotEmpty[j]++;
+
+ // Pick a wave and attempt to add it to schList
+ Wavefront *wf = scheduler[j].chooseWave();
+ GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+ assert(gpu_dyn_inst);
+ if (!addToSchList(j, gpu_dyn_inst)) {
+ // For waves not added to schList, increment count of cycles
+ // this wave spends in SCH stage.
+ wf->schCycles++;
+ addToSchListStalls[j]++;
+ }
+ }
+
+ // At this point, the schList queue per EXE type may contain
+ // multiple waves, in order of age (oldest to youngest).
+ // Wave may be in RFBUSY, indicating they are waiting for registers
+ // to be read, or in RFREADY, indicating they are candidates for
+ // the dispatchList and execution
+
+ // Iterate schList queues and check if any of the waves have finished
+ // reading their operands, moving those waves to RFREADY status
+ checkRfOperandReadComplete();
+
+ // Fill the dispatch list with the oldest wave of each EXE type that
+ // is ready to execute
+ // Wave is picked if status in schList is RFREADY and it passes resource
+ // ready checks similar to those currently in SCB
+ fillDispatchList();
+
+ // Resource arbitration on waves in dispatchList
+ // Losing waves are re-inserted to the schList at a location determined
+ // by wave age
+
+ // Arbitrate access to the VRF->LDS bus
+ arbitrateVrfToLdsBus();
+
+ // Schedule write operations to the register files
+ scheduleRfDestOperands();
+
+ // Lastly, reserve resources for waves that are ready to execute.
+ reserveResources();
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+ const GPUDynInstPtr &gpu_dyn_inst)
+{
+ toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)
+{
+ toExecute.dispatchTransition(unitId, s);
+}
+
+bool
+ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
+{
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ bool accessVrfWr = true;
+ if (!gpu_dyn_inst->isScalar()) {
+ accessVrfWr = computeUnit.vrf[wf->simdId]
+ ->canScheduleWriteOperands(wf, gpu_dyn_inst);
+ }
+ bool accessSrfWr = computeUnit.srf[wf->simdId]
+ ->canScheduleWriteOperands(wf, gpu_dyn_inst);
+ bool accessRf = accessVrfWr && accessSrfWr;
+ if (accessRf) {
+ if (!gpu_dyn_inst->isScalar()) {
+ computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
+ gpu_dyn_inst);
+ }
+ computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
+ return true;
+ } else {
+ rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+ if (!accessSrfWr) {
+ rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+ }
+ if (!accessVrfWr) {
+ rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+ }
+
+ // Increment stall counts for WF
+ wf->schStalls++;
+ wf->schRfAccessStalls++;
+ }
+ return false;
+}
+
+void
+ScheduleStage::scheduleRfDestOperands()
+{
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ if (toExecute.dispatchStatus(j) == EMPTY ||
+ toExecute.dispatchStatus(j) == SKIP) {
+ continue;
+ }
+
+ // get the wave on dispatch list and attempt to allocate write
+ // resources in the RFs
+ const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ if (!schedRfWrites(j, gpu_dyn_inst)) {
+ reinsertToSchList(j, gpu_dyn_inst);
+ doDispatchListTransition(j, EMPTY);
+ // if this is a flat inst, also transition the LM pipe to empty
+ // Note: since FLAT/LM arbitration occurs before scheduling
+ // destination operands to the RFs, it is possible that a LM
+ // instruction lost arbitration, but would have been able to
+ // pass the RF destination operand check here, and execute
+ // instead of the FLAT.
+ if (wf->instructionBuffer.front()->isFlat()) {
+ assert(toExecute.dispatchStatus(wf->localMem)
+ == SKIP);
+ doDispatchListTransition(wf->localMem, EMPTY);
+ }
+ }
+ }
+}
+
+bool
+ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
+{
+ // Attempt to add the wave to the schList if the VRF can support the
+ // wave's next instruction
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ bool accessVrf = true;
+ if (!gpu_dyn_inst->isScalar()) {
+ accessVrf = computeUnit.vrf[wf->simdId]
+ ->canScheduleReadOperands(wf, gpu_dyn_inst);
+ }
+ bool accessSrf = computeUnit.srf[wf->simdId]
+ ->canScheduleReadOperands(wf, gpu_dyn_inst);
+ // If RFs can support instruction, add to schList in RFBUSY state,
+ // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
+ // to the VRF
+ bool accessRf = accessVrf && accessSrf;
+ if (accessRf) {
+ DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
+ exeType, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+
+ computeUnit.insertInPipeMap(wf);
+ wavesInSch.emplace(wf->wfDynId);
+ schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
+ if (wf->isOldestInstWaitcnt()) {
+ wf->setStatus(Wavefront::S_WAITCNT);
+ }
+ if (!gpu_dyn_inst->isScalar()) {
+ computeUnit.vrf[wf->simdId]
+ ->scheduleReadOperands(wf, gpu_dyn_inst);
+ }
+ computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
+
+ DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
+ exeType, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+ return true;
+ } else {
+ // Number of stall cycles due to RF access denied
+ rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+ // Count number of denials due to each reason
+ // Multiple items may contribute to the denied request
+ if (!accessVrf) {
+ rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+ }
+ if (!accessSrf) {
+ rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+ }
+
+ // Increment stall counts for WF
+ wf->schStalls++;
+ wf->schRfAccessStalls++;
+ DPRINTF(GPUSched, "schList[%d]: Could not add: "
+ "SIMD[%d] WV[%d]: %d: %s\n",
+ exeType, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+ }
+ return false;
+}
+
+void
+ScheduleStage::reinsertToSchList(int exeType,
+ const GPUDynInstPtr &gpu_dyn_inst)
+{
+ // Insert wave w into schList for specified exeType.
+ // Wave is inserted in age order, with oldest wave being at the
+ // front of the schList
+ assert(gpu_dyn_inst);
+ auto schIter = schList.at(exeType).begin();
+ while (schIter != schList.at(exeType).end()
+ && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
+ schIter++;
+ }
+ schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
}
void
-ScheduleStage::init(ComputeUnit *cu)
+ScheduleStage::checkMemResources()
{
- computeUnit = cu;
- _name = computeUnit->name() + ".ScheduleStage";
+ // Check for resource availability in the next cycle
+ scalarMemBusRdy = false;
+ scalarMemIssueRdy = false;
+ // check if there is a SRF->Global Memory bus available and
+ if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
+ scalarMemBusRdy = true;
+ }
+ // check if we can issue a scalar memory instruction
+ if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
+ scalarMemIssueRdy = true;
+ }
- for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
- scheduler[j].bindList(&computeUnit->readyList[j]);
+ glbMemBusRdy = false;
+ glbMemIssueRdy = false;
+ // check if there is a VRF->Global Memory bus available
+ if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+ glbMemBusRdy = true;
+ }
+ // check if we can issue a Global memory instruction
+ if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
+ glbMemIssueRdy = true;
}
- for (int j = 0; j < numSIMDs; ++j) {
- waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+ locMemBusRdy = false;
+ locMemIssueRdy = false;
+ // check if there is a VRF->LDS bus available
+ if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+ locMemBusRdy = true;
+ }
+ // check if we can issue a LDS instruction
+ if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
+ locMemIssueRdy = true;
}
+}
- dispatchList = &computeUnit->dispatchList;
+bool
+ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
+{
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ vectorAluRdy = false;
+ scalarAluRdy = false;
+ // check for available vector/scalar ALUs in the next cycle
+ if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
+ vectorAluRdy = true;
+ }
+ if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
+ scalarAluRdy = true;
+ }
+
+ if (gpu_dyn_inst->isNop()) {
+ // S_NOP requires SALU. V_NOP requires VALU.
+ // TODO: Scalar NOP does not require SALU in hardware,
+ // and is executed out of IB directly.
+ if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
+ dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ return false;
+ } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
+ dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+ return false;
+ }
+ } else if (gpu_dyn_inst->isEndOfKernel()) {
+ // EndPgm instruction
+ if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
+ dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ return false;
+ }
+ } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
+ || gpu_dyn_inst->isALU()) {
+ // Barrier, Branch, or ALU instruction
+ if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
+ dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+ return false;
+ } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
+ dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+ return false;
+ }
+ } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
+ // Vector Global Memory instruction
+ bool rdy = true;
+ if (!glbMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+ }
+ if (!glbMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+ }
+ if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
+ rdy = false;
+ dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
+ // Scalar Global Memory instruction
+ bool rdy = true;
+ if (!scalarMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+ }
+ if (!scalarMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit.scalarMemoryPipe
+ .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
+ + wf->scalarWrGmReqsInPipe))
+ {
+ rdy = false;
+ dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
+ // Vector Local Memory instruction
+ bool rdy = true;
+ if (!locMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+ }
+ if (!locMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit.localMemoryPipe.
+ isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
+ rdy = false;
+ dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
+ // Vector Flat memory instruction
+ bool rdy = true;
+ if (!glbMemIssueRdy || !locMemIssueRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+ }
+ if (!glbMemBusRdy || !locMemBusRdy) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+ }
+ if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+ }
+ if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+ }
+ if (!computeUnit.localMemoryPipe.
+ isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
+ rdy = false;
+ dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+ }
+ if (!rdy) {
+ return false;
+ }
+ } else {
+ panic("%s: unknown instr checked for readiness",
+ gpu_dyn_inst->disassemble());
+ return false;
+ }
+ dispNrdyStalls[SCH_RDY]++;
+ return true;
}
void
-ScheduleStage::arbitrate()
-{
- // iterate over all Memory pipelines
- for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
- if (dispatchList->at(j).first) {
- Wavefront *waveToMemPipe = dispatchList->at(j).first;
- // iterate over all execution pipelines
- for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
- if ((i != j) && (dispatchList->at(i).first)) {
- Wavefront *waveToExePipe = dispatchList->at(i).first;
- // if the two selected wavefronts are mapped to the same
- // SIMD unit then they share the VRF
- if (waveToMemPipe->simdId == waveToExePipe->simdId) {
- int simdId = waveToMemPipe->simdId;
- // Read VRF port arbitration:
- // If there are read VRF port conflicts between the
- // a memory and another instruction we drop the other
- // instruction. We don't need to check for write VRF
- // port conflicts because the memory instruction either
- // does not need to write to the VRF (store) or will
- // write to the VRF when the data comes back (load) in
- // which case the arbiter of the memory pipes will
- // resolve any conflicts
- if (computeUnit->vrf[simdId]->
- isReadConflict(waveToMemPipe->wfSlotId,
- waveToExePipe->wfSlotId)) {
- // FIXME: The "second" member variable is never
- // used in the model. I am setting it to READY
- // simply to follow the protocol of setting it
- // when the WF has an instruction ready to issue
- waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
- .second = READY;
-
- dispatchList->at(i).first = nullptr;
- dispatchList->at(i).second = EMPTY;
- break;
- }
+ScheduleStage::fillDispatchList()
+{
+ // update execution resource status
+ checkMemResources();
+ // iterate execution resources
+ for (int j = 0; j < computeUnit.numExeUnits(); j++) {
+ assert(toExecute.dispatchStatus(j) == EMPTY);
+
+ // iterate waves in schList to pick one for dispatch
+ auto schIter = schList.at(j).begin();
+ bool dispatched = false;
+ while (schIter != schList.at(j).end()) {
+ // only attempt to dispatch if status is RFREADY
+ if (schIter->second == RFREADY) {
+ // Check if this wave is ready for dispatch
+ bool dispRdy = dispatchReady(schIter->first);
+ if (!dispatched && dispRdy) {
+ // No other wave has been dispatched for this exe
+ // resource, and this wave is ready. Place this wave
+ // on dispatchList and make it ready for execution
+ // next cycle.
+
+ // Acquire a coalescer token if it is a global mem
+ // operation.
+ GPUDynInstPtr mp = schIter->first;
+ if (!mp->isMemSync() && !mp->isScalar() &&
+ (mp->isGlobalMem() || mp->isFlat())) {
+ computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
}
+
+ doDispatchListTransition(j, EXREADY, schIter->first);
+ DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
+ "EMPTY->EXREADY\n", j);
+ schIter->first = nullptr;
+ schIter = schList.at(j).erase(schIter);
+ dispatched = true;
+ } else {
+ // Either another wave has been dispatched, or this wave
+ // was not ready, so it is stalled this cycle
+ schIter->first->wavefront()->schStalls++;
+ if (!dispRdy) {
+ // not ready for dispatch, increment stall stat
+ schIter->first->wavefront()->schResourceStalls++;
+ }
+ // Examine next wave for this resource
+ schIter++;
}
+ } else {
+ // Wave not in RFREADY, try next wave
+ schIter++;
}
}
+
+ // Increment stall count if no wave sent to dispatchList for
+ // current execution resource
+ if (!dispatched) {
+ schListToDispListStalls[j]++;
+ } else {
+ schListToDispList[j]++;
+ }
}
}
void
-ScheduleStage::exec()
+ScheduleStage::arbitrateVrfToLdsBus()
{
- for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
- uint32_t readyListSize = computeUnit->readyList[j].size();
+ // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
+ // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
+ // and a VRF->LDS bus. In GFx9, this is not the case.
- // If no wave is ready to be scheduled on the execution resource
- // then skip scheduling for this execution resource
- if (!readyListSize) {
- continue;
- }
+ // iterate the GM pipelines
+ for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
+ // get the GM pipe index in the dispatchList
+ int gm_exe_unit = computeUnit.firstMemUnit() + i;
+ // get the wave in the dispatchList
+ GPUDynInstPtr &gpu_dyn_inst
+ = toExecute.readyInst(gm_exe_unit);
+ // If the WF is valid, ready to execute, and the instruction
+ // is a flat access, arbitrate with the WF's assigned LM pipe
+ if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
+ == EXREADY && gpu_dyn_inst->isFlat()) {
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ // If the associated LM pipe also has a wave selected, block
+ // that wave and let the Flat instruction issue. The WF in the
+ // LM pipe is added back to the schList for consideration next
+ // cycle.
+ if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
+ reinsertToSchList(wf->localMem, toExecute
+ .readyInst(wf->localMem));
+ // Increment stall stats for LDS-VRF arbitration
+ ldsBusArbStalls++;
+ toExecute.readyInst(wf->localMem)
+ ->wavefront()->schLdsArbStalls++;
+ }
+ // With arbitration of LM pipe complete, transition the
+ // LM pipe to SKIP state in the dispatchList to inform EX stage
+ // that a Flat instruction is executing next cycle
+ doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
+ DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
+ "EXREADY->SKIP\n", wf->localMem);
+ }
+ }
+}
- Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
- dispatchList->at(j).first = waveToBeDispatched;
- waveToBeDispatched->updateResources();
- dispatchList->at(j).second = FILLED;
+void
+ScheduleStage::checkRfOperandReadComplete()
+{
+ // Iterate the schList queues and check if operand reads
+ // have completed in the RFs. If so, mark the wave as ready for
+ // selection for dispatchList
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ for (auto &p : schList.at(j)) {
+ const GPUDynInstPtr &gpu_dyn_inst = p.first;
+ assert(gpu_dyn_inst);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
- waveStatusList[waveToBeDispatched->simdId]->at(
- waveToBeDispatched->wfSlotId).second = BLOCKED;
+ // Increment the number of cycles the wave spends in the
+ // SCH stage, since this loop visits every wave in SCH.
+ wf->schCycles++;
- assert(computeUnit->readyList[j].size() == readyListSize - 1);
+ bool vrfRdy = true;
+ if (!gpu_dyn_inst->isScalar()) {
+ vrfRdy = computeUnit.vrf[wf->simdId]
+ ->operandReadComplete(wf, gpu_dyn_inst);
+ }
+ bool srfRdy = computeUnit.srf[wf->simdId]
+ ->operandReadComplete(wf, gpu_dyn_inst);
+ bool operandsReady = vrfRdy && srfRdy;
+ if (operandsReady) {
+ DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
+ "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
+ gpu_dyn_inst->disassemble());
+ DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
+ j, wf->wfDynId);
+ p.second = RFREADY;
+ } else {
+ DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
+ "for: %d: %s\n", j, wf->wfDynId,
+ gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+
+ // operands not ready yet, increment SCH stage stats
+ // aggregate to all wavefronts on the CU
+ p.second = RFBUSY;
+
+ // Increment stall stats
+ wf->schStalls++;
+ wf->schOpdNrdyStalls++;
+
+ opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+ if (!vrfRdy) {
+ opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+ }
+ if (!srfRdy) {
+ opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+ }
+ }
+ }
}
- // arbitrate over all shared resources among instructions being issued
- // simultaneously
- arbitrate();
+}
+
+void
+ScheduleStage::reserveResources()
+{
+ std::vector<bool> exeUnitReservations;
+ exeUnitReservations.resize(computeUnit.numExeUnits(), false);
+
+ for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+ GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+ if (gpu_dyn_inst) {
+ DISPATCH_STATUS s = toExecute.dispatchStatus(j);
+ Wavefront *wf = gpu_dyn_inst->wavefront();
+ if (s == EMPTY) {
+ continue;
+ } else if (s == EXREADY) {
+ // Wave is ready for execution
+ std::vector<int> execUnitIds = wf->reserveResources();
+
+ if (!gpu_dyn_inst->isScalar()) {
+ computeUnit.vrf[wf->simdId]
+ ->dispatchInstruction(gpu_dyn_inst);
+ }
+ computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
+
+ std::stringstream ss;
+ for (auto id : execUnitIds) {
+ ss << id << " ";
+ }
+ DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
+ " Reserving ExeRes[ %s]\n",
+ j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
+ gpu_dyn_inst->disassemble(), ss.str());
+ // mark the resources as reserved for this cycle
+ for (auto execUnitId : execUnitIds) {
+ panic_if(exeUnitReservations.at(execUnitId),
+ "Execution unit %d is reserved!!!\n"
+ "SIMD[%d] WV[%d]: %d: %s",
+ execUnitId, wf->simdId, wf->wfDynId,
+ gpu_dyn_inst->seqNum(),
+ gpu_dyn_inst->disassemble());
+ exeUnitReservations.at(execUnitId) = true;
+ }
+
+ // If wavefront::reserveResources reserved multiple resources,
+ // then we're executing a flat memory instruction. This means
+ // that we've reserved a global and local memory unit. Thus,
+ // we need to mark the latter execution unit as not available.
+ if (execUnitIds.size() > 1) {
+ int lm_exec_unit M5_VAR_USED = wf->localMem;
+ assert(toExecute.dispatchStatus(lm_exec_unit)
+ == SKIP);
+ }
+ } else if (s == SKIP) {
+ // Shared Memory pipe reserved for FLAT instruction.
+ // Verify the GM pipe for this wave is ready to execute
+ // and the wave in the GM pipe is the same as the wave
+ // in the LM pipe
+ int gm_exec_unit M5_VAR_USED = wf->globalMem;
+ assert(wf->wfDynId == toExecute
+ .readyInst(gm_exec_unit)->wfDynId);
+ assert(toExecute.dispatchStatus(gm_exec_unit)
+ == EXREADY);
+ }
+ }
+ }
+}
+
+void
+ScheduleStage::deleteFromSch(Wavefront *w)
+{
+ wavesInSch.erase(w->wfDynId);
}
void
ScheduleStage::regStats()
{
+ rdyListNotEmpty
+ .init(computeUnit.numExeUnits())
+ .name(name() + ".rdy_list_not_empty")
+ .desc("number of cycles one or more wave on ready list per "
+ "execution resource")
+ ;
+
+ rdyListEmpty
+ .init(computeUnit.numExeUnits())
+ .name(name() + ".rdy_list_empty")
+ .desc("number of cycles no wave on ready list per "
+ "execution resource")
+ ;
+
+ addToSchListStalls
+ .init(computeUnit.numExeUnits())
+ .name(name() + ".sch_list_add_stalls")
+ .desc("number of cycles a wave is not added to schList per "
+ "execution resource when ready list is not empty")
+ ;
+
+ schListToDispList
+ .init(computeUnit.numExeUnits())
+ .name(name() + ".sch_list_to_disp_list")
+ .desc("number of cycles a wave is added to dispatchList per "
+ "execution resource")
+ ;
+
+ schListToDispListStalls
+ .init(computeUnit.numExeUnits())
+ .name(name() + ".sch_list_to_disp_list_stalls")
+ .desc("number of cycles no wave is added to dispatchList per "
+ "execution resource")
+ ;
+
+ // Operand Readiness Stall Cycles
+ opdNrdyStalls
+ .init(SCH_RF_OPD_NRDY_CONDITIONS)
+ .name(name() + ".opd_nrdy_stalls")
+ .desc("number of stalls in SCH due to operands not ready")
+ ;
+ opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
+ opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
+ opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
+
+ // dispatchReady Stall Cycles
+ dispNrdyStalls
+ .init(SCH_NRDY_CONDITIONS)
+ .name(name() + ".disp_nrdy_stalls")
+ .desc("number of stalls in SCH due to resource not ready")
+ ;
+ dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
+ dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
+ dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
+ csprintf("VectorMemIssue"));
+ dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+ csprintf("VectorMemBusBusy"));
+ dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
+ csprintf("VectorMemCoalescer"));
+ dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
+ dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
+ csprintf("ScalarMemIssue"));
+ dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+ csprintf("ScalarMemBusBusy"));
+ dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
+ csprintf("ScalarMemFIFO"));
+ dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
+ csprintf("LocalMemIssue"));
+ dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+ csprintf("LocalMemBusBusy"));
+ dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
+ csprintf("LocalMemFIFO"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
+ csprintf("FlatMemIssue"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
+ csprintf("FlatMemBusBusy"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
+ csprintf("FlatMemCoalescer"));
+ dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
+ csprintf("FlatMemFIFO"));
+ dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
+
+ // RF Access Stall Cycles
+ rfAccessStalls
+ .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
+ .name(name() + ".rf_access_stalls")
+ .desc("number of stalls due to RF access denied")
+ ;
+ rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
+ rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
+ rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
+ rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
+ rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
+
+ // Stall cycles due to wave losing LDS bus arbitration
+ ldsBusArbStalls
+ .name(name() + ".lds_bus_arb_stalls")
+ .desc("number of stalls due to VRF->LDS bus conflicts")
+ ;
}