2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "gpu-compute/schedule_stage.hh"
36 #include <unordered_set>
38 #include "debug/GPUSched.hh"
39 #include "debug/GPUVRF.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_static_inst.hh"
42 #include "gpu-compute/scalar_register_file.hh"
43 #include "gpu-compute/vector_register_file.hh"
44 #include "gpu-compute/wavefront.hh"
46 ScheduleStage::ScheduleStage(const ComputeUnitParams
*p
, ComputeUnit
&cu
)
47 : computeUnit(cu
), _name(cu
.name() + ".ScheduleStage"),
48 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
49 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
50 locMemBusRdy(false), locMemIssueRdy(false)
52 for (int j
= 0; j
< cu
.numExeUnits(); ++j
) {
53 scheduler
.emplace_back(p
);
56 schList
.resize(cu
.numExeUnits());
57 for (auto &dq
: schList
) {
62 ScheduleStage::~ScheduleStage()
73 fatal_if(scheduler
.size() != computeUnit
.readyList
.size(),
74 "Scheduler should have same number of entries as CU's readyList");
75 for (int j
= 0; j
< computeUnit
.numExeUnits(); ++j
) {
76 scheduler
[j
].bindList(&computeUnit
.readyList
[j
]);
79 dispatchList
= &computeUnit
.dispatchList
;
81 assert(computeUnit
.numVectorGlobalMemUnits
== 1);
82 assert(computeUnit
.numVectorSharedMemUnits
== 1);
89 for (int j
= 0; j
< computeUnit
.numExeUnits(); ++j
) {
90 // delete all ready wavefronts whose instruction buffers are now
91 // empty because the last instruction was executed
92 computeUnit
.updateReadyList(j
);
94 * Remove any wave that already has an instruction present in SCH
95 * waiting for RF reads to complete. This prevents out of order
96 * execution within a wave.
98 for (auto wIt
= computeUnit
.readyList
.at(j
).begin();
99 wIt
!= computeUnit
.readyList
.at(j
).end();) {
100 if (wavesInSch
.find((*wIt
)->wfDynId
) != wavesInSch
.end()) {
102 wIt
= computeUnit
.readyList
.at(j
).erase(wIt
);
109 // Attempt to add another wave for each EXE type to schList queues
110 // VMEM resources are iterated first, effectively giving priority
111 // to VMEM over VALU for scheduling read of operands to the RFs.
112 // Scalar Memory are iterated after VMEM
114 // Iterate VMEM and SMEM
115 int firstMemUnit
= computeUnit
.firstMemUnit();
116 int lastMemUnit
= computeUnit
.lastMemUnit();
117 for (int j
= firstMemUnit
; j
<= lastMemUnit
; j
++) {
118 int readyListSize
= computeUnit
.readyList
[j
].size();
119 // If no wave is ready to be scheduled on the execution resource
120 // then skip scheduling for this execution resource
121 if (!readyListSize
) {
125 rdyListNotEmpty
[j
]++;
127 // Pick a wave and attempt to add it to schList
128 Wavefront
*w
= scheduler
[j
].chooseWave();
129 if (!addToSchList(j
, w
)) {
130 // For waves not added to schList, increment count of cycles
131 // this wave spends in SCH stage.
133 addToSchListStalls
[j
]++;
137 // Iterate everything else
138 for (int j
= 0; j
< computeUnit
.numExeUnits(); ++j
) {
139 // skip the VMEM resources
140 if (j
>= firstMemUnit
&& j
<= lastMemUnit
) {
143 int readyListSize
= computeUnit
.readyList
[j
].size();
144 // If no wave is ready to be scheduled on the execution resource
145 // then skip scheduling for this execution resource
146 if (!readyListSize
) {
150 rdyListNotEmpty
[j
]++;
152 // Pick a wave and attempt to add it to schList
153 Wavefront
*w
= scheduler
[j
].chooseWave();
154 if (!addToSchList(j
, w
)) {
155 // For waves not added to schList, increment count of cycles
156 // this wave spends in SCH stage.
158 addToSchListStalls
[j
]++;
162 // At this point, the schList queue per EXE type may contain
163 // multiple waves, in order of age (oldest to youngest).
164 // Wave may be in RFBUSY, indicating they are waiting for registers
165 // to be read, or in RFREADY, indicating they are candidates for
166 // the dispatchList and execution
168 // Iterate schList queues and check if any of the waves have finished
169 // reading their operands, moving those waves to RFREADY status
170 checkRfOperandReadComplete();
172 // Fill the dispatch list with the oldest wave of each EXE type that
173 // is ready to execute
174 // Wave is picked if status in schList is RFREADY and it passes resource
175 // ready checks similar to those currently in SCB
178 // Resource arbitration on waves in dispatchList
179 // Losing waves are re-inserted to the schList at a location determined
182 // Arbitrate access to the VRF->LDS bus
183 arbitrateVrfToLdsBus();
185 // Schedule write operations to the register files
186 scheduleRfDestOperands();
188 // Lastly, reserve resources for waves that are ready to execute.
193 ScheduleStage::doDispatchListTransition(int unitId
, DISPATCH_STATUS s
,
196 dispatchList
->at(unitId
).first
= w
;
197 dispatchList
->at(unitId
).second
= s
;
201 ScheduleStage::schedRfWrites(int exeType
, Wavefront
*w
)
203 GPUDynInstPtr ii
= w
->instructionBuffer
.front();
205 bool accessVrfWr
= true;
206 if (!ii
->isScalar()) {
208 computeUnit
.vrf
[w
->simdId
]->canScheduleWriteOperands(w
, ii
);
211 computeUnit
.srf
[w
->simdId
]->canScheduleWriteOperands(w
, ii
);
212 bool accessRf
= accessVrfWr
&& accessSrfWr
;
214 if (!ii
->isScalar()) {
215 computeUnit
.vrf
[w
->simdId
]->scheduleWriteOperands(w
, ii
);
217 computeUnit
.srf
[w
->simdId
]->scheduleWriteOperands(w
, ii
);
220 rfAccessStalls
[SCH_RF_ACCESS_NRDY
]++;
222 rfAccessStalls
[SCH_SRF_WR_ACCESS_NRDY
]++;
225 rfAccessStalls
[SCH_VRF_WR_ACCESS_NRDY
]++;
228 // Increment stall counts for WF
230 w
->schRfAccessStalls
++;
236 ScheduleStage::scheduleRfDestOperands()
238 for (int j
= 0; j
< computeUnit
.numExeUnits(); ++j
) {
239 if (!dispatchList
->at(j
).first
) {
242 // get the wave on dispatch list and attempt to allocate write
243 // resources in the RFs
244 Wavefront
*w
= dispatchList
->at(j
).first
;
245 if (!schedRfWrites(j
, w
)) {
246 reinsertToSchList(j
, w
);
247 doDispatchListTransition(j
, EMPTY
);
248 // if this is a flat inst, also transition the LM pipe to empty
249 // Note: since FLAT/LM arbitration occurs before scheduling
250 // destination operands to the RFs, it is possible that a LM
251 // instruction lost arbitration, but would have been able to
252 // pass the RF destination operand check here, and execute
253 // instead of the FLAT.
254 if (w
->instructionBuffer
.front()->isFlat()) {
255 assert(dispatchList
->at(w
->localMem
).second
== SKIP
);
256 doDispatchListTransition(w
->localMem
, EMPTY
);
263 ScheduleStage::addToSchList(int exeType
, Wavefront
*w
)
265 // Attempt to add the wave to the schList if the VRF can support the
266 // wave's next instruction
267 GPUDynInstPtr ii
= w
->instructionBuffer
.front();
269 bool accessVrf
= true;
270 if (!ii
->isScalar()) {
272 computeUnit
.vrf
[w
->simdId
]->canScheduleReadOperands(w
, ii
);
275 computeUnit
.srf
[w
->simdId
]->canScheduleReadOperands(w
, ii
);
276 // If RFs can support instruction, add to schList in RFBUSY state,
277 // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
279 bool accessRf
= accessVrf
&& accessSrf
;
281 DPRINTF(GPUSched
, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
282 exeType
, w
->simdId
, w
->wfDynId
,
283 ii
->seqNum(), ii
->disassemble());
285 computeUnit
.insertInPipeMap(w
);
286 wavesInSch
.emplace(w
->wfDynId
);
287 schList
.at(exeType
).push_back(std::make_pair(w
, RFBUSY
));
288 if (w
->isOldestInstWaitcnt()) {
289 w
->setStatus(Wavefront::S_WAITCNT
);
291 if (!ii
->isScalar()) {
292 computeUnit
.vrf
[w
->simdId
]->scheduleReadOperands(w
, ii
);
294 computeUnit
.srf
[w
->simdId
]->scheduleReadOperands(w
, ii
);
296 DPRINTF(GPUSched
, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
297 exeType
, w
->simdId
, w
->wfDynId
,
298 ii
->seqNum(), ii
->disassemble());
301 // Number of stall cycles due to RF access denied
302 rfAccessStalls
[SCH_RF_ACCESS_NRDY
]++;
303 // Count number of denials due to each reason
304 // Multiple items may contribute to the denied request
306 rfAccessStalls
[SCH_VRF_RD_ACCESS_NRDY
]++;
309 rfAccessStalls
[SCH_SRF_RD_ACCESS_NRDY
]++;
312 // Increment stall counts for WF
314 w
->schRfAccessStalls
++;
315 DPRINTF(GPUSched
, "schList[%d]: Could not add: "
316 "SIMD[%d] WV[%d]: %d: %s\n",
317 exeType
, w
->simdId
, w
->wfDynId
,
318 ii
->seqNum(), ii
->disassemble());
324 ScheduleStage::reinsertToSchList(int exeType
, Wavefront
*w
)
326 // Insert wave w into schList for specified exeType.
327 // Wave is inserted in age order, with oldest wave being at the
328 // front of the schList
329 auto schIter
= schList
.at(exeType
).begin();
330 while (schIter
!= schList
.at(exeType
).end()
331 && schIter
->first
->wfDynId
< w
->wfDynId
) {
334 schList
.at(exeType
).insert(schIter
, std::make_pair(w
, RFREADY
));
338 ScheduleStage::checkMemResources()
340 // Check for resource availability in the next cycle
341 scalarMemBusRdy
= false;
342 scalarMemIssueRdy
= false;
343 // check if there is a SRF->Global Memory bus available and
344 if (computeUnit
.srfToScalarMemPipeBus
.rdy(Cycles(1))) {
345 scalarMemBusRdy
= true;
347 // check if we can issue a scalar memory instruction
348 if (computeUnit
.scalarMemUnit
.rdy(Cycles(1))) {
349 scalarMemIssueRdy
= true;
352 glbMemBusRdy
= false;
353 glbMemIssueRdy
= false;
354 // check if there is a VRF->Global Memory bus available
355 if (computeUnit
.vrfToGlobalMemPipeBus
.rdy(Cycles(1))) {
358 // check if we can issue a Global memory instruction
359 if (computeUnit
.vectorGlobalMemUnit
.rdy(Cycles(1))) {
360 glbMemIssueRdy
= true;
363 locMemBusRdy
= false;
364 locMemIssueRdy
= false;
365 // check if there is a VRF->LDS bus available
366 if (computeUnit
.vrfToLocalMemPipeBus
.rdy(Cycles(1))) {
369 // check if we can issue a LDS instruction
370 if (computeUnit
.vectorSharedMemUnit
.rdy(Cycles(1))) {
371 locMemIssueRdy
= true;
376 ScheduleStage::dispatchReady(Wavefront
*w
)
378 vectorAluRdy
= false;
379 scalarAluRdy
= false;
380 // check for available vector/scalar ALUs in the next cycle
381 if (computeUnit
.vectorALUs
[w
->simdId
].rdy(Cycles(1))) {
384 if (computeUnit
.scalarALUs
[w
->scalarAlu
].rdy(Cycles(1))) {
387 GPUDynInstPtr ii
= w
->instructionBuffer
.front();
390 // S_NOP requires SALU. V_NOP requires VALU.
391 // TODO: Scalar NOP does not require SALU in hardware,
392 // and is executed out of IB directly.
393 if (ii
->isScalar() && !scalarAluRdy
) {
394 dispNrdyStalls
[SCH_SCALAR_ALU_NRDY
]++;
396 } else if (!ii
->isScalar() && !vectorAluRdy
) {
397 dispNrdyStalls
[SCH_VECTOR_ALU_NRDY
]++;
400 } else if (ii
->isEndOfKernel()) {
401 // EndPgm instruction
402 if (ii
->isScalar() && !scalarAluRdy
) {
403 dispNrdyStalls
[SCH_SCALAR_ALU_NRDY
]++;
406 } else if (ii
->isBarrier() || ii
->isBranch() || ii
->isALU()) {
407 // Barrier, Branch, or ALU instruction
408 if (ii
->isScalar() && !scalarAluRdy
) {
409 dispNrdyStalls
[SCH_SCALAR_ALU_NRDY
]++;
411 } else if (!ii
->isScalar() && !vectorAluRdy
) {
412 dispNrdyStalls
[SCH_VECTOR_ALU_NRDY
]++;
415 } else if (!ii
->isScalar() && ii
->isGlobalMem()) {
416 // Vector Global Memory instruction
418 if (!glbMemIssueRdy
) {
420 dispNrdyStalls
[SCH_VECTOR_MEM_ISSUE_NRDY
]++;
424 dispNrdyStalls
[SCH_VECTOR_MEM_BUS_BUSY_NRDY
]++;
426 if (!computeUnit
.globalMemoryPipe
.coalescerReady(ii
)) {
428 dispNrdyStalls
[SCH_VECTOR_MEM_COALESCER_NRDY
]++;
430 if (!computeUnit
.globalMemoryPipe
.outstandingReqsCheck(ii
)) {
432 dispNrdyStalls
[SCH_VECTOR_MEM_REQS_NRDY
]++;
437 } else if (ii
->isScalar() && ii
->isGlobalMem()) {
438 // Scalar Global Memory instruction
440 if (!scalarMemIssueRdy
) {
442 dispNrdyStalls
[SCH_SCALAR_MEM_ISSUE_NRDY
]++;
444 if (!scalarMemBusRdy
) {
446 dispNrdyStalls
[SCH_SCALAR_MEM_BUS_BUSY_NRDY
]++;
448 if (!computeUnit
.scalarMemoryPipe
.
449 isGMReqFIFOWrRdy(w
->scalarRdGmReqsInPipe
+
450 w
->scalarWrGmReqsInPipe
)) {
452 dispNrdyStalls
[SCH_SCALAR_MEM_FIFO_NRDY
]++;
457 } else if (!ii
->isScalar() && ii
->isLocalMem()) {
458 // Vector Local Memory instruction
460 if (!locMemIssueRdy
) {
462 dispNrdyStalls
[SCH_LOCAL_MEM_ISSUE_NRDY
]++;
466 dispNrdyStalls
[SCH_LOCAL_MEM_BUS_BUSY_NRDY
]++;
468 if (!computeUnit
.localMemoryPipe
.
469 isLMReqFIFOWrRdy(w
->rdLmReqsInPipe
+ w
->wrLmReqsInPipe
)) {
471 dispNrdyStalls
[SCH_LOCAL_MEM_FIFO_NRDY
]++;
476 } else if (!ii
->isScalar() && ii
->isFlat()) {
477 // Vector Flat memory instruction
479 if (!glbMemIssueRdy
|| !locMemIssueRdy
) {
481 dispNrdyStalls
[SCH_FLAT_MEM_ISSUE_NRDY
]++;
483 if (!glbMemBusRdy
|| !locMemBusRdy
) {
485 dispNrdyStalls
[SCH_FLAT_MEM_BUS_BUSY_NRDY
]++;
487 if (!computeUnit
.globalMemoryPipe
.coalescerReady(ii
)) {
489 dispNrdyStalls
[SCH_FLAT_MEM_COALESCER_NRDY
]++;
491 if (!computeUnit
.globalMemoryPipe
.outstandingReqsCheck(ii
)) {
493 dispNrdyStalls
[SCH_FLAT_MEM_REQS_NRDY
]++;
495 if (!computeUnit
.localMemoryPipe
.
496 isLMReqFIFOWrRdy(w
->rdLmReqsInPipe
+ w
->wrLmReqsInPipe
)) {
498 dispNrdyStalls
[SCH_FLAT_MEM_FIFO_NRDY
]++;
504 panic("%s: unknown instr checked for readiness", ii
->disassemble());
507 dispNrdyStalls
[SCH_RDY
]++;
512 ScheduleStage::fillDispatchList()
514 // update execution resource status
516 // iterate execution resources
517 for (int j
= 0; j
< computeUnit
.numExeUnits(); j
++) {
518 assert(dispatchList
->at(j
).second
== EMPTY
);
520 // iterate waves in schList to pick one for dispatch
521 auto schIter
= schList
.at(j
).begin();
522 bool dispatched
= false;
523 while (schIter
!= schList
.at(j
).end()) {
524 // only attempt to dispatch if status is RFREADY
525 if (schIter
->second
== RFREADY
) {
526 // Check if this wave is ready for dispatch
527 bool dispRdy
= dispatchReady(schIter
->first
);
528 if (!dispatched
&& dispRdy
) {
529 // No other wave has been dispatched for this exe
530 // resource, and this wave is ready. Place this wave
531 // on dispatchList and make it ready for execution
534 // Acquire a coalescer token if it is a global mem
536 GPUDynInstPtr mp
= schIter
->first
->
537 instructionBuffer
.front();
538 if (!mp
->isMemSync() && !mp
->isScalar() &&
539 (mp
->isGlobalMem() || mp
->isFlat())) {
540 computeUnit
.globalMemoryPipe
.acqCoalescerToken(mp
);
543 doDispatchListTransition(j
, EXREADY
, schIter
->first
);
544 DPRINTF(GPUSched
, "dispatchList[%d]: fillDispatchList: "
545 "EMPTY->EXREADY\n", j
);
546 schIter
->first
= nullptr;
547 schIter
= schList
.at(j
).erase(schIter
);
550 // Either another wave has been dispatched, or this wave
551 // was not ready, so it is stalled this cycle
552 schIter
->first
->schStalls
++;
554 // not ready for dispatch, increment stall stat
555 schIter
->first
->schResourceStalls
++;
557 // Examine next wave for this resource
561 // Wave not in RFREADY, try next wave
566 // Increment stall count if no wave sent to dispatchList for
567 // current execution resource
569 schListToDispListStalls
[j
]++;
571 schListToDispList
[j
]++;
577 ScheduleStage::arbitrateVrfToLdsBus()
579 // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
580 // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
581 // and a VRF->LDS bus. In GFx9, this is not the case.
583 // iterate the GM pipelines
584 for (int i
= 0; i
< computeUnit
.numVectorGlobalMemUnits
; i
++) {
585 // get the GM pipe index in the dispatchList
586 int gm_exe_unit
= computeUnit
.firstMemUnit() + i
;
587 // get the wave in the dispatchList
588 Wavefront
*w
= dispatchList
->at(gm_exe_unit
).first
;
589 // If the WF is valid, ready to execute, and the instruction
590 // is a flat access, arbitrate with the WF's assigned LM pipe
591 if (w
&& dispatchList
->at(gm_exe_unit
).second
== EXREADY
&&
592 w
->instructionBuffer
.front()->isFlat()) {
593 // If the associated LM pipe also has a wave selected, block
594 // that wave and let the Flat instruction issue. The WF in the
595 // LM pipe is added back to the schList for consideration next
597 if (dispatchList
->at(w
->localMem
).second
== EXREADY
) {
598 reinsertToSchList(w
->localMem
,
599 dispatchList
->at(w
->localMem
).first
);
600 // Increment stall stats for LDS-VRF arbitration
602 dispatchList
->at(w
->localMem
).first
->schLdsArbStalls
++;
604 // With arbitration of LM pipe complete, transition the
605 // LM pipe to SKIP state in the dispatchList to inform EX stage
606 // that a Flat instruction is executing next cycle
607 doDispatchListTransition(w
->localMem
, SKIP
, w
);
608 DPRINTF(GPUSched
, "dispatchList[%d]: arbVrfLds: "
609 "EXREADY->SKIP\n", w
->localMem
);
615 ScheduleStage::checkRfOperandReadComplete()
617 // Iterate the schList queues and check if operand reads
618 // have completed in the RFs. If so, mark the wave as ready for
619 // selection for dispatchList
620 for (int j
= 0; j
< computeUnit
.numExeUnits(); ++j
) {
621 for (auto &p
: schList
.at(j
)) {
622 Wavefront
*w
= p
.first
;
625 // Increment the number of cycles the wave spends in the
626 // SCH stage, since this loop visits every wave in SCH.
629 GPUDynInstPtr ii
= w
->instructionBuffer
.front();
631 if (!ii
->isScalar()) {
633 computeUnit
.vrf
[w
->simdId
]->operandReadComplete(w
, ii
);
636 computeUnit
.srf
[w
->simdId
]->operandReadComplete(w
, ii
);
637 bool operandsReady
= vrfRdy
&& srfRdy
;
640 "schList[%d]: WV[%d] operands ready for: %d: %s\n",
641 j
, w
->wfDynId
, ii
->seqNum(), ii
->disassemble());
642 DPRINTF(GPUSched
, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
647 "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
648 j
, w
->wfDynId
, ii
->seqNum(), ii
->disassemble());
650 // operands not ready yet, increment SCH stage stats
651 // aggregate to all wavefronts on the CU
654 // Increment stall stats
656 w
->schOpdNrdyStalls
++;
658 opdNrdyStalls
[SCH_RF_OPD_NRDY
]++;
660 opdNrdyStalls
[SCH_VRF_OPD_NRDY
]++;
663 opdNrdyStalls
[SCH_SRF_OPD_NRDY
]++;
671 ScheduleStage::reserveResources()
673 std::vector
<bool> exeUnitReservations
;
674 exeUnitReservations
.resize(computeUnit
.numExeUnits(), false);
676 for (int j
= 0; j
< computeUnit
.numExeUnits(); ++j
) {
677 Wavefront
*dispatchedWave
= dispatchList
->at(j
).first
;
678 if (dispatchedWave
) {
679 DISPATCH_STATUS s
= dispatchList
->at(j
).second
;
682 } else if (s
== EXREADY
) {
683 // Wave is ready for execution
684 std::vector
<int> execUnitIds
=
685 dispatchedWave
->reserveResources();
686 GPUDynInstPtr ii
= dispatchedWave
->instructionBuffer
.front();
688 if (!ii
->isScalar()) {
689 computeUnit
.vrf
[dispatchedWave
->simdId
]->
690 dispatchInstruction(ii
);
692 computeUnit
.srf
[dispatchedWave
->simdId
]->
693 dispatchInstruction(ii
);
695 std::stringstream ss
;
696 for (auto id
: execUnitIds
) {
699 DPRINTF(GPUSched
, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
700 " Reserving ExeRes[ %s]\n",
701 j
, dispatchedWave
->simdId
, dispatchedWave
->wfDynId
,
702 ii
->seqNum(), ii
->disassemble(), ss
.str());
703 // mark the resources as reserved for this cycle
704 for (auto execUnitId
: execUnitIds
) {
705 panic_if(exeUnitReservations
.at(execUnitId
),
706 "Execution unit %d is reserved!!!\n"
707 "SIMD[%d] WV[%d]: %d: %s",
708 execUnitId
, dispatchedWave
->simdId
,
709 dispatchedWave
->wfDynId
,
710 ii
->seqNum(), ii
->disassemble());
711 exeUnitReservations
.at(execUnitId
) = true;
714 // If wavefront::reserveResources reserved multiple resources,
715 // then we're executing a flat memory instruction. This means
716 // that we've reserved a global and local memory unit. Thus,
717 // we need to mark the latter execution unit as not available.
718 if (execUnitIds
.size() > 1) {
719 int lm_exec_unit M5_VAR_USED
= dispatchedWave
->localMem
;
720 assert(dispatchList
->at(lm_exec_unit
).second
== SKIP
);
722 } else if (s
== SKIP
) {
723 // Shared Memory pipe reserved for FLAT instruction.
724 // Verify the GM pipe for this wave is ready to execute
725 // and the wave in the GM pipe is the same as the wave
727 int gm_exec_unit M5_VAR_USED
= dispatchedWave
->globalMem
;
728 assert(dispatchList
->at(gm_exec_unit
).first
->wfDynId
==
729 dispatchedWave
->wfDynId
);
730 assert(dispatchList
->at(gm_exec_unit
).second
== EXREADY
);
737 ScheduleStage::deleteFromSch(Wavefront
*w
)
739 wavesInSch
.erase(w
->wfDynId
);
743 ScheduleStage::regStats()
746 .init(computeUnit
.numExeUnits())
747 .name(name() + ".rdy_list_not_empty")
748 .desc("number of cycles one or more wave on ready list per "
749 "execution resource")
753 .init(computeUnit
.numExeUnits())
754 .name(name() + ".rdy_list_empty")
755 .desc("number of cycles no wave on ready list per "
756 "execution resource")
760 .init(computeUnit
.numExeUnits())
761 .name(name() + ".sch_list_add_stalls")
762 .desc("number of cycles a wave is not added to schList per "
763 "execution resource when ready list is not empty")
767 .init(computeUnit
.numExeUnits())
768 .name(name() + ".sch_list_to_disp_list")
769 .desc("number of cycles a wave is added to dispatchList per "
770 "execution resource")
773 schListToDispListStalls
774 .init(computeUnit
.numExeUnits())
775 .name(name() + ".sch_list_to_disp_list_stalls")
776 .desc("number of cycles no wave is added to dispatchList per "
777 "execution resource")
780 // Operand Readiness Stall Cycles
782 .init(SCH_RF_OPD_NRDY_CONDITIONS
)
783 .name(name() + ".opd_nrdy_stalls")
784 .desc("number of stalls in SCH due to operands not ready")
786 opdNrdyStalls
.subname(SCH_VRF_OPD_NRDY
, csprintf("VRF"));
787 opdNrdyStalls
.subname(SCH_SRF_OPD_NRDY
, csprintf("SRF"));
788 opdNrdyStalls
.subname(SCH_RF_OPD_NRDY
, csprintf("RF"));
790 // dispatchReady Stall Cycles
792 .init(SCH_NRDY_CONDITIONS
)
793 .name(name() + ".disp_nrdy_stalls")
794 .desc("number of stalls in SCH due to resource not ready")
796 dispNrdyStalls
.subname(SCH_SCALAR_ALU_NRDY
, csprintf("ScalarAlu"));
797 dispNrdyStalls
.subname(SCH_VECTOR_ALU_NRDY
, csprintf("VectorAlu"));
798 dispNrdyStalls
.subname(SCH_VECTOR_MEM_ISSUE_NRDY
,
799 csprintf("VectorMemIssue"));
800 dispNrdyStalls
.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY
,
801 csprintf("VectorMemBusBusy"));
802 dispNrdyStalls
.subname(SCH_VECTOR_MEM_COALESCER_NRDY
,
803 csprintf("VectorMemCoalescer"));
804 dispNrdyStalls
.subname(SCH_CEDE_SIMD_NRDY
, csprintf("CedeSimd"));
805 dispNrdyStalls
.subname(SCH_SCALAR_MEM_ISSUE_NRDY
,
806 csprintf("ScalarMemIssue"));
807 dispNrdyStalls
.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY
,
808 csprintf("ScalarMemBusBusy"));
809 dispNrdyStalls
.subname(SCH_SCALAR_MEM_FIFO_NRDY
,
810 csprintf("ScalarMemFIFO"));
811 dispNrdyStalls
.subname(SCH_LOCAL_MEM_ISSUE_NRDY
,
812 csprintf("LocalMemIssue"));
813 dispNrdyStalls
.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY
,
814 csprintf("LocalMemBusBusy"));
815 dispNrdyStalls
.subname(SCH_LOCAL_MEM_FIFO_NRDY
,
816 csprintf("LocalMemFIFO"));
817 dispNrdyStalls
.subname(SCH_FLAT_MEM_ISSUE_NRDY
,
818 csprintf("FlatMemIssue"));
819 dispNrdyStalls
.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY
,
820 csprintf("FlatMemBusBusy"));
821 dispNrdyStalls
.subname(SCH_FLAT_MEM_COALESCER_NRDY
,
822 csprintf("FlatMemCoalescer"));
823 dispNrdyStalls
.subname(SCH_FLAT_MEM_FIFO_NRDY
,
824 csprintf("FlatMemFIFO"));
825 dispNrdyStalls
.subname(SCH_RDY
, csprintf("Ready"));
827 // RF Access Stall Cycles
829 .init(SCH_RF_ACCESS_NRDY_CONDITIONS
)
830 .name(name() + ".rf_access_stalls")
831 .desc("number of stalls due to RF access denied")
833 rfAccessStalls
.subname(SCH_VRF_RD_ACCESS_NRDY
, csprintf("VrfRd"));
834 rfAccessStalls
.subname(SCH_VRF_WR_ACCESS_NRDY
, csprintf("VrfWr"));
835 rfAccessStalls
.subname(SCH_SRF_RD_ACCESS_NRDY
, csprintf("SrfRd"));
836 rfAccessStalls
.subname(SCH_SRF_WR_ACCESS_NRDY
, csprintf("SrfWr"));
837 rfAccessStalls
.subname(SCH_RF_ACCESS_NRDY
, csprintf("Any"));
839 // Stall cycles due to wave losing LDS bus arbitration
841 .name(name() + ".lds_bus_arb_stalls")
842 .desc("number of stalls due to VRF->LDS bus conflicts")