gpu-compute: Use refs to CU in pipe stages/mem pipes
[gem5.git] / src / gpu-compute / schedule_stage.cc
1 /*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/schedule_stage.hh"
35
36 #include <unordered_set>
37
38 #include "debug/GPUSched.hh"
39 #include "debug/GPUVRF.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_static_inst.hh"
42 #include "gpu-compute/scalar_register_file.hh"
43 #include "gpu-compute/vector_register_file.hh"
44 #include "gpu-compute/wavefront.hh"
45
46 ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
47 : computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
48 vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
49 scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
50 locMemBusRdy(false), locMemIssueRdy(false)
51 {
52 for (int j = 0; j < cu.numExeUnits(); ++j) {
53 scheduler.emplace_back(p);
54 }
55 wavesInSch.clear();
56 schList.resize(cu.numExeUnits());
57 for (auto &dq : schList) {
58 dq.clear();
59 }
60 }
61
62 ScheduleStage::~ScheduleStage()
63 {
64 scheduler.clear();
65 wavesInSch.clear();
66 schList.clear();
67 }
68
69 void
70 ScheduleStage::init()
71 {
72
73 fatal_if(scheduler.size() != computeUnit.readyList.size(),
74 "Scheduler should have same number of entries as CU's readyList");
75 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
76 scheduler[j].bindList(&computeUnit.readyList[j]);
77 }
78
79 dispatchList = &computeUnit.dispatchList;
80
81 assert(computeUnit.numVectorGlobalMemUnits == 1);
82 assert(computeUnit.numVectorSharedMemUnits == 1);
83 }
84
85 void
86 ScheduleStage::exec()
87 {
88 // Update readyList
89 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
90 // delete all ready wavefronts whose instruction buffers are now
91 // empty because the last instruction was executed
92 computeUnit.updateReadyList(j);
93 /**
94 * Remove any wave that already has an instruction present in SCH
95 * waiting for RF reads to complete. This prevents out of order
96 * execution within a wave.
97 */
98 for (auto wIt = computeUnit.readyList.at(j).begin();
99 wIt != computeUnit.readyList.at(j).end();) {
100 if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
101 *wIt = nullptr;
102 wIt = computeUnit.readyList.at(j).erase(wIt);
103 } else {
104 wIt++;
105 }
106 }
107 }
108
109 // Attempt to add another wave for each EXE type to schList queues
110 // VMEM resources are iterated first, effectively giving priority
111 // to VMEM over VALU for scheduling read of operands to the RFs.
112 // Scalar Memory are iterated after VMEM
113
114 // Iterate VMEM and SMEM
115 int firstMemUnit = computeUnit.firstMemUnit();
116 int lastMemUnit = computeUnit.lastMemUnit();
117 for (int j = firstMemUnit; j <= lastMemUnit; j++) {
118 int readyListSize = computeUnit.readyList[j].size();
119 // If no wave is ready to be scheduled on the execution resource
120 // then skip scheduling for this execution resource
121 if (!readyListSize) {
122 rdyListEmpty[j]++;
123 continue;
124 }
125 rdyListNotEmpty[j]++;
126
127 // Pick a wave and attempt to add it to schList
128 Wavefront *w = scheduler[j].chooseWave();
129 if (!addToSchList(j, w)) {
130 // For waves not added to schList, increment count of cycles
131 // this wave spends in SCH stage.
132 w->schCycles++;
133 addToSchListStalls[j]++;
134 }
135 }
136
137 // Iterate everything else
138 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
139 // skip the VMEM resources
140 if (j >= firstMemUnit && j <= lastMemUnit) {
141 continue;
142 }
143 int readyListSize = computeUnit.readyList[j].size();
144 // If no wave is ready to be scheduled on the execution resource
145 // then skip scheduling for this execution resource
146 if (!readyListSize) {
147 rdyListEmpty[j]++;
148 continue;
149 }
150 rdyListNotEmpty[j]++;
151
152 // Pick a wave and attempt to add it to schList
153 Wavefront *w = scheduler[j].chooseWave();
154 if (!addToSchList(j, w)) {
155 // For waves not added to schList, increment count of cycles
156 // this wave spends in SCH stage.
157 w->schCycles++;
158 addToSchListStalls[j]++;
159 }
160 }
161
162 // At this point, the schList queue per EXE type may contain
163 // multiple waves, in order of age (oldest to youngest).
164 // Wave may be in RFBUSY, indicating they are waiting for registers
165 // to be read, or in RFREADY, indicating they are candidates for
166 // the dispatchList and execution
167
168 // Iterate schList queues and check if any of the waves have finished
169 // reading their operands, moving those waves to RFREADY status
170 checkRfOperandReadComplete();
171
172 // Fill the dispatch list with the oldest wave of each EXE type that
173 // is ready to execute
174 // Wave is picked if status in schList is RFREADY and it passes resource
175 // ready checks similar to those currently in SCB
176 fillDispatchList();
177
178 // Resource arbitration on waves in dispatchList
179 // Losing waves are re-inserted to the schList at a location determined
180 // by wave age
181
182 // Arbitrate access to the VRF->LDS bus
183 arbitrateVrfToLdsBus();
184
185 // Schedule write operations to the register files
186 scheduleRfDestOperands();
187
188 // Lastly, reserve resources for waves that are ready to execute.
189 reserveResources();
190 }
191
192 void
193 ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
194 Wavefront *w)
195 {
196 dispatchList->at(unitId).first = w;
197 dispatchList->at(unitId).second = s;
198 }
199
200 bool
201 ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
202 {
203 GPUDynInstPtr ii = w->instructionBuffer.front();
204 assert(ii);
205 bool accessVrfWr = true;
206 if (!ii->isScalar()) {
207 accessVrfWr =
208 computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
209 }
210 bool accessSrfWr =
211 computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
212 bool accessRf = accessVrfWr && accessSrfWr;
213 if (accessRf) {
214 if (!ii->isScalar()) {
215 computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
216 }
217 computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
218 return true;
219 } else {
220 rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
221 if (!accessSrfWr) {
222 rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
223 }
224 if (!accessVrfWr) {
225 rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
226 }
227
228 // Increment stall counts for WF
229 w->schStalls++;
230 w->schRfAccessStalls++;
231 }
232 return false;
233 }
234
235 void
236 ScheduleStage::scheduleRfDestOperands()
237 {
238 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
239 if (!dispatchList->at(j).first) {
240 continue;
241 }
242 // get the wave on dispatch list and attempt to allocate write
243 // resources in the RFs
244 Wavefront *w = dispatchList->at(j).first;
245 if (!schedRfWrites(j, w)) {
246 reinsertToSchList(j, w);
247 doDispatchListTransition(j, EMPTY);
248 // if this is a flat inst, also transition the LM pipe to empty
249 // Note: since FLAT/LM arbitration occurs before scheduling
250 // destination operands to the RFs, it is possible that a LM
251 // instruction lost arbitration, but would have been able to
252 // pass the RF destination operand check here, and execute
253 // instead of the FLAT.
254 if (w->instructionBuffer.front()->isFlat()) {
255 assert(dispatchList->at(w->localMem).second == SKIP);
256 doDispatchListTransition(w->localMem, EMPTY);
257 }
258 }
259 }
260 }
261
262 bool
263 ScheduleStage::addToSchList(int exeType, Wavefront *w)
264 {
265 // Attempt to add the wave to the schList if the VRF can support the
266 // wave's next instruction
267 GPUDynInstPtr ii = w->instructionBuffer.front();
268 assert(ii);
269 bool accessVrf = true;
270 if (!ii->isScalar()) {
271 accessVrf =
272 computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
273 }
274 bool accessSrf =
275 computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
276 // If RFs can support instruction, add to schList in RFBUSY state,
277 // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
278 // to the VRF
279 bool accessRf = accessVrf && accessSrf;
280 if (accessRf) {
281 DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
282 exeType, w->simdId, w->wfDynId,
283 ii->seqNum(), ii->disassemble());
284
285 computeUnit.insertInPipeMap(w);
286 wavesInSch.emplace(w->wfDynId);
287 schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
288 if (w->isOldestInstWaitcnt()) {
289 w->setStatus(Wavefront::S_WAITCNT);
290 }
291 if (!ii->isScalar()) {
292 computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
293 }
294 computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
295
296 DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
297 exeType, w->simdId, w->wfDynId,
298 ii->seqNum(), ii->disassemble());
299 return true;
300 } else {
301 // Number of stall cycles due to RF access denied
302 rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
303 // Count number of denials due to each reason
304 // Multiple items may contribute to the denied request
305 if (!accessVrf) {
306 rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
307 }
308 if (!accessSrf) {
309 rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
310 }
311
312 // Increment stall counts for WF
313 w->schStalls++;
314 w->schRfAccessStalls++;
315 DPRINTF(GPUSched, "schList[%d]: Could not add: "
316 "SIMD[%d] WV[%d]: %d: %s\n",
317 exeType, w->simdId, w->wfDynId,
318 ii->seqNum(), ii->disassemble());
319 }
320 return false;
321 }
322
323 void
324 ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
325 {
326 // Insert wave w into schList for specified exeType.
327 // Wave is inserted in age order, with oldest wave being at the
328 // front of the schList
329 auto schIter = schList.at(exeType).begin();
330 while (schIter != schList.at(exeType).end()
331 && schIter->first->wfDynId < w->wfDynId) {
332 schIter++;
333 }
334 schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
335 }
336
337 void
338 ScheduleStage::checkMemResources()
339 {
340 // Check for resource availability in the next cycle
341 scalarMemBusRdy = false;
342 scalarMemIssueRdy = false;
343 // check if there is a SRF->Global Memory bus available and
344 if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
345 scalarMemBusRdy = true;
346 }
347 // check if we can issue a scalar memory instruction
348 if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
349 scalarMemIssueRdy = true;
350 }
351
352 glbMemBusRdy = false;
353 glbMemIssueRdy = false;
354 // check if there is a VRF->Global Memory bus available
355 if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
356 glbMemBusRdy = true;
357 }
358 // check if we can issue a Global memory instruction
359 if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
360 glbMemIssueRdy = true;
361 }
362
363 locMemBusRdy = false;
364 locMemIssueRdy = false;
365 // check if there is a VRF->LDS bus available
366 if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
367 locMemBusRdy = true;
368 }
369 // check if we can issue a LDS instruction
370 if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
371 locMemIssueRdy = true;
372 }
373 }
374
375 bool
376 ScheduleStage::dispatchReady(Wavefront *w)
377 {
378 vectorAluRdy = false;
379 scalarAluRdy = false;
380 // check for available vector/scalar ALUs in the next cycle
381 if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
382 vectorAluRdy = true;
383 }
384 if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
385 scalarAluRdy = true;
386 }
387 GPUDynInstPtr ii = w->instructionBuffer.front();
388
389 if (ii->isNop()) {
390 // S_NOP requires SALU. V_NOP requires VALU.
391 // TODO: Scalar NOP does not require SALU in hardware,
392 // and is executed out of IB directly.
393 if (ii->isScalar() && !scalarAluRdy) {
394 dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
395 return false;
396 } else if (!ii->isScalar() && !vectorAluRdy) {
397 dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
398 return false;
399 }
400 } else if (ii->isEndOfKernel()) {
401 // EndPgm instruction
402 if (ii->isScalar() && !scalarAluRdy) {
403 dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
404 return false;
405 }
406 } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
407 // Barrier, Branch, or ALU instruction
408 if (ii->isScalar() && !scalarAluRdy) {
409 dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
410 return false;
411 } else if (!ii->isScalar() && !vectorAluRdy) {
412 dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
413 return false;
414 }
415 } else if (!ii->isScalar() && ii->isGlobalMem()) {
416 // Vector Global Memory instruction
417 bool rdy = true;
418 if (!glbMemIssueRdy) {
419 rdy = false;
420 dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
421 }
422 if (!glbMemBusRdy) {
423 rdy = false;
424 dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
425 }
426 if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
427 rdy = false;
428 dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
429 }
430 if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
431 rdy = false;
432 dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
433 }
434 if (!rdy) {
435 return false;
436 }
437 } else if (ii->isScalar() && ii->isGlobalMem()) {
438 // Scalar Global Memory instruction
439 bool rdy = true;
440 if (!scalarMemIssueRdy) {
441 rdy = false;
442 dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
443 }
444 if (!scalarMemBusRdy) {
445 rdy = false;
446 dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
447 }
448 if (!computeUnit.scalarMemoryPipe.
449 isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
450 w->scalarWrGmReqsInPipe)) {
451 rdy = false;
452 dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
453 }
454 if (!rdy) {
455 return false;
456 }
457 } else if (!ii->isScalar() && ii->isLocalMem()) {
458 // Vector Local Memory instruction
459 bool rdy = true;
460 if (!locMemIssueRdy) {
461 rdy = false;
462 dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
463 }
464 if (!locMemBusRdy) {
465 rdy = false;
466 dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
467 }
468 if (!computeUnit.localMemoryPipe.
469 isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
470 rdy = false;
471 dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
472 }
473 if (!rdy) {
474 return false;
475 }
476 } else if (!ii->isScalar() && ii->isFlat()) {
477 // Vector Flat memory instruction
478 bool rdy = true;
479 if (!glbMemIssueRdy || !locMemIssueRdy) {
480 rdy = false;
481 dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
482 }
483 if (!glbMemBusRdy || !locMemBusRdy) {
484 rdy = false;
485 dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
486 }
487 if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
488 rdy = false;
489 dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
490 }
491 if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
492 rdy = false;
493 dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
494 }
495 if (!computeUnit.localMemoryPipe.
496 isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
497 rdy = false;
498 dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
499 }
500 if (!rdy) {
501 return false;
502 }
503 } else {
504 panic("%s: unknown instr checked for readiness", ii->disassemble());
505 return false;
506 }
507 dispNrdyStalls[SCH_RDY]++;
508 return true;
509 }
510
511 void
512 ScheduleStage::fillDispatchList()
513 {
514 // update execution resource status
515 checkMemResources();
516 // iterate execution resources
517 for (int j = 0; j < computeUnit.numExeUnits(); j++) {
518 assert(dispatchList->at(j).second == EMPTY);
519
520 // iterate waves in schList to pick one for dispatch
521 auto schIter = schList.at(j).begin();
522 bool dispatched = false;
523 while (schIter != schList.at(j).end()) {
524 // only attempt to dispatch if status is RFREADY
525 if (schIter->second == RFREADY) {
526 // Check if this wave is ready for dispatch
527 bool dispRdy = dispatchReady(schIter->first);
528 if (!dispatched && dispRdy) {
529 // No other wave has been dispatched for this exe
530 // resource, and this wave is ready. Place this wave
531 // on dispatchList and make it ready for execution
532 // next cycle.
533
534 // Acquire a coalescer token if it is a global mem
535 // operation.
536 GPUDynInstPtr mp = schIter->first->
537 instructionBuffer.front();
538 if (!mp->isMemSync() && !mp->isScalar() &&
539 (mp->isGlobalMem() || mp->isFlat())) {
540 computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
541 }
542
543 doDispatchListTransition(j, EXREADY, schIter->first);
544 DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
545 "EMPTY->EXREADY\n", j);
546 schIter->first = nullptr;
547 schIter = schList.at(j).erase(schIter);
548 dispatched = true;
549 } else {
550 // Either another wave has been dispatched, or this wave
551 // was not ready, so it is stalled this cycle
552 schIter->first->schStalls++;
553 if (!dispRdy) {
554 // not ready for dispatch, increment stall stat
555 schIter->first->schResourceStalls++;
556 }
557 // Examine next wave for this resource
558 schIter++;
559 }
560 } else {
561 // Wave not in RFREADY, try next wave
562 schIter++;
563 }
564 }
565
566 // Increment stall count if no wave sent to dispatchList for
567 // current execution resource
568 if (!dispatched) {
569 schListToDispListStalls[j]++;
570 } else {
571 schListToDispList[j]++;
572 }
573 }
574 }
575
576 void
577 ScheduleStage::arbitrateVrfToLdsBus()
578 {
579 // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
580 // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
581 // and a VRF->LDS bus. In GFx9, this is not the case.
582
583 // iterate the GM pipelines
584 for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
585 // get the GM pipe index in the dispatchList
586 int gm_exe_unit = computeUnit.firstMemUnit() + i;
587 // get the wave in the dispatchList
588 Wavefront *w = dispatchList->at(gm_exe_unit).first;
589 // If the WF is valid, ready to execute, and the instruction
590 // is a flat access, arbitrate with the WF's assigned LM pipe
591 if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
592 w->instructionBuffer.front()->isFlat()) {
593 // If the associated LM pipe also has a wave selected, block
594 // that wave and let the Flat instruction issue. The WF in the
595 // LM pipe is added back to the schList for consideration next
596 // cycle.
597 if (dispatchList->at(w->localMem).second == EXREADY) {
598 reinsertToSchList(w->localMem,
599 dispatchList->at(w->localMem).first);
600 // Increment stall stats for LDS-VRF arbitration
601 ldsBusArbStalls++;
602 dispatchList->at(w->localMem).first->schLdsArbStalls++;
603 }
604 // With arbitration of LM pipe complete, transition the
605 // LM pipe to SKIP state in the dispatchList to inform EX stage
606 // that a Flat instruction is executing next cycle
607 doDispatchListTransition(w->localMem, SKIP, w);
608 DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
609 "EXREADY->SKIP\n", w->localMem);
610 }
611 }
612 }
613
614 void
615 ScheduleStage::checkRfOperandReadComplete()
616 {
617 // Iterate the schList queues and check if operand reads
618 // have completed in the RFs. If so, mark the wave as ready for
619 // selection for dispatchList
620 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
621 for (auto &p : schList.at(j)) {
622 Wavefront *w = p.first;
623 assert(w);
624
625 // Increment the number of cycles the wave spends in the
626 // SCH stage, since this loop visits every wave in SCH.
627 w->schCycles++;
628
629 GPUDynInstPtr ii = w->instructionBuffer.front();
630 bool vrfRdy = true;
631 if (!ii->isScalar()) {
632 vrfRdy =
633 computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
634 }
635 bool srfRdy =
636 computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
637 bool operandsReady = vrfRdy && srfRdy;
638 if (operandsReady) {
639 DPRINTF(GPUSched,
640 "schList[%d]: WV[%d] operands ready for: %d: %s\n",
641 j, w->wfDynId, ii->seqNum(), ii->disassemble());
642 DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
643 j, w->wfDynId);
644 p.second = RFREADY;
645 } else {
646 DPRINTF(GPUSched,
647 "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
648 j, w->wfDynId, ii->seqNum(), ii->disassemble());
649
650 // operands not ready yet, increment SCH stage stats
651 // aggregate to all wavefronts on the CU
652 p.second = RFBUSY;
653
654 // Increment stall stats
655 w->schStalls++;
656 w->schOpdNrdyStalls++;
657
658 opdNrdyStalls[SCH_RF_OPD_NRDY]++;
659 if (!vrfRdy) {
660 opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
661 }
662 if (!srfRdy) {
663 opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
664 }
665 }
666 }
667 }
668 }
669
670 void
671 ScheduleStage::reserveResources()
672 {
673 std::vector<bool> exeUnitReservations;
674 exeUnitReservations.resize(computeUnit.numExeUnits(), false);
675
676 for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
677 Wavefront *dispatchedWave = dispatchList->at(j).first;
678 if (dispatchedWave) {
679 DISPATCH_STATUS s = dispatchList->at(j).second;
680 if (s == EMPTY) {
681 continue;
682 } else if (s == EXREADY) {
683 // Wave is ready for execution
684 std::vector<int> execUnitIds =
685 dispatchedWave->reserveResources();
686 GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
687
688 if (!ii->isScalar()) {
689 computeUnit.vrf[dispatchedWave->simdId]->
690 dispatchInstruction(ii);
691 }
692 computeUnit.srf[dispatchedWave->simdId]->
693 dispatchInstruction(ii);
694
695 std::stringstream ss;
696 for (auto id : execUnitIds) {
697 ss << id << " ";
698 }
699 DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
700 " Reserving ExeRes[ %s]\n",
701 j, dispatchedWave->simdId, dispatchedWave->wfDynId,
702 ii->seqNum(), ii->disassemble(), ss.str());
703 // mark the resources as reserved for this cycle
704 for (auto execUnitId : execUnitIds) {
705 panic_if(exeUnitReservations.at(execUnitId),
706 "Execution unit %d is reserved!!!\n"
707 "SIMD[%d] WV[%d]: %d: %s",
708 execUnitId, dispatchedWave->simdId,
709 dispatchedWave->wfDynId,
710 ii->seqNum(), ii->disassemble());
711 exeUnitReservations.at(execUnitId) = true;
712 }
713
714 // If wavefront::reserveResources reserved multiple resources,
715 // then we're executing a flat memory instruction. This means
716 // that we've reserved a global and local memory unit. Thus,
717 // we need to mark the latter execution unit as not available.
718 if (execUnitIds.size() > 1) {
719 int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
720 assert(dispatchList->at(lm_exec_unit).second == SKIP);
721 }
722 } else if (s == SKIP) {
723 // Shared Memory pipe reserved for FLAT instruction.
724 // Verify the GM pipe for this wave is ready to execute
725 // and the wave in the GM pipe is the same as the wave
726 // in the LM pipe
727 int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
728 assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
729 dispatchedWave->wfDynId);
730 assert(dispatchList->at(gm_exec_unit).second == EXREADY);
731 }
732 }
733 }
734 }
735
736 void
737 ScheduleStage::deleteFromSch(Wavefront *w)
738 {
739 wavesInSch.erase(w->wfDynId);
740 }
741
742 void
743 ScheduleStage::regStats()
744 {
745 rdyListNotEmpty
746 .init(computeUnit.numExeUnits())
747 .name(name() + ".rdy_list_not_empty")
748 .desc("number of cycles one or more wave on ready list per "
749 "execution resource")
750 ;
751
752 rdyListEmpty
753 .init(computeUnit.numExeUnits())
754 .name(name() + ".rdy_list_empty")
755 .desc("number of cycles no wave on ready list per "
756 "execution resource")
757 ;
758
759 addToSchListStalls
760 .init(computeUnit.numExeUnits())
761 .name(name() + ".sch_list_add_stalls")
762 .desc("number of cycles a wave is not added to schList per "
763 "execution resource when ready list is not empty")
764 ;
765
766 schListToDispList
767 .init(computeUnit.numExeUnits())
768 .name(name() + ".sch_list_to_disp_list")
769 .desc("number of cycles a wave is added to dispatchList per "
770 "execution resource")
771 ;
772
773 schListToDispListStalls
774 .init(computeUnit.numExeUnits())
775 .name(name() + ".sch_list_to_disp_list_stalls")
776 .desc("number of cycles no wave is added to dispatchList per "
777 "execution resource")
778 ;
779
780 // Operand Readiness Stall Cycles
781 opdNrdyStalls
782 .init(SCH_RF_OPD_NRDY_CONDITIONS)
783 .name(name() + ".opd_nrdy_stalls")
784 .desc("number of stalls in SCH due to operands not ready")
785 ;
786 opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
787 opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
788 opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
789
790 // dispatchReady Stall Cycles
791 dispNrdyStalls
792 .init(SCH_NRDY_CONDITIONS)
793 .name(name() + ".disp_nrdy_stalls")
794 .desc("number of stalls in SCH due to resource not ready")
795 ;
796 dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
797 dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
798 dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
799 csprintf("VectorMemIssue"));
800 dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
801 csprintf("VectorMemBusBusy"));
802 dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
803 csprintf("VectorMemCoalescer"));
804 dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
805 dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
806 csprintf("ScalarMemIssue"));
807 dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
808 csprintf("ScalarMemBusBusy"));
809 dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
810 csprintf("ScalarMemFIFO"));
811 dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
812 csprintf("LocalMemIssue"));
813 dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
814 csprintf("LocalMemBusBusy"));
815 dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
816 csprintf("LocalMemFIFO"));
817 dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
818 csprintf("FlatMemIssue"));
819 dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
820 csprintf("FlatMemBusBusy"));
821 dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
822 csprintf("FlatMemCoalescer"));
823 dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
824 csprintf("FlatMemFIFO"));
825 dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
826
827 // RF Access Stall Cycles
828 rfAccessStalls
829 .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
830 .name(name() + ".rf_access_stalls")
831 .desc("number of stalls due to RF access denied")
832 ;
833 rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
834 rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
835 rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
836 rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
837 rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
838
839 // Stall cycles due to wave losing LDS bus arbitration
840 ldsBusArbStalls
841 .name(name() + ".lds_bus_arb_stalls")
842 .desc("number of stalls due to VRF->LDS bus conflicts")
843 ;
844 }