src/gpu-compute/schedule_stage.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/schedule_stage.hh"
  35
  36 #include <unordered_set>
  37
  38 #include "debug/GPUSched.hh"
  39 #include "debug/GPUVRF.hh"
  40 #include "gpu-compute/compute_unit.hh"
  41 #include "gpu-compute/gpu_static_inst.hh"
  42 #include "gpu-compute/scalar_register_file.hh"
  43 #include "gpu-compute/vector_register_file.hh"
  44 #include "gpu-compute/wavefront.hh"
  45
  46 ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
  47     : computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
  48       vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
  49       scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
  50       locMemBusRdy(false), locMemIssueRdy(false)
  51 {
  52     for (int j = 0; j < cu.numExeUnits(); ++j) {
  53         scheduler.emplace_back(p);
  54     }
  55     wavesInSch.clear();
  56     schList.resize(cu.numExeUnits());
  57     for (auto &dq : schList) {
  58         dq.clear();
  59     }
  60 }
  61
  62 ScheduleStage::~ScheduleStage()
  63 {
  64     scheduler.clear();
  65     wavesInSch.clear();
  66     schList.clear();
  67 }
  68
  69 void
  70 ScheduleStage::init()
  71 {
  72
  73     fatal_if(scheduler.size() != computeUnit.readyList.size(),
  74              "Scheduler should have same number of entries as CU's readyList");
  75     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
  76         scheduler[j].bindList(&computeUnit.readyList[j]);
  77     }
  78
  79     dispatchList = &computeUnit.dispatchList;
  80
  81     assert(computeUnit.numVectorGlobalMemUnits == 1);
  82     assert(computeUnit.numVectorSharedMemUnits == 1);
  83 }
  84
  85 void
  86 ScheduleStage::exec()
  87 {
  88     // Update readyList
  89     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
  90         // delete all ready wavefronts whose instruction buffers are now
  91         // empty because the last instruction was executed
  92         computeUnit.updateReadyList(j);
  93         /**
  94          * Remove any wave that already has an instruction present in SCH
  95          * waiting for RF reads to complete. This prevents out of order
  96          * execution within a wave.
  97          */
  98         for (auto wIt = computeUnit.readyList.at(j).begin();
  99              wIt != computeUnit.readyList.at(j).end();) {
 100             if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
 101                 *wIt = nullptr;
 102                 wIt = computeUnit.readyList.at(j).erase(wIt);
 103             } else {
 104                 wIt++;
 105             }
 106         }
 107     }
 108
 109     // Attempt to add another wave for each EXE type to schList queues
 110     // VMEM resources are iterated first, effectively giving priority
 111     // to VMEM over VALU for scheduling read of operands to the RFs.
 112     // Scalar Memory are iterated after VMEM
 113
 114     // Iterate VMEM and SMEM
 115     int firstMemUnit = computeUnit.firstMemUnit();
 116     int lastMemUnit = computeUnit.lastMemUnit();
 117     for (int j = firstMemUnit; j <= lastMemUnit; j++) {
 118         int readyListSize = computeUnit.readyList[j].size();
 119         // If no wave is ready to be scheduled on the execution resource
 120         // then skip scheduling for this execution resource
 121         if (!readyListSize) {
 122             rdyListEmpty[j]++;
 123             continue;
 124         }
 125         rdyListNotEmpty[j]++;
 126
 127         // Pick a wave and attempt to add it to schList
 128         Wavefront *w = scheduler[j].chooseWave();
 129         if (!addToSchList(j, w)) {
 130             // For waves not added to schList, increment count of cycles
 131             // this wave spends in SCH stage.
 132             w->schCycles++;
 133             addToSchListStalls[j]++;
 134         }
 135     }
 136
 137     // Iterate everything else
 138     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
 139         // skip the VMEM resources
 140         if (j >= firstMemUnit && j <= lastMemUnit) {
 141             continue;
 142         }
 143         int readyListSize = computeUnit.readyList[j].size();
 144         // If no wave is ready to be scheduled on the execution resource
 145         // then skip scheduling for this execution resource
 146         if (!readyListSize) {
 147             rdyListEmpty[j]++;
 148             continue;
 149         }
 150         rdyListNotEmpty[j]++;
 151
 152         // Pick a wave and attempt to add it to schList
 153         Wavefront *w = scheduler[j].chooseWave();
 154         if (!addToSchList(j, w)) {
 155             // For waves not added to schList, increment count of cycles
 156             // this wave spends in SCH stage.
 157             w->schCycles++;
 158             addToSchListStalls[j]++;
 159         }
 160     }
 161
 162     // At this point, the schList queue per EXE type may contain
 163     // multiple waves, in order of age (oldest to youngest).
 164     // Wave may be in RFBUSY, indicating they are waiting for registers
 165     // to be read, or in RFREADY, indicating they are candidates for
 166     // the dispatchList and execution
 167
 168     // Iterate schList queues and check if any of the waves have finished
 169     // reading their operands, moving those waves to RFREADY status
 170     checkRfOperandReadComplete();
 171
 172     // Fill the dispatch list with the oldest wave of each EXE type that
 173     // is ready to execute
 174     // Wave is picked if status in schList is RFREADY and it passes resource
 175     // ready checks similar to those currently in SCB
 176     fillDispatchList();
 177
 178     // Resource arbitration on waves in dispatchList
 179     // Losing waves are re-inserted to the schList at a location determined
 180     // by wave age
 181
 182     // Arbitrate access to the VRF->LDS bus
 183     arbitrateVrfToLdsBus();
 184
 185     // Schedule write operations to the register files
 186     scheduleRfDestOperands();
 187
 188     // Lastly, reserve resources for waves that are ready to execute.
 189     reserveResources();
 190 }
 191
 192 void
 193 ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
 194                                         Wavefront *w)
 195 {
 196     dispatchList->at(unitId).first = w;
 197     dispatchList->at(unitId).second = s;
 198 }
 199
 200 bool
 201 ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
 202 {
 203     GPUDynInstPtr ii = w->instructionBuffer.front();
 204     assert(ii);
 205     bool accessVrfWr = true;
 206     if (!ii->isScalar()) {
 207         accessVrfWr =
 208             computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
 209     }
 210     bool accessSrfWr =
 211         computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
 212     bool accessRf = accessVrfWr && accessSrfWr;
 213     if (accessRf) {
 214         if (!ii->isScalar()) {
 215             computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
 216         }
 217         computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
 218         return true;
 219     } else {
 220         rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
 221         if (!accessSrfWr) {
 222             rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
 223         }
 224         if (!accessVrfWr) {
 225             rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
 226         }
 227
 228         // Increment stall counts for WF
 229         w->schStalls++;
 230         w->schRfAccessStalls++;
 231     }
 232     return false;
 233 }
 234
 235 void
 236 ScheduleStage::scheduleRfDestOperands()
 237 {
 238     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
 239         if (!dispatchList->at(j).first) {
 240             continue;
 241         }
 242         // get the wave on dispatch list and attempt to allocate write
 243         // resources in the RFs
 244         Wavefront *w = dispatchList->at(j).first;
 245         if (!schedRfWrites(j, w)) {
 246             reinsertToSchList(j, w);
 247             doDispatchListTransition(j, EMPTY);
 248             // if this is a flat inst, also transition the LM pipe to empty
 249             // Note: since FLAT/LM arbitration occurs before scheduling
 250             // destination operands to the RFs, it is possible that a LM
 251             // instruction lost arbitration, but would have been able to
 252             // pass the RF destination operand check here, and execute
 253             // instead of the FLAT.
 254             if (w->instructionBuffer.front()->isFlat()) {
 255                 assert(dispatchList->at(w->localMem).second == SKIP);
 256                 doDispatchListTransition(w->localMem, EMPTY);
 257             }
 258         }
 259     }
 260 }
 261
 262 bool
 263 ScheduleStage::addToSchList(int exeType, Wavefront *w)
 264 {
 265     // Attempt to add the wave to the schList if the VRF can support the
 266     // wave's next instruction
 267     GPUDynInstPtr ii = w->instructionBuffer.front();
 268     assert(ii);
 269     bool accessVrf = true;
 270     if (!ii->isScalar()) {
 271         accessVrf =
 272             computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
 273     }
 274     bool accessSrf =
 275         computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
 276     // If RFs can support instruction, add to schList in RFBUSY state,
 277     // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
 278     // to the VRF
 279     bool accessRf = accessVrf && accessSrf;
 280     if (accessRf) {
 281         DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
 282                 exeType, w->simdId, w->wfDynId,
 283                 ii->seqNum(), ii->disassemble());
 284
 285         computeUnit.insertInPipeMap(w);
 286         wavesInSch.emplace(w->wfDynId);
 287         schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
 288         if (w->isOldestInstWaitcnt()) {
 289             w->setStatus(Wavefront::S_WAITCNT);
 290         }
 291         if (!ii->isScalar()) {
 292             computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
 293         }
 294         computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
 295
 296         DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
 297                 exeType, w->simdId, w->wfDynId,
 298                 ii->seqNum(), ii->disassemble());
 299         return true;
 300     } else {
 301         // Number of stall cycles due to RF access denied
 302         rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
 303         // Count number of denials due to each reason
 304         // Multiple items may contribute to the denied request
 305         if (!accessVrf) {
 306             rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
 307         }
 308         if (!accessSrf) {
 309             rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
 310         }
 311
 312         // Increment stall counts for WF
 313         w->schStalls++;
 314         w->schRfAccessStalls++;
 315         DPRINTF(GPUSched, "schList[%d]: Could not add: "
 316                 "SIMD[%d] WV[%d]: %d: %s\n",
 317                 exeType, w->simdId, w->wfDynId,
 318                 ii->seqNum(), ii->disassemble());
 319     }
 320     return false;
 321 }
 322
 323 void
 324 ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
 325 {
 326     // Insert wave w into schList for specified exeType.
 327     // Wave is inserted in age order, with oldest wave being at the
 328     // front of the schList
 329     auto schIter = schList.at(exeType).begin();
 330     while (schIter != schList.at(exeType).end()
 331            && schIter->first->wfDynId < w->wfDynId) {
 332         schIter++;
 333     }
 334     schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
 335 }
 336
 337 void
 338 ScheduleStage::checkMemResources()
 339 {
 340     // Check for resource availability in the next cycle
 341     scalarMemBusRdy = false;
 342     scalarMemIssueRdy = false;
 343     // check if there is a SRF->Global Memory bus available and
 344     if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
 345         scalarMemBusRdy = true;
 346     }
 347     // check if we can issue a scalar memory instruction
 348     if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
 349         scalarMemIssueRdy = true;
 350     }
 351
 352     glbMemBusRdy = false;
 353     glbMemIssueRdy = false;
 354     // check if there is a VRF->Global Memory bus available
 355     if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
 356         glbMemBusRdy = true;
 357     }
 358     // check if we can issue a Global memory instruction
 359     if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
 360         glbMemIssueRdy = true;
 361     }
 362
 363     locMemBusRdy = false;
 364     locMemIssueRdy = false;
 365     // check if there is a VRF->LDS bus available
 366     if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
 367         locMemBusRdy = true;
 368     }
 369     // check if we can issue a LDS instruction
 370     if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
 371         locMemIssueRdy = true;
 372     }
 373 }
 374
 375 bool
 376 ScheduleStage::dispatchReady(Wavefront *w)
 377 {
 378     vectorAluRdy = false;
 379     scalarAluRdy = false;
 380     // check for available vector/scalar ALUs in the next cycle
 381     if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
 382         vectorAluRdy = true;
 383     }
 384     if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
 385         scalarAluRdy = true;
 386     }
 387     GPUDynInstPtr ii = w->instructionBuffer.front();
 388
 389     if (ii->isNop()) {
 390         // S_NOP requires SALU. V_NOP requires VALU.
 391         // TODO: Scalar NOP does not require SALU in hardware,
 392         // and is executed out of IB directly.
 393         if (ii->isScalar() && !scalarAluRdy) {
 394             dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
 395             return false;
 396         } else if (!ii->isScalar() && !vectorAluRdy) {
 397             dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
 398             return false;
 399         }
 400     } else if (ii->isEndOfKernel()) {
 401         // EndPgm instruction
 402         if (ii->isScalar() && !scalarAluRdy) {
 403             dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
 404             return false;
 405         }
 406     } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
 407         // Barrier, Branch, or ALU instruction
 408         if (ii->isScalar() && !scalarAluRdy) {
 409             dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
 410             return false;
 411         } else if (!ii->isScalar() && !vectorAluRdy) {
 412             dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
 413             return false;
 414         }
 415     } else if (!ii->isScalar() && ii->isGlobalMem()) {
 416         // Vector Global Memory instruction
 417         bool rdy = true;
 418         if (!glbMemIssueRdy) {
 419             rdy = false;
 420             dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
 421         }
 422         if (!glbMemBusRdy) {
 423             rdy = false;
 424             dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
 425         }
 426         if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
 427             rdy = false;
 428             dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
 429         }
 430         if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
 431             rdy = false;
 432             dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
 433         }
 434         if (!rdy) {
 435             return false;
 436         }
 437     } else if (ii->isScalar() && ii->isGlobalMem()) {
 438         // Scalar Global Memory instruction
 439         bool rdy = true;
 440         if (!scalarMemIssueRdy) {
 441             rdy = false;
 442             dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
 443         }
 444         if (!scalarMemBusRdy) {
 445             rdy = false;
 446             dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
 447         }
 448         if (!computeUnit.scalarMemoryPipe.
 449                 isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
 450                                  w->scalarWrGmReqsInPipe)) {
 451             rdy = false;
 452             dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
 453         }
 454         if (!rdy) {
 455             return false;
 456         }
 457     } else if (!ii->isScalar() && ii->isLocalMem()) {
 458         // Vector Local Memory instruction
 459         bool rdy = true;
 460         if (!locMemIssueRdy) {
 461             rdy = false;
 462             dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
 463         }
 464         if (!locMemBusRdy) {
 465             rdy = false;
 466             dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
 467         }
 468         if (!computeUnit.localMemoryPipe.
 469                 isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
 470             rdy = false;
 471             dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
 472         }
 473         if (!rdy) {
 474             return false;
 475         }
 476     } else if (!ii->isScalar() && ii->isFlat()) {
 477         // Vector Flat memory instruction
 478         bool rdy = true;
 479         if (!glbMemIssueRdy || !locMemIssueRdy) {
 480             rdy = false;
 481             dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
 482         }
 483         if (!glbMemBusRdy || !locMemBusRdy) {
 484             rdy = false;
 485             dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
 486         }
 487         if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
 488             rdy = false;
 489             dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
 490         }
 491         if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
 492             rdy = false;
 493             dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
 494         }
 495         if (!computeUnit.localMemoryPipe.
 496                 isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
 497             rdy = false;
 498             dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
 499         }
 500         if (!rdy) {
 501             return false;
 502         }
 503     } else {
 504         panic("%s: unknown instr checked for readiness", ii->disassemble());
 505         return false;
 506     }
 507     dispNrdyStalls[SCH_RDY]++;
 508     return true;
 509 }
 510
 511 void
 512 ScheduleStage::fillDispatchList()
 513 {
 514     // update execution resource status
 515     checkMemResources();
 516     // iterate execution resources
 517     for (int j = 0; j < computeUnit.numExeUnits(); j++) {
 518         assert(dispatchList->at(j).second == EMPTY);
 519
 520         // iterate waves in schList to pick one for dispatch
 521         auto schIter = schList.at(j).begin();
 522         bool dispatched = false;
 523         while (schIter != schList.at(j).end()) {
 524             // only attempt to dispatch if status is RFREADY
 525             if (schIter->second == RFREADY) {
 526                 // Check if this wave is ready for dispatch
 527                 bool dispRdy = dispatchReady(schIter->first);
 528                 if (!dispatched && dispRdy) {
 529                     // No other wave has been dispatched for this exe
 530                     // resource, and this wave is ready. Place this wave
 531                     // on dispatchList and make it ready for execution
 532                     // next cycle.
 533
 534                     // Acquire a coalescer token if it is a global mem
 535                     // operation.
 536                     GPUDynInstPtr mp = schIter->first->
 537                                        instructionBuffer.front();
 538                     if (!mp->isMemSync() && !mp->isScalar() &&
 539                         (mp->isGlobalMem() || mp->isFlat())) {
 540                         computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
 541                     }
 542
 543                     doDispatchListTransition(j, EXREADY, schIter->first);
 544                     DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
 545                             "EMPTY->EXREADY\n", j);
 546                     schIter->first = nullptr;
 547                     schIter = schList.at(j).erase(schIter);
 548                     dispatched = true;
 549                 } else {
 550                     // Either another wave has been dispatched, or this wave
 551                     // was not ready, so it is stalled this cycle
 552                     schIter->first->schStalls++;
 553                     if (!dispRdy) {
 554                         // not ready for dispatch, increment stall stat
 555                         schIter->first->schResourceStalls++;
 556                     }
 557                     // Examine next wave for this resource
 558                     schIter++;
 559                 }
 560             } else {
 561                 // Wave not in RFREADY, try next wave
 562                 schIter++;
 563             }
 564         }
 565
 566         // Increment stall count if no wave sent to dispatchList for
 567         // current execution resource
 568         if (!dispatched) {
 569             schListToDispListStalls[j]++;
 570         } else {
 571             schListToDispList[j]++;
 572         }
 573     }
 574 }
 575
 576 void
 577 ScheduleStage::arbitrateVrfToLdsBus()
 578 {
 579     // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
 580     // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
 581     // and a VRF->LDS bus. In GFx9, this is not the case.
 582
 583     // iterate the GM pipelines
 584     for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
 585         // get the GM pipe index in the dispatchList
 586         int gm_exe_unit = computeUnit.firstMemUnit() + i;
 587         // get the wave in the dispatchList
 588         Wavefront *w = dispatchList->at(gm_exe_unit).first;
 589         // If the WF is valid, ready to execute, and the instruction
 590         // is a flat access, arbitrate with the WF's assigned LM pipe
 591         if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
 592             w->instructionBuffer.front()->isFlat()) {
 593             // If the associated LM pipe also has a wave selected, block
 594             // that wave and let the Flat instruction issue. The WF in the
 595             // LM pipe is added back to the schList for consideration next
 596             // cycle.
 597             if (dispatchList->at(w->localMem).second == EXREADY) {
 598                 reinsertToSchList(w->localMem,
 599                                   dispatchList->at(w->localMem).first);
 600                 // Increment stall stats for LDS-VRF arbitration
 601                 ldsBusArbStalls++;
 602                 dispatchList->at(w->localMem).first->schLdsArbStalls++;
 603             }
 604             // With arbitration of LM pipe complete, transition the
 605             // LM pipe to SKIP state in the dispatchList to inform EX stage
 606             // that a Flat instruction is executing next cycle
 607             doDispatchListTransition(w->localMem, SKIP, w);
 608             DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
 609                     "EXREADY->SKIP\n", w->localMem);
 610         }
 611     }
 612 }
 613
 614 void
 615 ScheduleStage::checkRfOperandReadComplete()
 616 {
 617     // Iterate the schList queues and check if operand reads
 618     // have completed in the RFs. If so, mark the wave as ready for
 619     // selection for dispatchList
 620     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
 621         for (auto &p : schList.at(j)) {
 622             Wavefront *w = p.first;
 623             assert(w);
 624
 625             // Increment the number of cycles the wave spends in the
 626             // SCH stage, since this loop visits every wave in SCH.
 627             w->schCycles++;
 628
 629             GPUDynInstPtr ii = w->instructionBuffer.front();
 630             bool vrfRdy = true;
 631             if (!ii->isScalar()) {
 632                 vrfRdy =
 633                     computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
 634             }
 635             bool srfRdy =
 636                 computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
 637             bool operandsReady = vrfRdy && srfRdy;
 638             if (operandsReady) {
 639                 DPRINTF(GPUSched,
 640                         "schList[%d]: WV[%d] operands ready for: %d: %s\n",
 641                          j, w->wfDynId, ii->seqNum(), ii->disassemble());
 642                 DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
 643                         j, w->wfDynId);
 644                 p.second = RFREADY;
 645             } else {
 646                 DPRINTF(GPUSched,
 647                         "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
 648                          j, w->wfDynId, ii->seqNum(), ii->disassemble());
 649
 650                 // operands not ready yet, increment SCH stage stats
 651                 // aggregate to all wavefronts on the CU
 652                 p.second = RFBUSY;
 653
 654                 // Increment stall stats
 655                 w->schStalls++;
 656                 w->schOpdNrdyStalls++;
 657
 658                 opdNrdyStalls[SCH_RF_OPD_NRDY]++;
 659                 if (!vrfRdy) {
 660                     opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
 661                 }
 662                 if (!srfRdy) {
 663                     opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
 664                 }
 665             }
 666         }
 667     }
 668 }
 669
 670 void
 671 ScheduleStage::reserveResources()
 672 {
 673     std::vector<bool> exeUnitReservations;
 674     exeUnitReservations.resize(computeUnit.numExeUnits(), false);
 675
 676     for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
 677         Wavefront *dispatchedWave = dispatchList->at(j).first;
 678         if (dispatchedWave) {
 679             DISPATCH_STATUS s = dispatchList->at(j).second;
 680             if (s == EMPTY) {
 681                 continue;
 682             } else if (s == EXREADY) {
 683                 // Wave is ready for execution
 684                 std::vector<int> execUnitIds =
 685                     dispatchedWave->reserveResources();
 686                 GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
 687
 688                 if (!ii->isScalar()) {
 689                     computeUnit.vrf[dispatchedWave->simdId]->
 690                         dispatchInstruction(ii);
 691                 }
 692                 computeUnit.srf[dispatchedWave->simdId]->
 693                     dispatchInstruction(ii);
 694
 695                 std::stringstream ss;
 696                 for (auto id : execUnitIds) {
 697                     ss << id << " ";
 698                 }
 699                 DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
 700                         "    Reserving ExeRes[ %s]\n",
 701                         j, dispatchedWave->simdId, dispatchedWave->wfDynId,
 702                         ii->seqNum(), ii->disassemble(), ss.str());
 703                 // mark the resources as reserved for this cycle
 704                 for (auto execUnitId : execUnitIds) {
 705                     panic_if(exeUnitReservations.at(execUnitId),
 706                              "Execution unit %d is reserved!!!\n"
 707                              "SIMD[%d] WV[%d]: %d: %s",
 708                              execUnitId, dispatchedWave->simdId,
 709                              dispatchedWave->wfDynId,
 710                              ii->seqNum(), ii->disassemble());
 711                     exeUnitReservations.at(execUnitId) = true;
 712                 }
 713
 714                 // If wavefront::reserveResources reserved multiple resources,
 715                 // then we're executing a flat memory instruction. This means
 716                 // that we've reserved a global and local memory unit. Thus,
 717                 // we need to mark the latter execution unit as not available.
 718                 if (execUnitIds.size() > 1) {
 719                     int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
 720                     assert(dispatchList->at(lm_exec_unit).second == SKIP);
 721                 }
 722             } else if (s == SKIP) {
 723                 // Shared Memory pipe reserved for FLAT instruction.
 724                 // Verify the GM pipe for this wave is ready to execute
 725                 // and the wave in the GM pipe is the same as the wave
 726                 // in the LM pipe
 727                 int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
 728                 assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
 729                        dispatchedWave->wfDynId);
 730                 assert(dispatchList->at(gm_exec_unit).second == EXREADY);
 731             }
 732         }
 733     }
 734 }
 735
 736 void
 737 ScheduleStage::deleteFromSch(Wavefront *w)
 738 {
 739     wavesInSch.erase(w->wfDynId);
 740 }
 741
 742 void
 743 ScheduleStage::regStats()
 744 {
 745     rdyListNotEmpty
 746         .init(computeUnit.numExeUnits())
 747         .name(name() + ".rdy_list_not_empty")
 748         .desc("number of cycles one or more wave on ready list per "
 749               "execution resource")
 750         ;
 751
 752     rdyListEmpty
 753         .init(computeUnit.numExeUnits())
 754         .name(name() + ".rdy_list_empty")
 755         .desc("number of cycles no wave on ready list per "
 756               "execution resource")
 757         ;
 758
 759     addToSchListStalls
 760         .init(computeUnit.numExeUnits())
 761         .name(name() + ".sch_list_add_stalls")
 762         .desc("number of cycles a wave is not added to schList per "
 763               "execution resource when ready list is not empty")
 764         ;
 765
 766     schListToDispList
 767         .init(computeUnit.numExeUnits())
 768         .name(name() + ".sch_list_to_disp_list")
 769         .desc("number of cycles a wave is added to dispatchList per "
 770               "execution resource")
 771         ;
 772
 773     schListToDispListStalls
 774         .init(computeUnit.numExeUnits())
 775         .name(name() + ".sch_list_to_disp_list_stalls")
 776         .desc("number of cycles no wave is added to dispatchList per "
 777               "execution resource")
 778         ;
 779
 780     // Operand Readiness Stall Cycles
 781     opdNrdyStalls
 782         .init(SCH_RF_OPD_NRDY_CONDITIONS)
 783         .name(name() + ".opd_nrdy_stalls")
 784         .desc("number of stalls in SCH due to operands not ready")
 785         ;
 786     opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
 787     opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
 788     opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
 789
 790     // dispatchReady Stall Cycles
 791     dispNrdyStalls
 792         .init(SCH_NRDY_CONDITIONS)
 793         .name(name() + ".disp_nrdy_stalls")
 794         .desc("number of stalls in SCH due to resource not ready")
 795         ;
 796     dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
 797     dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
 798     dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
 799                                   csprintf("VectorMemIssue"));
 800     dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
 801                                   csprintf("VectorMemBusBusy"));
 802     dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
 803                                   csprintf("VectorMemCoalescer"));
 804     dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
 805     dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
 806                                   csprintf("ScalarMemIssue"));
 807     dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
 808                                   csprintf("ScalarMemBusBusy"));
 809     dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
 810                                   csprintf("ScalarMemFIFO"));
 811     dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
 812                                   csprintf("LocalMemIssue"));
 813     dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
 814                                   csprintf("LocalMemBusBusy"));
 815     dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
 816                                   csprintf("LocalMemFIFO"));
 817     dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
 818                                   csprintf("FlatMemIssue"));
 819     dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
 820                                   csprintf("FlatMemBusBusy"));
 821     dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
 822                                   csprintf("FlatMemCoalescer"));
 823     dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
 824                                   csprintf("FlatMemFIFO"));
 825     dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
 826
 827     // RF Access Stall Cycles
 828     rfAccessStalls
 829         .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
 830         .name(name() + ".rf_access_stalls")
 831         .desc("number of stalls due to RF access denied")
 832         ;
 833     rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
 834     rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
 835     rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
 836     rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
 837     rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
 838
 839     // Stall cycles due to wave losing LDS bus arbitration
 840     ldsBusArbStalls
 841         .name(name() + ".lds_bus_arb_stalls")
 842         .desc("number of stalls due to VRF->LDS bus conflicts")
 843         ;
 844 }