ext,tests: Copy test's output files from /tmp to testing-results

[gem5.git] / src / gpu-compute / schedule_stage.cc
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc

index 068136026dd0913bfac59c07fa2a65662e87719b..005e6f61e2fb06be7f4d8f644f3100be7662ef32 100644 (file)
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -14,9 +14,9 @@
   * this list of conditions and the following disclaimer in the documentation
   * and/or other materials provided with the distribution.
   *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
   *
   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -29,123 +29,851 @@
   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   * POSSIBILITY OF SUCH DAMAGE.
- *
- * Author: Sooraj Puthoor
   */
  
  #include "gpu-compute/schedule_stage.hh"
  
+#include <unordered_set>
+
+#include "debug/GPUSched.hh"
+#include "debug/GPUVRF.hh"
  #include "gpu-compute/compute_unit.hh"
  #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/scalar_register_file.hh"
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
-    : numSIMDs(p->num_SIMDs),
-      numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
+                             ScoreboardCheckToSchedule &from_scoreboard_check,
+                             ScheduleToExecute &to_execute)
+    : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
+      toExecute(to_execute),
+      _name(cu.name() + ".ScheduleStage"),
+      vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
+      scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
+      locMemBusRdy(false), locMemIssueRdy(false)
  {
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
-        Scheduler newScheduler(p);
-        scheduler.push_back(newScheduler);
+    for (int j = 0; j < cu.numExeUnits(); ++j) {
+        scheduler.emplace_back(p);
+    }
+    wavesInSch.clear();
+    schList.resize(cu.numExeUnits());
+    for (auto &dq : schList) {
+        dq.clear();
      }
  }
  
  ScheduleStage::~ScheduleStage()
  {
      scheduler.clear();
-    waveStatusList.clear();
+    wavesInSch.clear();
+    schList.clear();
+}
+
+void
+ScheduleStage::init()
+{
+
+    fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
+             "Scheduler should have same number of entries as CU's readyList");
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
+    }
+
+    assert(computeUnit.numVectorGlobalMemUnits == 1);
+    assert(computeUnit.numVectorSharedMemUnits == 1);
+}
+
+void
+ScheduleStage::exec()
+{
+    toExecute.reset();
+
+    // Update readyList
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        /**
+         * Remove any wave that already has an instruction present in SCH
+         * waiting for RF reads to complete. This prevents out of order
+         * execution within a wave.
+         */
+        fromScoreboardCheck.updateReadyList(j);
+        for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
+             wIt != fromScoreboardCheck.readyWFs(j).end();) {
+            if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
+                *wIt = nullptr;
+                wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
+            } else {
+                wIt++;
+            }
+        }
+    }
+
+    // Attempt to add another wave for each EXE type to schList queues
+    // VMEM resources are iterated first, effectively giving priority
+    // to VMEM over VALU for scheduling read of operands to the RFs.
+    // Scalar Memory are iterated after VMEM
+
+    // Iterate VMEM and SMEM
+    int firstMemUnit = computeUnit.firstMemUnit();
+    int lastMemUnit = computeUnit.lastMemUnit();
+    for (int j = firstMemUnit; j <= lastMemUnit; j++) {
+        int readyListSize = fromScoreboardCheck.readyWFs(j).size();
+        // If no wave is ready to be scheduled on the execution resource
+        // then skip scheduling for this execution resource
+        if (!readyListSize) {
+            rdyListEmpty[j]++;
+            continue;
+        }
+        rdyListNotEmpty[j]++;
+
+        // Pick a wave and attempt to add it to schList
+        Wavefront *wf = scheduler[j].chooseWave();
+        GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+        assert(gpu_dyn_inst);
+        if (!addToSchList(j, gpu_dyn_inst)) {
+            // For waves not added to schList, increment count of cycles
+            // this wave spends in SCH stage.
+            wf->schCycles++;
+            addToSchListStalls[j]++;
+        } else {
+            if (gpu_dyn_inst->isScalar() || gpu_dyn_inst->isGroupSeg()) {
+                wf->incLGKMInstsIssued();
+            } else {
+                wf->incVMemInstsIssued();
+                if (gpu_dyn_inst->isFlat()) {
+                    wf->incLGKMInstsIssued();
+                }
+            }
+        }
+    }
+
+    // Iterate everything else
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        // skip the VMEM resources
+        if (j >= firstMemUnit && j <= lastMemUnit) {
+            continue;
+        }
+        int readyListSize = fromScoreboardCheck.readyWFs(j).size();
+        // If no wave is ready to be scheduled on the execution resource
+        // then skip scheduling for this execution resource
+        if (!readyListSize) {
+            rdyListEmpty[j]++;
+            continue;
+        }
+        rdyListNotEmpty[j]++;
+
+        // Pick a wave and attempt to add it to schList
+        Wavefront *wf = scheduler[j].chooseWave();
+        GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+        assert(gpu_dyn_inst);
+        if (!addToSchList(j, gpu_dyn_inst)) {
+            // For waves not added to schList, increment count of cycles
+            // this wave spends in SCH stage.
+            wf->schCycles++;
+            addToSchListStalls[j]++;
+        }
+    }
+
+    // At this point, the schList queue per EXE type may contain
+    // multiple waves, in order of age (oldest to youngest).
+    // Wave may be in RFBUSY, indicating they are waiting for registers
+    // to be read, or in RFREADY, indicating they are candidates for
+    // the dispatchList and execution
+
+    // Iterate schList queues and check if any of the waves have finished
+    // reading their operands, moving those waves to RFREADY status
+    checkRfOperandReadComplete();
+
+    // Fill the dispatch list with the oldest wave of each EXE type that
+    // is ready to execute
+    // Wave is picked if status in schList is RFREADY and it passes resource
+    // ready checks similar to those currently in SCB
+    fillDispatchList();
+
+    // Resource arbitration on waves in dispatchList
+    // Losing waves are re-inserted to the schList at a location determined
+    // by wave age
+
+    // Arbitrate access to the VRF->LDS bus
+    arbitrateVrfToLdsBus();
+
+    // Schedule write operations to the register files
+    scheduleRfDestOperands();
+
+    // Lastly, reserve resources for waves that are ready to execute.
+    reserveResources();
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
+                                        const GPUDynInstPtr &gpu_dyn_inst)
+{
+    toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)
+{
+    toExecute.dispatchTransition(unitId, s);
+}
+
+bool
+ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
+{
+    assert(gpu_dyn_inst);
+    Wavefront *wf = gpu_dyn_inst->wavefront();
+    bool accessVrfWr = true;
+    if (!gpu_dyn_inst->isScalar()) {
+        accessVrfWr = computeUnit.vrf[wf->simdId]
+            ->canScheduleWriteOperands(wf, gpu_dyn_inst);
+    }
+    bool accessSrfWr = computeUnit.srf[wf->simdId]
+        ->canScheduleWriteOperands(wf, gpu_dyn_inst);
+    bool accessRf = accessVrfWr && accessSrfWr;
+    if (accessRf) {
+        if (!gpu_dyn_inst->isScalar()) {
+            computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
+                                                               gpu_dyn_inst);
+        }
+        computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
+        return true;
+    } else {
+        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        if (!accessSrfWr) {
+            rfAccessStalls[SCH_SRF_WR_ACCESS_NRDY]++;
+        }
+        if (!accessVrfWr) {
+            rfAccessStalls[SCH_VRF_WR_ACCESS_NRDY]++;
+        }
+
+        // Increment stall counts for WF
+        wf->schStalls++;
+        wf->schRfAccessStalls++;
+    }
+    return false;
+}
+
+void
+ScheduleStage::scheduleRfDestOperands()
+{
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        if (toExecute.dispatchStatus(j) == EMPTY ||
+            toExecute.dispatchStatus(j) == SKIP) {
+            continue;
+        }
+
+        // get the wave on dispatch list and attempt to allocate write
+        // resources in the RFs
+        const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+        assert(gpu_dyn_inst);
+        Wavefront *wf = gpu_dyn_inst->wavefront();
+        if (!schedRfWrites(j, gpu_dyn_inst)) {
+            reinsertToSchList(j, gpu_dyn_inst);
+            doDispatchListTransition(j, EMPTY);
+            // if this is a flat inst, also transition the LM pipe to empty
+            // Note: since FLAT/LM arbitration occurs before scheduling
+            // destination operands to the RFs, it is possible that a LM
+            // instruction lost arbitration, but would have been able to
+            // pass the RF destination operand check here, and execute
+            // instead of the FLAT.
+            if (wf->instructionBuffer.front()->isFlat()) {
+                assert(toExecute.dispatchStatus(wf->localMem)
+                       == SKIP);
+                doDispatchListTransition(wf->localMem, EMPTY);
+            }
+        }
+    }
+}
+
+bool
+ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
+{
+    // Attempt to add the wave to the schList if the VRF can support the
+    // wave's next instruction
+    assert(gpu_dyn_inst);
+    Wavefront *wf = gpu_dyn_inst->wavefront();
+    bool accessVrf = true;
+    if (!gpu_dyn_inst->isScalar()) {
+        accessVrf = computeUnit.vrf[wf->simdId]
+            ->canScheduleReadOperands(wf, gpu_dyn_inst);
+    }
+    bool accessSrf = computeUnit.srf[wf->simdId]
+        ->canScheduleReadOperands(wf, gpu_dyn_inst);
+    // If RFs can support instruction, add to schList in RFBUSY state,
+    // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
+    // to the VRF
+    bool accessRf = accessVrf && accessSrf;
+    if (accessRf) {
+        DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, wf->simdId, wf->wfDynId,
+                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+
+        computeUnit.insertInPipeMap(wf);
+        wavesInSch.emplace(wf->wfDynId);
+        schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
+        if (wf->isOldestInstWaitcnt()) {
+            wf->setStatus(Wavefront::S_WAITCNT);
+        }
+        if (!gpu_dyn_inst->isScalar()) {
+            computeUnit.vrf[wf->simdId]
+                ->scheduleReadOperands(wf, gpu_dyn_inst);
+        }
+        computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
+
+        DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, wf->simdId, wf->wfDynId,
+                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+        return true;
+    } else {
+        // Number of stall cycles due to RF access denied
+        rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
+        // Count number of denials due to each reason
+        // Multiple items may contribute to the denied request
+        if (!accessVrf) {
+            rfAccessStalls[SCH_VRF_RD_ACCESS_NRDY]++;
+        }
+        if (!accessSrf) {
+            rfAccessStalls[SCH_SRF_RD_ACCESS_NRDY]++;
+        }
+
+        // Increment stall counts for WF
+        wf->schStalls++;
+        wf->schRfAccessStalls++;
+        DPRINTF(GPUSched, "schList[%d]: Could not add: "
+                "SIMD[%d] WV[%d]: %d: %s\n",
+                exeType, wf->simdId, wf->wfDynId,
+                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+    }
+    return false;
+}
+
+void
+ScheduleStage::reinsertToSchList(int exeType,
+                                 const GPUDynInstPtr &gpu_dyn_inst)
+{
+    // Insert wave w into schList for specified exeType.
+    // Wave is inserted in age order, with oldest wave being at the
+    // front of the schList
+    assert(gpu_dyn_inst);
+    auto schIter = schList.at(exeType).begin();
+    while (schIter != schList.at(exeType).end()
+           && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
+        schIter++;
+    }
+    schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
  }
  
  void
-ScheduleStage::init(ComputeUnit *cu)
+ScheduleStage::checkMemResources()
  {
-    computeUnit = cu;
-    _name = computeUnit->name() + ".ScheduleStage";
+    // Check for resource availability in the next cycle
+    scalarMemBusRdy = false;
+    scalarMemIssueRdy = false;
+    // check if there is a SRF->Global Memory bus available and
+    if (computeUnit.srfToScalarMemPipeBus.rdy(Cycles(1))) {
+        scalarMemBusRdy = true;
+    }
+    // check if we can issue a scalar memory instruction
+    if (computeUnit.scalarMemUnit.rdy(Cycles(1))) {
+        scalarMemIssueRdy = true;
+    }
  
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
-        scheduler[j].bindList(&computeUnit->readyList[j]);
+    glbMemBusRdy = false;
+    glbMemIssueRdy = false;
+    // check if there is a VRF->Global Memory bus available
+    if (computeUnit.vrfToGlobalMemPipeBus.rdy(Cycles(1))) {
+        glbMemBusRdy = true;
+    }
+    // check if we can issue a Global memory instruction
+    if (computeUnit.vectorGlobalMemUnit.rdy(Cycles(1))) {
+        glbMemIssueRdy = true;
      }
  
-    for (int j = 0; j < numSIMDs; ++j) {
-        waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+    locMemBusRdy = false;
+    locMemIssueRdy = false;
+    // check if there is a VRF->LDS bus available
+    if (computeUnit.vrfToLocalMemPipeBus.rdy(Cycles(1))) {
+        locMemBusRdy = true;
+    }
+    // check if we can issue a LDS instruction
+    if (computeUnit.vectorSharedMemUnit.rdy(Cycles(1))) {
+        locMemIssueRdy = true;
      }
+}
  
-    dispatchList = &computeUnit->dispatchList;
+bool
+ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
+{
+    assert(gpu_dyn_inst);
+    Wavefront *wf = gpu_dyn_inst->wavefront();
+    vectorAluRdy = false;
+    scalarAluRdy = false;
+    // check for available vector/scalar ALUs in the next cycle
+    if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
+        vectorAluRdy = true;
+    }
+    if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
+        scalarAluRdy = true;
+    }
+
+    if (gpu_dyn_inst->isNop()) {
+        // S_NOP requires SALU. V_NOP requires VALU.
+        // TODO: Scalar NOP does not require SALU in hardware,
+        // and is executed out of IB directly.
+        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
+            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (gpu_dyn_inst->isEndOfKernel()) {
+        // EndPgm instruction
+        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
+               || gpu_dyn_inst->isALU()) {
+        // Barrier, Branch, or ALU instruction
+        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
+            dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
+            return false;
+        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
+            dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
+            return false;
+        }
+    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
+        // Vector Global Memory instruction
+        bool rdy = true;
+        if (!glbMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_ISSUE_NRDY]++;
+        }
+        if (!glbMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
+        }
+        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
+            rdy = false;
+            dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
+        // Scalar Global Memory instruction
+        bool rdy = true;
+        if (!scalarMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_ISSUE_NRDY]++;
+        }
+        if (!scalarMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit.scalarMemoryPipe
+            .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
+            + wf->scalarWrGmReqsInPipe))
+        {
+            rdy = false;
+            dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
+        // Vector Local Memory instruction
+        bool rdy = true;
+        if (!locMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_ISSUE_NRDY]++;
+        }
+        if (!locMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit.localMemoryPipe.
+                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
+        // Vector Flat memory instruction
+        bool rdy = true;
+        if (!glbMemIssueRdy || !locMemIssueRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_ISSUE_NRDY]++;
+        }
+        if (!glbMemBusRdy || !locMemBusRdy) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
+        }
+        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
+        }
+        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
+        }
+        if (!computeUnit.localMemoryPipe.
+                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
+            rdy = false;
+            dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
+        }
+        if (!rdy) {
+            return false;
+        }
+    } else {
+        panic("%s: unknown instr checked for readiness",
+              gpu_dyn_inst->disassemble());
+        return false;
+    }
+    dispNrdyStalls[SCH_RDY]++;
+    return true;
  }
  
  void
-ScheduleStage::arbitrate()
-{
-    // iterate over all Memory pipelines
-    for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
-        if (dispatchList->at(j).first) {
-            Wavefront *waveToMemPipe = dispatchList->at(j).first;
-            // iterate over all execution pipelines
-            for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
-                if ((i != j) && (dispatchList->at(i).first)) {
-                    Wavefront *waveToExePipe = dispatchList->at(i).first;
-                    // if the two selected wavefronts are mapped to the same
-                    // SIMD unit then they share the VRF
-                    if (waveToMemPipe->simdId == waveToExePipe->simdId) {
-                        int simdId = waveToMemPipe->simdId;
-                        // Read VRF port arbitration:
-                        // If there are read VRF port conflicts between the
-                        // a memory and another instruction we drop the other
-                        // instruction. We don't need to check for write VRF
-                        // port conflicts because the memory instruction either
-                        // does not need to write to the VRF (store) or will
-                        // write to the VRF when the data comes back (load) in
-                        // which case the arbiter of the memory pipes will
-                        // resolve any conflicts
-                        if (computeUnit->vrf[simdId]->
-                            isReadConflict(waveToMemPipe->wfSlotId,
-                            waveToExePipe->wfSlotId)) {
-                            // FIXME: The "second" member variable is never
-                            // used in the model. I am setting it to READY
-                            // simply to follow the protocol of setting it
-                            // when the WF has an instruction ready to issue
-                            waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
-                                                    .second = READY;
-
-                            dispatchList->at(i).first = nullptr;
-                            dispatchList->at(i).second = EMPTY;
-                            break;
-                        }
+ScheduleStage::fillDispatchList()
+{
+    // update execution resource status
+    checkMemResources();
+    // iterate execution resources
+    for (int j = 0; j < computeUnit.numExeUnits(); j++) {
+        assert(toExecute.dispatchStatus(j) == EMPTY);
+
+        // iterate waves in schList to pick one for dispatch
+        auto schIter = schList.at(j).begin();
+        bool dispatched = false;
+        while (schIter != schList.at(j).end()) {
+            // only attempt to dispatch if status is RFREADY
+            if (schIter->second == RFREADY) {
+                // Check if this wave is ready for dispatch
+                bool dispRdy = dispatchReady(schIter->first);
+                if (!dispatched && dispRdy) {
+                    // No other wave has been dispatched for this exe
+                    // resource, and this wave is ready. Place this wave
+                    // on dispatchList and make it ready for execution
+                    // next cycle.
+
+                    // Acquire a coalescer token if it is a global mem
+                    // operation.
+                    GPUDynInstPtr mp = schIter->first;
+                    if (!mp->isMemSync() && !mp->isScalar() &&
+                        (mp->isGlobalMem() || mp->isFlat())) {
+                        computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
                      }
+
+                    doDispatchListTransition(j, EXREADY, schIter->first);
+                    DPRINTF(GPUSched, "dispatchList[%d]: fillDispatchList: "
+                            "EMPTY->EXREADY\n", j);
+                    schIter->first = nullptr;
+                    schIter = schList.at(j).erase(schIter);
+                    dispatched = true;
+                } else {
+                    // Either another wave has been dispatched, or this wave
+                    // was not ready, so it is stalled this cycle
+                    schIter->first->wavefront()->schStalls++;
+                    if (!dispRdy) {
+                        // not ready for dispatch, increment stall stat
+                        schIter->first->wavefront()->schResourceStalls++;
+                    }
+                    // Examine next wave for this resource
+                    schIter++;
                  }
+            } else {
+                // Wave not in RFREADY, try next wave
+                schIter++;
              }
          }
+
+        // Increment stall count if no wave sent to dispatchList for
+        // current execution resource
+        if (!dispatched) {
+            schListToDispListStalls[j]++;
+        } else {
+            schListToDispList[j]++;
+        }
      }
  }
  
  void
-ScheduleStage::exec()
+ScheduleStage::arbitrateVrfToLdsBus()
  {
-    for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
-         uint32_t readyListSize = computeUnit->readyList[j].size();
+    // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops
+    // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus
+    // and a VRF->LDS bus. In GFx9, this is not the case.
  
-         // If no wave is ready to be scheduled on the execution resource
-         // then skip scheduling for this execution resource
-         if (!readyListSize) {
-             continue;
-         }
+    // iterate the GM pipelines
+    for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) {
+        // get the GM pipe index in the dispatchList
+        int gm_exe_unit = computeUnit.firstMemUnit() + i;
+        // get the wave in the dispatchList
+        GPUDynInstPtr &gpu_dyn_inst
+            = toExecute.readyInst(gm_exe_unit);
+        // If the WF is valid, ready to execute, and the instruction
+        // is a flat access, arbitrate with the WF's assigned LM pipe
+        if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
+            == EXREADY && gpu_dyn_inst->isFlat()) {
+            Wavefront *wf = gpu_dyn_inst->wavefront();
+            // If the associated LM pipe also has a wave selected, block
+            // that wave and let the Flat instruction issue. The WF in the
+            // LM pipe is added back to the schList for consideration next
+            // cycle.
+            if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
+                reinsertToSchList(wf->localMem, toExecute
+                                  .readyInst(wf->localMem));
+                // Increment stall stats for LDS-VRF arbitration
+                ldsBusArbStalls++;
+                toExecute.readyInst(wf->localMem)
+                    ->wavefront()->schLdsArbStalls++;
+            }
+            // With arbitration of LM pipe complete, transition the
+            // LM pipe to SKIP state in the dispatchList to inform EX stage
+            // that a Flat instruction is executing next cycle
+            doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
+            DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
+                    "EXREADY->SKIP\n", wf->localMem);
+        }
+    }
+}
  
-         Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
-         dispatchList->at(j).first = waveToBeDispatched;
-         waveToBeDispatched->updateResources();
-         dispatchList->at(j).second = FILLED;
+void
+ScheduleStage::checkRfOperandReadComplete()
+{
+    // Iterate the schList queues and check if operand reads
+    // have completed in the RFs. If so, mark the wave as ready for
+    // selection for dispatchList
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        for (auto &p : schList.at(j)) {
+            const GPUDynInstPtr &gpu_dyn_inst = p.first;
+            assert(gpu_dyn_inst);
+            Wavefront *wf = gpu_dyn_inst->wavefront();
  
-         waveStatusList[waveToBeDispatched->simdId]->at(
-                 waveToBeDispatched->wfSlotId).second = BLOCKED;
+            // Increment the number of cycles the wave spends in the
+            // SCH stage, since this loop visits every wave in SCH.
+            wf->schCycles++;
  
-         assert(computeUnit->readyList[j].size() == readyListSize - 1);
+            bool vrfRdy = true;
+            if (!gpu_dyn_inst->isScalar()) {
+                vrfRdy = computeUnit.vrf[wf->simdId]
+                    ->operandReadComplete(wf, gpu_dyn_inst);
+            }
+            bool srfRdy = computeUnit.srf[wf->simdId]
+                ->operandReadComplete(wf, gpu_dyn_inst);
+            bool operandsReady = vrfRdy && srfRdy;
+            if (operandsReady) {
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
+                        "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
+                        gpu_dyn_inst->disassemble());
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
+                        j, wf->wfDynId);
+                p.second = RFREADY;
+            } else {
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
+                        "for: %d: %s\n", j, wf->wfDynId,
+                        gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
+
+                // operands not ready yet, increment SCH stage stats
+                // aggregate to all wavefronts on the CU
+                p.second = RFBUSY;
+
+                // Increment stall stats
+                wf->schStalls++;
+                wf->schOpdNrdyStalls++;
+
+                opdNrdyStalls[SCH_RF_OPD_NRDY]++;
+                if (!vrfRdy) {
+                    opdNrdyStalls[SCH_VRF_OPD_NRDY]++;
+                }
+                if (!srfRdy) {
+                    opdNrdyStalls[SCH_SRF_OPD_NRDY]++;
+                }
+            }
+        }
      }
-    // arbitrate over all shared resources among instructions being issued
-    // simultaneously
-    arbitrate();
+}
+
+void
+ScheduleStage::reserveResources()
+{
+    std::vector<bool> exeUnitReservations;
+    exeUnitReservations.resize(computeUnit.numExeUnits(), false);
+
+    for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
+        GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+        if (gpu_dyn_inst) {
+            DISPATCH_STATUS s = toExecute.dispatchStatus(j);
+            Wavefront *wf = gpu_dyn_inst->wavefront();
+            if (s == EMPTY) {
+                continue;
+            } else if (s == EXREADY) {
+                // Wave is ready for execution
+                std::vector<int> execUnitIds = wf->reserveResources();
+
+                if (!gpu_dyn_inst->isScalar()) {
+                    computeUnit.vrf[wf->simdId]
+                        ->dispatchInstruction(gpu_dyn_inst);
+                }
+                computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
+
+                std::stringstream ss;
+                for (auto id : execUnitIds) {
+                    ss << id << " ";
+                }
+                DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
+                        "    Reserving ExeRes[ %s]\n",
+                        j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
+                        gpu_dyn_inst->disassemble(), ss.str());
+                // mark the resources as reserved for this cycle
+                for (auto execUnitId : execUnitIds) {
+                    panic_if(exeUnitReservations.at(execUnitId),
+                             "Execution unit %d is reserved!!!\n"
+                             "SIMD[%d] WV[%d]: %d: %s",
+                             execUnitId, wf->simdId, wf->wfDynId,
+                             gpu_dyn_inst->seqNum(),
+                             gpu_dyn_inst->disassemble());
+                    exeUnitReservations.at(execUnitId) = true;
+                }
+
+                // If wavefront::reserveResources reserved multiple resources,
+                // then we're executing a flat memory instruction. This means
+                // that we've reserved a global and local memory unit. Thus,
+                // we need to mark the latter execution unit as not available.
+                if (execUnitIds.size() > 1) {
+                    int lm_exec_unit M5_VAR_USED = wf->localMem;
+                    assert(toExecute.dispatchStatus(lm_exec_unit)
+                           == SKIP);
+                }
+            } else if (s == SKIP) {
+                // Shared Memory pipe reserved for FLAT instruction.
+                // Verify the GM pipe for this wave is ready to execute
+                // and the wave in the GM pipe is the same as the wave
+                // in the LM pipe
+                int gm_exec_unit M5_VAR_USED = wf->globalMem;
+                assert(wf->wfDynId == toExecute
+                       .readyInst(gm_exec_unit)->wfDynId);
+                assert(toExecute.dispatchStatus(gm_exec_unit)
+                       == EXREADY);
+            }
+        }
+    }
+}
+
+void
+ScheduleStage::deleteFromSch(Wavefront *w)
+{
+    wavesInSch.erase(w->wfDynId);
  }
  
  void
  ScheduleStage::regStats()
  {
+    rdyListNotEmpty
+        .init(computeUnit.numExeUnits())
+        .name(name() + ".rdy_list_not_empty")
+        .desc("number of cycles one or more wave on ready list per "
+              "execution resource")
+        ;
+
+    rdyListEmpty
+        .init(computeUnit.numExeUnits())
+        .name(name() + ".rdy_list_empty")
+        .desc("number of cycles no wave on ready list per "
+              "execution resource")
+        ;
+
+    addToSchListStalls
+        .init(computeUnit.numExeUnits())
+        .name(name() + ".sch_list_add_stalls")
+        .desc("number of cycles a wave is not added to schList per "
+              "execution resource when ready list is not empty")
+        ;
+
+    schListToDispList
+        .init(computeUnit.numExeUnits())
+        .name(name() + ".sch_list_to_disp_list")
+        .desc("number of cycles a wave is added to dispatchList per "
+              "execution resource")
+        ;
+
+    schListToDispListStalls
+        .init(computeUnit.numExeUnits())
+        .name(name() + ".sch_list_to_disp_list_stalls")
+        .desc("number of cycles no wave is added to dispatchList per "
+              "execution resource")
+        ;
+
+    // Operand Readiness Stall Cycles
+    opdNrdyStalls
+        .init(SCH_RF_OPD_NRDY_CONDITIONS)
+        .name(name() + ".opd_nrdy_stalls")
+        .desc("number of stalls in SCH due to operands not ready")
+        ;
+    opdNrdyStalls.subname(SCH_VRF_OPD_NRDY, csprintf("VRF"));
+    opdNrdyStalls.subname(SCH_SRF_OPD_NRDY, csprintf("SRF"));
+    opdNrdyStalls.subname(SCH_RF_OPD_NRDY, csprintf("RF"));
+
+    // dispatchReady Stall Cycles
+    dispNrdyStalls
+        .init(SCH_NRDY_CONDITIONS)
+        .name(name() + ".disp_nrdy_stalls")
+        .desc("number of stalls in SCH due to resource not ready")
+        ;
+    dispNrdyStalls.subname(SCH_SCALAR_ALU_NRDY, csprintf("ScalarAlu"));
+    dispNrdyStalls.subname(SCH_VECTOR_ALU_NRDY, csprintf("VectorAlu"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_ISSUE_NRDY,
+                                  csprintf("VectorMemIssue"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_BUS_BUSY_NRDY,
+                                  csprintf("VectorMemBusBusy"));
+    dispNrdyStalls.subname(SCH_VECTOR_MEM_COALESCER_NRDY,
+                                  csprintf("VectorMemCoalescer"));
+    dispNrdyStalls.subname(SCH_CEDE_SIMD_NRDY, csprintf("CedeSimd"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_ISSUE_NRDY,
+                                  csprintf("ScalarMemIssue"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_BUS_BUSY_NRDY,
+                                  csprintf("ScalarMemBusBusy"));
+    dispNrdyStalls.subname(SCH_SCALAR_MEM_FIFO_NRDY,
+                                  csprintf("ScalarMemFIFO"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_ISSUE_NRDY,
+                                  csprintf("LocalMemIssue"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_BUS_BUSY_NRDY,
+                                  csprintf("LocalMemBusBusy"));
+    dispNrdyStalls.subname(SCH_LOCAL_MEM_FIFO_NRDY,
+                                  csprintf("LocalMemFIFO"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_ISSUE_NRDY,
+                                  csprintf("FlatMemIssue"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_BUS_BUSY_NRDY,
+                                  csprintf("FlatMemBusBusy"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_COALESCER_NRDY,
+                                  csprintf("FlatMemCoalescer"));
+    dispNrdyStalls.subname(SCH_FLAT_MEM_FIFO_NRDY,
+                                  csprintf("FlatMemFIFO"));
+    dispNrdyStalls.subname(SCH_RDY, csprintf("Ready"));
+
+    // RF Access Stall Cycles
+    rfAccessStalls
+        .init(SCH_RF_ACCESS_NRDY_CONDITIONS)
+        .name(name() + ".rf_access_stalls")
+        .desc("number of stalls due to RF access denied")
+        ;
+    rfAccessStalls.subname(SCH_VRF_RD_ACCESS_NRDY, csprintf("VrfRd"));
+    rfAccessStalls.subname(SCH_VRF_WR_ACCESS_NRDY, csprintf("VrfWr"));
+    rfAccessStalls.subname(SCH_SRF_RD_ACCESS_NRDY, csprintf("SrfRd"));
+    rfAccessStalls.subname(SCH_SRF_WR_ACCESS_NRDY, csprintf("SrfWr"));
+    rfAccessStalls.subname(SCH_RF_ACCESS_NRDY, csprintf("Any"));
+
+    // Stall cycles due to wave losing LDS bus arbitration
+    ldsBusArbStalls
+        .name(name() + ".lds_bus_arb_stalls")
+        .desc("number of stalls due to VRF->LDS bus conflicts")
+        ;
  }