gpu-compute: Add pipeline stage interface classes

author Tony Gutierrez <anthony.gutierrez@amd.com>

Mon, 2 Jul 2018 19:56:22 +0000 (15:56 -0400)

committer Anthony Gutierrez <anthony.gutierrez@amd.com>

Fri, 17 Jul 2020 16:36:09 +0000 (16:36 +0000)
author Tony Gutierrez <anthony.gutierrez@amd.com>
Mon, 2 Jul 2018 19:56:22 +0000 (15:56 -0400)
committer Anthony Gutierrez <anthony.gutierrez@amd.com>
Fri, 17 Jul 2020 16:36:09 +0000 (16:36 +0000)
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript

index 244791b9b29afc83c300d266ececdbf9e0be16de..0f1afbcca35aa8f042996642c4d9bbb03207abde 100644 (file)
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -41,6 +41,7 @@ SimObject('GPUStaticInstFlags.py')
  SimObject('LdsState.py')
  SimObject('X86GPUTLB.py')
  
+Source('comm.cc')
  Source('compute_unit.cc')
  Source('dispatcher.cc')
  Source('exec_stage.cc')
diff --git a/src/gpu-compute/comm.cc b/src/gpu-compute/comm.cc

new file mode 100644 (file)

index 0000000..b1dd031
--- /dev/null
+++ b/src/gpu-compute/comm.cc
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#include "gpu-compute/comm.hh"
+
+#include <cassert>
+
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+/**
+ * Scoreboard/Schedule stage interface.
+ */
+ScoreboardCheckToSchedule::ScoreboardCheckToSchedule(const ComputeUnitParams
+                                                     *p)
+{
+    int num_func_units = p->num_SIMDs + p->num_scalar_cores
+        + p->num_global_mem_pipes + p->num_shared_mem_pipes
+        + p->num_scalar_mem_pipes;
+    _readyWFs.resize(num_func_units);
+
+    for (auto &func_unit_wf_list : _readyWFs) {
+        func_unit_wf_list.reserve(p->n_wf);
+    }
+}
+
+void
+ScoreboardCheckToSchedule::reset()
+{
+    for (auto &func_unit_wf_list : _readyWFs) {
+        func_unit_wf_list.resize(0);
+    }
+}
+
+void
+ScoreboardCheckToSchedule::markWFReady(Wavefront *wf, int func_unit_id)
+{
+    _readyWFs[func_unit_id].push_back(wf);
+}
+
+int
+ScoreboardCheckToSchedule::numReadyLists() const
+{
+    return _readyWFs.size();
+}
+
+std::vector<Wavefront*>&
+ScoreboardCheckToSchedule::readyWFs(int func_unit_id)
+{
+    return _readyWFs[func_unit_id];
+}
+
+/**
+ * Delete all wavefronts that have been marked as ready at scoreboard stage
+ * but are found to have empty instruction buffers at schedule stage.
+ */
+void
+ScoreboardCheckToSchedule::updateReadyList(int func_unit_id)
+{
+    std::vector<Wavefront*> &func_unit_wf_list = _readyWFs[func_unit_id];
+
+    for (auto it = func_unit_wf_list.begin(); it != func_unit_wf_list.end();) {
+        if ((*it)->instructionBuffer.empty()) {
+            it = func_unit_wf_list.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+/**
+ * Schedule/Execute stage interface.
+ */
+ScheduleToExecute::ScheduleToExecute(const ComputeUnitParams *p)
+{
+    int num_func_units = p->num_SIMDs + p->num_scalar_cores
+        + p->num_global_mem_pipes + p->num_shared_mem_pipes
+        + p->num_scalar_mem_pipes;
+    _readyInsts.resize(num_func_units, nullptr);
+    _dispatchStatus.resize(num_func_units, EMPTY);
+}
+
+void
+ScheduleToExecute::reset()
+{
+    for (auto &func_unit_ready_inst : _readyInsts) {
+        func_unit_ready_inst = nullptr;
+    }
+
+    for (auto &func_unit_status : _dispatchStatus) {
+        func_unit_status = EMPTY;
+    }
+}
+
+GPUDynInstPtr&
+ScheduleToExecute::readyInst(int func_unit_id)
+{
+    return _readyInsts[func_unit_id];
+}
+
+void
+ScheduleToExecute::dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst,
+                                      int func_unit_id,
+                                      DISPATCH_STATUS disp_status)
+{
+    _readyInsts[func_unit_id] = gpu_dyn_inst;
+    _dispatchStatus[func_unit_id] = disp_status;
+}
+
+void
+ScheduleToExecute::dispatchTransition(int func_unit_id,
+                                      DISPATCH_STATUS disp_status)
+{
+    _readyInsts[func_unit_id] = nullptr;
+    _dispatchStatus[func_unit_id] = disp_status;
+}
+
+DISPATCH_STATUS
+ScheduleToExecute::dispatchStatus(int func_unit_id) const
+{
+    return _dispatchStatus[func_unit_id];
+}
diff --git a/src/gpu-compute/comm.hh b/src/gpu-compute/comm.hh

new file mode 100644 (file)

index 0000000..bc3ec7b
--- /dev/null
+++ b/src/gpu-compute/comm.hh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Anthony Gutierrez
+ */
+
+#ifndef __GPU_COMPUTE_COMM_HH__
+#define __GPU_COMPUTE_COMM_HH__
+
+#include <array>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/misc.hh"
+
+struct ComputeUnitParams;
+class Wavefront;
+
+class PipeStageIFace
+{
+  public:
+    /**
+     * Reset the pipe stage interface. This is called to remove
+     * any stale state from the pipe stage that is leftover from
+     * the prior cycle. This is needed when stages do not actually
+     * consume the information passed via the stage interfaces.
+     */
+    virtual void reset() = 0;
+};
+
+/**
+ * Communication interface between ScoreboardCheck and Schedule stages.
+ */
+class ScoreboardCheckToSchedule : public PipeStageIFace
+{
+  public:
+    ScoreboardCheckToSchedule() = delete;
+    ScoreboardCheckToSchedule(const ComputeUnitParams *p);
+    void reset() override;
+    /**
+     * Mark the WF as ready for execution on a particular functional
+     * unit.
+     */
+    void markWFReady(Wavefront *wf, int func_unit_id);
+    /**
+     * Returns the number of ready lists (i.e., the number of functional
+     * units). Each functional unit has its own list of ready WFs to
+     * consider for arbitration.
+     */
+    int numReadyLists() const;
+    /**
+     * TODO: These methods expose this class' implementation too much by
+     *       returning references to its internal data structures directly.
+     *       These are to support legacy functionality in the CU pipeline.
+     *       They should be removed eventually for an API that hides such
+     *       implementation details.
+     */
+    std::vector<Wavefront*>& readyWFs(int func_unit_id);
+
+    // TODO: Leftover from old CU code, needs to go away.
+    void updateReadyList(int func_unit_id);
+
+  private:
+    std::vector<std::vector<Wavefront*>> _readyWFs;
+};
+
+/**
+ * Communication interface between Schedule and Execute stages.
+ */
+class ScheduleToExecute : public PipeStageIFace
+{
+  public:
+    ScheduleToExecute() = delete;
+    ScheduleToExecute(const ComputeUnitParams *p);
+    void reset() override;
+    GPUDynInstPtr& readyInst(int func_unit_id);
+    /**
+     * Once the scheduler has chosen a winning WF for execution, and
+     * after the WF's oldest instruction's operands have been read,
+     * this method is used to mark the instruction as ready to execute.
+     * This puts it on the dispatch list to be consumed by the execute
+     * stage.
+     */
+    void dispatchTransition(const GPUDynInstPtr &gpu_dyn_inst,
+                            int func_unit_id, DISPATCH_STATUS disp_status);
+    void dispatchTransition(int func_unit_id, DISPATCH_STATUS disp_status);
+    DISPATCH_STATUS dispatchStatus(int func_unit_id) const;
+
+  private:
+    std::vector<GPUDynInstPtr> _readyInsts;
+    std::vector<DISPATCH_STATUS> _dispatchStatus;
+};
+
+#endif // __GPU_COMPUTE_COMM_HH__
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index a59a7fd6e24e2f6d91ca1c85f4a06d3017bdc348..067c254696f83e1f46cdb510dc87e8407427709b 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -68,9 +68,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
      coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
      registerManager(p->register_manager),
      fetchStage(p, *this),
-    scoreboardCheckStage(p, *this),
-    scheduleStage(p, *this),
-    execStage(p, *this),
+    scoreboardCheckStage(p, *this, scoreboardCheckToSchedule),
+    scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute),
+    execStage(p, *this, scheduleToExecute),
      globalMemoryPipe(p, *this),
      localMemoryPipe(p, *this),
      scalarMemoryPipe(p, *this),
@@ -98,7 +98,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
      lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
      _cacheLineSize(p->system->cacheLineSize()),
      _numBarrierSlots(p->num_barrier_slots),
-    globalSeqNum(0), wavefrontSize(p->wf_size)
+    globalSeqNum(0), wavefrontSize(p->wf_size),
+    scoreboardCheckToSchedule(p),
+    scheduleToExecute(p)
  {
      /**
       * This check is necessary because std::bitset only provides conversion
@@ -213,8 +215,6 @@ ComputeUnit::~ComputeUnit()
          lastVaddrSimd[j].clear();
      }
      lastVaddrCU.clear();
-    readyList.clear();
-    dispatchList.clear();
      delete cuExitCallback;
      delete ldsPort;
  }
@@ -297,24 +297,6 @@ ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
      w->computeActualWgSz(task);
  }
  
-// delete all wavefronts that have been marked as ready at SCB stage
-// but are found to have empty instruction buffers at SCH stage
-void
-ComputeUnit::updateReadyList(int unitId)
-{
-    if (!readyList[unitId].empty()) {
-        for (std::vector<Wavefront *>::iterator it = readyList[unitId].begin();
-             it != readyList[unitId].end();) {
-            if ((*it)->instructionBuffer.empty()) {
-                it = readyList[unitId].erase(it);
-            }
-            else {
-                ++it;
-            }
-        }
-    }
-}
-
  void
  ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
                              HSAQueueEntry *task, int bar_id, bool fetchContext)
@@ -786,15 +768,7 @@ ComputeUnit::init()
      vectorRegsReserved.resize(numVectorALUs, 0);
      scalarRegsReserved.resize(numVectorALUs, 0);
  
-    // Initializing pipeline resources
-    readyList.resize(numExeUnits());
-
-    for (int j = 0; j < numExeUnits(); ++j) {
-        dispatchList.push_back(std::make_pair(nullptr, EMPTY));
-    }
-
      fetchStage.init();
-    scoreboardCheckStage.init();
      scheduleStage.init();
      execStage.init();
      globalMemoryPipe.init();
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh

index d4e978b40894aa7ad60e8e9dd9c2d1484a85d50d..22960c0c9166e4011be48139b2ee1e50f3cb92f8 100644 (file)
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -44,6 +44,7 @@
  #include "base/types.hh"
  #include "config/the_gpu_isa.hh"
  #include "enums/PrefetchType.hh"
+#include "gpu-compute/comm.hh"
  #include "gpu-compute/exec_stage.hh"
  #include "gpu-compute/fetch_stage.hh"
  #include "gpu-compute/global_memory_pipeline.hh"
@@ -266,40 +267,6 @@ class ComputeUnit : public ClockedObject
      int numCyclesPerStoreTransfer;  // number of cycles per vector store
      int numCyclesPerLoadTransfer;  // number of cycles per vector load
  
-    // Buffers used to communicate between various pipeline stages
-
-    // At a high level, the following intra-/inter-stage communication occurs:
-    // SCB to SCH: readyList provides per exec resource list of waves that
-    //             passed dependency and readiness checks. If selected by
-    //             scheduler, attempt to add wave to schList conditional on
-    //             RF support.
-    // SCH: schList holds waves that are gathering operands or waiting
-    //      for execution resource availability. Once ready, waves are
-    //      placed on the dispatchList as candidates for execution. A wave
-    //      may spend multiple cycles in SCH stage, on the schList due to
-    //      RF access conflicts or execution resource contention.
-    // SCH to EX: dispatchList holds waves that are ready to be executed.
-    //            LM/FLAT arbitration may remove an LM wave and place it
-    //            back on the schList. RF model may also force a wave back
-    //            to the schList if using the detailed model.
-
-    // List of waves which are ready to be scheduled.
-    // Each execution resource has a ready list. readyList is
-    // used to communicate between scoreboardCheck stage and
-    // schedule stage
-    std::vector<std::vector<Wavefront*>> readyList;
-
-    // List of waves which will be dispatched to
-    // each execution resource. An EXREADY implies
-    // dispatch list is non-empty and
-    // execution unit has something to execute
-    // this cycle. Currently, the dispatch list of
-    // an execution resource can hold only one wave because
-    // an execution resource can execute only one wave in a cycle.
-    // dispatchList is used to communicate between schedule
-    // and exec stage
-    // TODO: convert std::pair to a class to increase readability
-    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
      // track presence of dynamic instructions in the Schedule pipeline
      // stage. This is used to check the readiness of the oldest,
      // non-dispatched instruction of every WF in the Scoreboard stage.
@@ -413,8 +380,6 @@ class ComputeUnit : public ClockedObject
      // number of available scalar registers per SIMD unit
      int numScalarRegsPerSimd;
  
-    void updateReadyList(int unitId);
-
      // this hash map will keep track of page divergence
      // per memory instruction per wavefront. The hash map
      // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
@@ -1117,6 +1082,41 @@ class ComputeUnit : public ClockedObject
      InstSeqNum globalSeqNum;
      int wavefrontSize;
  
+    /**
+     * TODO: Update these comments once the pipe stage interface has
+     *       been fully refactored.
+     *
+     * Pipeline stage interfaces.
+     *
+     * Buffers used to communicate between various pipeline stages
+     * List of waves which will be dispatched to
+     * each execution resource. An EXREADY implies
+     * dispatch list is non-empty and
+     * execution unit has something to execute
+     * this cycle. Currently, the dispatch list of
+     * an execution resource can hold only one wave because
+     * an execution resource can execute only one wave in a cycle.
+     * dispatchList is used to communicate between schedule
+     * and exec stage
+     *
+     * At a high level, the following intra-/inter-stage communication occurs:
+     * SCB to SCH: readyList provides per exec resource list of waves that
+     *             passed dependency and readiness checks. If selected by
+     *             scheduler, attempt to add wave to schList conditional on
+     *             RF support.
+     * SCH: schList holds waves that are gathering operands or waiting
+     *      for execution resource availability. Once ready, waves are
+     *      placed on the dispatchList as candidates for execution. A wave
+     *      may spend multiple cycles in SCH stage, on the schList due to
+     *      RF access conflicts or execution resource contention.
+     * SCH to EX: dispatchList holds waves that are ready to be executed.
+     *            LM/FLAT arbitration may remove an LM wave and place it
+     *            back on the schList. RF model may also force a wave back
+     *            to the schList if using the detailed model.
+     */
+    ScoreboardCheckToSchedule scoreboardCheckToSchedule;
+    ScheduleToExecute scheduleToExecute;
+
      /**
       * The barrier slots for this CU.
       */
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc

index 2b0a79785c6795d9d6b62c2756da6ada30fda51f..79fca724f9784657efbbd10f34c7aab81a6eaf54 100644 (file)
--- a/src/gpu-compute/exec_stage.cc
+++ b/src/gpu-compute/exec_stage.cc
@@ -41,8 +41,10 @@
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
-    : computeUnit(cu), lastTimeInstExecuted(false),
+ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu,
+                     ScheduleToExecute &from_schedule)
+    : computeUnit(cu), fromSchedule(from_schedule),
+      lastTimeInstExecuted(false),
        thisTimeInstExecuted(false), instrExecuted (false),
        executionResourcesUsed(0), _name(cu.name() + ".ExecStage")
  
@@ -54,7 +56,6 @@ ExecStage::ExecStage(const ComputeUnitParams *p, ComputeUnit &cu)
  void
  ExecStage::init()
  {
-    dispatchList = &computeUnit.dispatchList;
      idle_dur = 0;
  }
  
@@ -128,14 +129,15 @@ ExecStage::dumpDispList()
      std::stringstream ss;
      bool empty = true;
      for (int i = 0; i < computeUnit.numExeUnits(); i++) {
-        DISPATCH_STATUS s = dispatchList->at(i).second;
+        DISPATCH_STATUS s = fromSchedule.dispatchStatus(i);
          ss << i << ": " << dispStatusToStr(s);
          if (s != EMPTY) {
              empty = false;
-            Wavefront *w = dispatchList->at(i).first;
-            ss << " SIMD[" << w->simdId << "] WV[" << w->wfDynId << "]: ";
-            ss << (w->instructionBuffer.front())->seqNum() << ": ";
-            ss << (w->instructionBuffer.front())->disassemble();
+            GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(i);
+            Wavefront *wf = gpu_dyn_inst->wavefront();
+            ss << " SIMD[" << wf->simdId << "] WV[" << wf->wfDynId << "]: ";
+            ss << (wf->instructionBuffer.front())->seqNum() << ": ";
+            ss << (wf->instructionBuffer.front())->disassemble();
          }
          ss << "\n";
      }
@@ -152,36 +154,41 @@ ExecStage::exec()
          dumpDispList();
      }
      for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
-        DISPATCH_STATUS s = dispatchList->at(unitId).second;
+        DISPATCH_STATUS s = fromSchedule.dispatchStatus(unitId);
          switch (s) {
-        case EMPTY:
+          case EMPTY:
              // Do not execute if empty, waiting for VRF reads,
              // or LM tied to GM waiting for VRF reads
              collectStatistics(IdleExec, unitId);
              break;
-        case EXREADY:
-        {
-            collectStatistics(BusyExec, unitId);
-            Wavefront *w = dispatchList->at(unitId).first;
-            DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
-                    unitId, w->simdId, w->wfDynId,
-                    (w->instructionBuffer.front())->disassemble());
-            DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
-            dispatchList->at(unitId).first->exec();
-            (computeUnit.scheduleStage).deleteFromSch(w);
-            dispatchList->at(unitId).second = EMPTY;
-            dispatchList->at(unitId).first->freeResources();
-            dispatchList->at(unitId).first = nullptr;
-            break;
-        }
-        case SKIP:
-            collectStatistics(BusyExec, unitId);
-            DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
-            dispatchList->at(unitId).second = EMPTY;
-            dispatchList->at(unitId).first->freeResources();
-            dispatchList->at(unitId).first = nullptr;
-            break;
-        default:
+          case EXREADY:
+            {
+                collectStatistics(BusyExec, unitId);
+                GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId);
+                assert(gpu_dyn_inst);
+                Wavefront *wf = gpu_dyn_inst->wavefront();
+                DPRINTF(GPUSched, "Exec[%d]: SIMD[%d] WV[%d]: %s\n",
+                        unitId, wf->simdId, wf->wfDynId,
+                        gpu_dyn_inst->disassemble());
+                DPRINTF(GPUSched, "dispatchList[%d] EXREADY->EMPTY\n", unitId);
+                wf->exec();
+                (computeUnit.scheduleStage).deleteFromSch(wf);
+                fromSchedule.dispatchTransition(unitId, EMPTY);
+                wf->freeResources();
+                break;
+            }
+          case SKIP:
+            {
+                collectStatistics(BusyExec, unitId);
+                GPUDynInstPtr &gpu_dyn_inst = fromSchedule.readyInst(unitId);
+                assert(gpu_dyn_inst);
+                Wavefront *wf = gpu_dyn_inst->wavefront();
+                DPRINTF(GPUSched, "dispatchList[%d] SKIP->EMPTY\n", unitId);
+                fromSchedule.dispatchTransition(unitId, EMPTY);
+                wf->freeResources();
+                break;
+            }
+          default:
              panic("Unknown dispatch status in exec()\n");
          }
      }
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh

index cd4343e6d54098b63fb5cb0fdff6e56b47a1f2bf..23e9369b3d3285b2e927cc0340f14dc6bec3366f 100644 (file)
--- a/src/gpu-compute/exec_stage.hh
+++ b/src/gpu-compute/exec_stage.hh
@@ -42,7 +42,9 @@
  #include "sim/stats.hh"
  
  class ComputeUnit;
+class ScheduleToExecute;
  class Wavefront;
+
  struct ComputeUnitParams;
  
  enum STAT_STATUS
@@ -69,7 +71,8 @@ enum DISPATCH_STATUS
  class ExecStage
  {
    public:
-    ExecStage(const ComputeUnitParams* p, ComputeUnit &cu);
+    ExecStage(const ComputeUnitParams* p, ComputeUnit &cu,
+              ScheduleToExecute &from_schedule);
      ~ExecStage() { }
      void init();
      void exec();
@@ -97,17 +100,8 @@ class ExecStage
      void collectStatistics(enum STAT_STATUS stage, int unitId);
      void initStatistics();
      ComputeUnit &computeUnit;
+    ScheduleToExecute &fromSchedule;
  
-    // List of waves which will be dispatched to
-    // each execution resource. A FILLED implies
-    // dispatch list is non-empty and
-    // execution unit has something to execute
-    // this cycle. Currently, the dispatch list of
-    // an execution resource can hold only one wave because
-    // an execution resource can execute only one wave in a cycle.
-    // dispatchList is used to communicate between schedule
-    // and exec stage
-    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
      bool lastTimeInstExecuted;
      bool thisTimeInstExecuted;
      bool instrExecuted;
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc

index e0600a623a357e66c0a74109e3ffd3a6c71420ff..fb52b6dd1c2e0a0f99f23681f3530c853d6803cc 100644 (file)
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -43,8 +43,12 @@
  #include "gpu-compute/vector_register_file.hh"
  #include "gpu-compute/wavefront.hh"
  
-ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu)
-    : computeUnit(cu), _name(cu.name() + ".ScheduleStage"),
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
+                             ScoreboardCheckToSchedule &from_scoreboard_check,
+                             ScheduleToExecute &to_execute)
+    : computeUnit(cu), fromScoreboardCheck(from_scoreboard_check),
+      toExecute(to_execute),
+      _name(cu.name() + ".ScheduleStage"),
        vectorAluRdy(false), scalarAluRdy(false), scalarMemBusRdy(false),
        scalarMemIssueRdy(false), glbMemBusRdy(false), glbMemIssueRdy(false),
        locMemBusRdy(false), locMemIssueRdy(false)
@@ -70,14 +74,12 @@ void
  ScheduleStage::init()
  {
  
-    fatal_if(scheduler.size() != computeUnit.readyList.size(),
+    fatal_if(scheduler.size() != fromScoreboardCheck.numReadyLists(),
               "Scheduler should have same number of entries as CU's readyList");
      for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
-        scheduler[j].bindList(&computeUnit.readyList[j]);
+        scheduler[j].bindList(&fromScoreboardCheck.readyWFs(j));
      }
  
-    dispatchList = &computeUnit.dispatchList;
-
      assert(computeUnit.numVectorGlobalMemUnits == 1);
      assert(computeUnit.numVectorSharedMemUnits == 1);
  }
@@ -85,21 +87,21 @@ ScheduleStage::init()
  void
  ScheduleStage::exec()
  {
+    toExecute.reset();
+
      // Update readyList
      for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
-        // delete all ready wavefronts whose instruction buffers are now
-        // empty because the last instruction was executed
-        computeUnit.updateReadyList(j);
          /**
           * Remove any wave that already has an instruction present in SCH
           * waiting for RF reads to complete. This prevents out of order
           * execution within a wave.
           */
-        for (auto wIt = computeUnit.readyList.at(j).begin();
-             wIt != computeUnit.readyList.at(j).end();) {
+        fromScoreboardCheck.updateReadyList(j);
+        for (auto wIt = fromScoreboardCheck.readyWFs(j).begin();
+             wIt != fromScoreboardCheck.readyWFs(j).end();) {
              if (wavesInSch.find((*wIt)->wfDynId) != wavesInSch.end()) {
                  *wIt = nullptr;
-                wIt = computeUnit.readyList.at(j).erase(wIt);
+                wIt = fromScoreboardCheck.readyWFs(j).erase(wIt);
              } else {
                  wIt++;
              }
@@ -115,7 +117,7 @@ ScheduleStage::exec()
      int firstMemUnit = computeUnit.firstMemUnit();
      int lastMemUnit = computeUnit.lastMemUnit();
      for (int j = firstMemUnit; j <= lastMemUnit; j++) {
-        int readyListSize = computeUnit.readyList[j].size();
+        int readyListSize = fromScoreboardCheck.readyWFs(j).size();
          // If no wave is ready to be scheduled on the execution resource
          // then skip scheduling for this execution resource
          if (!readyListSize) {
@@ -125,11 +127,13 @@ ScheduleStage::exec()
          rdyListNotEmpty[j]++;
  
          // Pick a wave and attempt to add it to schList
-        Wavefront *w = scheduler[j].chooseWave();
-        if (!addToSchList(j, w)) {
+        Wavefront *wf = scheduler[j].chooseWave();
+        GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+        assert(gpu_dyn_inst);
+        if (!addToSchList(j, gpu_dyn_inst)) {
              // For waves not added to schList, increment count of cycles
              // this wave spends in SCH stage.
-            w->schCycles++;
+            wf->schCycles++;
              addToSchListStalls[j]++;
          }
      }
@@ -140,7 +144,7 @@ ScheduleStage::exec()
          if (j >= firstMemUnit && j <= lastMemUnit) {
              continue;
          }
-        int readyListSize = computeUnit.readyList[j].size();
+        int readyListSize = fromScoreboardCheck.readyWFs(j).size();
          // If no wave is ready to be scheduled on the execution resource
          // then skip scheduling for this execution resource
          if (!readyListSize) {
@@ -150,11 +154,13 @@ ScheduleStage::exec()
          rdyListNotEmpty[j]++;
  
          // Pick a wave and attempt to add it to schList
-        Wavefront *w = scheduler[j].chooseWave();
-        if (!addToSchList(j, w)) {
+        Wavefront *wf = scheduler[j].chooseWave();
+        GPUDynInstPtr &gpu_dyn_inst = wf->instructionBuffer.front();
+        assert(gpu_dyn_inst);
+        if (!addToSchList(j, gpu_dyn_inst)) {
              // For waves not added to schList, increment count of cycles
              // this wave spends in SCH stage.
-            w->schCycles++;
+            wf->schCycles++;
              addToSchListStalls[j]++;
          }
      }
@@ -191,30 +197,36 @@ ScheduleStage::exec()
  
  void
  ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s,
-                                        Wavefront *w)
+                                        const GPUDynInstPtr &gpu_dyn_inst)
  {
-    dispatchList->at(unitId).first = w;
-    dispatchList->at(unitId).second = s;
+    toExecute.dispatchTransition(gpu_dyn_inst, unitId, s);
+}
+
+void
+ScheduleStage::doDispatchListTransition(int unitId, DISPATCH_STATUS s)
+{
+    toExecute.dispatchTransition(unitId, s);
  }
  
  bool
-ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
+ScheduleStage::schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
  {
-    GPUDynInstPtr ii = w->instructionBuffer.front();
-    assert(ii);
+    assert(gpu_dyn_inst);
+    Wavefront *wf = gpu_dyn_inst->wavefront();
      bool accessVrfWr = true;
-    if (!ii->isScalar()) {
-        accessVrfWr =
-            computeUnit.vrf[w->simdId]->canScheduleWriteOperands(w, ii);
+    if (!gpu_dyn_inst->isScalar()) {
+        accessVrfWr = computeUnit.vrf[wf->simdId]
+            ->canScheduleWriteOperands(wf, gpu_dyn_inst);
      }
-    bool accessSrfWr =
-        computeUnit.srf[w->simdId]->canScheduleWriteOperands(w, ii);
+    bool accessSrfWr = computeUnit.srf[wf->simdId]
+        ->canScheduleWriteOperands(wf, gpu_dyn_inst);
      bool accessRf = accessVrfWr && accessSrfWr;
      if (accessRf) {
-        if (!ii->isScalar()) {
-            computeUnit.vrf[w->simdId]->scheduleWriteOperands(w, ii);
+        if (!gpu_dyn_inst->isScalar()) {
+            computeUnit.vrf[wf->simdId]->scheduleWriteOperands(wf,
+                                                               gpu_dyn_inst);
          }
-        computeUnit.srf[w->simdId]->scheduleWriteOperands(w, ii);
+        computeUnit.srf[wf->simdId]->scheduleWriteOperands(wf, gpu_dyn_inst);
          return true;
      } else {
          rfAccessStalls[SCH_RF_ACCESS_NRDY]++;
@@ -226,8 +238,8 @@ ScheduleStage::schedRfWrites(int exeType, Wavefront *w)
          }
  
          // Increment stall counts for WF
-        w->schStalls++;
-        w->schRfAccessStalls++;
+        wf->schStalls++;
+        wf->schRfAccessStalls++;
      }
      return false;
  }
@@ -236,18 +248,18 @@ void
  ScheduleStage::scheduleRfDestOperands()
  {
      for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
-        if (dispatchList->at(j).second == EMPTY ||
-            dispatchList->at(j).second == SKIP) {
+        if (toExecute.dispatchStatus(j) == EMPTY ||
+            toExecute.dispatchStatus(j) == SKIP) {
              continue;
          }
  
-        assert(dispatchList->at(j).first);
-
          // get the wave on dispatch list and attempt to allocate write
          // resources in the RFs
-        Wavefront *w = dispatchList->at(j).first;
-        if (!schedRfWrites(j, w)) {
-            reinsertToSchList(j, w);
+        const GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+        assert(gpu_dyn_inst);
+        Wavefront *wf = gpu_dyn_inst->wavefront();
+        if (!schedRfWrites(j, gpu_dyn_inst)) {
+            reinsertToSchList(j, gpu_dyn_inst);
              doDispatchListTransition(j, EMPTY);
              // if this is a flat inst, also transition the LM pipe to empty
              // Note: since FLAT/LM arbitration occurs before scheduling
@@ -255,51 +267,53 @@ ScheduleStage::scheduleRfDestOperands()
              // instruction lost arbitration, but would have been able to
              // pass the RF destination operand check here, and execute
              // instead of the FLAT.
-            if (w->instructionBuffer.front()->isFlat()) {
-                assert(dispatchList->at(w->localMem).second == SKIP);
-                doDispatchListTransition(w->localMem, EMPTY);
+            if (wf->instructionBuffer.front()->isFlat()) {
+                assert(toExecute.dispatchStatus(wf->localMem)
+                       == SKIP);
+                doDispatchListTransition(wf->localMem, EMPTY);
              }
          }
      }
  }
  
  bool
-ScheduleStage::addToSchList(int exeType, Wavefront *w)
+ScheduleStage::addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst)
  {
      // Attempt to add the wave to the schList if the VRF can support the
      // wave's next instruction
-    GPUDynInstPtr ii = w->instructionBuffer.front();
-    assert(ii);
+    assert(gpu_dyn_inst);
+    Wavefront *wf = gpu_dyn_inst->wavefront();
      bool accessVrf = true;
-    if (!ii->isScalar()) {
-        accessVrf =
-            computeUnit.vrf[w->simdId]->canScheduleReadOperands(w, ii);
+    if (!gpu_dyn_inst->isScalar()) {
+        accessVrf = computeUnit.vrf[wf->simdId]
+            ->canScheduleReadOperands(wf, gpu_dyn_inst);
      }
-    bool accessSrf =
-        computeUnit.srf[w->simdId]->canScheduleReadOperands(w, ii);
+    bool accessSrf = computeUnit.srf[wf->simdId]
+        ->canScheduleReadOperands(wf, gpu_dyn_inst);
      // If RFs can support instruction, add to schList in RFBUSY state,
      // place wave in wavesInSch and pipeMap, and schedule Rd/Wr operands
      // to the VRF
      bool accessRf = accessVrf && accessSrf;
      if (accessRf) {
          DPRINTF(GPUSched, "schList[%d]: Adding: SIMD[%d] WV[%d]: %d: %s\n",
-                exeType, w->simdId, w->wfDynId,
-                ii->seqNum(), ii->disassemble());
+                exeType, wf->simdId, wf->wfDynId,
+                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
  
-        computeUnit.insertInPipeMap(w);
-        wavesInSch.emplace(w->wfDynId);
-        schList.at(exeType).push_back(std::make_pair(w, RFBUSY));
-        if (w->isOldestInstWaitcnt()) {
-            w->setStatus(Wavefront::S_WAITCNT);
+        computeUnit.insertInPipeMap(wf);
+        wavesInSch.emplace(wf->wfDynId);
+        schList.at(exeType).push_back(std::make_pair(gpu_dyn_inst, RFBUSY));
+        if (wf->isOldestInstWaitcnt()) {
+            wf->setStatus(Wavefront::S_WAITCNT);
          }
-        if (!ii->isScalar()) {
-            computeUnit.vrf[w->simdId]->scheduleReadOperands(w, ii);
+        if (!gpu_dyn_inst->isScalar()) {
+            computeUnit.vrf[wf->simdId]
+                ->scheduleReadOperands(wf, gpu_dyn_inst);
          }
-        computeUnit.srf[w->simdId]->scheduleReadOperands(w, ii);
+        computeUnit.srf[wf->simdId]->scheduleReadOperands(wf, gpu_dyn_inst);
  
          DPRINTF(GPUSched, "schList[%d]: Added: SIMD[%d] WV[%d]: %d: %s\n",
-                exeType, w->simdId, w->wfDynId,
-                ii->seqNum(), ii->disassemble());
+                exeType, wf->simdId, wf->wfDynId,
+                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
          return true;
      } else {
          // Number of stall cycles due to RF access denied
@@ -314,28 +328,30 @@ ScheduleStage::addToSchList(int exeType, Wavefront *w)
          }
  
          // Increment stall counts for WF
-        w->schStalls++;
-        w->schRfAccessStalls++;
+        wf->schStalls++;
+        wf->schRfAccessStalls++;
          DPRINTF(GPUSched, "schList[%d]: Could not add: "
                  "SIMD[%d] WV[%d]: %d: %s\n",
-                exeType, w->simdId, w->wfDynId,
-                ii->seqNum(), ii->disassemble());
+                exeType, wf->simdId, wf->wfDynId,
+                gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
      }
      return false;
  }
  
  void
-ScheduleStage::reinsertToSchList(int exeType, Wavefront *w)
+ScheduleStage::reinsertToSchList(int exeType,
+                                 const GPUDynInstPtr &gpu_dyn_inst)
  {
      // Insert wave w into schList for specified exeType.
      // Wave is inserted in age order, with oldest wave being at the
      // front of the schList
+    assert(gpu_dyn_inst);
      auto schIter = schList.at(exeType).begin();
      while (schIter != schList.at(exeType).end()
-           && schIter->first->wfDynId < w->wfDynId) {
+           && schIter->first->wfDynId < gpu_dyn_inst->wfDynId) {
          schIter++;
      }
-    schList.at(exeType).insert(schIter, std::make_pair(w, RFREADY));
+    schList.at(exeType).insert(schIter, std::make_pair(gpu_dyn_inst, RFREADY));
  }
  
  void
@@ -377,46 +393,48 @@ ScheduleStage::checkMemResources()
  }
  
  bool
-ScheduleStage::dispatchReady(Wavefront *w)
+ScheduleStage::dispatchReady(const GPUDynInstPtr &gpu_dyn_inst)
  {
+    assert(gpu_dyn_inst);
+    Wavefront *wf = gpu_dyn_inst->wavefront();
      vectorAluRdy = false;
      scalarAluRdy = false;
      // check for available vector/scalar ALUs in the next cycle
-    if (computeUnit.vectorALUs[w->simdId].rdy(Cycles(1))) {
+    if (computeUnit.vectorALUs[wf->simdId].rdy(Cycles(1))) {
          vectorAluRdy = true;
      }
-    if (computeUnit.scalarALUs[w->scalarAlu].rdy(Cycles(1))) {
+    if (computeUnit.scalarALUs[wf->scalarAlu].rdy(Cycles(1))) {
          scalarAluRdy = true;
      }
-    GPUDynInstPtr ii = w->instructionBuffer.front();
  
-    if (ii->isNop()) {
+    if (gpu_dyn_inst->isNop()) {
          // S_NOP requires SALU. V_NOP requires VALU.
          // TODO: Scalar NOP does not require SALU in hardware,
          // and is executed out of IB directly.
-        if (ii->isScalar() && !scalarAluRdy) {
+        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
              dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
              return false;
-        } else if (!ii->isScalar() && !vectorAluRdy) {
+        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
              dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
              return false;
          }
-    } else if (ii->isEndOfKernel()) {
+    } else if (gpu_dyn_inst->isEndOfKernel()) {
          // EndPgm instruction
-        if (ii->isScalar() && !scalarAluRdy) {
+        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
              dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
              return false;
          }
-    } else if (ii->isBarrier() || ii->isBranch() || ii->isALU()) {
+    } else if (gpu_dyn_inst->isBarrier() || gpu_dyn_inst->isBranch()
+               || gpu_dyn_inst->isALU()) {
          // Barrier, Branch, or ALU instruction
-        if (ii->isScalar() && !scalarAluRdy) {
+        if (gpu_dyn_inst->isScalar() && !scalarAluRdy) {
              dispNrdyStalls[SCH_SCALAR_ALU_NRDY]++;
              return false;
-        } else if (!ii->isScalar() && !vectorAluRdy) {
+        } else if (!gpu_dyn_inst->isScalar() && !vectorAluRdy) {
              dispNrdyStalls[SCH_VECTOR_ALU_NRDY]++;
              return false;
          }
-    } else if (!ii->isScalar() && ii->isGlobalMem()) {
+    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
          // Vector Global Memory instruction
          bool rdy = true;
          if (!glbMemIssueRdy) {
@@ -427,18 +445,18 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_VECTOR_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
+        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
              rdy = false;
              dispNrdyStalls[SCH_VECTOR_MEM_COALESCER_NRDY]++;
          }
-        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
+        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
              rdy = false;
              dispNrdyStalls[SCH_VECTOR_MEM_REQS_NRDY]++;
          }
          if (!rdy) {
              return false;
          }
-    } else if (ii->isScalar() && ii->isGlobalMem()) {
+    } else if (gpu_dyn_inst->isScalar() && gpu_dyn_inst->isGlobalMem()) {
          // Scalar Global Memory instruction
          bool rdy = true;
          if (!scalarMemIssueRdy) {
@@ -449,16 +467,17 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_SCALAR_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit.scalarMemoryPipe.
-                isGMReqFIFOWrRdy(w->scalarRdGmReqsInPipe +
-                                 w->scalarWrGmReqsInPipe)) {
+        if (!computeUnit.scalarMemoryPipe
+            .isGMReqFIFOWrRdy(wf->scalarRdGmReqsInPipe
+            + wf->scalarWrGmReqsInPipe))
+        {
              rdy = false;
              dispNrdyStalls[SCH_SCALAR_MEM_FIFO_NRDY]++;
          }
          if (!rdy) {
              return false;
          }
-    } else if (!ii->isScalar() && ii->isLocalMem()) {
+    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isLocalMem()) {
          // Vector Local Memory instruction
          bool rdy = true;
          if (!locMemIssueRdy) {
@@ -470,14 +489,14 @@ ScheduleStage::dispatchReady(Wavefront *w)
              dispNrdyStalls[SCH_LOCAL_MEM_BUS_BUSY_NRDY]++;
          }
          if (!computeUnit.localMemoryPipe.
-                isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
              rdy = false;
              dispNrdyStalls[SCH_LOCAL_MEM_FIFO_NRDY]++;
          }
          if (!rdy) {
              return false;
          }
-    } else if (!ii->isScalar() && ii->isFlat()) {
+    } else if (!gpu_dyn_inst->isScalar() && gpu_dyn_inst->isFlat()) {
          // Vector Flat memory instruction
          bool rdy = true;
          if (!glbMemIssueRdy || !locMemIssueRdy) {
@@ -488,16 +507,16 @@ ScheduleStage::dispatchReady(Wavefront *w)
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_BUS_BUSY_NRDY]++;
          }
-        if (!computeUnit.globalMemoryPipe.coalescerReady(ii)) {
+        if (!computeUnit.globalMemoryPipe.coalescerReady(gpu_dyn_inst)) {
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_COALESCER_NRDY]++;
          }
-        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(ii)) {
+        if (!computeUnit.globalMemoryPipe.outstandingReqsCheck(gpu_dyn_inst)) {
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_REQS_NRDY]++;
          }
          if (!computeUnit.localMemoryPipe.
-                isLMReqFIFOWrRdy(w->rdLmReqsInPipe + w->wrLmReqsInPipe)) {
+                isLMReqFIFOWrRdy(wf->rdLmReqsInPipe + wf->wrLmReqsInPipe)) {
              rdy = false;
              dispNrdyStalls[SCH_FLAT_MEM_FIFO_NRDY]++;
          }
@@ -505,7 +524,8 @@ ScheduleStage::dispatchReady(Wavefront *w)
              return false;
          }
      } else {
-        panic("%s: unknown instr checked for readiness", ii->disassemble());
+        panic("%s: unknown instr checked for readiness",
+              gpu_dyn_inst->disassemble());
          return false;
      }
      dispNrdyStalls[SCH_RDY]++;
@@ -519,7 +539,7 @@ ScheduleStage::fillDispatchList()
      checkMemResources();
      // iterate execution resources
      for (int j = 0; j < computeUnit.numExeUnits(); j++) {
-        assert(dispatchList->at(j).second == EMPTY);
+        assert(toExecute.dispatchStatus(j) == EMPTY);
  
          // iterate waves in schList to pick one for dispatch
          auto schIter = schList.at(j).begin();
@@ -537,8 +557,7 @@ ScheduleStage::fillDispatchList()
  
                      // Acquire a coalescer token if it is a global mem
                      // operation.
-                    GPUDynInstPtr mp = schIter->first->
-                                       instructionBuffer.front();
+                    GPUDynInstPtr mp = schIter->first;
                      if (!mp->isMemSync() && !mp->isScalar() &&
                          (mp->isGlobalMem() || mp->isFlat())) {
                          computeUnit.globalMemoryPipe.acqCoalescerToken(mp);
@@ -553,10 +572,10 @@ ScheduleStage::fillDispatchList()
                  } else {
                      // Either another wave has been dispatched, or this wave
                      // was not ready, so it is stalled this cycle
-                    schIter->first->schStalls++;
+                    schIter->first->wavefront()->schStalls++;
                      if (!dispRdy) {
                          // not ready for dispatch, increment stall stat
-                        schIter->first->schResourceStalls++;
+                        schIter->first->wavefront()->schResourceStalls++;
                      }
                      // Examine next wave for this resource
                      schIter++;
@@ -589,28 +608,31 @@ ScheduleStage::arbitrateVrfToLdsBus()
          // get the GM pipe index in the dispatchList
          int gm_exe_unit = computeUnit.firstMemUnit() + i;
          // get the wave in the dispatchList
-        Wavefront *w = dispatchList->at(gm_exe_unit).first;
+        GPUDynInstPtr &gpu_dyn_inst
+            = toExecute.readyInst(gm_exe_unit);
          // If the WF is valid, ready to execute, and the instruction
          // is a flat access, arbitrate with the WF's assigned LM pipe
-        if (w && dispatchList->at(gm_exe_unit).second == EXREADY &&
-            w->instructionBuffer.front()->isFlat()) {
+        if (gpu_dyn_inst && toExecute.dispatchStatus(gm_exe_unit)
+            == EXREADY && gpu_dyn_inst->isFlat()) {
+            Wavefront *wf = gpu_dyn_inst->wavefront();
              // If the associated LM pipe also has a wave selected, block
              // that wave and let the Flat instruction issue. The WF in the
              // LM pipe is added back to the schList for consideration next
              // cycle.
-            if (dispatchList->at(w->localMem).second == EXREADY) {
-                reinsertToSchList(w->localMem,
-                                  dispatchList->at(w->localMem).first);
+            if (toExecute.dispatchStatus(wf->localMem) == EXREADY) {
+                reinsertToSchList(wf->localMem, toExecute
+                                  .readyInst(wf->localMem));
                  // Increment stall stats for LDS-VRF arbitration
                  ldsBusArbStalls++;
-                dispatchList->at(w->localMem).first->schLdsArbStalls++;
+                toExecute.readyInst(wf->localMem)
+                    ->wavefront()->schLdsArbStalls++;
              }
              // With arbitration of LM pipe complete, transition the
              // LM pipe to SKIP state in the dispatchList to inform EX stage
              // that a Flat instruction is executing next cycle
-            doDispatchListTransition(w->localMem, SKIP, w);
+            doDispatchListTransition(wf->localMem, SKIP, gpu_dyn_inst);
              DPRINTF(GPUSched, "dispatchList[%d]: arbVrfLds: "
-                    "EXREADY->SKIP\n", w->localMem);
+                    "EXREADY->SKIP\n", wf->localMem);
          }
      }
  }
@@ -623,41 +645,41 @@ ScheduleStage::checkRfOperandReadComplete()
      // selection for dispatchList
      for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
          for (auto &p : schList.at(j)) {
-            Wavefront *w = p.first;
-            assert(w);
+            const GPUDynInstPtr &gpu_dyn_inst = p.first;
+            assert(gpu_dyn_inst);
+            Wavefront *wf = gpu_dyn_inst->wavefront();
  
              // Increment the number of cycles the wave spends in the
              // SCH stage, since this loop visits every wave in SCH.
-            w->schCycles++;
+            wf->schCycles++;
  
-            GPUDynInstPtr ii = w->instructionBuffer.front();
              bool vrfRdy = true;
-            if (!ii->isScalar()) {
-                vrfRdy =
-                    computeUnit.vrf[w->simdId]->operandReadComplete(w, ii);
+            if (!gpu_dyn_inst->isScalar()) {
+                vrfRdy = computeUnit.vrf[wf->simdId]
+                    ->operandReadComplete(wf, gpu_dyn_inst);
              }
-            bool srfRdy =
-                computeUnit.srf[w->simdId]->operandReadComplete(w, ii);
+            bool srfRdy = computeUnit.srf[wf->simdId]
+                ->operandReadComplete(wf, gpu_dyn_inst);
              bool operandsReady = vrfRdy && srfRdy;
              if (operandsReady) {
-                DPRINTF(GPUSched,
-                        "schList[%d]: WV[%d] operands ready for: %d: %s\n",
-                         j, w->wfDynId, ii->seqNum(), ii->disassemble());
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] operands ready for: "
+                        "%d: %s\n", j, wf->wfDynId, gpu_dyn_inst->seqNum(),
+                        gpu_dyn_inst->disassemble());
                  DPRINTF(GPUSched, "schList[%d]: WV[%d] RFBUSY->RFREADY\n",
-                        j, w->wfDynId);
+                        j, wf->wfDynId);
                  p.second = RFREADY;
              } else {
-                DPRINTF(GPUSched,
-                        "schList[%d]: WV[%d] operands not ready for: %d: %s\n",
-                         j, w->wfDynId, ii->seqNum(), ii->disassemble());
+                DPRINTF(GPUSched, "schList[%d]: WV[%d] operands not ready "
+                        "for: %d: %s\n", j, wf->wfDynId,
+                        gpu_dyn_inst->seqNum(), gpu_dyn_inst->disassemble());
  
                  // operands not ready yet, increment SCH stage stats
                  // aggregate to all wavefronts on the CU
                  p.second = RFBUSY;
  
                  // Increment stall stats
-                w->schStalls++;
-                w->schOpdNrdyStalls++;
+                wf->schStalls++;
+                wf->schOpdNrdyStalls++;
  
                  opdNrdyStalls[SCH_RF_OPD_NRDY]++;
                  if (!vrfRdy) {
@@ -678,23 +700,21 @@ ScheduleStage::reserveResources()
      exeUnitReservations.resize(computeUnit.numExeUnits(), false);
  
      for (int j = 0; j < computeUnit.numExeUnits(); ++j) {
-        Wavefront *dispatchedWave = dispatchList->at(j).first;
-        if (dispatchedWave) {
-            DISPATCH_STATUS s = dispatchList->at(j).second;
+        GPUDynInstPtr &gpu_dyn_inst = toExecute.readyInst(j);
+        if (gpu_dyn_inst) {
+            DISPATCH_STATUS s = toExecute.dispatchStatus(j);
+            Wavefront *wf = gpu_dyn_inst->wavefront();
              if (s == EMPTY) {
                  continue;
              } else if (s == EXREADY) {
                  // Wave is ready for execution
-                std::vector<int> execUnitIds =
-                    dispatchedWave->reserveResources();
-                GPUDynInstPtr ii = dispatchedWave->instructionBuffer.front();
+                std::vector<int> execUnitIds = wf->reserveResources();
  
-                if (!ii->isScalar()) {
-                    computeUnit.vrf[dispatchedWave->simdId]->
-                        dispatchInstruction(ii);
+                if (!gpu_dyn_inst->isScalar()) {
+                    computeUnit.vrf[wf->simdId]
+                        ->dispatchInstruction(gpu_dyn_inst);
                  }
-                computeUnit.srf[dispatchedWave->simdId]->
-                    dispatchInstruction(ii);
+                computeUnit.srf[wf->simdId]->dispatchInstruction(gpu_dyn_inst);
  
                  std::stringstream ss;
                  for (auto id : execUnitIds) {
@@ -702,16 +722,16 @@ ScheduleStage::reserveResources()
                  }
                  DPRINTF(GPUSched, "dispatchList[%d]: SIMD[%d] WV[%d]: %d: %s"
                          "    Reserving ExeRes[ %s]\n",
-                        j, dispatchedWave->simdId, dispatchedWave->wfDynId,
-                        ii->seqNum(), ii->disassemble(), ss.str());
+                        j, wf->simdId, wf->wfDynId, gpu_dyn_inst->seqNum(),
+                        gpu_dyn_inst->disassemble(), ss.str());
                  // mark the resources as reserved for this cycle
                  for (auto execUnitId : execUnitIds) {
                      panic_if(exeUnitReservations.at(execUnitId),
                               "Execution unit %d is reserved!!!\n"
                               "SIMD[%d] WV[%d]: %d: %s",
-                             execUnitId, dispatchedWave->simdId,
-                             dispatchedWave->wfDynId,
-                             ii->seqNum(), ii->disassemble());
+                             execUnitId, wf->simdId, wf->wfDynId,
+                             gpu_dyn_inst->seqNum(),
+                             gpu_dyn_inst->disassemble());
                      exeUnitReservations.at(execUnitId) = true;
                  }
  
@@ -720,18 +740,20 @@ ScheduleStage::reserveResources()
                  // that we've reserved a global and local memory unit. Thus,
                  // we need to mark the latter execution unit as not available.
                  if (execUnitIds.size() > 1) {
-                    int lm_exec_unit M5_VAR_USED = dispatchedWave->localMem;
-                    assert(dispatchList->at(lm_exec_unit).second == SKIP);
+                    int lm_exec_unit M5_VAR_USED = wf->localMem;
+                    assert(toExecute.dispatchStatus(lm_exec_unit)
+                           == SKIP);
                  }
              } else if (s == SKIP) {
                  // Shared Memory pipe reserved for FLAT instruction.
                  // Verify the GM pipe for this wave is ready to execute
                  // and the wave in the GM pipe is the same as the wave
                  // in the LM pipe
-                int gm_exec_unit M5_VAR_USED = dispatchedWave->globalMem;
-                assert(dispatchList->at(gm_exec_unit).first->wfDynId ==
-                       dispatchedWave->wfDynId);
-                assert(dispatchList->at(gm_exec_unit).second == EXREADY);
+                int gm_exec_unit M5_VAR_USED = wf->globalMem;
+                assert(wf->wfDynId == toExecute
+                       .readyInst(gm_exec_unit)->wfDynId);
+                assert(toExecute.dispatchStatus(gm_exec_unit)
+                       == EXREADY);
              }
          }
      }
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh

index 6ec4a8ddda5f5c5dab204ebf2f9aa86c7e11f241..c4dc28237845371ea77f2e4846acb2575a4a1ba2 100644 (file)
--- a/src/gpu-compute/schedule_stage.hh
+++ b/src/gpu-compute/schedule_stage.hh
@@ -41,8 +41,8 @@
  #include <vector>
  
  #include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/misc.hh"
  #include "gpu-compute/scheduler.hh"
-#include "gpu-compute/scoreboard_check_stage.hh"
  
  // Schedule or execution arbitration stage.
  // From the pool of ready waves in the ready list,
@@ -50,6 +50,8 @@
  // The selection is made based on a scheduling policy
  
  class ComputeUnit;
+class ScheduleToExecute;
+class ScoreboardCheckToSchedule;
  class Wavefront;
  
  struct ComputeUnitParams;
@@ -57,7 +59,9 @@ struct ComputeUnitParams;
  class ScheduleStage
  {
    public:
-    ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu);
+    ScheduleStage(const ComputeUnitParams *p, ComputeUnit &cu,
+                  ScoreboardCheckToSchedule &from_scoreboard_check,
+                  ScheduleToExecute &to_execute);
      ~ScheduleStage();
      void init();
      void exec();
@@ -115,17 +119,13 @@ class ScheduleStage
  
    private:
      ComputeUnit &computeUnit;
+    ScoreboardCheckToSchedule &fromScoreboardCheck;
+    ScheduleToExecute &toExecute;
+
      // Each execution resource will have its own
      // scheduler and a dispatch list
      std::vector<Scheduler> scheduler;
  
-    // List of waves which will be dispatched to
-    // each execution resource.
-    // Currently, the dispatch list of
-    // an execution resource can hold only one wave because
-    // an execution resource can execute only one wave in a cycle.
-    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
-
      // Stats
  
      // Number of cycles with empty (or not empty) readyList, per execution
@@ -171,10 +171,10 @@ class ScheduleStage
      const std::string _name;
  
      // called by exec() to add a wave to schList if the RFs can support it
-    bool addToSchList(int exeType, Wavefront *w);
+    bool addToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
      // re-insert a wave to schList if wave lost arbitration
      // wave is inserted such that age order (oldest to youngest) is preserved
-    void reinsertToSchList(int exeType, Wavefront *w);
+    void reinsertToSchList(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
      // check waves in schList to see if RF reads complete
      void checkRfOperandReadComplete();
      // check execution resources for readiness
@@ -189,7 +189,7 @@ class ScheduleStage
      // check status of memory pipes and RF to Mem buses
      void checkMemResources();
      // resource ready check called by fillDispatchList
-    bool dispatchReady(Wavefront *w);
+    bool dispatchReady(const GPUDynInstPtr &gpu_dyn_inst);
      // pick waves from schList and populate dispatchList with one wave
      // per EXE resource type
      void fillDispatchList();
@@ -199,12 +199,13 @@ class ScheduleStage
      // dispatchList
      void scheduleRfDestOperands();
      // invoked by scheduleRfDestOperands to schedule RF writes for a wave
-    bool schedRfWrites(int exeType, Wavefront *w);
+    bool schedRfWrites(int exeType, const GPUDynInstPtr &gpu_dyn_inst);
      // reserve resources for waves surviving arbitration in dispatchList
      void reserveResources();
  
      void doDispatchListTransition(int unitId, DISPATCH_STATUS s,
-                                  Wavefront *w = nullptr);
+                                  const GPUDynInstPtr &gpu_dyn_inst);
+    void doDispatchListTransition(int unitId, DISPATCH_STATUS s);
  
      // Set tracking wfDynId for each wave present in schedule stage
      // Used to allow only one instruction per wave in schedule
@@ -219,7 +220,7 @@ class ScheduleStage
      // The maximum number of waves per resource can be determined by either
      // the VRF/SRF availability or limits imposed by paremeters (to be added)
      // of the SCH stage or CU.
-    std::vector<std::deque<std::pair<Wavefront*, SCH_STATUS>>> schList;
+    std::vector<std::deque<std::pair<GPUDynInstPtr, SCH_STATUS>>> schList;
  };
  
  #endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc

index fb99e69122d1028da1c98aaa401bae7e4a826173..0e52d310cec808d6fec1151ab681a5fb9abcd165 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.cc
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -45,22 +45,16 @@
  #include "params/ComputeUnit.hh"
  
  ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p,
-                                           ComputeUnit &cu)
-    : computeUnit(cu), _name(cu.name() + ".ScoreboardCheckStage")
+                                           ComputeUnit &cu,
+                                           ScoreboardCheckToSchedule
+                                           &to_schedule)
+    : computeUnit(cu), toSchedule(to_schedule),
+      _name(cu.name() + ".ScoreboardCheckStage")
  {
  }
  
  ScoreboardCheckStage::~ScoreboardCheckStage()
  {
-    readyList.clear();
-}
-
-void
-ScoreboardCheckStage::init()
-{
-    for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
-        readyList.push_back(&computeUnit.readyList[unitId]);
-    }
  }
  
  void
@@ -242,17 +236,13 @@ ScoreboardCheckStage::mapWaveToExeUnit(Wavefront *w)
  void
  ScoreboardCheckStage::exec()
  {
-    // reset the ready list for all execution units; it will be
-    // constructed every cycle since resource availability may change
-    for (int unitId = 0; unitId < computeUnit.numExeUnits(); ++unitId) {
-        // Reset wavefront pointers to nullptr so clear() on the vector
-        // does not accidentally destruct the wavefront object
-        for (int i = 0; i < readyList[unitId]->size(); i++) {
-            readyList[unitId]->at(i) = nullptr;
-        }
-        readyList[unitId]->clear();
-    }
-    // iterate over all WF slots across all vector ALUs
+    /**
+     * Reset the ready list for all execution units; ready list will be
+     * constructed every cycle because resource availability may change.
+     */
+    toSchedule.reset();
+
+    // Iterate over all WF slots across all SIMDs.
      for (int simdId = 0; simdId < computeUnit.numVectorALUs; ++simdId) {
          for (int wfSlot = 0; wfSlot < computeUnit.shader->n_wf; ++wfSlot) {
              // reset the ready status of each wavefront
@@ -269,7 +259,7 @@ ScoreboardCheckStage::exec()
                          curWave->simdId, curWave->wfDynId,
                          curWave->nextInstr()->seqNum(),
                          curWave->nextInstr()->disassemble());
-                readyList.at(exeResType)->push_back(curWave);
+                toSchedule.markWFReady(curWave, exeResType);
              }
              collectStatistics(rdyStatus);
          }
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh

index 6953c4c66f2c058f0b3794f49fea33b7621b1b3f..87582759bdadeea76ed41724954e15dd031ba817 100644 (file)
--- a/src/gpu-compute/scoreboard_check_stage.hh
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -43,6 +43,7 @@
  #include "sim/stats.hh"
  
  class ComputeUnit;
+class ScoreboardCheckToSchedule;
  class Wavefront;
  
  struct ComputeUnitParams;
@@ -70,9 +71,9 @@ class ScoreboardCheckStage
          NRDY_CONDITIONS
      };
  
-    ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu);
+    ScoreboardCheckStage(const ComputeUnitParams* p, ComputeUnit &cu,
+                         ScoreboardCheckToSchedule &to_schedule);
      ~ScoreboardCheckStage();
-    void init();
      void exec();
  
      // Stats related variables and methods
@@ -86,9 +87,12 @@ class ScoreboardCheckStage
                 int *exeResType, int wfSlot);
      ComputeUnit &computeUnit;
  
-    // List of waves which are ready to be scheduled.
-    // Each execution resource has a ready list
-    std::vector<std::vector<Wavefront*>*> readyList;
+    /**
+     * Interface between scoreboard check and schedule stages. Each
+     * cycle the scoreboard check stage populates this interface with
+     * information needed by the schedule stage.
+     */
+    ScoreboardCheckToSchedule &toSchedule;
  
      // Stats
      Stats::Vector stallCycles;
author	Tony Gutierrez <anthony.gutierrez@amd.com>
	Mon, 2 Jul 2018 19:56:22 +0000 (15:56 -0400)
committer	Anthony Gutierrez <anthony.gutierrez@amd.com>
	Fri, 17 Jul 2020 16:36:09 +0000 (16:36 +0000)
src/gpu-compute/SConscript		patch \| blob \| history
src/gpu-compute/comm.cc	[new file with mode: 0644]	patch \| blob
src/gpu-compute/comm.hh	[new file with mode: 0644]	patch \| blob
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/compute_unit.hh		patch \| blob \| history
src/gpu-compute/exec_stage.cc		patch \| blob \| history
src/gpu-compute/exec_stage.hh		patch \| blob \| history
src/gpu-compute/schedule_stage.cc		patch \| blob \| history
src/gpu-compute/schedule_stage.hh		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.cc		patch \| blob \| history
src/gpu-compute/scoreboard_check_stage.hh		patch \| blob \| history