gpu-compute: support in-order data delivery in GM pipe

author Tony Gutierrez <anthony.gutierrez@amd.com>

Thu, 27 Oct 2016 02:48:28 +0000 (22:48 -0400)

committer Tony Gutierrez <anthony.gutierrez@amd.com>

Thu, 27 Oct 2016 02:48:28 +0000 (22:48 -0400)
author Tony Gutierrez <anthony.gutierrez@amd.com>
Thu, 27 Oct 2016 02:48:28 +0000 (22:48 -0400)
committer Tony Gutierrez <anthony.gutierrez@amd.com>
Thu, 27 Oct 2016 02:48:28 +0000 (22:48 -0400)
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py

index b8ec149d5d2918f0260e46380d5b4c4807a1981d..5ec3289d25f7fe69d6e319e791dab2fad03422f4 100644 (file)
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -153,7 +153,9 @@ parser.add_option('--fast-forward-pseudo-op', action='store_true',
                    help = 'fast forward using kvm until the m5_switchcpu'
                    ' pseudo-op is encountered, then switch cpus. subsequent'
                    ' m5_switchcpu pseudo-ops will toggle back and forth')
-
+parser.add_option('--outOfOrderDataDelivery', action='store_true',
+                  default=False, help='enable OoO data delivery in the GM'
+                  ' pipeline')
  
  Ruby.define_options(parser)
  
@@ -248,7 +250,9 @@ for i in xrange(n_cu):
                                       localDataStore = \
                                       LdsState(banks = options.numLdsBanks,
                                                bankConflictPenalty = \
-                                              options.ldsBankConflictPenalty)))
+                                              options.ldsBankConflictPenalty),
+                                     out_of_order_data_delivery =
+                                             options.outOfOrderDataDelivery))
      wavefronts = []
      vrfs = []
      for j in xrange(options.simds_per_cu):
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh

index c40411ace14aa54f9ac44d6ecc1a5177d2b328a2..4c0bc9ce1efc4daf7d5d30a1accd382052057241 100644 (file)
--- a/src/arch/hsail/insts/decl.hh
+++ b/src/arch/hsail/insts/decl.hh
@@ -1082,7 +1082,7 @@ namespace HsailISA
  
                  gpuDynInst->useContinuation = false;
                  GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
-                gmp->getGMReqFIFO().push(gpuDynInst);
+                gmp->issueRequest(gpuDynInst);
  
                  w->wrGmReqsInPipe--;
                  w->rdGmReqsInPipe--;
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh

index c175f278250fe96256021571827b2b5bd8af3e66..dbda6643bac72f27695aa184812fdd019272e4eb 100644 (file)
--- a/src/arch/hsail/insts/mem_impl.hh
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -263,7 +263,7 @@ namespace HsailISA
                  }
              }
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsRdGm++;
              w->rdGmReqsInPipe--;
              break;
@@ -288,7 +288,7 @@ namespace HsailISA
                  }
              }
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsRdGm++;
              w->rdGmReqsInPipe--;
              break;
@@ -312,7 +312,7 @@ namespace HsailISA
                  }
              }
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsRdGm++;
              w->rdGmReqsInPipe--;
              break;
@@ -330,7 +330,7 @@ namespace HsailISA
                      }
                  }
              }
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsRdGm++;
              w->rdGmReqsInPipe--;
              break;
@@ -440,7 +440,7 @@ namespace HsailISA
                  }
              }
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsWrGm++;
              w->wrGmReqsInPipe--;
              break;
@@ -460,7 +460,7 @@ namespace HsailISA
                  }
              }
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsWrGm++;
              w->wrGmReqsInPipe--;
              break;
@@ -486,7 +486,7 @@ namespace HsailISA
                  }
              }
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsWrGm++;
              w->wrGmReqsInPipe--;
              break;
@@ -591,7 +591,7 @@ namespace HsailISA
              m->latency.set(w->computeUnit->shader->ticks(64));
              m->pipeId = GLBMEM_PIPE;
  
-            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+            w->computeUnit->globalMemoryPipe.issueRequest(m);
              w->outstandingReqsWrGm++;
              w->wrGmReqsInPipe--;
              w->outstandingReqsRdGm++;
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc

index bfffb7d8f66f744a86766e418dceb99c551fb2d1..580328aedfb49e494c071d2a056e28b8930fd094 100644 (file)
--- a/src/arch/hsail/insts/pseudo_inst.cc
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -648,7 +648,7 @@ namespace HsailISA
  
          m->pipeId = GLBMEM_PIPE;
          m->latency.set(w->computeUnit->shader->ticks(64));
-        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->computeUnit->globalMemoryPipe.issueRequest(m);
          w->outstandingReqsWrGm++;
          w->wrGmReqsInPipe--;
          w->outstandingReqsRdGm++;
@@ -688,7 +688,7 @@ namespace HsailISA
  
          m->pipeId = GLBMEM_PIPE;
          m->latency.set(w->computeUnit->shader->ticks(64));
-        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->computeUnit->globalMemoryPipe.issueRequest(m);
          w->outstandingReqsWrGm++;
          w->wrGmReqsInPipe--;
          w->outstandingReqsRdGm++;
@@ -727,7 +727,7 @@ namespace HsailISA
  
          m->pipeId = GLBMEM_PIPE;
          m->latency.set(w->computeUnit->shader->ticks(1));
-        w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+        w->computeUnit->globalMemoryPipe.issueRequest(m);
          w->outstandingReqsRdGm++;
          w->rdGmReqsInPipe--;
          w->outstandingReqs++;
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py

index b672f616cf21a2b209256516e93ae08840e38e5d..0cb9e76a41fde6f23b494ea9a003340628fee500 100644 (file)
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -135,6 +135,8 @@ class ComputeUnit(MemObject):
  
      vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
                                                            "file")
+    out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery"
+                                            " in the GM pipeline")
  
  class Shader(ClockedObject):
      type = 'Shader'
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index 93cffbe1ebdd77e0e0807f0b4fd2002904b8ad94..ffa5243d2f9a4924f757f0b287ff1495c65298cb 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1033,17 +1033,7 @@ ComputeUnit::DataPort::MemRespEvent::process()
                  if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
                      gpuDynInst->statusVector.clear();
  
-                if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
-                    assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
-
-                    compute_unit->globalMemoryPipe.getGMLdRespFIFO()
-                        .push(gpuDynInst);
-                } else {
-                    assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
-
-                    compute_unit->globalMemoryPipe.getGMStRespFIFO()
-                        .push(gpuDynInst);
-                }
+                compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
  
                  DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
                          compute_unit->cu_id, gpuDynInst->simdId,
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc

index f48af5a6fdfe4bf7f5dfc4a5961d6025f629d0f5..7583ebb9b94c8d8e6dea4dcd117ff50737413bae 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -45,7 +45,8 @@
  
  GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
      computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
-    inflightStores(0), inflightLoads(0)
+    outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0),
+    inflightLoads(0)
  {
  }
  
@@ -61,8 +62,7 @@ void
  GlobalMemPipeline::exec()
  {
      // apply any returned global memory operations
-    GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
-        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+    GPUDynInstPtr m = getNextReadyResp();
  
      bool accessVrf = true;
      Wavefront *w = nullptr;
@@ -74,30 +74,19 @@ GlobalMemPipeline::exec()
  
          accessVrf =
              w->computeUnit->vrf[w->simdId]->
-            vrfOperandAccessReady(m->seqNum(), w, m,
-                                  VrfAccessType::WRITE);
+                vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE);
      }
  
-    if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
-        m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+    if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
          accessVrf && m->statusBitVector == VectorMask(0) &&
          (computeUnit->shader->coissue_return ||
-         computeUnit->wfWait.at(m->pipeId).rdy())) {
+        computeUnit->wfWait.at(m->pipeId).rdy())) {
  
          w = m->wavefront();
  
          m->completeAcc(m);
  
-        if (m->isLoad() || m->isAtomic()) {
-            gmReturnedLoads.pop();
-            assert(inflightLoads > 0);
-            --inflightLoads;
-        } else {
-            assert(m->isStore());
-            gmReturnedStores.pop();
-            assert(inflightStores > 0);
-            --inflightStores;
-        }
+        completeRequest(m);
  
          // Decrement outstanding register count
          computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
@@ -129,15 +118,30 @@ GlobalMemPipeline::exec()
              } else {
                  ++inflightLoads;
              }
-        } else {
+        } else if (mp->isStore()) {
              if (inflightStores >= gmQueueSize) {
                  return;
-            } else if (mp->isStore()) {
+            } else {
                  ++inflightStores;
              }
          }
  
          mp->initiateAcc(mp);
+
+        if (!outOfOrderDataDelivery && !mp->isMemFence()) {
+            /**
+             * if we are not in out-of-order data delivery mode
+             * then we keep the responses sorted in program order.
+             * in order to do so we must reserve an entry in the
+             * resp buffer before we issue the request to the mem
+             * system. mem fence requests will not be stored here
+             * because once they are issued from the GM pipeline,
+             * they do not send any response back to it.
+             */
+            gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
+                std::make_pair(mp, false)));
+        }
+
          gmIssuedRequests.pop();
  
          DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
@@ -145,6 +149,86 @@ GlobalMemPipeline::exec()
      }
  }
  
+GPUDynInstPtr
+GlobalMemPipeline::getNextReadyResp()
+{
+    if (outOfOrderDataDelivery) {
+        if (!gmReturnedLoads.empty()) {
+            return gmReturnedLoads.front();
+        } else if (!gmReturnedStores.empty()) {
+            return gmReturnedStores.front();
+        }
+    } else {
+        if (!gmOrderedRespBuffer.empty()) {
+            auto mem_req = gmOrderedRespBuffer.begin();
+
+            if (mem_req->second.second) {
+                return mem_req->second.first;
+            }
+        }
+    }
+
+    return nullptr;
+}
+
+void
+GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
+{
+    if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+        assert(inflightLoads > 0);
+        --inflightLoads;
+    } else if (gpuDynInst->isStore()) {
+        assert(inflightStores > 0);
+        --inflightStores;
+    }
+
+    if (outOfOrderDataDelivery) {
+        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+            assert(!gmReturnedLoads.empty());
+            gmReturnedLoads.pop();
+        } else if (gpuDynInst->isStore()) {
+            assert(!gmReturnedStores.empty());
+            gmReturnedStores.pop();
+        }
+    } else {
+        // we should only pop the oldest requst, and it
+        // should be marked as done if we are here
+        assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
+        assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
+        assert(gmOrderedRespBuffer.begin()->second.second);
+        // remove this instruction from the buffer by its
+        // unique seq ID
+        gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
+    }
+}
+
+void
+GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
+{
+    gmIssuedRequests.push(gpuDynInst);
+}
+
+void
+GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
+{
+    if (outOfOrderDataDelivery) {
+        if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
+            assert(isGMLdRespFIFOWrRdy());
+            gmReturnedLoads.push(gpuDynInst);
+        } else {
+            assert(isGMStRespFIFOWrRdy());
+            gmReturnedStores.push(gpuDynInst);
+        }
+    } else {
+        auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
+        // if we are getting a response for this mem request,
+        // then it ought to already be in the ordered response
+        // buffer
+        assert(mem_req != gmOrderedRespBuffer.end());
+        mem_req->second.second = true;
+    }
+}
+
  void
  GlobalMemPipeline::regStats()
  {
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh

index 368a150791d943422481e207a43d999fe7731322..d10b7c1a2801b52c5f7b1d6e6d9178af38dad8db 100644 (file)
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -62,10 +62,40 @@ class GlobalMemPipeline
      void init(ComputeUnit *cu);
      void exec();
  
-    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
      std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
      std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
  
+    /**
+     * find the next ready response to service. for OoO mode we
+     * simply pop the oldest (based on when the response was
+     * received) response in the response FIFOs. for in-order mode
+     * we pop the oldest (in program order) response, and only if
+     * it is marked as done.
+     */
+    GPUDynInstPtr getNextReadyResp();
+
+    /**
+     * once a memory request is finished we remove it from the
+     * buffer. this method determines which response buffer
+     * we're using based on the mode (in-order vs. OoO).
+     */
+    void completeRequest(GPUDynInstPtr gpuDynInst);
+
+    /**
+     * issues a request to the pipeline - i.e., enqueue it
+     * in the request buffer.
+     */
+    void issueRequest(GPUDynInstPtr gpuDynInst);
+
+    /**
+     * this method handles responses sent to this GM pipeline by the
+     * CU. in the case of in-order delivery it simply marks the reqeust
+     * as done in the ordered buffer to indicate that the requst is
+     * finished. for out-of-order data delivery, the requests are enqueued
+     * (in the order in which they are received) in the response FIFOs.
+     */
+    void handleResponse(GPUDynInstPtr gpuDynInst);
+
      bool
      isGMLdRespFIFOWrRdy() const
      {
@@ -97,6 +127,7 @@ class GlobalMemPipeline
      ComputeUnit *computeUnit;
      std::string _name;
      int gmQueueSize;
+    bool outOfOrderDataDelivery;
  
      // number of cycles of delaying the update of a VGPR that is the
      // target of a load instruction (or the load component of an atomic)
@@ -111,6 +142,22 @@ class GlobalMemPipeline
      // The size of global memory.
      int globalMemSize;
  
+    /*
+     * this buffer holds the memory responses when in-order data
+     * deilvery is used - the responses are ordered by their unique
+     * sequence number, which is monotonically increasing. when a
+     * memory request returns its "done" flag is set to true. during
+     * each tick the the GM pipeline will check if the oldest request
+     * is finished, and if so it will be removed from the queue.
+     *
+     * key:   memory instruction's sequence ID
+     *
+     * value: pair holding the instruction pointer and a bool that
+     *        is used to indicate whether or not the request has
+     *        completed
+     */
+    std::map<uint64_t, std::pair<GPUDynInstPtr, bool>> gmOrderedRespBuffer;
+
      // Global Memory Request FIFO: all global memory requests
      // are issued to this FIFO from the memory pipelines
      std::queue<GPUDynInstPtr> gmIssuedRequests;
author	Tony Gutierrez <anthony.gutierrez@amd.com>
	Thu, 27 Oct 2016 02:48:28 +0000 (22:48 -0400)
committer	Tony Gutierrez <anthony.gutierrez@amd.com>
	Thu, 27 Oct 2016 02:48:28 +0000 (22:48 -0400)
configs/example/apu_se.py		patch \| blob \| history
src/arch/hsail/insts/decl.hh		patch \| blob \| history
src/arch/hsail/insts/mem_impl.hh		patch \| blob \| history
src/arch/hsail/insts/pseudo_inst.cc		patch \| blob \| history
src/gpu-compute/GPU.py		patch \| blob \| history
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.cc		patch \| blob \| history
src/gpu-compute/global_memory_pipeline.hh		patch \| blob \| history