gpu-compute: Remove WFContext

author Alexandru Dutu <alexandru.dutu@amd.com>

Fri, 16 Sep 2016 16:26:03 +0000 (12:26 -0400)

committer Alexandru Dutu <alexandru.dutu@amd.com>

Fri, 16 Sep 2016 16:26:03 +0000 (12:26 -0400)
author Alexandru Dutu <alexandru.dutu@amd.com>
Fri, 16 Sep 2016 16:26:03 +0000 (12:26 -0400)
committer Alexandru Dutu <alexandru.dutu@amd.com>
Fri, 16 Sep 2016 16:26:03 +0000 (12:26 -0400)
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index 5ec061172aa3518ec51b04c82ec8eebfc0ad65bb..83e2414db7a278c918b53650cc6fdabaa4590e5c 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -192,50 +192,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
      w->roSize = ndr->q.roMemTotal;
  }
  
-void
-ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
-                        int trueWgSize[], int trueWgSizeTotal,
-                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
-{
-    wfCtx->cnt = cnt;
-
-    VectorMask init_mask;
-    init_mask.reset();
-
-    for (int k = 0; k < wfSize(); ++k) {
-        if (k + cnt * wfSize() < trueWgSizeTotal)
-            init_mask[k] = 1;
-    }
-
-    wfCtx->init_mask = init_mask.to_ullong();
-    wfCtx->exec_mask = init_mask.to_ullong();
-
-    wfCtx->bar_cnt.resize(wfSize(), 0);
-
-    wfCtx->max_bar_cnt = 0;
-    wfCtx->old_barrier_cnt = 0;
-    wfCtx->barrier_cnt = 0;
-
-    wfCtx->privBase = ndr->q.privMemStart;
-    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
-
-    wfCtx->spillBase = ndr->q.spillMemStart;
-    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
-
-    wfCtx->pc = 0;
-    wfCtx->rpc = UINT32_MAX;
-
-    // set the wavefront context to have a pointer to this section of the LDS
-    wfCtx->ldsChunk = ldsChunk;
-
-    // WG state
-    wfCtx->wg_id = ndr->globalWgId;
-    wfCtx->barrier_id = barrier_id;
-
-    // Kernel wide state
-    wfCtx->ndr = ndr;
-}
-
  void
  ComputeUnit::updateEvents() {
  
@@ -264,19 +220,25 @@ ComputeUnit::updateEvents() {
  
  
  void
-ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
-                     int trueWgSizeTotal)
+ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
+                     int cnt, LdsChunk *ldsChunk, NDRange *ndr)
  {
      static int _n_wave = 0;
-    int cnt = wfCtx->cnt;
-    NDRange *ndr = wfCtx->ndr;
  
      // Fill in Kernel state
      FillKernelState(w, ndr);
  
+    VectorMask init_mask;
+    init_mask.reset();
+
+    for (int k = 0; k < wfSize(); ++k) {
+        if (k + cnt * wfSize() < trueWgSizeTotal)
+            init_mask[k] = 1;
+    }
+
      w->kern_id = ndr->dispatchId;
      w->dynwaveid = cnt;
-    w->init_mask = wfCtx->init_mask;
+    w->init_mask = init_mask.to_ullong();
  
      for (int k = 0; k < wfSize(); ++k) {
          w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
@@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
              w->workitemid[0][k];
      }
  
-    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
-    w->barrier_cnt = wfCtx->barrier_cnt;
      w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
  
-    for (int i = 0; i < wfSize(); ++i) {
-        w->bar_cnt[i] = wfCtx->bar_cnt[i];
-    }
+    w->bar_cnt.resize(wfSize(), 0);
+
+    w->max_bar_cnt = 0;
+    w->old_barrier_cnt = 0;
+    w->barrier_cnt = 0;
+
+    w->privBase = ndr->q.privMemStart;
+    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
  
-    w->max_bar_cnt = wfCtx->max_bar_cnt;
-    w->privBase = wfCtx->privBase;
-    w->spillBase = wfCtx->spillBase;
+    w->spillBase = ndr->q.spillMemStart;
+    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
  
-    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+    w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
  
      // WG state
-    w->wg_id = wfCtx->wg_id;
-    w->dispatchid = wfCtx->ndr->dispatchId;
+    w->wg_id = ndr->globalWgId;
+    w->dispatchid = ndr->dispatchId;
      w->workgroupid[0] = w->wg_id % ndr->numWg[0];
      w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
      w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
  
-    w->barrier_id = wfCtx->barrier_id;
+    w->barrier_id = barrier_id;
      w->stalledAtBarrier = false;
  
-    // move this from the context into the actual wavefront
-    w->ldsChunk = wfCtx->ldsChunk;
+    // set the wavefront context to have a pointer to this section of the LDS
+    w->ldsChunk = ldsChunk;
  
      int32_t refCount M5_VAR_USED =
                      lds.increaseRefCounter(w->dispatchid, w->wg_id);
@@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
              "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
  
      w->start(++_n_wave, ndr->q.code_ptr);
-    wfCtx->bar_cnt.clear();
  }
  
  void
@@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
          trueWgSizeTotal *= trueWgSize[d];
      }
  
-    uint64_t origSpillMemStart = ndr->q.spillMemStart;
      // calculate the number of 32-bit vector registers required by wavefront
      int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
      int cnt = 0;
@@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
              w->reservedVectorRegs = normSize;
              vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
  
-            WFContext wfCtx;
-
-            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
-                                ldsChunk, origSpillMemStart);
-
-            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+            StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
              ++cnt;
          }
      }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh

index a234cbeb56886ab1f8b0285c36c4f37181ba8e16..34b710cd6ff8a29e359def0bc504bae81dc424f8 100644 (file)
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -256,12 +256,8 @@ class ComputeUnit : public MemObject
      void fetch(PacketPtr pkt, Wavefront *wavefront);
      void FillKernelState(Wavefront *w, NDRange *ndr);
  
-    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
-                 int trueWgSizeTotal);
-
-    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
-                             int trueWgSize[], int trueWgSizeTotal,
-                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+    void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
+                     int cnt, LdsChunk *ldsChunk, NDRange *ndr);
  
      void StartWorkgroup(NDRange *ndr);
      int ReadyWorkgroup(NDRange *ndr);
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh

index 7bca757b83c3bb7eef7270d659db5783235dd284..b400dc0eec62892f48065df85afcbca28956b5df 100644 (file)
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@@ -95,59 +95,6 @@ struct HsaQueueEntry
      uint16_t num_args;
  };
  
-// State used to start (or restart) a WF
-struct WFContext
-{
-    // 32 bit values
-    // barrier state
-    std::vector<int> bar_cnt;
-
-    // id (which WF in the WG)
-    int cnt;
-
-    // more barrier state
-    int max_bar_cnt;
-    int old_barrier_cnt;
-    int barrier_cnt;
-
-    // More Program Counter Stuff
-    uint32_t pc;
-
-    // Program counter of the immediate post-dominator instruction
-    uint32_t rpc;
-
-    // WG wide state (I don't see how to avoid redundancy here)
-    int cu_id;
-    uint32_t wg_id;
-    uint32_t barrier_id;
-
-    // 64 bit values (these values depend on the wavefront size)
-    // masks
-    uint64_t init_mask;
-    uint64_t exec_mask;
-
-    // private memory;
-    Addr privBase;
-    Addr spillBase;
-
-    LdsChunk *ldsChunk;
-
-    /*
-     * Kernel wide state
-     * This is a hack. This state should be moved through simulated memory
-     * during a yield. Though not much is being used here, so it's probably
-     * probably not a big deal.
-     *
-     * Just to add to this comment... The ndr is derived from simulated
-     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
-     * for a kernel launch. So in theory the runtime should be able to keep
-     * that state around. Then a WF can reference it upon restart to derive
-     * kernel wide state. The runtime can deallocate the state when the
-     * kernel completes.
-     */
-    NDRange *ndr;
-};
-
  // State that needs to be passed between the simulation and simulated app, a
  // pointer to this struct can be passed through the depends field in the
  // HsaQueueEntry struct
author	Alexandru Dutu <alexandru.dutu@amd.com>
	Fri, 16 Sep 2016 16:26:03 +0000 (12:26 -0400)
committer	Alexandru Dutu <alexandru.dutu@amd.com>
	Fri, 16 Sep 2016 16:26:03 +0000 (12:26 -0400)
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/compute_unit.hh		patch \| blob \| history
src/gpu-compute/qstruct.hh		patch \| blob \| history