gpu-compute: Support for dynamic register alloc

author gauravjain14 <gjain6@wisc.edu>

Sat, 19 Oct 2019 23:51:41 +0000 (18:51 -0500)

committer Matt Sinclair <mattdsinclair@gmail.com>

Thu, 14 Jan 2021 17:04:27 +0000 (17:04 +0000)
author gauravjain14 <gjain6@wisc.edu>
Sat, 19 Oct 2019 23:51:41 +0000 (18:51 -0500)
committer Matt Sinclair <mattdsinclair@gmail.com>
Thu, 14 Jan 2021 17:04:27 +0000 (17:04 +0000)
diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py

index 14f716317d72dea64d4efbb71790bb59d7516af9..0bcf99bc4436eae7127b41cc52e89311c42831b3 100644 (file)
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -182,6 +182,8 @@ parser.add_option('--fast-forward-pseudo-op', action='store_true',
                    ' m5_switchcpu pseudo-ops will toggle back and forth')
  parser.add_option("--num-hw-queues", type="int", default=10,
                    help="number of hw queues in packet processor")
+parser.add_option("--reg-alloc-policy",type="string", default="simple",
+                  help="register allocation policy (simple/dynamic)")
  
  Ruby.define_options(parser)
  
@@ -295,18 +297,28 @@ for i in range(n_cu):
          for k in range(shader.n_wf):
              wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
                                          wf_size = options.wf_size))
-        vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
+
+        if options.reg_alloc_policy == "simple":
+            vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
                                                 options.vreg_file_size,
                                                 min_alloc = \
                                                 options.vreg_min_alloc))
-
-        vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
-                                       num_regs=options.vreg_file_size))
-
-        srf_pool_mgrs.append(SimplePoolManager(pool_size = \
+            srf_pool_mgrs.append(SimplePoolManager(pool_size = \
+                                               options.sreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+        elif options.reg_alloc_policy == "dynamic":
+            vrf_pool_mgrs.append(DynPoolManager(pool_size = \
+                                               options.vreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+            srf_pool_mgrs.append(DynPoolManager(pool_size = \
                                                 options.sreg_file_size,
                                                 min_alloc = \
                                                 options.vreg_min_alloc))
+
+        vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
+                                       num_regs=options.vreg_file_size))
          srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
                                         num_regs=options.sreg_file_size))
  
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py

index b82ad184ff958e8e46c27548ca3a3855a4aa0f95..d2959ac6b84b2f033731732882f25615e21fe143 100644 (file)
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -28,8 +28,6 @@
  # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  # POSSIBILITY OF SUCH DAMAGE.
-#
-# Authors: Steve Reinhardt
  
  from m5.defines import buildEnv
  from m5.params import *
@@ -67,6 +65,12 @@ class SimplePoolManager(PoolManager):
      cxx_class = 'SimplePoolManager'
      cxx_header = "gpu-compute/simple_pool_manager.hh"
  
+## This is for allowing multiple workgroups on one CU
+class DynPoolManager(PoolManager):
+    type = 'DynPoolManager'
+    cxx_class = 'DynPoolManager'
+    cxx_header = "gpu-compute/dyn_pool_manager.hh"
+
  class RegisterFile(SimObject):
      type = 'RegisterFile'
      cxx_class = 'RegisterFile'
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript

index 416b9e9242fb38476cec9f36673f00ff01a888ec..e41e387c14a88e3f3927c162b0189a77d5009bd3 100644 (file)
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -65,6 +65,7 @@ Source('schedule_stage.cc')
  Source('scheduler.cc')
  Source('scoreboard_check_stage.cc')
  Source('shader.cc')
+Source('dyn_pool_manager.cc')
  Source('simple_pool_manager.cc')
  Source('static_register_manager_policy.cc')
  Source('tlb_coalescer.cc')
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index 1da5a45a027a4cb878f646a43dde7f287d1c7111..d460861e221a8fff3e6739e1707fcf76b25d3645 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -400,6 +400,19 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
      injectGlobalMemFence(gpuDynInst, true);
  }
  
+// reseting SIMD register pools
+// I couldn't think of any other place and
+// I think it is needed in my implementation
+void
+ComputeUnit::resetRegisterPool()
+{
+    for (int i=0; i<numVectorALUs; i++)
+    {
+        registerManager->vrfPoolMgrs[i]->resetRegion(numVecRegsPerSimd);
+        registerManager->srfPoolMgrs[i]->resetRegion(numScalarRegsPerSimd);
+    }
+}
+
  void
  ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
  {
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh

index 3ca678e0aaf972d47a9594e3593cb8ba756034af..ecb6d06d8075be37f3ac27b9bad8167b1885da2b 100644 (file)
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -415,6 +415,8 @@ class ComputeUnit : public ClockedObject
      int cacheLineSize() const { return _cacheLineSize; }
      int getCacheLineBits() const { return cacheLineBits; }
  
+    void resetRegisterPool();
+
    private:
      WFBarrier&
      barrierSlot(int bar_id)
diff --git a/src/gpu-compute/dyn_pool_manager.cc b/src/gpu-compute/dyn_pool_manager.cc

new file mode 100644 (file)

index 0000000..19b7cac
--- /dev/null
+++ b/src/gpu-compute/dyn_pool_manager.cc
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUVRF.hh"
+#include "gpu-compute/dyn_pool_manager.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+DynPoolManager::minAllocatedElements(uint32_t size)
+{
+    fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+             size);
+
+    return size % minAllocation() > 0 ?
+        (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+DynPoolManager::printRegion()
+{
+    std::string _cout;
+    uint32_t reservedEntries = 0;
+
+    /*
+      Iterate over all elements in freeSpaceRecord, checking first element
+      of each pair to see how much space in it has been allocated already.
+      This only counts the partially allocated regions.  Thus, in addition,
+      count the elements in reservedSpaceRecord.
+    */
+    auto it_free = freeSpaceRecord.begin();
+    while (it_free != freeSpaceRecord.end()) {
+        reservedEntries += it_free->first;
+        ++it_free;
+    }
+    reservedEntries += (reservedSpaceRecord * totalRegSpace);
+
+    if (reservedEntries == 0)
+        _cout = "VRF is empty\n";
+    else {
+        _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+    }
+    return _cout;
+}
+
+// reset freeSpace and reservedSpace
+void
+DynPoolManager::resetRegion(const int & regsPerSimd){
+    totalRegSpace = regsPerSimd;
+    reservedSpaceRecord = 0;
+    freeSpaceRecord.clear();
+
+    // reset available free space
+    _totRegSpaceAvailable = regsPerSimd;
+    freeSpaceRecord.push_back(std::make_pair(0,regsPerSimd));
+}
+
+bool
+DynPoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+    uint32_t actualSize = minAllocatedElements(size);
+    DPRINTF(GPUVRF,"Can Allocate %d\n",actualSize);
+    return (_totRegSpaceAvailable >= actualSize);
+}
+
+uint32_t
+DynPoolManager::allocateRegion(const uint32_t size,
+                                    uint32_t *reservedPoolSize)
+{
+    uint32_t startIdx = (unsigned)-1;
+    uint32_t actualSize = minAllocatedElements(size);
+    auto it = freeSpaceRecord.begin();
+    while (it != freeSpaceRecord.end()) {
+        if (it->second >= actualSize) {
+            // assign the next block starting from here
+            startIdx = it->first;
+            _regionSize = actualSize;
+            *reservedPoolSize = actualSize;
+            _totRegSpaceAvailable -= actualSize;
+
+            // This case sees if this chunk size is exactly equal to
+            // the size of the requested chunk. If yes, then this can't
+            // contribute to future requests and hence, should be removed
+            if (it->second == actualSize) {
+                it = freeSpaceRecord.erase(it);
+                // once entire freeSpaceRecord allocated, increment
+                // reservedSpaceRecord count
+                ++reservedSpaceRecord;
+            } else {
+                it->first += actualSize;
+                it->second -= actualSize;
+            }
+            break;
+        }
+        it++;
+    }
+    DPRINTF(GPUVRF,"totRegSpace %d allocating Register at %d and"
+                " size %d\n",_totRegSpaceAvailable,startIdx,actualSize);
+    return startIdx;
+}
+
+void
+DynPoolManager::freeRegion(uint32_t firstIdx,
+                                uint32_t lastIdx)
+{
+    // lastIdx-firstIdx should give the size of free space
+    DPRINTF(GPUVRF,"freeing Region at %d %d, size %d\n",
+                firstIdx,lastIdx,lastIdx-firstIdx);
+
+    // Current dynamic register allocation does not handle wraparound
+    assert(firstIdx < lastIdx);
+    _totRegSpaceAvailable += lastIdx-firstIdx;
+    freeSpaceRecord.push_back(std::make_pair(firstIdx,lastIdx-firstIdx));
+    // remove corresponding entry from reservedSpaceRecord too
+    --reservedSpaceRecord;
+}
+
+uint32_t
+DynPoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+    bool wrapAround = (region.first > region.second);
+    if (!wrapAround) {
+        return region.second - region.first + 1;
+    } else {
+        return region.second + poolSize() - region.first + 1;
+    }
+}
diff --git a/src/gpu-compute/dyn_pool_manager.hh b/src/gpu-compute/dyn_pool_manager.hh

new file mode 100644 (file)

index 0000000..dc8ffec
--- /dev/null
+++ b/src/gpu-compute/dyn_pool_manager.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __DYN_POOL_MANAGER_HH__
+#define __DYN_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+#include "params/DynPoolManager.hh"
+
+// Dynamic Pool Manager: allows multiple WGs on the same pool
+class DynPoolManager : public PoolManager
+{
+  public:
+    DynPoolManager(const PoolManagerParams &p)
+        : PoolManager(p), _regionSize(0), _nxtFreeIdx(0)
+    {
+        _totRegSpaceAvailable = p.pool_size;
+    }
+
+    uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize) override;
+    bool canAllocate(uint32_t numRegions, uint32_t size) override;
+    void freeRegion(uint32_t firstIdx, uint32_t lastIdx) override;
+    uint32_t minAllocatedElements(uint32_t size);
+    std::string printRegion() override;
+    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) override;
+    void resetRegion(const int & regsPerSimd) override;
+
+  private:
+    // actual size of a region (normalized to the minimum size that can
+    // be reserved)
+    uint32_t _regionSize;
+    // next index to allocate a region
+    int _nxtFreeIdx;
+    // total registers available - across chunks
+    uint32_t _totRegSpaceAvailable;
+    // regIndex and freeSpace record
+    std::list<std::pair<int,int>> freeSpaceRecord;
+    int reservedSpaceRecord;
+    // total registers to be allocated -- treat as a const
+    int totalRegSpace;
+};
+
+#endif // __DYN_POOL_MANAGER_HH__
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh

index 0f102c2d833e15beea99b8bd64288b7704d903b2..2de8fd2ea12fa66b315a92a3f9535a6fcd152b09 100644 (file)
--- a/src/gpu-compute/pool_manager.hh
+++ b/src/gpu-compute/pool_manager.hh
@@ -57,6 +57,14 @@ class PoolManager : public SimObject
  
      virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
      uint32_t poolSize() { return _poolSize; }
+    // I don't think with the current API it is possible to do what
+    // we intend to - reset the entire register pool.
+    // Because we need to reset the register pool when all WGs on
+    // the Compute Unit are finished - before launching WGs from
+    // another kernel.
+    // TsungTai Yeh added a virtual method do the very same - at a diff
+    // place though.
+    virtual void resetRegion(const int & regsPerSimd) {}; // do nothing
  
    private:
      // minimum size that can be reserved per allocation
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc

index 012b9870cd3dcc5b9dc34cabe5c50f383ef68552..9ae3fd7ce1155e4b20d302f363e4457ed2e93ed7 100644 (file)
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -207,6 +207,9 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
          _dispatcher.updateInvCounter(kernId, +1);
          // all necessary INV flags are all set now, call cu to execute
          cuList[i_cu]->doInvalidate(req, task->dispatchId());
+
+        // I don't like this. This is intrusive coding.
+        cuList[i_cu]->resetRegisterPool();
      }
  }
  
diff --git a/src/gpu-compute/static_register_manager_policy.cc b/src/gpu-compute/static_register_manager_policy.cc

index 85f530bfcc16a1c7dbd59bf047ecb1b584f54230..f1bc1e6f16f5e2007cd49f92f1d356515df1b70f 100644 (file)
--- a/src/gpu-compute/static_register_manager_policy.cc
+++ b/src/gpu-compute/static_register_manager_policy.cc
@@ -152,13 +152,13 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
               w->simdId,
               w->computeUnit->scalarRegsReserved[w->simdId]);
  
-    int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
-        w->computeUnit->vrf[w->simdId]->numRegs();
+    // Current dynamic register allocation does not handle wraparound
+    int endIndex = w->startVgprIndex + w->reservedVectorRegs;
  
      w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
          freeRegion(w->startVgprIndex, endIndex);
  
-    // mark/pre-mark all registers as not busy
+    // mark/pre-mark all registers are not busy
      for (int i = 0; i < w->reservedVectorRegs; i++) {
          uint32_t physVgprIdx = mapVgpr(w, i);
          w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
@@ -167,12 +167,11 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
      w->reservedVectorRegs = 0;
      w->startVgprIndex = 0;
  
-    endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
-        w->computeUnit->srf[w->simdId]->numRegs();
+    endIndex = w->startSgprIndex + w->reservedScalarRegs;
      w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
          freeRegion(w->startSgprIndex, endIndex);
  
-    // mark/pre-mark all registers as not busy
+    // mark/pre-mark all registers are not busy
      for (int i = 0; i < w->reservedScalarRegs; i++) {
          uint32_t physSgprIdx = mapSgpr(w, i);
          w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);
author	gauravjain14 <gjain6@wisc.edu>
	Sat, 19 Oct 2019 23:51:41 +0000 (18:51 -0500)
committer	Matt Sinclair <mattdsinclair@gmail.com>
	Thu, 14 Jan 2021 17:04:27 +0000 (17:04 +0000)
configs/example/apu_se.py		patch \| blob \| history
src/gpu-compute/GPU.py		patch \| blob \| history
src/gpu-compute/SConscript		patch \| blob \| history
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/gpu-compute/compute_unit.hh		patch \| blob \| history
src/gpu-compute/dyn_pool_manager.cc	[new file with mode: 0644]	patch \| blob
src/gpu-compute/dyn_pool_manager.hh	[new file with mode: 0644]	patch \| blob
src/gpu-compute/pool_manager.hh		patch \| blob \| history
src/gpu-compute/shader.cc		patch \| blob \| history
src/gpu-compute/static_register_manager_policy.cc		patch \| blob \| history