gpu-compute: Support for dynamic register alloc
authorgauravjain14 <gjain6@wisc.edu>
Sat, 19 Oct 2019 23:51:41 +0000 (18:51 -0500)
committerMatt Sinclair <mattdsinclair@gmail.com>
Thu, 14 Jan 2021 17:04:27 +0000 (17:04 +0000)
SimplePoolManager doesn't allow mapping of two WGs
simultaneously on the same Compute Unit (provided
the previous WG has been mapped to all the SIMDs)
even if there is sufficient VRF and SRF space
available.

DynPoolManager takes care of that by dynamically
allocating and deallocating register file space
to wavefronts

Change-Id: I2255c68d4b421615d7b231edc05d3ebb27cbd66c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32034
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Reviewed-by: Alexandru Duțu <alexandru.dutu@amd.com>
configs/example/apu_se.py
src/gpu-compute/GPU.py
src/gpu-compute/SConscript
src/gpu-compute/compute_unit.cc
src/gpu-compute/compute_unit.hh
src/gpu-compute/dyn_pool_manager.cc [new file with mode: 0644]
src/gpu-compute/dyn_pool_manager.hh [new file with mode: 0644]
src/gpu-compute/pool_manager.hh
src/gpu-compute/shader.cc
src/gpu-compute/static_register_manager_policy.cc

index 14f716317d72dea64d4efbb71790bb59d7516af9..0bcf99bc4436eae7127b41cc52e89311c42831b3 100644 (file)
@@ -182,6 +182,8 @@ parser.add_option('--fast-forward-pseudo-op', action='store_true',
                   ' m5_switchcpu pseudo-ops will toggle back and forth')
 parser.add_option("--num-hw-queues", type="int", default=10,
                   help="number of hw queues in packet processor")
+parser.add_option("--reg-alloc-policy",type="string", default="simple",
+                  help="register allocation policy (simple/dynamic)")
 
 Ruby.define_options(parser)
 
@@ -295,18 +297,28 @@ for i in range(n_cu):
         for k in range(shader.n_wf):
             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
                                         wf_size = options.wf_size))
-        vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
+
+        if options.reg_alloc_policy == "simple":
+            vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
                                                options.vreg_file_size,
                                                min_alloc = \
                                                options.vreg_min_alloc))
-
-        vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
-                                       num_regs=options.vreg_file_size))
-
-        srf_pool_mgrs.append(SimplePoolManager(pool_size = \
+            srf_pool_mgrs.append(SimplePoolManager(pool_size = \
+                                               options.sreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+        elif options.reg_alloc_policy == "dynamic":
+            vrf_pool_mgrs.append(DynPoolManager(pool_size = \
+                                               options.vreg_file_size,
+                                               min_alloc = \
+                                               options.vreg_min_alloc))
+            srf_pool_mgrs.append(DynPoolManager(pool_size = \
                                                options.sreg_file_size,
                                                min_alloc = \
                                                options.vreg_min_alloc))
+
+        vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
+                                       num_regs=options.vreg_file_size))
         srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
                                        num_regs=options.sreg_file_size))
 
index b82ad184ff958e8e46c27548ca3a3855a4aa0f95..d2959ac6b84b2f033731732882f25615e21fe143 100644 (file)
@@ -28,8 +28,6 @@
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
-#
-# Authors: Steve Reinhardt
 
 from m5.defines import buildEnv
 from m5.params import *
@@ -67,6 +65,12 @@ class SimplePoolManager(PoolManager):
     cxx_class = 'SimplePoolManager'
     cxx_header = "gpu-compute/simple_pool_manager.hh"
 
+## This is for allowing multiple workgroups on one CU
+class DynPoolManager(PoolManager):
+    type = 'DynPoolManager'
+    cxx_class = 'DynPoolManager'
+    cxx_header = "gpu-compute/dyn_pool_manager.hh"
+
 class RegisterFile(SimObject):
     type = 'RegisterFile'
     cxx_class = 'RegisterFile'
index 416b9e9242fb38476cec9f36673f00ff01a888ec..e41e387c14a88e3f3927c162b0189a77d5009bd3 100644 (file)
@@ -65,6 +65,7 @@ Source('schedule_stage.cc')
 Source('scheduler.cc')
 Source('scoreboard_check_stage.cc')
 Source('shader.cc')
+Source('dyn_pool_manager.cc')
 Source('simple_pool_manager.cc')
 Source('static_register_manager_policy.cc')
 Source('tlb_coalescer.cc')
index 1da5a45a027a4cb878f646a43dde7f287d1c7111..d460861e221a8fff3e6739e1707fcf76b25d3645 100644 (file)
@@ -400,6 +400,19 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
     injectGlobalMemFence(gpuDynInst, true);
 }
 
+// reseting SIMD register pools
+// I couldn't think of any other place and
+// I think it is needed in my implementation
+void
+ComputeUnit::resetRegisterPool()
+{
+    for (int i=0; i<numVectorALUs; i++)
+    {
+        registerManager->vrfPoolMgrs[i]->resetRegion(numVecRegsPerSimd);
+        registerManager->srfPoolMgrs[i]->resetRegion(numScalarRegsPerSimd);
+    }
+}
+
 void
 ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg)
 {
index 3ca678e0aaf972d47a9594e3593cb8ba756034af..ecb6d06d8075be37f3ac27b9bad8167b1885da2b 100644 (file)
@@ -415,6 +415,8 @@ class ComputeUnit : public ClockedObject
     int cacheLineSize() const { return _cacheLineSize; }
     int getCacheLineBits() const { return cacheLineBits; }
 
+    void resetRegisterPool();
+
   private:
     WFBarrier&
     barrierSlot(int bar_id)
diff --git a/src/gpu-compute/dyn_pool_manager.cc b/src/gpu-compute/dyn_pool_manager.cc
new file mode 100644 (file)
index 0000000..19b7cac
--- /dev/null
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUVRF.hh"
+#include "gpu-compute/dyn_pool_manager.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+DynPoolManager::minAllocatedElements(uint32_t size)
+{
+    fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+             size);
+
+    return size % minAllocation() > 0 ?
+        (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+DynPoolManager::printRegion()
+{
+    std::string _cout;
+    uint32_t reservedEntries = 0;
+
+    /*
+      Iterate over all elements in freeSpaceRecord, checking first element
+      of each pair to see how much space in it has been allocated already.
+      This only counts the partially allocated regions.  Thus, in addition,
+      count the elements in reservedSpaceRecord.
+    */
+    auto it_free = freeSpaceRecord.begin();
+    while (it_free != freeSpaceRecord.end()) {
+        reservedEntries += it_free->first;
+        ++it_free;
+    }
+    reservedEntries += (reservedSpaceRecord * totalRegSpace);
+
+    if (reservedEntries == 0)
+        _cout = "VRF is empty\n";
+    else {
+        _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+    }
+    return _cout;
+}
+
+// reset freeSpace and reservedSpace
+void
+DynPoolManager::resetRegion(const int & regsPerSimd){
+    totalRegSpace = regsPerSimd;
+    reservedSpaceRecord = 0;
+    freeSpaceRecord.clear();
+
+    // reset available free space
+    _totRegSpaceAvailable = regsPerSimd;
+    freeSpaceRecord.push_back(std::make_pair(0,regsPerSimd));
+}
+
+bool
+DynPoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+    uint32_t actualSize = minAllocatedElements(size);
+    DPRINTF(GPUVRF,"Can Allocate %d\n",actualSize);
+    return (_totRegSpaceAvailable >= actualSize);
+}
+
+uint32_t
+DynPoolManager::allocateRegion(const uint32_t size,
+                                    uint32_t *reservedPoolSize)
+{
+    uint32_t startIdx = (unsigned)-1;
+    uint32_t actualSize = minAllocatedElements(size);
+    auto it = freeSpaceRecord.begin();
+    while (it != freeSpaceRecord.end()) {
+        if (it->second >= actualSize) {
+            // assign the next block starting from here
+            startIdx = it->first;
+            _regionSize = actualSize;
+            *reservedPoolSize = actualSize;
+            _totRegSpaceAvailable -= actualSize;
+
+            // This case sees if this chunk size is exactly equal to
+            // the size of the requested chunk. If yes, then this can't
+            // contribute to future requests and hence, should be removed
+            if (it->second == actualSize) {
+                it = freeSpaceRecord.erase(it);
+                // once entire freeSpaceRecord allocated, increment
+                // reservedSpaceRecord count
+                ++reservedSpaceRecord;
+            } else {
+                it->first += actualSize;
+                it->second -= actualSize;
+            }
+            break;
+        }
+        it++;
+    }
+    DPRINTF(GPUVRF,"totRegSpace %d allocating Register at %d and"
+                " size %d\n",_totRegSpaceAvailable,startIdx,actualSize);
+    return startIdx;
+}
+
+void
+DynPoolManager::freeRegion(uint32_t firstIdx,
+                                uint32_t lastIdx)
+{
+    // lastIdx-firstIdx should give the size of free space
+    DPRINTF(GPUVRF,"freeing Region at %d %d, size %d\n",
+                firstIdx,lastIdx,lastIdx-firstIdx);
+
+    // Current dynamic register allocation does not handle wraparound
+    assert(firstIdx < lastIdx);
+    _totRegSpaceAvailable += lastIdx-firstIdx;
+    freeSpaceRecord.push_back(std::make_pair(firstIdx,lastIdx-firstIdx));
+    // remove corresponding entry from reservedSpaceRecord too
+    --reservedSpaceRecord;
+}
+
+uint32_t
+DynPoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+    bool wrapAround = (region.first > region.second);
+    if (!wrapAround) {
+        return region.second - region.first + 1;
+    } else {
+        return region.second + poolSize() - region.first + 1;
+    }
+}
diff --git a/src/gpu-compute/dyn_pool_manager.hh b/src/gpu-compute/dyn_pool_manager.hh
new file mode 100644 (file)
index 0000000..dc8ffec
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef __DYN_POOL_MANAGER_HH__
+#define __DYN_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+#include "params/DynPoolManager.hh"
+
+// Dynamic Pool Manager: allows multiple WGs on the same pool
+class DynPoolManager : public PoolManager
+{
+  public:
+    DynPoolManager(const PoolManagerParams &p)
+        : PoolManager(p), _regionSize(0), _nxtFreeIdx(0)
+    {
+        _totRegSpaceAvailable = p.pool_size;
+    }
+
+    uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize) override;
+    bool canAllocate(uint32_t numRegions, uint32_t size) override;
+    void freeRegion(uint32_t firstIdx, uint32_t lastIdx) override;
+    uint32_t minAllocatedElements(uint32_t size);
+    std::string printRegion() override;
+    uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) override;
+    void resetRegion(const int & regsPerSimd) override;
+
+  private:
+    // actual size of a region (normalized to the minimum size that can
+    // be reserved)
+    uint32_t _regionSize;
+    // next index to allocate a region
+    int _nxtFreeIdx;
+    // total registers available - across chunks
+    uint32_t _totRegSpaceAvailable;
+    // regIndex and freeSpace record
+    std::list<std::pair<int,int>> freeSpaceRecord;
+    int reservedSpaceRecord;
+    // total registers to be allocated -- treat as a const
+    int totalRegSpace;
+};
+
+#endif // __DYN_POOL_MANAGER_HH__
index 0f102c2d833e15beea99b8bd64288b7704d903b2..2de8fd2ea12fa66b315a92a3f9535a6fcd152b09 100644 (file)
@@ -57,6 +57,14 @@ class PoolManager : public SimObject
 
     virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
     uint32_t poolSize() { return _poolSize; }
+    // I don't think with the current API it is possible to do what
+    // we intend to - reset the entire register pool.
+    // Because we need to reset the register pool when all WGs on
+    // the Compute Unit are finished - before launching WGs from
+    // another kernel.
+    // TsungTai Yeh added a virtual method do the very same - at a diff
+    // place though.
+    virtual void resetRegion(const int & regsPerSimd) {}; // do nothing
 
   private:
     // minimum size that can be reserved per allocation
index 012b9870cd3dcc5b9dc34cabe5c50f383ef68552..9ae3fd7ce1155e4b20d302f363e4457ed2e93ed7 100644 (file)
@@ -207,6 +207,9 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
         _dispatcher.updateInvCounter(kernId, +1);
         // all necessary INV flags are all set now, call cu to execute
         cuList[i_cu]->doInvalidate(req, task->dispatchId());
+
+        // I don't like this. This is intrusive coding.
+        cuList[i_cu]->resetRegisterPool();
     }
 }
 
index 85f530bfcc16a1c7dbd59bf047ecb1b584f54230..f1bc1e6f16f5e2007cd49f92f1d356515df1b70f 100644 (file)
@@ -152,13 +152,13 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
              w->simdId,
              w->computeUnit->scalarRegsReserved[w->simdId]);
 
-    int endIndex = (w->startVgprIndex + w->reservedVectorRegs - 1) %
-        w->computeUnit->vrf[w->simdId]->numRegs();
+    // Current dynamic register allocation does not handle wraparound
+    int endIndex = w->startVgprIndex + w->reservedVectorRegs;
 
     w->computeUnit->registerManager->vrfPoolMgrs[w->simdId]->
         freeRegion(w->startVgprIndex, endIndex);
 
-    // mark/pre-mark all registers as not busy
+    // mark/pre-mark all registers are not busy
     for (int i = 0; i < w->reservedVectorRegs; i++) {
         uint32_t physVgprIdx = mapVgpr(w, i);
         w->computeUnit->vrf[w->simdId]->markReg(physVgprIdx, false);
@@ -167,12 +167,11 @@ StaticRegisterManagerPolicy::freeRegisters(Wavefront *w)
     w->reservedVectorRegs = 0;
     w->startVgprIndex = 0;
 
-    endIndex = (w->startSgprIndex + w->reservedScalarRegs - 1) %
-        w->computeUnit->srf[w->simdId]->numRegs();
+    endIndex = w->startSgprIndex + w->reservedScalarRegs;
     w->computeUnit->registerManager->srfPoolMgrs[w->simdId]->
         freeRegion(w->startSgprIndex, endIndex);
 
-    // mark/pre-mark all registers as not busy
+    // mark/pre-mark all registers are not busy
     for (int i = 0; i < w->reservedScalarRegs; i++) {
         uint32_t physSgprIdx = mapSgpr(w, i);
         w->computeUnit->srf[w->simdId]->markReg(physSgprIdx, false);