configs,tests: Add tokens to GPU VIPER tester
authorKyle Roarty <kyleroarty1716@gmail.com>
Fri, 25 Sep 2020 02:50:58 +0000 (21:50 -0500)
committerMatthew Poremba <matthew.poremba@amd.com>
Wed, 4 Nov 2020 21:09:26 +0000 (21:09 +0000)
This patch integrates tokens into the VIPER tester by adding a
GMTokenPort to the tester, having the tester acquire tokens for
requests that use tokens, and checking for available tokens
before issuing any requests.

Change-Id: Id317d703e4765dd5fa7de0d16f5eb595aab7096c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/35135
Maintainer: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
src/cpu/testers/gpu_ruby_test/ProtocolTester.py
src/cpu/testers/gpu_ruby_test/gpu_thread.cc
src/cpu/testers/gpu_ruby_test/gpu_thread.hh
src/cpu/testers/gpu_ruby_test/protocol_tester.cc
src/cpu/testers/gpu_ruby_test/protocol_tester.hh

index e6874abbb02251901cfdefaf07f35e630e0ccdb4..ed0e0a88ee1392996166ce2a1e08f3169641fef6 100644 (file)
@@ -41,6 +41,7 @@ class ProtocolTester(ClockedObject):
     cu_vector_ports = VectorRequestPort("Vector ports for GPUs")
     cu_sqc_ports = VectorRequestPort("SQC ports for GPUs")
     cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs")
+    cu_token_ports = VectorRequestPort("Token ports for GPU")
 
     cus_per_sqc = Param.Int(4, "Number of CUs per SQC")
     cus_per_scalar = Param.Int(4, "Number of CUs per scalar cache")
@@ -48,6 +49,11 @@ class ProtocolTester(ClockedObject):
     wavefronts_per_cu = Param.Int(1, "Number of wavefronts per CU")
     workitems_per_wavefront = Param.Int(64, "Number of workitems per wf")
 
+    max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"
+                                 " of instructions that can be uncoalesced"
+                                 " before back-pressure occurs from the"
+                                 " coalescer.")
+
     cpu_threads = VectorParam.CpuThread("All cpus")
     wavefronts = VectorParam.GpuWavefront("All wavefronts")
 
index 7bf939b85b6f33dad7d5c3d6d6e1d69d93b7528c..fbf5d0d73eb9068edd896e8bc62265c30ba241ed 100644 (file)
@@ -125,11 +125,13 @@ GpuThread::scheduleDeadlockCheckEvent()
 void
 GpuThread::attachGpuThreadToPorts(ProtocolTester *_tester,
                             ProtocolTester::SeqPort *_port,
+                            ProtocolTester::GMTokenPort *_tokenPort,
                             ProtocolTester::SeqPort *_scalarPort,
                             ProtocolTester::SeqPort *_sqcPort)
 {
     tester = _tester;
     port = _port;
+    tokenPort = _tokenPort;
     scalarPort = _scalarPort;
     sqcPort = _sqcPort;
 
@@ -163,7 +165,8 @@ GpuThread::isNextActionReady()
                 // to complete
                 if (pendingLdStCount == 0 &&
                     pendingFenceCount == 0 &&
-                    pendingAtomicCount == 0) {
+                    pendingAtomicCount == 0 &&
+                    tokenPort->haveTokens(numLanes)) {
                     return true;
                 }
 
@@ -198,7 +201,8 @@ GpuThread::isNextActionReady()
                 assert(pendingAtomicCount == 0);
 
                 // can't issue if there is a pending fence
-                if (pendingFenceCount > 0) {
+                if (pendingFenceCount > 0 ||
+                    !tokenPort->haveTokens(numLanes)) {
                     return false;
                 }
 
@@ -241,6 +245,7 @@ GpuThread::issueNextAction()
 {
     switch(curAction->getType()) {
         case Episode::Action::Type::ATOMIC:
+            tokenPort->acquireTokens(numLanes);
             issueAtomicOps();
             break;
         case Episode::Action::Type::ACQUIRE:
@@ -250,9 +255,11 @@ GpuThread::issueNextAction()
             issueReleaseOp();
             break;
         case Episode::Action::Type::LOAD:
+            tokenPort->acquireTokens(numLanes);
             issueLoadOps();
             break;
         case Episode::Action::Type::STORE:
+            tokenPort->acquireTokens(numLanes);
             issueStoreOps();
             break;
         default:
index 9e4569b7a31a6bf812689595dd735093d8ab9405..00a69bec613db01c5b8f0aff57f3f055f82c2298 100644 (file)
@@ -42,6 +42,7 @@
 #include "cpu/testers/gpu_ruby_test/episode.hh"
 #include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "mem/token_port.hh"
 #include "sim/clocked_object.hh"
 
 class GpuThread : public ClockedObject
@@ -61,6 +62,7 @@ class GpuThread : public ClockedObject
 
     void attachGpuThreadToPorts(ProtocolTester *_tester,
                              ProtocolTester::SeqPort *_port,
+                             ProtocolTester::GMTokenPort *_tokenPort = nullptr,
                              ProtocolTester::SeqPort *_sqcPort = nullptr,
                              ProtocolTester::SeqPort *_scalarPort = nullptr);
 
@@ -136,6 +138,7 @@ class GpuThread : public ClockedObject
     AddressManager *addrManager;
 
     ProtocolTester::SeqPort *port;       // main data port (GPU-vector data)
+    ProtocolTester::GMTokenPort *tokenPort;
     ProtocolTester::SeqPort *scalarPort; // nullptr for CPU
     ProtocolTester::SeqPort *sqcPort;   // nullptr for CPU
 
index 98eda4987b0897026088ed5222f5fc81269b3340..c4baa208a49e276f61e961c8239ecdda3c2f3b36 100644 (file)
@@ -53,10 +53,12 @@ ProtocolTester::ProtocolTester(const Params &p)
         numVectorPorts(p.port_cu_vector_ports_connection_count),
         numSqcPorts(p.port_cu_sqc_ports_connection_count),
         numScalarPorts(p.port_cu_scalar_ports_connection_count),
+        numTokenPorts(p.port_cu_token_ports_connection_count),
         numCusPerSqc(p.cus_per_sqc),
         numCusPerScalar(p.cus_per_scalar),
         numWfsPerCu(p.wavefronts_per_cu),
         numWisPerWf(p.workitems_per_wavefront),
+        numCuTokens(p.max_cu_tokens),
         numAtomicLocs(p.num_atomic_locations),
         numNormalLocsPerAtomic(p.num_normal_locs_per_atomic),
         episodeLength(p.episode_length),
@@ -107,6 +109,14 @@ ProtocolTester::ProtocolTester(const Params &p)
         idx++;
     }
 
+    for (int i = 0; i < numTokenPorts; ++i) {
+        cuTokenPorts.push_back(new GMTokenPort(csprintf("%s-cuTokenPort%d",
+                                                        name(), i),
+                                               this, i));
+        cuTokenManagers.push_back(new TokenManager(numCuTokens));
+        cuTokenPorts[i]->setTokenManager(cuTokenManagers[i]);
+    }
+
     // create an address manager
     addrManager = new AddressManager(numAtomicLocs,
                                        numNormalLocsPerAtomic);
@@ -194,6 +204,7 @@ ProtocolTester::init()
             wfId = cu_id * numWfsPerCu + i;
             wfs[wfId]->attachGpuThreadToPorts(this,
                            static_cast<SeqPort*>(cuVectorPorts[vectorPortId]),
+                           cuTokenPorts[vectorPortId],
                            static_cast<SeqPort*>(cuSqcPorts[sqcPortId]),
                            static_cast<SeqPort*>(cuScalarPorts[scalarPortId]));
             wfs[wfId]->scheduleWakeup();
@@ -206,7 +217,8 @@ Port&
 ProtocolTester::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
-        if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") {
+        if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports" &&
+        if_name != "cu_token_ports") {
         // pass along to super class
         return ClockedObject::getPort(if_name, idx);
     } else {
@@ -222,6 +234,10 @@ ProtocolTester::getPort(const std::string &if_name, PortID idx)
             if (idx > numSqcPorts)
                 panic("ProtocolTester: unknown cu sqc port %d\n", idx);
             return *cuSqcPorts[idx];
+        } else if (if_name == "cu_token_ports") {
+            if (idx > numTokenPorts)
+                panic("ProtocolTester: unknown cu token port %d\n", idx);
+            return *cuTokenPorts[idx];
         } else {
             assert(if_name == "cu_scalar_ports");
             if (idx > numScalarPorts)
index c1f2997f7b3959ba0e14c02540ab0e72045b294b..6109e5addee988928d8e87c4690296c770bb8d52 100644 (file)
@@ -58,6 +58,7 @@
 #include "cpu/testers/gpu_ruby_test/address_manager.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/system/RubyPort.hh"
+#include "mem/token_port.hh"
 #include "params/ProtocolTester.hh"
 
 class GpuThread;
@@ -81,6 +82,20 @@ class ProtocolTester : public ClockedObject
             { panic("%s does not expect a retry\n", name()); }
     };
 
+    class GMTokenPort : public TokenRequestPort
+    {
+        public:
+            GMTokenPort(const std::string& name, ProtocolTester *_tester,
+                        PortID id = InvalidPortID)
+                : TokenRequestPort(name, _tester, id)
+            {}
+            ~GMTokenPort() {}
+
+        protected:
+            bool recvTimingResp(PacketPtr) { return false; }
+            void recvReqRetry() {}
+    };
+
     struct SenderState : public Packet::SenderState
     {
         GpuThread* th;
@@ -131,10 +146,12 @@ class ProtocolTester : public ClockedObject
     int numVectorPorts;
     int numSqcPorts;
     int numScalarPorts;
+    int numTokenPorts;
     int numCusPerSqc;
     int numCusPerScalar;
     int numWfsPerCu;
     int numWisPerWf;
+    int numCuTokens;
     // parameters controlling the address range that the tester can access
     int numAtomicLocs;
     int numNormalLocsPerAtomic;
@@ -150,6 +167,8 @@ class ProtocolTester : public ClockedObject
     std::vector<RequestPort*> cuVectorPorts; // ports to GPU vector cache
     std::vector<RequestPort*> cuSqcPorts;    // ports to GPU inst cache
     std::vector<RequestPort*> cuScalarPorts; // ports to GPU scalar cache
+    std::vector<TokenManager*> cuTokenManagers;
+    std::vector<GMTokenPort*> cuTokenPorts;
     // all CPU and GPU threads
     std::vector<CpuThread*> cpuThreads;
     std::vector<GpuWavefront*> wfs;