From dfc64880fe7450cf277d555ba5a8d0ece8e2013a Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Thu, 24 Sep 2020 21:50:58 -0500 Subject: [PATCH] configs,tests: Add tokens to GPU VIPER tester This patch integrates tokens into the VIPER tester by adding a GMTokenPort to the tester, having the tester acquire tokens for requests that use tokens, and checking for available tokens before issuing any requests. Change-Id: Id317d703e4765dd5fa7de0d16f5eb595aab7096c Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/35135 Maintainer: Matthew Poremba Maintainer: Matt Sinclair Reviewed-by: Matt Sinclair Tested-by: kokoro --- .../testers/gpu_ruby_test/ProtocolTester.py | 6 ++++++ src/cpu/testers/gpu_ruby_test/gpu_thread.cc | 11 +++++++++-- src/cpu/testers/gpu_ruby_test/gpu_thread.hh | 3 +++ .../testers/gpu_ruby_test/protocol_tester.cc | 18 +++++++++++++++++- .../testers/gpu_ruby_test/protocol_tester.hh | 19 +++++++++++++++++++ 5 files changed, 54 insertions(+), 3 deletions(-) diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py index e6874abbb..ed0e0a88e 100644 --- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py +++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py @@ -41,6 +41,7 @@ class ProtocolTester(ClockedObject): cu_vector_ports = VectorRequestPort("Vector ports for GPUs") cu_sqc_ports = VectorRequestPort("SQC ports for GPUs") cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs") + cu_token_ports = VectorRequestPort("Token ports for GPU") cus_per_sqc = Param.Int(4, "Number of CUs per SQC") cus_per_scalar = Param.Int(4, "Number of CUs per scalar cache") @@ -48,6 +49,11 @@ class ProtocolTester(ClockedObject): wavefronts_per_cu = Param.Int(1, "Number of wavefronts per CU") workitems_per_wavefront = Param.Int(64, "Number of workitems per wf") + max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number" + " of instructions that can be uncoalesced" + " before back-pressure occurs from the" + " coalescer.") + cpu_threads = VectorParam.CpuThread("All cpus") wavefronts = VectorParam.GpuWavefront("All wavefronts") diff --git a/src/cpu/testers/gpu_ruby_test/gpu_thread.cc b/src/cpu/testers/gpu_ruby_test/gpu_thread.cc index 7bf939b85..fbf5d0d73 100644 --- a/src/cpu/testers/gpu_ruby_test/gpu_thread.cc +++ b/src/cpu/testers/gpu_ruby_test/gpu_thread.cc @@ -125,11 +125,13 @@ GpuThread::scheduleDeadlockCheckEvent() void GpuThread::attachGpuThreadToPorts(ProtocolTester *_tester, ProtocolTester::SeqPort *_port, + ProtocolTester::GMTokenPort *_tokenPort, ProtocolTester::SeqPort *_scalarPort, ProtocolTester::SeqPort *_sqcPort) { tester = _tester; port = _port; + tokenPort = _tokenPort; scalarPort = _scalarPort; sqcPort = _sqcPort; @@ -163,7 +165,8 @@ GpuThread::isNextActionReady() // to complete if (pendingLdStCount == 0 && pendingFenceCount == 0 && - pendingAtomicCount == 0) { + pendingAtomicCount == 0 && + tokenPort->haveTokens(numLanes)) { return true; } @@ -198,7 +201,8 @@ GpuThread::isNextActionReady() assert(pendingAtomicCount == 0); // can't issue if there is a pending fence - if (pendingFenceCount > 0) { + if (pendingFenceCount > 0 || + !tokenPort->haveTokens(numLanes)) { return false; } @@ -241,6 +245,7 @@ GpuThread::issueNextAction() { switch(curAction->getType()) { case Episode::Action::Type::ATOMIC: + tokenPort->acquireTokens(numLanes); issueAtomicOps(); break; case Episode::Action::Type::ACQUIRE: @@ -250,9 +255,11 @@ GpuThread::issueNextAction() issueReleaseOp(); break; case Episode::Action::Type::LOAD: + tokenPort->acquireTokens(numLanes); issueLoadOps(); break; case Episode::Action::Type::STORE: + tokenPort->acquireTokens(numLanes); issueStoreOps(); break; default: diff --git a/src/cpu/testers/gpu_ruby_test/gpu_thread.hh b/src/cpu/testers/gpu_ruby_test/gpu_thread.hh index 9e4569b7a..00a69bec6 100644 --- a/src/cpu/testers/gpu_ruby_test/gpu_thread.hh +++ b/src/cpu/testers/gpu_ruby_test/gpu_thread.hh @@ -42,6 +42,7 @@ #include "cpu/testers/gpu_ruby_test/episode.hh" #include "cpu/testers/gpu_ruby_test/protocol_tester.hh" #include "gpu-compute/gpu_dyn_inst.hh" +#include "mem/token_port.hh" #include "sim/clocked_object.hh" class GpuThread : public ClockedObject @@ -61,6 +62,7 @@ class GpuThread : public ClockedObject void attachGpuThreadToPorts(ProtocolTester *_tester, ProtocolTester::SeqPort *_port, + ProtocolTester::GMTokenPort *_tokenPort = nullptr, ProtocolTester::SeqPort *_sqcPort = nullptr, ProtocolTester::SeqPort *_scalarPort = nullptr); @@ -136,6 +138,7 @@ class GpuThread : public ClockedObject AddressManager *addrManager; ProtocolTester::SeqPort *port; // main data port (GPU-vector data) + ProtocolTester::GMTokenPort *tokenPort; ProtocolTester::SeqPort *scalarPort; // nullptr for CPU ProtocolTester::SeqPort *sqcPort; // nullptr for CPU diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc index 98eda4987..c4baa208a 100644 --- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc +++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc @@ -53,10 +53,12 @@ ProtocolTester::ProtocolTester(const Params &p) numVectorPorts(p.port_cu_vector_ports_connection_count), numSqcPorts(p.port_cu_sqc_ports_connection_count), numScalarPorts(p.port_cu_scalar_ports_connection_count), + numTokenPorts(p.port_cu_token_ports_connection_count), numCusPerSqc(p.cus_per_sqc), numCusPerScalar(p.cus_per_scalar), numWfsPerCu(p.wavefronts_per_cu), numWisPerWf(p.workitems_per_wavefront), + numCuTokens(p.max_cu_tokens), numAtomicLocs(p.num_atomic_locations), numNormalLocsPerAtomic(p.num_normal_locs_per_atomic), episodeLength(p.episode_length), @@ -107,6 +109,14 @@ ProtocolTester::ProtocolTester(const Params &p) idx++; } + for (int i = 0; i < numTokenPorts; ++i) { + cuTokenPorts.push_back(new GMTokenPort(csprintf("%s-cuTokenPort%d", + name(), i), + this, i)); + cuTokenManagers.push_back(new TokenManager(numCuTokens)); + cuTokenPorts[i]->setTokenManager(cuTokenManagers[i]); + } + // create an address manager addrManager = new AddressManager(numAtomicLocs, numNormalLocsPerAtomic); @@ -194,6 +204,7 @@ ProtocolTester::init() wfId = cu_id * numWfsPerCu + i; wfs[wfId]->attachGpuThreadToPorts(this, static_cast(cuVectorPorts[vectorPortId]), + cuTokenPorts[vectorPortId], static_cast(cuSqcPorts[sqcPortId]), static_cast(cuScalarPorts[scalarPortId])); wfs[wfId]->scheduleWakeup(); @@ -206,7 +217,8 @@ Port& ProtocolTester::getPort(const std::string &if_name, PortID idx) { if (if_name != "cpu_ports" && if_name != "cu_vector_ports" && - if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") { + if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports" && + if_name != "cu_token_ports") { // pass along to super class return ClockedObject::getPort(if_name, idx); } else { @@ -222,6 +234,10 @@ ProtocolTester::getPort(const std::string &if_name, PortID idx) if (idx > numSqcPorts) panic("ProtocolTester: unknown cu sqc port %d\n", idx); return *cuSqcPorts[idx]; + } else if (if_name == "cu_token_ports") { + if (idx > numTokenPorts) + panic("ProtocolTester: unknown cu token port %d\n", idx); + return *cuTokenPorts[idx]; } else { assert(if_name == "cu_scalar_ports"); if (idx > numScalarPorts) diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh index c1f2997f7..6109e5add 100644 --- a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh +++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh @@ -58,6 +58,7 @@ #include "cpu/testers/gpu_ruby_test/address_manager.hh" #include "mem/packet.hh" #include "mem/ruby/system/RubyPort.hh" +#include "mem/token_port.hh" #include "params/ProtocolTester.hh" class GpuThread; @@ -81,6 +82,20 @@ class ProtocolTester : public ClockedObject { panic("%s does not expect a retry\n", name()); } }; + class GMTokenPort : public TokenRequestPort + { + public: + GMTokenPort(const std::string& name, ProtocolTester *_tester, + PortID id = InvalidPortID) + : TokenRequestPort(name, _tester, id) + {} + ~GMTokenPort() {} + + protected: + bool recvTimingResp(PacketPtr) { return false; } + void recvReqRetry() {} + }; + struct SenderState : public Packet::SenderState { GpuThread* th; @@ -131,10 +146,12 @@ class ProtocolTester : public ClockedObject int numVectorPorts; int numSqcPorts; int numScalarPorts; + int numTokenPorts; int numCusPerSqc; int numCusPerScalar; int numWfsPerCu; int numWisPerWf; + int numCuTokens; // parameters controlling the address range that the tester can access int numAtomicLocs; int numNormalLocsPerAtomic; @@ -150,6 +167,8 @@ class ProtocolTester : public ClockedObject std::vector cuVectorPorts; // ports to GPU vector cache std::vector cuSqcPorts; // ports to GPU inst cache std::vector cuScalarPorts; // ports to GPU scalar cache + std::vector cuTokenManagers; + std::vector cuTokenPorts; // all CPU and GPU threads std::vector cpuThreads; std::vector wfs; -- 2.30.2