cu_vector_ports = VectorRequestPort("Vector ports for GPUs")
cu_sqc_ports = VectorRequestPort("SQC ports for GPUs")
cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs")
+ cu_token_ports = VectorRequestPort("Token ports for GPU")
cus_per_sqc = Param.Int(4, "Number of CUs per SQC")
cus_per_scalar = Param.Int(4, "Number of CUs per scalar cache")
wavefronts_per_cu = Param.Int(1, "Number of wavefronts per CU")
workitems_per_wavefront = Param.Int(64, "Number of workitems per wf")
+ max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"
+ " of instructions that can be uncoalesced"
+ " before back-pressure occurs from the"
+ " coalescer.")
+
cpu_threads = VectorParam.CpuThread("All cpus")
wavefronts = VectorParam.GpuWavefront("All wavefronts")
void
GpuThread::attachGpuThreadToPorts(ProtocolTester *_tester,
ProtocolTester::SeqPort *_port,
+ ProtocolTester::GMTokenPort *_tokenPort,
ProtocolTester::SeqPort *_scalarPort,
ProtocolTester::SeqPort *_sqcPort)
{
tester = _tester;
port = _port;
+ tokenPort = _tokenPort;
scalarPort = _scalarPort;
sqcPort = _sqcPort;
// to complete
if (pendingLdStCount == 0 &&
pendingFenceCount == 0 &&
- pendingAtomicCount == 0) {
+ pendingAtomicCount == 0 &&
+ tokenPort->haveTokens(numLanes)) {
return true;
}
assert(pendingAtomicCount == 0);
// can't issue if there is a pending fence
- if (pendingFenceCount > 0) {
+ if (pendingFenceCount > 0 ||
+ !tokenPort->haveTokens(numLanes)) {
return false;
}
{
switch(curAction->getType()) {
case Episode::Action::Type::ATOMIC:
+ tokenPort->acquireTokens(numLanes);
issueAtomicOps();
break;
case Episode::Action::Type::ACQUIRE:
issueReleaseOp();
break;
case Episode::Action::Type::LOAD:
+ tokenPort->acquireTokens(numLanes);
issueLoadOps();
break;
case Episode::Action::Type::STORE:
+ tokenPort->acquireTokens(numLanes);
issueStoreOps();
break;
default:
#include "cpu/testers/gpu_ruby_test/episode.hh"
#include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
+#include "mem/token_port.hh"
#include "sim/clocked_object.hh"
class GpuThread : public ClockedObject
void attachGpuThreadToPorts(ProtocolTester *_tester,
ProtocolTester::SeqPort *_port,
+ ProtocolTester::GMTokenPort *_tokenPort = nullptr,
ProtocolTester::SeqPort *_sqcPort = nullptr,
ProtocolTester::SeqPort *_scalarPort = nullptr);
AddressManager *addrManager;
ProtocolTester::SeqPort *port; // main data port (GPU-vector data)
+ ProtocolTester::GMTokenPort *tokenPort;
ProtocolTester::SeqPort *scalarPort; // nullptr for CPU
ProtocolTester::SeqPort *sqcPort; // nullptr for CPU
numVectorPorts(p.port_cu_vector_ports_connection_count),
numSqcPorts(p.port_cu_sqc_ports_connection_count),
numScalarPorts(p.port_cu_scalar_ports_connection_count),
+ numTokenPorts(p.port_cu_token_ports_connection_count),
numCusPerSqc(p.cus_per_sqc),
numCusPerScalar(p.cus_per_scalar),
numWfsPerCu(p.wavefronts_per_cu),
numWisPerWf(p.workitems_per_wavefront),
+ numCuTokens(p.max_cu_tokens),
numAtomicLocs(p.num_atomic_locations),
numNormalLocsPerAtomic(p.num_normal_locs_per_atomic),
episodeLength(p.episode_length),
idx++;
}
+ for (int i = 0; i < numTokenPorts; ++i) {
+ cuTokenPorts.push_back(new GMTokenPort(csprintf("%s-cuTokenPort%d",
+ name(), i),
+ this, i));
+ cuTokenManagers.push_back(new TokenManager(numCuTokens));
+ cuTokenPorts[i]->setTokenManager(cuTokenManagers[i]);
+ }
+
// create an address manager
addrManager = new AddressManager(numAtomicLocs,
numNormalLocsPerAtomic);
wfId = cu_id * numWfsPerCu + i;
wfs[wfId]->attachGpuThreadToPorts(this,
static_cast<SeqPort*>(cuVectorPorts[vectorPortId]),
+ cuTokenPorts[vectorPortId],
static_cast<SeqPort*>(cuSqcPorts[sqcPortId]),
static_cast<SeqPort*>(cuScalarPorts[scalarPortId]));
wfs[wfId]->scheduleWakeup();
ProtocolTester::getPort(const std::string &if_name, PortID idx)
{
if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
- if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") {
+ if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports" &&
+ if_name != "cu_token_ports") {
// pass along to super class
return ClockedObject::getPort(if_name, idx);
} else {
if (idx > numSqcPorts)
panic("ProtocolTester: unknown cu sqc port %d\n", idx);
return *cuSqcPorts[idx];
+ } else if (if_name == "cu_token_ports") {
+ if (idx > numTokenPorts)
+ panic("ProtocolTester: unknown cu token port %d\n", idx);
+ return *cuTokenPorts[idx];
} else {
assert(if_name == "cu_scalar_ports");
if (idx > numScalarPorts)
#include "cpu/testers/gpu_ruby_test/address_manager.hh"
#include "mem/packet.hh"
#include "mem/ruby/system/RubyPort.hh"
+#include "mem/token_port.hh"
#include "params/ProtocolTester.hh"
class GpuThread;
{ panic("%s does not expect a retry\n", name()); }
};
+ class GMTokenPort : public TokenRequestPort
+ {
+ public:
+ GMTokenPort(const std::string& name, ProtocolTester *_tester,
+ PortID id = InvalidPortID)
+ : TokenRequestPort(name, _tester, id)
+ {}
+ ~GMTokenPort() {}
+
+ protected:
+ bool recvTimingResp(PacketPtr) { return false; }
+ void recvReqRetry() {}
+ };
+
struct SenderState : public Packet::SenderState
{
GpuThread* th;
int numVectorPorts;
int numSqcPorts;
int numScalarPorts;
+ int numTokenPorts;
int numCusPerSqc;
int numCusPerScalar;
int numWfsPerCu;
int numWisPerWf;
+ int numCuTokens;
// parameters controlling the address range that the tester can access
int numAtomicLocs;
int numNormalLocsPerAtomic;
std::vector<RequestPort*> cuVectorPorts; // ports to GPU vector cache
std::vector<RequestPort*> cuSqcPorts; // ports to GPU inst cache
std::vector<RequestPort*> cuScalarPorts; // ports to GPU scalar cache
+ std::vector<TokenManager*> cuTokenManagers;
+ std::vector<GMTokenPort*> cuTokenPorts;
// all CPU and GPU threads
std::vector<CpuThread*> cpuThreads;
std::vector<GpuWavefront*> wfs;