action(w_sendResponseWBAck, "w", desc="send WB Ack") {
peek(responseFromNB_in, ResponseMsg) {
- enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
- out_msg.addr := address;
- out_msg.Type := CoherenceResponseType:TDSysWBAck;
- out_msg.Destination.clear();
- out_msg.Destination.add(in_msg.WTRequestor);
- out_msg.Sender := machineID;
- out_msg.MessageSize := MessageSizeType:Writeback_Control;
- }
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Destination.clear();
+ out_msg.Destination.add(in_msg.WTRequestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.instSeqNum := in_msg.instSeqNum;
+ }
}
}
out_msg.Destination.add(in_msg.Requestor);
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
out_msg.Dirty := true;
out_msg.DataBlk := in_msg.DataBlk;
out_msg.writeMask.orMask(in_msg.writeMask);
+ out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
I, AccessPermission:Invalid, desc="Invalid";
V, AccessPermission:Read_Only, desc="Valid";
- W, AccessPermission:Read_Write, desc="Written";
- M, AccessPermission:Read_Write, desc="Written and Valid";
- L, AccessPermission:Read_Write, desc="Local access is modifable";
A, AccessPermission:Invalid, desc="Waiting on Atomic";
}
Load, desc="Load";
Store, desc="Store to L1 (L1 is dirty)";
StoreThrough, desc="Store directly to L2(L1 is clean)";
- StoreLocal, desc="Store to L1 but L1 is clean";
Atomic, desc="Atomic";
Flush, desc="Flush if dirty(wbL1 for Store Release)";
Evict, desc="Evict if clean(invL1 for Load Acquire)";
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
// disable L1 cache
if (disableL1) {
- trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+ trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
DPRINTF(RubySlicc, "%s\n", in_msg);
if (in_msg.Type == RubyRequestType:LD) {
trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
- } else if (in_msg.Type == RubyRequestType:ATOMIC) {
+ } else if (in_msg.Type == RubyRequestType:ATOMIC ||
+ in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
+ in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
} else if (in_msg.Type == RubyRequestType:ST) {
if(disableL1) {
trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
- if (WB) {
- trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
- } else {
- trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
- }
+ trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
} else {
Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
} else {
error("Unexpected Request Message from VIC");
- if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
- if (WB) {
- trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
- } else {
- trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
- }
- } else {
- Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
- trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
- }
}
}
}
out_msg.Type := CoherenceRequestType:WriteThrough;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
+
+ // forward inst sequence number to lower TCC
+ peek(mandatoryQueue_in, RubyRequest) {
+ out_msg.instSeqNum := in_msg.instSeqNum;
+ }
}
}
}
}
+ action(ad_atomicDone, "ad", desc="atomic done") {
+ assert(is_valid(cache_entry));
+ coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+ }
+
action(s_storeDone, "s", desc="local store done") {
assert(is_valid(cache_entry));
DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
assert(false);
} else {
- coalescer.invCallback(address);
- }
- }
-
- action(wb_wbDone, "wb", desc="local wb done") {
- if (inFlush == true) {
- Fcnt := Fcnt + 1;
- if (Fcnt > WTcnt) {
- if (use_seq_not_coal) {
- DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
- assert(false);
- } else {
- coalescer.wbCallback(address);
- }
- Fcnt := Fcnt - 1;
- }
- if (WTcnt == 0 && Fcnt == 0) {
- inFlush := false;
- APPEND_TRANSITION_COMMENT(" inFlush is false");
- }
+ coalescer.invTCPCallback(address);
}
}
action(wd_wtDone, "wd", desc="writethrough done") {
- WTcnt := WTcnt - 1;
- if (inFlush == true) {
- Fcnt := Fcnt -1;
+ if (use_seq_not_coal) {
+ DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
+ assert(false);
+ } else {
+ peek(responseToTCP_in, ResponseMsg) {
+ coalescer.writeCompleteCallback(address, in_msg.instSeqNum);
+ }
}
- assert(WTcnt >= 0);
- APPEND_TRANSITION_COMMENT("write-- = ");
- APPEND_TRANSITION_COMMENT(WTcnt);
}
action(dw_dirtyWrite, "dw", desc="update write mask"){
// Stalling transitions do NOT check the tag array...and if they do,
// they can cause a resource stall deadlock!
- transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
+ transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} {
z_stall;
}
- transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
- l_loadDone;
- mru_updateMRU;
- p_popMandatoryQueue;
- }
-
transition(I, Load) {TagArrayRead} {
n_issueRdBlk;
p_popMandatoryQueue;
}
- transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
- t_allocateTBE;
+ transition(V, Load) {TagArrayRead, DataArrayRead} {
+ l_loadDone;
mru_updateMRU;
- at_atomicThrough;
p_popMandatoryQueue;
}
- transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
- wt_writeThrough;
+ transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
t_allocateTBE;
- at_atomicThrough;
- ic_invCache;
- }
-
- transition(W, Load, I) {TagArrayRead, DataArrayRead} {
- wt_writeThrough;
- norl_issueRdBlkOrloadDone;
- p_popMandatoryQueue;
- }
-
- transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
- a_allocate;
- dw_dirtyWrite;
- s_storeDone;
- p_popMandatoryQueue;
- }
-
- transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
- dw_dirtyWrite;
mru_updateMRU;
- s_storeDone;
- p_popMandatoryQueue;
- }
-
- transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
- a_allocate;
- dw_dirtyWrite;
- s_storeDone;
- p_popMandatoryQueue;
- }
-
- transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
- dw_dirtyWrite;
- mru_updateMRU;
- s_storeDone;
- p_popMandatoryQueue;
- }
-
- transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
- dw_dirtyWrite;
- mru_updateMRU;
- s_storeDone;
+ at_atomicThrough;
p_popMandatoryQueue;
}
- //M,W should not see storeThrough
transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocate;
dw_dirtyWrite;
p_popMandatoryQueue;
}
- transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
dw_dirtyWrite;
s_storeDone;
wt_writeThrough;
d_deallocateTBE;
a_allocate;
w_writeCache;
- s_storeDone;
+ ad_atomicDone;
pr_popResponseQueue;
ic_invCache;
}
pr_popResponseQueue;
}
- transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
- w_writeCache;
- l_loadDone;
- pr_popResponseQueue;
- }
-
transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
ic_invCache;
}
ic_invCache;
}
- transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
- wt_writeThrough;
- ic_invCache;
- }
-
- transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
- wt_writeThrough;
- ic_invCache;
- }
-
- transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ transition({V, I, A},Flush) {TagArrayFlash} {
sf_setFlush;
- wt_writeThrough;
- ic_invCache;
- p_popMandatoryQueue;
- }
-
- transition({V, I, A, L},Flush) {TagArrayFlash} {
- sf_setFlush;
- wb_wbDone;
p_popMandatoryQueue;
}
ic_invCache;
}
- transition({W, M}, Evict, W) {TagArrayFlash} {
- inv_invDone;
- p_popMandatoryQueue;
- }
-
- transition({A, L}, Evict) {TagArrayFlash} {
+ transition(A, Evict) {TagArrayFlash} {
inv_invDone;
p_popMandatoryQueue;
}
// TCC_AckWB only snoops TBE
- transition({V, I, A, M, W, L}, TCC_AckWB) {
+ transition({V, I, A}, TCC_AckWB) {
wd_wtDone;
- wb_wbDone;
pr_popResponseQueue;
}
}
Cycles, Cycles, Cycles);
void writeCallback(Addr, MachineType, DataBlock,
Cycles, Cycles, Cycles, bool);
- void invCallback(Addr);
- void wbCallback(Addr);
+ void atomicCallback(Addr, MachineType, DataBlock);
+ void invTCPCallback(Addr);
+ void writeCompleteCallback(Addr, uint64_t);
void evictionCallback(Addr);
}
out_msg.InitialRequestTime := in_msg.InitialRequestTime;
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := curCycle();
+ out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
WriteMask writeMask, desc="Write Through Data";
MachineID WTRequestor, desc="Node who initiated the write through";
int wfid, default="0", desc="wavefront id";
+ uint64_t instSeqNum, desc="instruction sequence number";
bool NoWriteConflict, default="true", desc="write collided with CAB entry";
int ProgramCounter, desc="PC that accesses to this block";
MessageSizeType MessageSize, desc="size category of the message";
int Phase, desc="Synchronization Phase";
int wfid, desc="wavefront id for Release";
+ uint64_t instSeqNum, desc="instruction sequence number";
MachineID Requestor, desc="Node who initiated the request";
bool functionalRead(Packet *pkt) {
bool NoAckNeeded, default="false", desc="For short circuting acks";
bool isValid, default="false", desc="Is acked block valid";
int wfid, default="0", desc="wavefront id";
+ uint64_t instSeqNum, desc="instruction sequence number";
int Phase, desc="Synchronization Phase";
int ProgramCounter, desc="PC that issues this request";
Addr addr, desc="Address";
FifoType Type, desc="WriteThrough/WriteFlush";
int wfid, default="0",desc="wavefront id";
+ uint64_t instSeqNum, desc="instruction sequence number";
MachineID Requestor, desc="Flush Requestor";
MachineID oRequestor, desc="original Flush Requestor";
WriteMask writeMask, desc="Writethrough mask";
DataBlock WTData, desc="Writethrough data block";
int wfid, desc="Writethrough wavefront";
+ uint64_t instSeqNum, desc="Instruction sequence number";
PacketPtr pkt, desc="Packet associated with this request";
}
WriteMask m_writeMask;
DataBlock m_WTData;
int m_wfid;
+ uint64_t m_instSeqNum;
RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb,
unsigned _proc_id, unsigned _core_id,
int _wm_size, std::vector<bool> & _wm_mask,
- DataBlock & _Data)
+ DataBlock & _Data,
+ uint64_t _instSeqNum = 0)
: Message(curTime),
m_PhysicalAddress(_paddr),
m_Type(_type),
m_contextId(_core_id),
m_writeMask(_wm_size,_wm_mask),
m_WTData(_Data),
- m_wfid(_proc_id)
+ m_wfid(_proc_id),
+ m_instSeqNum(_instSeqNum)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
}
unsigned _proc_id, unsigned _core_id,
int _wm_size, std::vector<bool> & _wm_mask,
DataBlock & _Data,
- std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps)
+ std::vector< std::pair<int,AtomicOpFunctor*> > _atomicOps,
+ uint64_t _instSeqNum = 0)
: Message(curTime),
m_PhysicalAddress(_paddr),
m_Type(_type),
m_contextId(_core_id),
m_writeMask(_wm_size,_wm_mask,_atomicOps),
m_WTData(_Data),
- m_wfid(_proc_id)
+ m_wfid(_proc_id),
+ m_instSeqNum(_instSeqNum)
{
m_LineAddress = makeLineAddress(m_PhysicalAddress);
}
}
}
-
-
m_outstanding_count--;
assert(m_outstanding_count >= 0);
assert(pkt->req->hasInstSeqNum());
if (pkt->cmd == MemCmd::MemSyncReq) {
- // issue mem_sync requests immedidately to the cache system without
- // going though uncoalescedTable like normal LD/ST/Atomic requests
- issueMemSyncRequest(pkt);
- } else {
- // otherwise, this must be either read or write command
- assert(pkt->isRead() || pkt->isWrite());
-
- // the pkt is temporarily stored in the uncoalesced table until
- // it's picked for coalescing process later in this cycle or in a
- // future cycle
- uncoalescedTable.insertPacket(pkt);
- DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
- pkt->getAddr());
-
- // we schedule an issue event here to process the uncoalesced table
- // and try to issue Ruby request to cache system
- if (!issueEvent.scheduled()) {
- schedule(issueEvent, curTick());
- }
+ // let the child coalescer handle MemSyncReq because this is
+ // cache coherence protocol specific
+ return RequestStatus_Issued;
+ }
+ // otherwise, this must be either read or write command
+ assert(pkt->isRead() || pkt->isWrite());
+
+ // the pkt is temporarily stored in the uncoalesced table until
+ // it's picked for coalescing process later in this cycle or in a
+ // future cycle
+ uncoalescedTable.insertPacket(pkt);
+ DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+ pkt->getAddr());
+
+ // we schedule an issue event here to process the uncoalesced table
+ // and try to issue Ruby request to cache system
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
}
// we always return RequestStatus_Issued in this coalescer
return RequestStatus_Issued;
}
-/**
- * TODO: Figure out what do with this code. This code may go away
- * and/or be merged into the VIPER coalescer once the VIPER
- * protocol is re-integrated with GCN3 codes.
- */
-/*
-void
-GPUCoalescer::issueRequest(CoalescedRequest* crequest)
-{
- PacketPtr pkt = crequest->getFirstPkt();
-
- int proc_id = -1;
- if (pkt != NULL && pkt->req->hasContextId()) {
- proc_id = pkt->req->contextId();
- }
-
- // If valid, copy the pc to the ruby request
- Addr pc = 0;
- if (pkt->req->hasPC()) {
- pc = pkt->req->getPC();
- }
-
- // At the moment setting scopes only counts
- // for GPU spill space accesses
- // which is pkt->req->isStack()
- // this scope is REPLACE since it
- // does not need to be flushed at the end
- // of a kernel Private and local may need
- // to be visible at the end of the kernel
- HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
- HSAScope accessScope = reqScopeToHSAScope(pkt->req);
-
- Addr line_addr = makeLineAddress(pkt->getAddr());
-
- // Creating WriteMask that records written bytes
- // and atomic operations. This enables partial writes
- // and partial reads of those writes
- DataBlock dataBlock;
- dataBlock.clear();
- uint32_t blockSize = RubySystem::getBlockSizeBytes();
- std::vector<bool> accessMask(blockSize,false);
- std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
- uint32_t tableSize = crequest->getPackets().size();
- for (int i = 0; i < tableSize; i++) {
- PacketPtr tmpPkt = crequest->getPackets()[i];
- uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
- uint32_t tmpSize = tmpPkt->getSize();
- if (tmpPkt->isAtomicOp()) {
- std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
- tmpPkt->getAtomicOp());
- atomicOps.push_back(tmpAtomicOp);
- } else if (tmpPkt->isWrite()) {
- dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
- tmpOffset, tmpSize);
- }
- for (int j = 0; j < tmpSize; j++) {
- accessMask[tmpOffset + j] = true;
- }
- }
- std::shared_ptr<RubyRequest> msg;
- if (pkt->isAtomicOp()) {
- msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
- pkt->getPtr<uint8_t>(),
- pkt->getSize(), pc, crequest->getRubyType(),
- RubyAccessMode_Supervisor, pkt,
- PrefetchBit_No, proc_id, 100,
- blockSize, accessMask,
- dataBlock, atomicOps,
- accessScope, accessSegment);
- } else {
- msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
- pkt->getPtr<uint8_t>(),
- pkt->getSize(), pc, crequest->getRubyType(),
- RubyAccessMode_Supervisor, pkt,
- PrefetchBit_No, proc_id, 100,
- blockSize, accessMask,
- dataBlock,
- accessScope, accessSegment);
- }
- DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
- curTick(), m_version, "Coal", "Begin", "", "",
- printAddress(msg->getPhysicalAddress()),
- RubyRequestType_to_string(crequest->getRubyType()));
-
- fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
- "there should not be any I-Fetch requests in the GPU Coalescer");
-
- Tick latency = cyclesToTicks(
- m_controller->mandatoryQueueLatency(crequest->getRubyType()));
- assert(latency > 0);
-
- if (!deadlockCheckEvent.scheduled()) {
- schedule(deadlockCheckEvent,
- m_deadlock_threshold * clockPeriod() +
- curTick());
- }
-
- assert(m_mandatory_q_ptr);
- m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
-}*/
-
template <class KEY, class VALUE>
std::ostream &
operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
assert(port != NULL);
pkt->senderState = ss->predecessor;
- delete ss;
+
+ if (pkt->cmd != MemCmd::WriteReq) {
+ // for WriteReq, we keep the original senderState until
+ // writeCompleteCallback
+ delete ss;
+ }
+
port->hitCallback(pkt);
trySendRetries();
}
Cycles firstResponseTime,
bool isRegion);
- void atomicCallback(Addr address,
- MachineType mach,
- const DataBlock& data);
+ /* atomics need their own callback because the data
+ might be const coming from SLICC */
+ virtual void atomicCallback(Addr address,
+ MachineType mach,
+ const DataBlock& data);
RequestStatus makeRequest(PacketPtr pkt) override;
int outstandingCount() const override { return m_outstanding_count; }
// since the two following issue functions are protocol-specific,
// they must be implemented in a derived coalescer
virtual void issueRequest(CoalescedRequest* crequest) = 0;
- virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+// virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
void kernelCallback(int wavefront_id);
RubySystem::getBlockSizeBytes());
}
+ // Save the port in the sender state object to be used later to
+ // route the response
+ pkt->pushSenderState(new SenderState(this));
+
// Submit the ruby request
RequestStatus requestStatus = ruby_port->makeRequest(pkt);
// Otherwise, we need to tell the port to retry at a later point
// and return false.
if (requestStatus == RequestStatus_Issued) {
- // Save the port in the sender state object to be used later to
- // route the response
- pkt->pushSenderState(new SenderState(this));
-
- DPRINTF(RubyPort, "Request %s address %#x issued\n", pkt->cmdString(),
+ DPRINTF(RubyPort, "Request %s 0x%x issued\n", pkt->cmdString(),
pkt->getAddr());
return true;
}
- if (pkt->cmd != MemCmd::MemFenceReq) {
+ // pop off sender state as this request failed to issue
+ SenderState *ss = safe_cast<SenderState *>(pkt->popSenderState());
+ delete ss;
+
+ if (pkt->cmd != MemCmd::MemSyncReq) {
DPRINTF(RubyPort,
"Request %s for address %#x did not issue because %s\n",
pkt->cmdString(), pkt->getAddr(),
}
// turn packet around to go back to requester if response expected
- if (needsResponse) {
+ if (needsResponse || pkt->isResponse()) {
DPRINTF(RubyPort, "Sending packet back over port\n");
// Send a response in the same cycle. There is no need to delay the
// response because the response latency is already incurred in the
#include "cpu/testers/rubytest/RubyTester.hh"
#include "debug/GPUCoalescer.hh"
#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
#include "mem/packet.hh"
#include "mem/ruby/common/SubBlock.hh"
#include "mem/ruby/network/MessageBuffer.hh"
}
VIPERCoalescer::VIPERCoalescer(const Params *p)
- : GPUCoalescer(p)
+ : GPUCoalescer(p),
+ m_cache_inv_pkt(nullptr),
+ m_num_pending_invs(0)
{
- m_max_wb_per_cycle=p->max_wb_per_cycle;
- m_max_inv_per_cycle=p->max_inv_per_cycle;
- m_outstanding_inv = 0;
- m_outstanding_wb = 0;
}
VIPERCoalescer::~VIPERCoalescer()
{
}
-void
-VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
-{
-}
-
-void
-VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt)
-{
-}
-
// Places an uncoalesced packet in uncoalescedTable. If the packet is a
// special type (MemFence, scoping, etc), it is issued immediately.
RequestStatus
VIPERCoalescer::makeRequest(PacketPtr pkt)
{
- if (m_outstanding_wb | m_outstanding_inv) {
- DPRINTF(GPUCoalescer,
- "There are %d Writebacks and %d Invalidatons\n",
- m_outstanding_wb, m_outstanding_inv);
- }
- // Are we in the middle of a release
- if ((m_outstanding_wb) > 0) {
- if (pkt->req->isKernel()) {
- // Everythign is fine
- // Barriers and Kernel End scan coalesce
- // If it is a Kerenl Begin flush the cache
- if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
- invL1();
- }
-
- if (pkt->req->isRelease()) {
- insertKernel(pkt->req->contextId(), pkt);
- }
-
- return RequestStatus_Issued;
- }
- } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
- // Flush Dirty Data on Kernel End
- // isKernel + isRelease
- insertKernel(pkt->req->contextId(), pkt);
- wbL1();
- if (m_outstanding_wb == 0) {
- for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
- newKernelEnds.push_back(it->first);
- }
- completeIssue();
- }
- return RequestStatus_Issued;
+ // VIPER only supports following memory request types
+ // MemSyncReq & Acquire: TCP cache invalidation
+ // ReadReq : cache read
+ // WriteReq : cache write
+ // AtomicOp : cache atomic
+ //
+ // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
+ // does not specify an equivalent type of memory request.
+ // TODO: future patches should rename Acquire and Release
+ assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) ||
+ pkt->cmd == MemCmd::ReadReq ||
+ pkt->cmd == MemCmd::WriteReq ||
+ pkt->isAtomicOp());
+
+ if (pkt->req->isAcquire() && m_cache_inv_pkt) {
+ // In VIPER protocol, the coalescer is not able to handle two or
+ // more cache invalidation requests at a time. Cache invalidation
+ // requests must be serialized to ensure that all stale data in
+ // TCP are invalidated correctly. If there's already a pending
+ // cache invalidation request, we must retry this request later
+ return RequestStatus_Aliased;
}
GPUCoalescer::makeRequest(pkt);
- if (pkt->req->isKernel() && pkt->req->isAcquire()) {
- // Invalidate clean Data on Kernel Begin
- // isKernel + isAcquire
- invL1();
- } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
- // Deschedule the AtomicAcqRel and
- // Flush and Invalidate the L1 cache
- invwbL1();
- if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
- DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
- deschedule(issueEvent);
- }
- } else if (pkt->req->isRelease()) {
- // Deschedule the StoreRel and
- // Flush the L1 cache
- wbL1();
- if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
- DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
- deschedule(issueEvent);
- }
- } else if (pkt->req->isAcquire()) {
- // LoadAcq or AtomicAcq
- // Invalidate the L1 cache
- invL1();
- }
- // Request was successful
- if (m_outstanding_wb == 0) {
- if (!issueEvent.scheduled()) {
- DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
- schedule(issueEvent, curTick());
- }
+ if (pkt->req->isAcquire()) {
+ // In VIPER protocol, a compute unit sends a MemSyncReq with Acquire
+ // flag to invalidate TCP. Upon receiving a request of this type,
+ // VIPERCoalescer starts a cache walk to invalidate all valid entries
+ // in TCP. The request is completed once all entries are invalidated.
+ assert(!m_cache_inv_pkt);
+ m_cache_inv_pkt = pkt;
+ invTCP();
}
+
return RequestStatus_Issued;
}
void
-VIPERCoalescer::wbCallback(Addr addr)
+VIPERCoalescer::issueRequest(CoalescedRequest* crequest)
{
- m_outstanding_wb--;
- // if L1 Flush Complete
- // attemnpt to schedule issueEvent
- assert(((int) m_outstanding_wb) >= 0);
- if (m_outstanding_wb == 0) {
- for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
- newKernelEnds.push_back(it->first);
+ PacketPtr pkt = crequest->getFirstPkt();
+
+ int proc_id = -1;
+ if (pkt != NULL && pkt->req->hasContextId()) {
+ proc_id = pkt->req->contextId();
+ }
+
+ // If valid, copy the pc to the ruby request
+ Addr pc = 0;
+ if (pkt->req->hasPC()) {
+ pc = pkt->req->getPC();
+ }
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Creating WriteMask that records written bytes
+ // and atomic operations. This enables partial writes
+ // and partial reads of those writes
+ DataBlock dataBlock;
+ dataBlock.clear();
+ uint32_t blockSize = RubySystem::getBlockSizeBytes();
+ std::vector<bool> accessMask(blockSize,false);
+ std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+ uint32_t tableSize = crequest->getPackets().size();
+ for (int i = 0; i < tableSize; i++) {
+ PacketPtr tmpPkt = crequest->getPackets()[i];
+ uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+ uint32_t tmpSize = tmpPkt->getSize();
+ if (tmpPkt->isAtomicOp()) {
+ std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+ tmpPkt->getAtomicOp());
+ atomicOps.push_back(tmpAtomicOp);
+ } else if (tmpPkt->isWrite()) {
+ dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+ tmpOffset, tmpSize);
+ }
+ for (int j = 0; j < tmpSize; j++) {
+ accessMask[tmpOffset + j] = true;
}
- completeIssue();
}
- trySendRetries();
+ std::shared_ptr<RubyRequest> msg;
+ if (pkt->isAtomicOp()) {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, crequest->getRubyType(),
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock, atomicOps, crequest->getSeqNum());
+ } else {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, crequest->getRubyType(),
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock, crequest->getSeqNum());
+ }
+
+ if (pkt->cmd == MemCmd::WriteReq) {
+ makeWriteCompletePkts(crequest);
+ }
+
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+ curTick(), m_version, "Coal", "Begin", "", "",
+ printAddress(msg->getPhysicalAddress()),
+ RubyRequestType_to_string(crequest->getRubyType()));
+
+ fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
+ "there should not be any I-Fetch requests in the GPU Coalescer");
+
+ if (!deadlockCheckEvent.scheduled()) {
+ schedule(deadlockCheckEvent,
+ m_deadlock_threshold * clockPeriod() +
+ curTick());
+ }
+
+ assert(m_mandatory_q_ptr);
+ Tick latency = cyclesToTicks(
+ m_controller->mandatoryQueueLatency(crequest->getRubyType()));
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
}
void
-VIPERCoalescer::invCallback(Addr addr)
+VIPERCoalescer::makeWriteCompletePkts(CoalescedRequest* crequest)
{
- m_outstanding_inv--;
- // if L1 Flush Complete
- // attemnpt to schedule issueEvent
- // This probably won't happen, since
- // we dont wait on cache invalidations
- if (m_outstanding_wb == 0) {
- for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
- newKernelEnds.push_back(it->first);
- }
- completeIssue();
+ // In VIPER protocol, for each write request, down-stream caches
+ // return two responses: writeCallback and writeCompleteCallback.
+ // We need to prepare a writeCompletePkt for each write request so
+ // that when writeCompleteCallback is called, we can respond
+ // requesting wavefront right away.
+ // writeCompletePkt inherits request and senderState of the original
+ // write request packet so that we can find the original requestor
+ // later. This assumes that request and senderState are not deleted
+ // before writeCompleteCallback is called.
+
+ auto key = crequest->getSeqNum();
+ std::vector<PacketPtr>& req_pkts = crequest->getPackets();
+
+ for (auto pkt : req_pkts) {
+ DPRINTF(GPUCoalescer, "makeWriteCompletePkts: instSeqNum %d\n",
+ key);
+ assert(pkt->cmd == MemCmd::WriteReq);
+
+ PacketPtr writeCompletePkt = new Packet(pkt->req,
+ MemCmd::WriteCompleteResp);
+ writeCompletePkt->setAddr(pkt->getAddr());
+ writeCompletePkt->senderState = pkt->senderState;
+ m_writeCompletePktMap[key].push_back(writeCompletePkt);
}
- trySendRetries();
}
-/**
- * Invalidate L1 cache (Acquire)
- */
void
-VIPERCoalescer::invL1()
+VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
{
- int size = m_dataCache_ptr->getNumBlocks();
- DPRINTF(GPUCoalescer,
- "There are %d Invalidations outstanding before Cache Walk\n",
- m_outstanding_inv);
- // Walk the cache
- for (int i = 0; i < size; i++) {
- Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
- // Evict Read-only data
- RubyRequestType request_type = RubyRequestType_REPLACEMENT;
- std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
- clockEdge(), addr, (uint8_t*) 0, 0, 0,
- request_type, RubyAccessMode_Supervisor,
- nullptr);
- assert(m_mandatory_q_ptr != NULL);
- Tick latency = cyclesToTicks(
- m_controller->mandatoryQueueLatency(request_type));
- assert(latency > 0);
- m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
- m_outstanding_inv++;
+ DPRINTF(GPUCoalescer, "writeCompleteCallback: instSeqNum %d addr 0x%x\n",
+ instSeqNum, addr);
+
+ auto key = instSeqNum;
+ assert(m_writeCompletePktMap.count(key) == 1 &&
+ !m_writeCompletePktMap[key].empty());
+
+ for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
+ if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
+ RubyPort::SenderState *ss =
+ safe_cast<RubyPort::SenderState *>
+ (writeCompletePkt->senderState);
+ MemSlavePort *port = ss->port;
+ assert(port != NULL);
+
+ writeCompletePkt->senderState = ss->predecessor;
+ delete ss;
+ port->hitCallback(writeCompletePkt);
+ }
}
- DPRINTF(GPUCoalescer,
- "There are %d Invalidatons outstanding after Cache Walk\n",
- m_outstanding_inv);
+
+ trySendRetries();
+
+ if (m_writeCompletePktMap[key].empty())
+ m_writeCompletePktMap.erase(key);
}
-/**
- * Writeback L1 cache (Release)
- */
void
-VIPERCoalescer::wbL1()
+VIPERCoalescer::invTCPCallback(Addr addr)
{
- int size = m_dataCache_ptr->getNumBlocks();
- DPRINTF(GPUCoalescer,
- "There are %d Writebacks outstanding before Cache Walk\n",
- m_outstanding_wb);
- // Walk the cache
- for (int i = 0; i < size; i++) {
- Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
- // Write dirty data back
- RubyRequestType request_type = RubyRequestType_FLUSH;
- std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
- clockEdge(), addr, (uint8_t*) 0, 0, 0,
- request_type, RubyAccessMode_Supervisor,
- nullptr);
- assert(m_mandatory_q_ptr != NULL);
- Tick latency = cyclesToTicks(
- m_controller->mandatoryQueueLatency(request_type));
- assert(latency > 0);
- m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
- m_outstanding_wb++;
+ assert(m_cache_inv_pkt && m_num_pending_invs > 0);
+
+ m_num_pending_invs--;
+
+ if (m_num_pending_invs == 0) {
+ std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
+ completeHitCallback(pkt_list);
+ m_cache_inv_pkt = nullptr;
}
- DPRINTF(GPUCoalescer,
- "There are %d Writebacks outstanding after Cache Walk\n",
- m_outstanding_wb);
}
/**
- * Invalidate and Writeback L1 cache (Acquire&Release)
+ * Invalidate TCP (Acquire)
*/
void
-VIPERCoalescer::invwbL1()
+VIPERCoalescer::invTCP()
{
int size = m_dataCache_ptr->getNumBlocks();
+ DPRINTF(GPUCoalescer,
+ "There are %d Invalidations outstanding before Cache Walk\n",
+ m_num_pending_invs);
// Walk the cache
for (int i = 0; i < size; i++) {
Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
clockEdge(), addr, (uint8_t*) 0, 0, 0,
request_type, RubyAccessMode_Supervisor,
nullptr);
+ DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr);
assert(m_mandatory_q_ptr != NULL);
Tick latency = cyclesToTicks(
- m_controller->mandatoryQueueLatency(request_type));
- assert(latency > 0);
+ m_controller->mandatoryQueueLatency(request_type));
m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
- m_outstanding_inv++;
- }
- // Walk the cache
- for (int i = 0; i< size; i++) {
- Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
- // Write dirty data back
- RubyRequestType request_type = RubyRequestType_FLUSH;
- std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
- clockEdge(), addr, (uint8_t*) 0, 0, 0,
- request_type, RubyAccessMode_Supervisor,
- nullptr);
- assert(m_mandatory_q_ptr != NULL);
- Tick latency = cyclesToTicks(
- m_controller->mandatoryQueueLatency(request_type));
- assert(latency > 0);
- m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
- m_outstanding_wb++;
+ m_num_pending_invs++;
}
+ DPRINTF(GPUCoalescer,
+ "There are %d Invalidatons outstanding after Cache Walk\n",
+ m_num_pending_invs);
}
typedef VIPERCoalescerParams Params;
VIPERCoalescer(const Params *);
~VIPERCoalescer();
-
- void issueMemSyncRequest(PacketPtr pkt) override;
- void issueRequest(CoalescedRequest* crequest) override;
- void wbCallback(Addr address);
- void invCallback(Addr address);
+ void writeCompleteCallback(Addr address, uint64_t instSeqNum);
+ void invTCPCallback(Addr address);
RequestStatus makeRequest(PacketPtr pkt) override;
+ void issueRequest(CoalescedRequest* crequest) override;
+
private:
- void invL1();
- void wbL1();
- void invwbL1();
- uint64_t m_outstanding_inv;
- uint64_t m_outstanding_wb;
- uint64_t m_max_inv_per_cycle;
- uint64_t m_max_wb_per_cycle;
+ void invTCP();
+
+ // make write-complete response packets from original write request packets
+ void makeWriteCompletePkts(CoalescedRequest* crequest);
+
+ // current cache invalidation packet
+ // nullptr if there is no active cache invalidation request
+ PacketPtr m_cache_inv_pkt;
+
+ // number of remaining cache lines to be invalidated in TCP
+ int m_num_pending_invs;
+
+ // a map of instruction sequence number and corresponding pending
+ // write-complete response packets. Each write-complete response
+ // corresponds to a pending store request that is waiting for
+ // writeCompleteCallback. We may have multiple pending store requests per
+ // wavefront at a time. Each time writeCompleteCallback is called, an entry
+ // with a corresponding seqNum is popped off from map and returned to
+ // compute unit.
+ std::unordered_map<uint64_t, std::vector<PacketPtr>> m_writeCompletePktMap;
};
#endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__