From: Tuan Ta Date: Fri, 4 May 2018 16:14:13 +0000 (-0400) Subject: mem-ruby: GCN3 and VIPER integration X-Git-Tag: v20.1.0.0~566 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=18ebe62598e09771f58d555faaed6e3572cc4d14;p=gem5.git mem-ruby: GCN3 and VIPER integration This patch modifies the Coalescer and VIPER protocol to support memory synchronization requests and write-completion responses that are required by upcoming GCN3 implementation. VIPER protocol is simplified to be a solely write-through protocol. Change-Id: Iccfa3d749a0301172a1cc567c59609bb548dace6 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29913 Reviewed-by: Anthony Gutierrez Reviewed-by: Jason Lowe-Power Reviewed-by: Bradford Beckmann Maintainer: Anthony Gutierrez Maintainer: Bradford Beckmann Tested-by: kokoro --- diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index f8da4abf1..5f05a605b 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -392,14 +392,15 @@ machine(MachineType:TCC, "TCC Cache") action(w_sendResponseWBAck, "w", desc="send WB Ack") { peek(responseFromNB_in, ResponseMsg) { - enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { - out_msg.addr := address; - out_msg.Type := CoherenceResponseType:TDSysWBAck; - out_msg.Destination.clear(); - out_msg.Destination.add(in_msg.WTRequestor); - out_msg.Sender := machineID; - out_msg.MessageSize := MessageSizeType:Writeback_Control; - } + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.WTRequestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.instSeqNum := in_msg.instSeqNum; + } } } @@ -412,6 +413,7 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Destination.add(in_msg.Requestor); out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.instSeqNum := in_msg.instSeqNum; } } } @@ -486,6 +488,7 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Dirty := true; out_msg.DataBlk := in_msg.DataBlk; out_msg.writeMask.orMask(in_msg.writeMask); + out_msg.instSeqNum := in_msg.instSeqNum; } } } diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 4047dc689..3f6179159 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -56,9 +56,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") state_declaration(State, desc="TCP Cache States", default="TCP_State_I") { I, AccessPermission:Invalid, desc="Invalid"; V, AccessPermission:Read_Only, desc="Valid"; - W, AccessPermission:Read_Write, desc="Written"; - M, AccessPermission:Read_Write, desc="Written and Valid"; - L, AccessPermission:Read_Write, desc="Local access is modifable"; A, AccessPermission:Invalid, desc="Waiting on Atomic"; } @@ -67,7 +64,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") Load, desc="Load"; Store, desc="Store to L1 (L1 is dirty)"; StoreThrough, desc="Store directly to L2(L1 is clean)"; - StoreLocal, desc="Store to L1 but L1 is clean"; Atomic, desc="Atomic"; Flush, desc="Flush if dirty(wbL1 for Store Release)"; Evict, desc="Evict if clean(invL1 for Load Acquire)"; @@ -264,7 +260,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") if (in_msg.Type == CoherenceResponseType:TDSysResp) { // disable L1 cache if (disableL1) { - trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); + trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); } else { if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) { trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe); @@ -291,18 +287,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") DPRINTF(RubySlicc, "%s\n", in_msg); if (in_msg.Type == RubyRequestType:LD) { trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); - } else if (in_msg.Type == RubyRequestType:ATOMIC) { + } else if (in_msg.Type == RubyRequestType:ATOMIC || + in_msg.Type == RubyRequestType:ATOMIC_RETURN || + in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) { trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe); } else if (in_msg.Type == RubyRequestType:ST) { if(disableL1) { trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); } else { if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { - if (WB) { - trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); - } else { - trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); - } + trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); } else { Addr victim := L1cache.cacheProbe(in_msg.LineAddress); trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); @@ -314,16 +308,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe); } else { error("Unexpected Request Message from VIC"); - if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { - if (WB) { - trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe); - } else { - trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); - } - } else { - Addr victim := L1cache.cacheProbe(in_msg.LineAddress); - trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim)); - } } } } @@ -415,6 +399,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") out_msg.Type := CoherenceRequestType:WriteThrough; out_msg.InitialRequestTime := curCycle(); out_msg.Shared := false; + + // forward inst sequence number to lower TCC + peek(mandatoryQueue_in, RubyRequest) { + out_msg.instSeqNum := in_msg.instSeqNum; + } } } @@ -475,6 +464,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } } + action(ad_atomicDone, "ad", desc="atomic done") { + assert(is_valid(cache_entry)); + coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + action(s_storeDone, "s", desc="local store done") { assert(is_valid(cache_entry)); @@ -491,37 +485,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n"); assert(false); } else { - coalescer.invCallback(address); - } - } - - action(wb_wbDone, "wb", desc="local wb done") { - if (inFlush == true) { - Fcnt := Fcnt + 1; - if (Fcnt > WTcnt) { - if (use_seq_not_coal) { - DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n"); - assert(false); - } else { - coalescer.wbCallback(address); - } - Fcnt := Fcnt - 1; - } - if (WTcnt == 0 && Fcnt == 0) { - inFlush := false; - APPEND_TRANSITION_COMMENT(" inFlush is false"); - } + coalescer.invTCPCallback(address); } } action(wd_wtDone, "wd", desc="writethrough done") { - WTcnt := WTcnt - 1; - if (inFlush == true) { - Fcnt := Fcnt -1; + if (use_seq_not_coal) { + DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n"); + assert(false); + } else { + peek(responseToTCP_in, ResponseMsg) { + coalescer.writeCompleteCallback(address, in_msg.instSeqNum); + } } - assert(WTcnt >= 0); - APPEND_TRANSITION_COMMENT("write-- = "); - APPEND_TRANSITION_COMMENT(WTcnt); } action(dw_dirtyWrite, "dw", desc="update write mask"){ @@ -562,77 +538,28 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") // Stalling transitions do NOT check the tag array...and if they do, // they can cause a resource stall deadlock! - transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} { + transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} { z_stall; } - transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} { - l_loadDone; - mru_updateMRU; - p_popMandatoryQueue; - } - transition(I, Load) {TagArrayRead} { n_issueRdBlk; p_popMandatoryQueue; } - transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} { - t_allocateTBE; + transition(V, Load) {TagArrayRead, DataArrayRead} { + l_loadDone; mru_updateMRU; - at_atomicThrough; p_popMandatoryQueue; } - transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} { - wt_writeThrough; + transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} { t_allocateTBE; - at_atomicThrough; - ic_invCache; - } - - transition(W, Load, I) {TagArrayRead, DataArrayRead} { - wt_writeThrough; - norl_issueRdBlkOrloadDone; - p_popMandatoryQueue; - } - - transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} { - a_allocate; - dw_dirtyWrite; - s_storeDone; - p_popMandatoryQueue; - } - - transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} { - dw_dirtyWrite; mru_updateMRU; - s_storeDone; - p_popMandatoryQueue; - } - - transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} { - a_allocate; - dw_dirtyWrite; - s_storeDone; - p_popMandatoryQueue; - } - - transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { - dw_dirtyWrite; - mru_updateMRU; - s_storeDone; - p_popMandatoryQueue; - } - - transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} { - dw_dirtyWrite; - mru_updateMRU; - s_storeDone; + at_atomicThrough; p_popMandatoryQueue; } - //M,W should not see storeThrough transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} { a_allocate; dw_dirtyWrite; @@ -642,7 +569,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") p_popMandatoryQueue; } - transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + transition(V, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} { dw_dirtyWrite; s_storeDone; wt_writeThrough; @@ -672,7 +599,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") d_deallocateTBE; a_allocate; w_writeCache; - s_storeDone; + ad_atomicDone; pr_popResponseQueue; ic_invCache; } @@ -683,12 +610,6 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") pr_popResponseQueue; } - transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} { - w_writeCache; - l_loadDone; - pr_popResponseQueue; - } - transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} { ic_invCache; } @@ -697,26 +618,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") ic_invCache; } - transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { - wt_writeThrough; - ic_invCache; - } - - transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { - wt_writeThrough; - ic_invCache; - } - - transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} { + transition({V, I, A},Flush) {TagArrayFlash} { sf_setFlush; - wt_writeThrough; - ic_invCache; - p_popMandatoryQueue; - } - - transition({V, I, A, L},Flush) {TagArrayFlash} { - sf_setFlush; - wb_wbDone; p_popMandatoryQueue; } @@ -726,20 +629,14 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") ic_invCache; } - transition({W, M}, Evict, W) {TagArrayFlash} { - inv_invDone; - p_popMandatoryQueue; - } - - transition({A, L}, Evict) {TagArrayFlash} { + transition(A, Evict) {TagArrayFlash} { inv_invDone; p_popMandatoryQueue; } // TCC_AckWB only snoops TBE - transition({V, I, A, M, W, L}, TCC_AckWB) { + transition({V, I, A}, TCC_AckWB) { wd_wtDone; - wb_wbDone; pr_popResponseQueue; } } diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm index 124ebbeda..60117482a 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm @@ -62,7 +62,8 @@ structure (VIPERCoalescer, external = "yes") { Cycles, Cycles, Cycles); void writeCallback(Addr, MachineType, DataBlock, Cycles, Cycles, Cycles, bool); - void invCallback(Addr); - void wbCallback(Addr); + void atomicCallback(Addr, MachineType, DataBlock); + void invTCPCallback(Addr); + void writeCompleteCallback(Addr, uint64_t); void evictionCallback(Addr); } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 6f788ce5f..efbffbd8c 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -514,6 +514,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.InitialRequestTime := in_msg.InitialRequestTime; out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := curCycle(); + out_msg.instSeqNum := in_msg.instSeqNum; } } } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm index f4f50cb32..f0705192d 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm @@ -136,6 +136,7 @@ structure(CPURequestMsg, desc="...", interface="Message") { WriteMask writeMask, desc="Write Through Data"; MachineID WTRequestor, desc="Node who initiated the write through"; int wfid, default="0", desc="wavefront id"; + uint64_t instSeqNum, desc="instruction sequence number"; bool NoWriteConflict, default="true", desc="write collided with CAB entry"; int ProgramCounter, desc="PC that accesses to this block"; @@ -188,6 +189,7 @@ structure(TDProbeRequestMsg, desc="...", interface="Message") { MessageSizeType MessageSize, desc="size category of the message"; int Phase, desc="Synchronization Phase"; int wfid, desc="wavefront id for Release"; + uint64_t instSeqNum, desc="instruction sequence number"; MachineID Requestor, desc="Node who initiated the request"; bool functionalRead(Packet *pkt) { @@ -242,6 +244,7 @@ structure(ResponseMsg, desc="...", interface="Message") { bool NoAckNeeded, default="false", desc="For short circuting acks"; bool isValid, default="false", desc="Is acked block valid"; int wfid, default="0", desc="wavefront id"; + uint64_t instSeqNum, desc="instruction sequence number"; int Phase, desc="Synchronization Phase"; int ProgramCounter, desc="PC that issues this request"; @@ -343,6 +346,7 @@ structure(FifoMsg, desc="...", interface="Message") { Addr addr, desc="Address"; FifoType Type, desc="WriteThrough/WriteFlush"; int wfid, default="0",desc="wavefront id"; + uint64_t instSeqNum, desc="instruction sequence number"; MachineID Requestor, desc="Flush Requestor"; MachineID oRequestor, desc="original Flush Requestor"; diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm index 76c45b9b0..71716f9fe 100644 --- a/src/mem/ruby/protocol/RubySlicc_Types.sm +++ b/src/mem/ruby/protocol/RubySlicc_Types.sm @@ -150,6 +150,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") { WriteMask writeMask, desc="Writethrough mask"; DataBlock WTData, desc="Writethrough data block"; int wfid, desc="Writethrough wavefront"; + uint64_t instSeqNum, desc="Instruction sequence number"; PacketPtr pkt, desc="Packet associated with this request"; } diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh index 29bedfa51..b3e239686 100644 --- a/src/mem/ruby/slicc_interface/RubyRequest.hh +++ b/src/mem/ruby/slicc_interface/RubyRequest.hh @@ -56,6 +56,7 @@ class RubyRequest : public Message WriteMask m_writeMask; DataBlock m_WTData; int m_wfid; + uint64_t m_instSeqNum; RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len, uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode, @@ -80,7 +81,8 @@ class RubyRequest : public Message RubyAccessMode _access_mode, PacketPtr _pkt, PrefetchBit _pb, unsigned _proc_id, unsigned _core_id, int _wm_size, std::vector & _wm_mask, - DataBlock & _Data) + DataBlock & _Data, + uint64_t _instSeqNum = 0) : Message(curTime), m_PhysicalAddress(_paddr), m_Type(_type), @@ -93,7 +95,8 @@ class RubyRequest : public Message m_contextId(_core_id), m_writeMask(_wm_size,_wm_mask), m_WTData(_Data), - m_wfid(_proc_id) + m_wfid(_proc_id), + m_instSeqNum(_instSeqNum) { m_LineAddress = makeLineAddress(m_PhysicalAddress); } @@ -104,7 +107,8 @@ class RubyRequest : public Message unsigned _proc_id, unsigned _core_id, int _wm_size, std::vector & _wm_mask, DataBlock & _Data, - std::vector< std::pair > _atomicOps) + std::vector< std::pair > _atomicOps, + uint64_t _instSeqNum = 0) : Message(curTime), m_PhysicalAddress(_paddr), m_Type(_type), @@ -117,7 +121,8 @@ class RubyRequest : public Message m_contextId(_core_id), m_writeMask(_wm_size,_wm_mask,_atomicOps), m_WTData(_Data), - m_wfid(_proc_id) + m_wfid(_proc_id), + m_instSeqNum(_instSeqNum) { m_LineAddress = makeLineAddress(m_PhysicalAddress); } diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index 1eecb82ad..d9793fafd 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -506,8 +506,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest, } } - - m_outstanding_count--; assert(m_outstanding_count >= 0); @@ -555,25 +553,24 @@ GPUCoalescer::makeRequest(PacketPtr pkt) assert(pkt->req->hasInstSeqNum()); if (pkt->cmd == MemCmd::MemSyncReq) { - // issue mem_sync requests immedidately to the cache system without - // going though uncoalescedTable like normal LD/ST/Atomic requests - issueMemSyncRequest(pkt); - } else { - // otherwise, this must be either read or write command - assert(pkt->isRead() || pkt->isWrite()); - - // the pkt is temporarily stored in the uncoalesced table until - // it's picked for coalescing process later in this cycle or in a - // future cycle - uncoalescedTable.insertPacket(pkt); - DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", - pkt->getAddr()); - - // we schedule an issue event here to process the uncoalesced table - // and try to issue Ruby request to cache system - if (!issueEvent.scheduled()) { - schedule(issueEvent, curTick()); - } + // let the child coalescer handle MemSyncReq because this is + // cache coherence protocol specific + return RequestStatus_Issued; + } + // otherwise, this must be either read or write command + assert(pkt->isRead() || pkt->isWrite()); + + // the pkt is temporarily stored in the uncoalesced table until + // it's picked for coalescing process later in this cycle or in a + // future cycle + uncoalescedTable.insertPacket(pkt); + DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", + pkt->getAddr()); + + // we schedule an issue event here to process the uncoalesced table + // and try to issue Ruby request to cache system + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); } // we always return RequestStatus_Issued in this coalescer @@ -582,107 +579,6 @@ GPUCoalescer::makeRequest(PacketPtr pkt) return RequestStatus_Issued; } -/** - * TODO: Figure out what do with this code. This code may go away - * and/or be merged into the VIPER coalescer once the VIPER - * protocol is re-integrated with GCN3 codes. - */ -/* -void -GPUCoalescer::issueRequest(CoalescedRequest* crequest) -{ - PacketPtr pkt = crequest->getFirstPkt(); - - int proc_id = -1; - if (pkt != NULL && pkt->req->hasContextId()) { - proc_id = pkt->req->contextId(); - } - - // If valid, copy the pc to the ruby request - Addr pc = 0; - if (pkt->req->hasPC()) { - pc = pkt->req->getPC(); - } - - // At the moment setting scopes only counts - // for GPU spill space accesses - // which is pkt->req->isStack() - // this scope is REPLACE since it - // does not need to be flushed at the end - // of a kernel Private and local may need - // to be visible at the end of the kernel - HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); - HSAScope accessScope = reqScopeToHSAScope(pkt->req); - - Addr line_addr = makeLineAddress(pkt->getAddr()); - - // Creating WriteMask that records written bytes - // and atomic operations. This enables partial writes - // and partial reads of those writes - DataBlock dataBlock; - dataBlock.clear(); - uint32_t blockSize = RubySystem::getBlockSizeBytes(); - std::vector accessMask(blockSize,false); - std::vector< std::pair > atomicOps; - uint32_t tableSize = crequest->getPackets().size(); - for (int i = 0; i < tableSize; i++) { - PacketPtr tmpPkt = crequest->getPackets()[i]; - uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; - uint32_t tmpSize = tmpPkt->getSize(); - if (tmpPkt->isAtomicOp()) { - std::pair tmpAtomicOp(tmpOffset, - tmpPkt->getAtomicOp()); - atomicOps.push_back(tmpAtomicOp); - } else if (tmpPkt->isWrite()) { - dataBlock.setData(tmpPkt->getPtr(), - tmpOffset, tmpSize); - } - for (int j = 0; j < tmpSize; j++) { - accessMask[tmpOffset + j] = true; - } - } - std::shared_ptr msg; - if (pkt->isAtomicOp()) { - msg = std::make_shared(clockEdge(), pkt->getAddr(), - pkt->getPtr(), - pkt->getSize(), pc, crequest->getRubyType(), - RubyAccessMode_Supervisor, pkt, - PrefetchBit_No, proc_id, 100, - blockSize, accessMask, - dataBlock, atomicOps, - accessScope, accessSegment); - } else { - msg = std::make_shared(clockEdge(), pkt->getAddr(), - pkt->getPtr(), - pkt->getSize(), pc, crequest->getRubyType(), - RubyAccessMode_Supervisor, pkt, - PrefetchBit_No, proc_id, 100, - blockSize, accessMask, - dataBlock, - accessScope, accessSegment); - } - DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", - curTick(), m_version, "Coal", "Begin", "", "", - printAddress(msg->getPhysicalAddress()), - RubyRequestType_to_string(crequest->getRubyType())); - - fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH, - "there should not be any I-Fetch requests in the GPU Coalescer"); - - Tick latency = cyclesToTicks( - m_controller->mandatoryQueueLatency(crequest->getRubyType())); - assert(latency > 0); - - if (!deadlockCheckEvent.scheduled()) { - schedule(deadlockCheckEvent, - m_deadlock_threshold * clockPeriod() + - curTick()); - } - - assert(m_mandatory_q_ptr); - m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); -}*/ - template std::ostream & operator<<(ostream &out, const std::unordered_map &map) @@ -890,7 +786,13 @@ GPUCoalescer::completeHitCallback(std::vector & mylist) assert(port != NULL); pkt->senderState = ss->predecessor; - delete ss; + + if (pkt->cmd != MemCmd::WriteReq) { + // for WriteReq, we keep the original senderState until + // writeCompleteCallback + delete ss; + } + port->hitCallback(pkt); trySendRetries(); } diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index 789ca308f..74236cb36 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -294,9 +294,11 @@ class GPUCoalescer : public RubyPort Cycles firstResponseTime, bool isRegion); - void atomicCallback(Addr address, - MachineType mach, - const DataBlock& data); + /* atomics need their own callback because the data + might be const coming from SLICC */ + virtual void atomicCallback(Addr address, + MachineType mach, + const DataBlock& data); RequestStatus makeRequest(PacketPtr pkt) override; int outstandingCount() const override { return m_outstanding_count; } @@ -365,7 +367,7 @@ class GPUCoalescer : public RubyPort // since the two following issue functions are protocol-specific, // they must be implemented in a derived coalescer virtual void issueRequest(CoalescedRequest* crequest) = 0; - virtual void issueMemSyncRequest(PacketPtr pkt) = 0; +// virtual void issueMemSyncRequest(PacketPtr pkt) = 0; void kernelCallback(int wavefront_id); diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 7632bbb4e..83aaa1a50 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -272,6 +272,10 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt) RubySystem::getBlockSizeBytes()); } + // Save the port in the sender state object to be used later to + // route the response + pkt->pushSenderState(new SenderState(this)); + // Submit the ruby request RequestStatus requestStatus = ruby_port->makeRequest(pkt); @@ -279,16 +283,16 @@ RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt) // Otherwise, we need to tell the port to retry at a later point // and return false. if (requestStatus == RequestStatus_Issued) { - // Save the port in the sender state object to be used later to - // route the response - pkt->pushSenderState(new SenderState(this)); - - DPRINTF(RubyPort, "Request %s address %#x issued\n", pkt->cmdString(), + DPRINTF(RubyPort, "Request %s 0x%x issued\n", pkt->cmdString(), pkt->getAddr()); return true; } - if (pkt->cmd != MemCmd::MemFenceReq) { + // pop off sender state as this request failed to issue + SenderState *ss = safe_cast(pkt->popSenderState()); + delete ss; + + if (pkt->cmd != MemCmd::MemSyncReq) { DPRINTF(RubyPort, "Request %s for address %#x did not issue because %s\n", pkt->cmdString(), pkt->getAddr(), @@ -558,7 +562,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt) } // turn packet around to go back to requester if response expected - if (needsResponse) { + if (needsResponse || pkt->isResponse()) { DPRINTF(RubyPort, "Sending packet back over port\n"); // Send a response in the same cycle. There is no need to delay the // response because the response latency is already incurred in the diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index cdef2b1f3..eafce6da7 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -44,6 +44,7 @@ #include "cpu/testers/rubytest/RubyTester.hh" #include "debug/GPUCoalescer.hh" #include "debug/MemoryAccess.hh" +#include "debug/ProtocolTrace.hh" #include "mem/packet.hh" #include "mem/ruby/common/SubBlock.hh" #include "mem/ruby/network/MessageBuffer.hh" @@ -64,207 +65,228 @@ VIPERCoalescerParams::create() } VIPERCoalescer::VIPERCoalescer(const Params *p) - : GPUCoalescer(p) + : GPUCoalescer(p), + m_cache_inv_pkt(nullptr), + m_num_pending_invs(0) { - m_max_wb_per_cycle=p->max_wb_per_cycle; - m_max_inv_per_cycle=p->max_inv_per_cycle; - m_outstanding_inv = 0; - m_outstanding_wb = 0; } VIPERCoalescer::~VIPERCoalescer() { } -void -VIPERCoalescer::issueRequest(CoalescedRequest* crequest) -{ -} - -void -VIPERCoalescer::issueMemSyncRequest(PacketPtr pkt) -{ -} - // Places an uncoalesced packet in uncoalescedTable. If the packet is a // special type (MemFence, scoping, etc), it is issued immediately. RequestStatus VIPERCoalescer::makeRequest(PacketPtr pkt) { - if (m_outstanding_wb | m_outstanding_inv) { - DPRINTF(GPUCoalescer, - "There are %d Writebacks and %d Invalidatons\n", - m_outstanding_wb, m_outstanding_inv); - } - // Are we in the middle of a release - if ((m_outstanding_wb) > 0) { - if (pkt->req->isKernel()) { - // Everythign is fine - // Barriers and Kernel End scan coalesce - // If it is a Kerenl Begin flush the cache - if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) { - invL1(); - } - - if (pkt->req->isRelease()) { - insertKernel(pkt->req->contextId(), pkt); - } - - return RequestStatus_Issued; - } - } else if (pkt->req->isKernel() && pkt->req->isRelease()) { - // Flush Dirty Data on Kernel End - // isKernel + isRelease - insertKernel(pkt->req->contextId(), pkt); - wbL1(); - if (m_outstanding_wb == 0) { - for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { - newKernelEnds.push_back(it->first); - } - completeIssue(); - } - return RequestStatus_Issued; + // VIPER only supports following memory request types + // MemSyncReq & Acquire: TCP cache invalidation + // ReadReq : cache read + // WriteReq : cache write + // AtomicOp : cache atomic + // + // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit + // does not specify an equivalent type of memory request. + // TODO: future patches should rename Acquire and Release + assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isAcquire()) || + pkt->cmd == MemCmd::ReadReq || + pkt->cmd == MemCmd::WriteReq || + pkt->isAtomicOp()); + + if (pkt->req->isAcquire() && m_cache_inv_pkt) { + // In VIPER protocol, the coalescer is not able to handle two or + // more cache invalidation requests at a time. Cache invalidation + // requests must be serialized to ensure that all stale data in + // TCP are invalidated correctly. If there's already a pending + // cache invalidation request, we must retry this request later + return RequestStatus_Aliased; } GPUCoalescer::makeRequest(pkt); - if (pkt->req->isKernel() && pkt->req->isAcquire()) { - // Invalidate clean Data on Kernel Begin - // isKernel + isAcquire - invL1(); - } else if (pkt->req->isAcquire() && pkt->req->isRelease()) { - // Deschedule the AtomicAcqRel and - // Flush and Invalidate the L1 cache - invwbL1(); - if (m_outstanding_wb > 0 && issueEvent.scheduled()) { - DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); - deschedule(issueEvent); - } - } else if (pkt->req->isRelease()) { - // Deschedule the StoreRel and - // Flush the L1 cache - wbL1(); - if (m_outstanding_wb > 0 && issueEvent.scheduled()) { - DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); - deschedule(issueEvent); - } - } else if (pkt->req->isAcquire()) { - // LoadAcq or AtomicAcq - // Invalidate the L1 cache - invL1(); - } - // Request was successful - if (m_outstanding_wb == 0) { - if (!issueEvent.scheduled()) { - DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n"); - schedule(issueEvent, curTick()); - } + if (pkt->req->isAcquire()) { + // In VIPER protocol, a compute unit sends a MemSyncReq with Acquire + // flag to invalidate TCP. Upon receiving a request of this type, + // VIPERCoalescer starts a cache walk to invalidate all valid entries + // in TCP. The request is completed once all entries are invalidated. + assert(!m_cache_inv_pkt); + m_cache_inv_pkt = pkt; + invTCP(); } + return RequestStatus_Issued; } void -VIPERCoalescer::wbCallback(Addr addr) +VIPERCoalescer::issueRequest(CoalescedRequest* crequest) { - m_outstanding_wb--; - // if L1 Flush Complete - // attemnpt to schedule issueEvent - assert(((int) m_outstanding_wb) >= 0); - if (m_outstanding_wb == 0) { - for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { - newKernelEnds.push_back(it->first); + PacketPtr pkt = crequest->getFirstPkt(); + + int proc_id = -1; + if (pkt != NULL && pkt->req->hasContextId()) { + proc_id = pkt->req->contextId(); + } + + // If valid, copy the pc to the ruby request + Addr pc = 0; + if (pkt->req->hasPC()) { + pc = pkt->req->getPC(); + } + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Creating WriteMask that records written bytes + // and atomic operations. This enables partial writes + // and partial reads of those writes + DataBlock dataBlock; + dataBlock.clear(); + uint32_t blockSize = RubySystem::getBlockSizeBytes(); + std::vector accessMask(blockSize,false); + std::vector< std::pair > atomicOps; + uint32_t tableSize = crequest->getPackets().size(); + for (int i = 0; i < tableSize; i++) { + PacketPtr tmpPkt = crequest->getPackets()[i]; + uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; + uint32_t tmpSize = tmpPkt->getSize(); + if (tmpPkt->isAtomicOp()) { + std::pair tmpAtomicOp(tmpOffset, + tmpPkt->getAtomicOp()); + atomicOps.push_back(tmpAtomicOp); + } else if (tmpPkt->isWrite()) { + dataBlock.setData(tmpPkt->getPtr(), + tmpOffset, tmpSize); + } + for (int j = 0; j < tmpSize; j++) { + accessMask[tmpOffset + j] = true; } - completeIssue(); } - trySendRetries(); + std::shared_ptr msg; + if (pkt->isAtomicOp()) { + msg = std::make_shared(clockEdge(), pkt->getAddr(), + pkt->getPtr(), + pkt->getSize(), pc, crequest->getRubyType(), + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, atomicOps, crequest->getSeqNum()); + } else { + msg = std::make_shared(clockEdge(), pkt->getAddr(), + pkt->getPtr(), + pkt->getSize(), pc, crequest->getRubyType(), + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, crequest->getSeqNum()); + } + + if (pkt->cmd == MemCmd::WriteReq) { + makeWriteCompletePkts(crequest); + } + + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", + curTick(), m_version, "Coal", "Begin", "", "", + printAddress(msg->getPhysicalAddress()), + RubyRequestType_to_string(crequest->getRubyType())); + + fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH, + "there should not be any I-Fetch requests in the GPU Coalescer"); + + if (!deadlockCheckEvent.scheduled()) { + schedule(deadlockCheckEvent, + m_deadlock_threshold * clockPeriod() + + curTick()); + } + + assert(m_mandatory_q_ptr); + Tick latency = cyclesToTicks( + m_controller->mandatoryQueueLatency(crequest->getRubyType())); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); } void -VIPERCoalescer::invCallback(Addr addr) +VIPERCoalescer::makeWriteCompletePkts(CoalescedRequest* crequest) { - m_outstanding_inv--; - // if L1 Flush Complete - // attemnpt to schedule issueEvent - // This probably won't happen, since - // we dont wait on cache invalidations - if (m_outstanding_wb == 0) { - for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { - newKernelEnds.push_back(it->first); - } - completeIssue(); + // In VIPER protocol, for each write request, down-stream caches + // return two responses: writeCallback and writeCompleteCallback. + // We need to prepare a writeCompletePkt for each write request so + // that when writeCompleteCallback is called, we can respond + // requesting wavefront right away. + // writeCompletePkt inherits request and senderState of the original + // write request packet so that we can find the original requestor + // later. This assumes that request and senderState are not deleted + // before writeCompleteCallback is called. + + auto key = crequest->getSeqNum(); + std::vector& req_pkts = crequest->getPackets(); + + for (auto pkt : req_pkts) { + DPRINTF(GPUCoalescer, "makeWriteCompletePkts: instSeqNum %d\n", + key); + assert(pkt->cmd == MemCmd::WriteReq); + + PacketPtr writeCompletePkt = new Packet(pkt->req, + MemCmd::WriteCompleteResp); + writeCompletePkt->setAddr(pkt->getAddr()); + writeCompletePkt->senderState = pkt->senderState; + m_writeCompletePktMap[key].push_back(writeCompletePkt); } - trySendRetries(); } -/** - * Invalidate L1 cache (Acquire) - */ void -VIPERCoalescer::invL1() +VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum) { - int size = m_dataCache_ptr->getNumBlocks(); - DPRINTF(GPUCoalescer, - "There are %d Invalidations outstanding before Cache Walk\n", - m_outstanding_inv); - // Walk the cache - for (int i = 0; i < size; i++) { - Addr addr = m_dataCache_ptr->getAddressAtIdx(i); - // Evict Read-only data - RubyRequestType request_type = RubyRequestType_REPLACEMENT; - std::shared_ptr msg = std::make_shared( - clockEdge(), addr, (uint8_t*) 0, 0, 0, - request_type, RubyAccessMode_Supervisor, - nullptr); - assert(m_mandatory_q_ptr != NULL); - Tick latency = cyclesToTicks( - m_controller->mandatoryQueueLatency(request_type)); - assert(latency > 0); - m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); - m_outstanding_inv++; + DPRINTF(GPUCoalescer, "writeCompleteCallback: instSeqNum %d addr 0x%x\n", + instSeqNum, addr); + + auto key = instSeqNum; + assert(m_writeCompletePktMap.count(key) == 1 && + !m_writeCompletePktMap[key].empty()); + + for (auto writeCompletePkt : m_writeCompletePktMap[key]) { + if (makeLineAddress(writeCompletePkt->getAddr()) == addr) { + RubyPort::SenderState *ss = + safe_cast + (writeCompletePkt->senderState); + MemSlavePort *port = ss->port; + assert(port != NULL); + + writeCompletePkt->senderState = ss->predecessor; + delete ss; + port->hitCallback(writeCompletePkt); + } } - DPRINTF(GPUCoalescer, - "There are %d Invalidatons outstanding after Cache Walk\n", - m_outstanding_inv); + + trySendRetries(); + + if (m_writeCompletePktMap[key].empty()) + m_writeCompletePktMap.erase(key); } -/** - * Writeback L1 cache (Release) - */ void -VIPERCoalescer::wbL1() +VIPERCoalescer::invTCPCallback(Addr addr) { - int size = m_dataCache_ptr->getNumBlocks(); - DPRINTF(GPUCoalescer, - "There are %d Writebacks outstanding before Cache Walk\n", - m_outstanding_wb); - // Walk the cache - for (int i = 0; i < size; i++) { - Addr addr = m_dataCache_ptr->getAddressAtIdx(i); - // Write dirty data back - RubyRequestType request_type = RubyRequestType_FLUSH; - std::shared_ptr msg = std::make_shared( - clockEdge(), addr, (uint8_t*) 0, 0, 0, - request_type, RubyAccessMode_Supervisor, - nullptr); - assert(m_mandatory_q_ptr != NULL); - Tick latency = cyclesToTicks( - m_controller->mandatoryQueueLatency(request_type)); - assert(latency > 0); - m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); - m_outstanding_wb++; + assert(m_cache_inv_pkt && m_num_pending_invs > 0); + + m_num_pending_invs--; + + if (m_num_pending_invs == 0) { + std::vector pkt_list { m_cache_inv_pkt }; + completeHitCallback(pkt_list); + m_cache_inv_pkt = nullptr; } - DPRINTF(GPUCoalescer, - "There are %d Writebacks outstanding after Cache Walk\n", - m_outstanding_wb); } /** - * Invalidate and Writeback L1 cache (Acquire&Release) + * Invalidate TCP (Acquire) */ void -VIPERCoalescer::invwbL1() +VIPERCoalescer::invTCP() { int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Invalidations outstanding before Cache Walk\n", + m_num_pending_invs); // Walk the cache for (int i = 0; i < size; i++) { Addr addr = m_dataCache_ptr->getAddressAtIdx(i); @@ -274,27 +296,14 @@ VIPERCoalescer::invwbL1() clockEdge(), addr, (uint8_t*) 0, 0, 0, request_type, RubyAccessMode_Supervisor, nullptr); + DPRINTF(GPUCoalescer, "Evicting addr 0x%x\n", addr); assert(m_mandatory_q_ptr != NULL); Tick latency = cyclesToTicks( - m_controller->mandatoryQueueLatency(request_type)); - assert(latency > 0); + m_controller->mandatoryQueueLatency(request_type)); m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); - m_outstanding_inv++; - } - // Walk the cache - for (int i = 0; i< size; i++) { - Addr addr = m_dataCache_ptr->getAddressAtIdx(i); - // Write dirty data back - RubyRequestType request_type = RubyRequestType_FLUSH; - std::shared_ptr msg = std::make_shared( - clockEdge(), addr, (uint8_t*) 0, 0, 0, - request_type, RubyAccessMode_Supervisor, - nullptr); - assert(m_mandatory_q_ptr != NULL); - Tick latency = cyclesToTicks( - m_controller->mandatoryQueueLatency(request_type)); - assert(latency > 0); - m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); - m_outstanding_wb++; + m_num_pending_invs++; } + DPRINTF(GPUCoalescer, + "There are %d Invalidatons outstanding after Cache Walk\n", + m_num_pending_invs); } diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh index 659c9fd34..2f68c10bc 100644 --- a/src/mem/ruby/system/VIPERCoalescer.hh +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -57,19 +57,31 @@ class VIPERCoalescer : public GPUCoalescer typedef VIPERCoalescerParams Params; VIPERCoalescer(const Params *); ~VIPERCoalescer(); - - void issueMemSyncRequest(PacketPtr pkt) override; - void issueRequest(CoalescedRequest* crequest) override; - void wbCallback(Addr address); - void invCallback(Addr address); + void writeCompleteCallback(Addr address, uint64_t instSeqNum); + void invTCPCallback(Addr address); RequestStatus makeRequest(PacketPtr pkt) override; + void issueRequest(CoalescedRequest* crequest) override; + private: - void invL1(); - void wbL1(); - void invwbL1(); - uint64_t m_outstanding_inv; - uint64_t m_outstanding_wb; - uint64_t m_max_inv_per_cycle; - uint64_t m_max_wb_per_cycle; + void invTCP(); + + // make write-complete response packets from original write request packets + void makeWriteCompletePkts(CoalescedRequest* crequest); + + // current cache invalidation packet + // nullptr if there is no active cache invalidation request + PacketPtr m_cache_inv_pkt; + + // number of remaining cache lines to be invalidated in TCP + int m_num_pending_invs; + + // a map of instruction sequence number and corresponding pending + // write-complete response packets. Each write-complete response + // corresponds to a pending store request that is waiting for + // writeCompleteCallback. We may have multiple pending store requests per + // wavefront at a time. Each time writeCompleteCallback is called, an entry + // with a corresponding seqNum is popped off from map and returned to + // compute unit. + std::unordered_map> m_writeCompletePktMap; }; #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__