From 855748030032dc09a054a204ec93f16c91ee1577 Mon Sep 17 00:00:00 2001 From: Brad Beckmann Date: Fri, 20 Aug 2010 11:46:14 -0700 Subject: [PATCH] ruby: Added merge GETS optimization to hammer Added an optimization that merges multiple pending GETS requests into a single request to the owner node. --- src/mem/protocol/MOESI_hammer-cache.sm | 75 +++++++++++++- src/mem/protocol/MOESI_hammer-dir.sm | 132 +++++++++++++++++++++++-- src/mem/protocol/MOESI_hammer-msg.sm | 4 + 3 files changed, 203 insertions(+), 8 deletions(-) diff --git a/src/mem/protocol/MOESI_hammer-cache.sm b/src/mem/protocol/MOESI_hammer-cache.sm index ae74e284f..667b4ffcb 100644 --- a/src/mem/protocol/MOESI_hammer-cache.sm +++ b/src/mem/protocol/MOESI_hammer-cache.sm @@ -95,6 +95,7 @@ machine(L1Cache, "AMD Hammer-like protocol") // Requests Other_GETX, desc="A GetX from another processor"; Other_GETS, desc="A GetS from another processor"; + Merged_GETS, desc="A Merged GetS from another processor"; Other_GETS_No_Mig, desc="A GetS from another processor"; Invalidate, desc="Invalidate block"; @@ -136,6 +137,7 @@ machine(L1Cache, "AMD Hammer-like protocol") int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; bool Sharers, desc="On a GetS, did we find any other sharers in the system"; MachineID LastResponder, desc="last machine to send a response for this request"; + MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; Time ForwardRequestTime, default="0", desc="time the dir forwarded the request"; Time FirstResponseTime, default="0", desc="the time the first response was received"; @@ -286,6 +288,8 @@ machine(L1Cache, "AMD Hammer-like protocol") peek(forwardToCache_in, RequestMsg, block_on="Address") { if (in_msg.Type == CoherenceRequestType:GETX) { trigger(Event:Other_GETX, in_msg.Address); + } else if (in_msg.Type == CoherenceRequestType:MERGED_GETS) { + trigger(Event:Merged_GETS, in_msg.Address); } else if (in_msg.Type == CoherenceRequestType:GETS) { if (isCacheTagPresent(in_msg.Address)) { if (getCacheEntry(in_msg.Address).AtomicAccessed && no_mig_atomic) { @@ -518,6 +522,24 @@ machine(L1Cache, "AMD Hammer-like protocol") } } + action(em_sendDataSharedMultiple, "em", desc="Send data from cache to all requestors") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceResponseType:DATA_SHARED; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + out_msg.DataBlk := getCacheEntry(address).DataBlk; + DEBUG_EXPR(out_msg.DataBlk); + out_msg.Dirty := getCacheEntry(address).Dirty; + out_msg.Acks := machineCount(MachineType:L1Cache); + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + action(f_sendAck, "f", desc="Send ack from cache to requestor") { peek(forwardToCache_in, RequestMsg) { enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) { @@ -575,6 +597,7 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Address := address; out_msg.Type := CoherenceResponseType:UNBLOCKS; out_msg.Sender := machineID; + out_msg.CurOwner := TBEs[address].CurOwner; out_msg.Destination.add(map_Address_to_Directory(address)); out_msg.MessageSize := MessageSizeType:Unblock_Control; } @@ -690,6 +713,11 @@ machine(L1Cache, "AMD Hammer-like protocol") } } } + action(uo_updateCurrentOwner, "uo", desc="When moving SS state, update current owner.") { + peek(responseToCache_in, ResponseMsg) { + TBEs[address].CurOwner := in_msg.Sender; + } + } action(n_popResponseQueue, "n", desc="Pop response queue") { responseToCache_in.dequeue(); @@ -745,6 +773,24 @@ machine(L1Cache, "AMD Hammer-like protocol") } } + action(qm_sendDataFromTBEToCache, "qm", desc="Send data from TBE to cache, multiple sharers") { + peek(forwardToCache_in, RequestMsg) { + enqueue(responseNetwork_out, ResponseMsg, latency=cache_response_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination := in_msg.MergedRequestors; + DEBUG_EXPR(out_msg.Destination); + out_msg.DataBlk := TBEs[address].DataBlk; + out_msg.Dirty := TBEs[address].Dirty; + out_msg.Acks := machineCount(MachineType:L1Cache); + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; + } + } + } + action(qq_sendDataFromTBEToMemory, "\q", desc="Send data from TBE to memory") { enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) { out_msg.Address := address; @@ -899,7 +945,7 @@ machine(L1Cache, "AMD Hammer-like protocol") zz_recycleMandatoryQueue; } - transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) { + transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Merged_GETS, Other_GETS_No_Mig, Invalidate}) { // stall } @@ -1111,6 +1157,11 @@ machine(L1Cache, "AMD Hammer-like protocol") l_popForwardQueue; } + transition(O, Merged_GETS) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + // Transitions from Modified transition(MM, {Load, Ifetch}) { h_load_hit; @@ -1143,6 +1194,11 @@ machine(L1Cache, "AMD Hammer-like protocol") l_popForwardQueue; } + transition(MM, Merged_GETS, O) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + // Transitions from Dirty Exclusive transition(M, {Load, Ifetch}) { h_load_hit; @@ -1170,6 +1226,11 @@ machine(L1Cache, "AMD Hammer-like protocol") l_popForwardQueue; } + transition(M, Merged_GETS, O) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + // Transitions from IM transition(IM, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) { @@ -1249,6 +1310,11 @@ machine(L1Cache, "AMD Hammer-like protocol") l_popForwardQueue; } + transition(OM, Merged_GETS) { + em_sendDataSharedMultiple; + l_popForwardQueue; + } + transition(OM, Ack) { m_decrementNumberOfMessages; o_checkForCompletion; @@ -1287,6 +1353,7 @@ machine(L1Cache, "AMD Hammer-like protocol") m_decrementNumberOfMessages; o_checkForCompletion; hx_external_load_hit; + uo_updateCurrentOwner; n_popResponseQueue; } @@ -1304,6 +1371,7 @@ machine(L1Cache, "AMD Hammer-like protocol") m_decrementNumberOfMessages; o_checkForCompletion; hx_external_load_hit; + uo_updateCurrentOwner; n_popResponseQueue; } @@ -1385,6 +1453,11 @@ machine(L1Cache, "AMD Hammer-like protocol") l_popForwardQueue; } + transition({OI, MI}, Merged_GETS, OI) { + qm_sendDataFromTBEToCache; + l_popForwardQueue; + } + transition(MI, Writeback_Ack, I) { t_sendExclusiveDataFromTBEToMemory; s_deallocateTBE; diff --git a/src/mem/protocol/MOESI_hammer-dir.sm b/src/mem/protocol/MOESI_hammer-dir.sm index 806719916..9f7d08f9d 100644 --- a/src/mem/protocol/MOESI_hammer-dir.sm +++ b/src/mem/protocol/MOESI_hammer-dir.sm @@ -69,6 +69,9 @@ machine(Directory, "AMD Hammer-like protocol") NO_R, desc="Was Not Owner or Sharer, replacing probe filter entry"; NO_B, "NO^B", desc="Not Owner, Blocked"; + NO_B_X, "NO^B", desc="Not Owner, Blocked, next queued request GETX"; + NO_B_S, "NO^B", desc="Not Owner, Blocked, next queued request GETS"; + NO_B_S_W, "NO^B", desc="Not Owner, Blocked, forwarded merged GETS, waiting for responses"; O_B, "O^B", desc="Owner, Blocked"; NO_B_W, desc="Not Owner, Blocked, waiting for Dram"; O_B_W, desc="Owner, Blocked, waiting for Dram"; @@ -121,6 +124,7 @@ machine(Directory, "AMD Hammer-like protocol") All_acks_and_shared_data, desc="Received shared data and message acks"; All_acks_and_owner_data, desc="Received shared data and message acks"; All_acks_and_data_no_sharers, desc="Received all acks and no other processor has a shared copy"; + All_Unblocks, desc="Received all unblocks for a merged gets request"; } // TYPES @@ -148,6 +152,7 @@ machine(Directory, "AMD Hammer-like protocol") DataBlock DataBlk, desc="The current view of system memory"; int Len, desc="..."; MachineID DmaRequestor, desc="DMA requestor"; + NetDest GetSRequestors, desc="GETS merged requestors"; int NumPendingMsgs, desc="Number of pending acks/messages"; bool CacheDirty, default="false", desc="Indicates whether a cache has responded with dirty data"; bool Sharers, default="false", desc="Indicates whether a cache has indicated it is currently a sharer"; @@ -243,6 +248,8 @@ machine(Directory, "AMD Hammer-like protocol") trigger(Event:All_acks_and_shared_data, in_msg.Address); } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) { trigger(Event:All_acks_and_data_no_sharers, in_msg.Address); + } else if (in_msg.Type == TriggerType:ALL_UNBLOCKS) { + trigger(Event:All_Unblocks, in_msg.Address); } else { error("Unexpected message"); } @@ -487,6 +494,20 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(mu_decrementNumberOfUnblocks, "mu", desc="Decrement the number of messages for which we're waiting") { + peek(unblockNetwork_in, ResponseMsg) { + assert(in_msg.Type == CoherenceResponseType:UNBLOCKS); + DEBUG_EXPR(TBEs[address].NumPendingMsgs); + // + // Note that cache data responses will have an ack count of 2. However, + // directory DMA requests must wait for acks from all LLC caches, so + // only decrement by 1. + // + TBEs[address].NumPendingMsgs := TBEs[address].NumPendingMsgs - 1; + DEBUG_EXPR(TBEs[address].NumPendingMsgs); + } + } + action(n_popResponseQueue, "n", desc="Pop response queue") { responseToDir_in.dequeue(); } @@ -508,6 +529,19 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(os_checkForMergedGetSCompletion, "os", desc="Check for merged GETS completion") { + if (TBEs[address].NumPendingMsgs == 0) { + enqueue(triggerQueue_out, TriggerMsg) { + out_msg.Address := address; + out_msg.Type := TriggerType:ALL_UNBLOCKS; + } + } + } + + action(sp_setPendingMsgsToMergedSharers, "sp", desc="Set pending messages to waiting sharers") { + TBEs[address].NumPendingMsgs := TBEs[address].GetSRequestors.count(); + } + action(spa_setPendingAcksToZeroIfPF, "spa", desc="if probe filter, no need to wait for acks") { if (probe_filter_enabled) { TBEs[address].NumPendingMsgs := 0; @@ -598,6 +632,12 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(rs_recordGetSRequestor, "rs", desc="Record GETS requestor in TBE") { + peek(requestQueue_in, RequestMsg) { + TBEs[address].GetSRequestors.add(in_msg.Requestor); + } + } + action(r_setSharerBit, "r", desc="We saw other sharers") { TBEs[address].Sharers := true; } @@ -694,6 +734,29 @@ machine(Directory, "AMD Hammer-like protocol") } } + action(fr_forwardMergeReadRequestsToOwner, "frr", desc="Forward coalesced read request to owner") { + assert(machineCount(MachineType:L1Cache) > 1); + // + // Fixme! The unblock network should not stall on the forward network. Add a trigger queue to + // decouple the two. + // + peek(unblockNetwork_in, ResponseMsg) { + enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) { + out_msg.Address := address; + out_msg.Type := CoherenceRequestType:MERGED_GETS; + out_msg.MergedRequestors := TBEs[address].GetSRequestors; + if (in_msg.Type == CoherenceResponseType:UNBLOCKS) { + out_msg.Destination.add(in_msg.CurOwner); + } else { + out_msg.Destination.add(in_msg.Sender); + } + out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := zero_time(); + out_msg.ForwardRequestTime := get_time(); + } + } + } + action(fc_forwardRequestConditionalOwner, "fc", desc="Forward request to one or more nodes") { assert(machineCount(MachineType:L1Cache) > 1); if (probe_filter_enabled) { @@ -1058,31 +1121,81 @@ machine(Directory, "AMD Hammer-like protocol") } // Blocked transient states - transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, - NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, + transition({NO_B_X, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, + NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, NO_B_S_W, NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R}, {GETS, GETX, PUT, Pf_Replacement}) { z_stallAndWaitRequest; } - transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, - NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, + transition(NO_B, GETX, NO_B_X) { + z_stallAndWaitRequest; + } + + transition(NO_B, {PUT, Pf_Replacement}) { + z_stallAndWaitRequest; + } + + transition(NO_B_S, {GETX, PUT, Pf_Replacement}) { + z_stallAndWaitRequest; + } + + transition({NO_B, NO_B_S, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, + NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, NO_B_S_W, NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R}, {DMA_READ, DMA_WRITE}) { zd_stallAndWaitDMARequest; } - transition(NO_B, UnblockS, NX) { + // merge GETS into one response + transition(NO_B, GETS, NO_B_S) { + v_allocateTBE; + rs_recordGetSRequestor; + i_popIncomingRequestQueue; + } + + transition(NO_B_S, GETS) { + rs_recordGetSRequestor; + i_popIncomingRequestQueue; + } + + // unblock responses + transition({NO_B, NO_B_X}, UnblockS, NX) { k_wakeUpDependents; j_popIncomingUnblockQueue; } - transition(NO_B, UnblockM, NO) { + transition({NO_B, NO_B_X}, UnblockM, NO) { uo_updateOwnerIfPf; k_wakeUpDependents; j_popIncomingUnblockQueue; } + transition(NO_B_S, UnblockS, NO_B_S_W) { + fr_forwardMergeReadRequestsToOwner; + sp_setPendingMsgsToMergedSharers; + j_popIncomingUnblockQueue; + } + + transition(NO_B_S, UnblockM, NO_B_S_W) { + uo_updateOwnerIfPf; + fr_forwardMergeReadRequestsToOwner; + sp_setPendingMsgsToMergedSharers; + j_popIncomingUnblockQueue; + } + + transition(NO_B_S_W, UnblockS) { + mu_decrementNumberOfUnblocks; + os_checkForMergedGetSCompletion; + j_popIncomingUnblockQueue; + } + + transition(NO_B_S_W, All_Unblocks, NX) { + w_deallocateTBE; + k_wakeUpDependents; + g_popTriggerQueue; + } + transition(O_B, UnblockS, O) { k_wakeUpDependents; j_popIncomingUnblockQueue; @@ -1315,7 +1428,12 @@ machine(Directory, "AMD Hammer-like protocol") l_popMemQueue; } - transition(NO_B_W, {UnblockM, UnblockS}, NO_W) { + transition(NO_B_W, UnblockM, NO_W) { + uo_updateOwnerIfPf; + j_popIncomingUnblockQueue; + } + + transition(NO_B_W, UnblockS, NO_W) { j_popIncomingUnblockQueue; } diff --git a/src/mem/protocol/MOESI_hammer-msg.sm b/src/mem/protocol/MOESI_hammer-msg.sm index 05a52b881..c90c8a53c 100644 --- a/src/mem/protocol/MOESI_hammer-msg.sm +++ b/src/mem/protocol/MOESI_hammer-msg.sm @@ -33,6 +33,7 @@ enumeration(CoherenceRequestType, desc="...") { GETX, desc="Get eXclusive"; GETS, desc="Get Shared"; + MERGED_GETS, desc="Get Shared"; PUT, desc="Put Ownership"; WB_ACK, desc="Writeback ack"; WB_NACK, desc="Writeback neg. ack"; @@ -62,6 +63,7 @@ enumeration(TriggerType, desc="...") { ALL_ACKS, desc="See corresponding event"; ALL_ACKS_OWNER_EXISTS,desc="See corresponding event"; ALL_ACKS_NO_SHARERS, desc="See corresponding event"; + ALL_UNBLOCKS, desc="all unblockS received"; } // TriggerMsg @@ -75,6 +77,7 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") { Address Address, desc="Physical address for this request"; CoherenceRequestType Type, desc="Type of request (GetS, GetX, PutX, etc)"; MachineID Requestor, desc="Node who initiated the request"; + NetDest MergedRequestors, desc="Merge set of read requestors"; NetDest Destination, desc="Multicast destination mask"; MessageSizeType MessageSize, desc="size category of the message"; bool DirectedProbe, default="false", desc="probe filter directed probe"; @@ -87,6 +90,7 @@ structure(ResponseMsg, desc="...", interface="NetworkMessage") { Address Address, desc="Physical address for this request"; CoherenceResponseType Type, desc="Type of response (Ack, Data, etc)"; MachineID Sender, desc="Node who sent the data"; + MachineID CurOwner, desc="current owner of the block, used for UnblockS responses"; NetDest Destination, desc="Node to whom the data is sent"; DataBlock DataBlk, desc="data for the cache line"; bool Dirty, desc="Is the data dirty (different than memory)?"; -- 2.30.2