From f57053473ad369d5baf4a83d17913e5af393a8a8 Mon Sep 17 00:00:00 2001 From: Brad Beckmann Date: Fri, 20 Aug 2010 11:46:14 -0700 Subject: [PATCH] MOESI_hammer: break down miss latency stalled cycles This patch tracks the number of cycles a transaction is delayed at different points of the request-forward-response loop. --- src/mem/protocol/MOESI_hammer-cache.sm | 47 +++++++++++- src/mem/protocol/MOESI_hammer-dir.sm | 8 +++ src/mem/protocol/MOESI_hammer-msg.sm | 4 ++ src/mem/protocol/RubySlicc_Types.sm | 2 + src/mem/ruby/profiler/Profiler.cc | 99 ++++++++++++++++++++++++++ src/mem/ruby/profiler/Profiler.hh | 24 +++++++ src/mem/ruby/system/Sequencer.cc | 49 ++++++++++++- src/mem/ruby/system/Sequencer.hh | 19 ++++- 8 files changed, 245 insertions(+), 7 deletions(-) diff --git a/src/mem/protocol/MOESI_hammer-cache.sm b/src/mem/protocol/MOESI_hammer-cache.sm index 06ce69624..ae74e284f 100644 --- a/src/mem/protocol/MOESI_hammer-cache.sm +++ b/src/mem/protocol/MOESI_hammer-cache.sm @@ -136,6 +136,9 @@ machine(L1Cache, "AMD Hammer-like protocol") int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for"; bool Sharers, desc="On a GetS, did we find any other sharers in the system"; MachineID LastResponder, desc="last machine to send a response for this request"; + Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Time ForwardRequestTime, default="0", desc="time the dir forwarded the request"; + Time FirstResponseTime, default="0", desc="the time the first response was received"; } external_type(TBETable) { @@ -424,6 +427,7 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Requestor := machineID; out_msg.Destination.add(map_Address_to_Directory(address)); out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := get_time(); TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1) } } @@ -435,6 +439,7 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Requestor := machineID; out_msg.Destination.add(map_Address_to_Directory(address)); out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.InitialRequestTime := get_time(); TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1) } } @@ -454,6 +459,8 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Acks := 2; } out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; } } } @@ -483,6 +490,8 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Acks := 2; } out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; } } } @@ -503,6 +512,8 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Acks := 2; } out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; } } } @@ -517,6 +528,8 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Acks := 1; assert(in_msg.DirectedProbe == false); out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; } } } @@ -531,6 +544,8 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Acks := 1; assert(in_msg.DirectedProbe == false); out_msg.MessageSize := MessageSizeType:Response_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; } } } @@ -580,7 +595,10 @@ machine(L1Cache, "AMD Hammer-like protocol") sequencer.readCallback(address, getNondirectHitMachType(in_msg.Address, in_msg.Sender), - getCacheEntry(address).DataBlk); + getCacheEntry(address).DataBlk, + TBEs[address].InitialRequestTime, + TBEs[address].ForwardRequestTime, + TBEs[address].FirstResponseTime); } } @@ -605,7 +623,10 @@ machine(L1Cache, "AMD Hammer-like protocol") sequencer.writeCallback(address, getNondirectHitMachType(address, in_msg.Sender), - getCacheEntry(address).DataBlk); + getCacheEntry(address).DataBlk, + TBEs[address].InitialRequestTime, + TBEs[address].ForwardRequestTime, + TBEs[address].FirstResponseTime); } getCacheEntry(address).Dirty := true; @@ -617,7 +638,10 @@ machine(L1Cache, "AMD Hammer-like protocol") sequencer.writeCallback(address, getNondirectHitMachType(address, TBEs[address].LastResponder), - getCacheEntry(address).DataBlk); + getCacheEntry(address).DataBlk, + TBEs[address].InitialRequestTime, + TBEs[address].ForwardRequestTime, + TBEs[address].FirstResponseTime); getCacheEntry(address).Dirty := true; } @@ -649,6 +673,21 @@ machine(L1Cache, "AMD Hammer-like protocol") TBEs[address].NumPendingMsgs := TBEs[address].NumPendingMsgs - in_msg.Acks; DEBUG_EXPR(TBEs[address].NumPendingMsgs); TBEs[address].LastResponder := in_msg.Sender; + if (TBEs[address].InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) { + assert(TBEs[address].InitialRequestTime == in_msg.InitialRequestTime); + } + if (in_msg.InitialRequestTime != zero_time()) { + TBEs[address].InitialRequestTime := in_msg.InitialRequestTime; + } + if (TBEs[address].ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) { + assert(TBEs[address].ForwardRequestTime == in_msg.ForwardRequestTime); + } + if (in_msg.ForwardRequestTime != zero_time()) { + TBEs[address].ForwardRequestTime := in_msg.ForwardRequestTime; + } + if (TBEs[address].FirstResponseTime == zero_time()) { + TBEs[address].FirstResponseTime := get_time(); + } } } @@ -700,6 +739,8 @@ machine(L1Cache, "AMD Hammer-like protocol") out_msg.Acks := 2; } out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := in_msg.ForwardRequestTime; } } } diff --git a/src/mem/protocol/MOESI_hammer-dir.sm b/src/mem/protocol/MOESI_hammer-dir.sm index ae282ba3a..d4b36cded 100644 --- a/src/mem/protocol/MOESI_hammer-dir.sm +++ b/src/mem/protocol/MOESI_hammer-dir.sm @@ -645,6 +645,8 @@ machine(Directory, "AMD Hammer-like protocol") out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor out_msg.MessageSize := MessageSizeType:Broadcast_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := get_time(); } } } @@ -685,6 +687,8 @@ machine(Directory, "AMD Hammer-like protocol") out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor out_msg.MessageSize := MessageSizeType:Broadcast_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := get_time(); } } } @@ -701,6 +705,8 @@ machine(Directory, "AMD Hammer-like protocol") out_msg.Destination.add(getPfEntry(address).Owner); out_msg.MessageSize := MessageSizeType:Request_Control; out_msg.DirectedProbe := true; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := get_time(); } } } else { @@ -712,6 +718,8 @@ machine(Directory, "AMD Hammer-like protocol") out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor out_msg.MessageSize := MessageSizeType:Broadcast_Control; + out_msg.InitialRequestTime := in_msg.InitialRequestTime; + out_msg.ForwardRequestTime := get_time(); } } } diff --git a/src/mem/protocol/MOESI_hammer-msg.sm b/src/mem/protocol/MOESI_hammer-msg.sm index f414d599d..05a52b881 100644 --- a/src/mem/protocol/MOESI_hammer-msg.sm +++ b/src/mem/protocol/MOESI_hammer-msg.sm @@ -78,6 +78,8 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") { NetDest Destination, desc="Multicast destination mask"; MessageSizeType MessageSize, desc="size category of the message"; bool DirectedProbe, default="false", desc="probe filter directed probe"; + Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Time ForwardRequestTime, default="0", desc="time the dir forwarded the request"; } // ResponseMsg (and also unblock requests) @@ -90,6 +92,8 @@ structure(ResponseMsg, desc="...", interface="NetworkMessage") { bool Dirty, desc="Is the data dirty (different than memory)?"; int Acks, desc="How many messages this counts as"; MessageSizeType MessageSize, desc="size category of the message"; + Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache"; + Time ForwardRequestTime, default="0", desc="time the dir forwarded the request"; } enumeration(DMARequestType, desc="...", default="DMARequestType_NULL") { diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm index 8dcdabeb8..9d3cd6634 100644 --- a/src/mem/protocol/RubySlicc_Types.sm +++ b/src/mem/protocol/RubySlicc_Types.sm @@ -101,8 +101,10 @@ external_type(NetDest, non_obj="yes") { external_type(Sequencer) { void readCallback(Address, DataBlock); void readCallback(Address, GenericMachineType, DataBlock); + void readCallback(Address, GenericMachineType, DataBlock, Time, Time, Time); void writeCallback(Address, DataBlock); void writeCallback(Address, GenericMachineType, DataBlock); + void writeCallback(Address, GenericMachineType, DataBlock, Time, Time, Time); void checkCoherence(Address); void profileNack(Address, int, int, uint64); } diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc index 753fdd230..a42d919f7 100644 --- a/src/mem/ruby/profiler/Profiler.cc +++ b/src/mem/ruby/profiler/Profiler.cc @@ -286,6 +286,35 @@ Profiler::printStats(ostream& out, bool short_stats) } } + out << "miss_latency_wCC_issue_to_initial_request: " + << m_wCCIssueToInitialRequestHistogram << endl; + out << "miss_latency_wCC_initial_forward_request: " + << m_wCCInitialRequestToForwardRequestHistogram << endl; + out << "miss_latency_wCC_forward_to_first_response: " + << m_wCCForwardRequestToFirstResponseHistogram << endl; + out << "miss_latency_wCC_first_response_to_completion: " + << m_wCCFirstResponseToCompleteHistogram << endl; + out << "imcomplete_wCC_Times: " << m_wCCIncompleteTimes << endl; + out << "miss_latency_dir_issue_to_initial_request: " + << m_dirIssueToInitialRequestHistogram << endl; + out << "miss_latency_dir_initial_forward_request: " + << m_dirInitialRequestToForwardRequestHistogram << endl; + out << "miss_latency_dir_forward_to_first_response: " + << m_dirForwardRequestToFirstResponseHistogram << endl; + out << "miss_latency_dir_first_response_to_completion: " + << m_dirFirstResponseToCompleteHistogram << endl; + out << "imcomplete_dir_Times: " << m_dirIncompleteTimes << endl; + + for (int i = 0; i < m_missMachLatencyHistograms.size(); i++) { + for (int j = 0; j < m_missMachLatencyHistograms[i].size(); j++) { + if (m_missMachLatencyHistograms[i][j].size() > 0) { + out << "miss_latency_" << RubyRequestType(i) + << "_" << GenericMachineType(j) << ": " + << m_missMachLatencyHistograms[i][j] << endl; + } + } + } + out << endl; out << "All Non-Zero Cycle SW Prefetch Requests" << endl; @@ -454,7 +483,24 @@ Profiler::clearStats() for (int i = 0; i < m_machLatencyHistograms.size(); i++) { m_machLatencyHistograms[i].clear(200); } + m_missMachLatencyHistograms.resize(RubyRequestType_NUM); + for (int i = 0; i < m_missLatencyHistograms.size(); i++) { + m_missMachLatencyHistograms[i].resize(GenericMachineType_NUM+1); + for (int j = 0; j < m_missMachLatencyHistograms[i].size(); j++) { + m_missMachLatencyHistograms[i][j].clear(200); + } + } m_allMissLatencyHistogram.clear(200); + m_wCCIssueToInitialRequestHistogram.clear(200); + m_wCCInitialRequestToForwardRequestHistogram.clear(200); + m_wCCForwardRequestToFirstResponseHistogram.clear(200); + m_wCCFirstResponseToCompleteHistogram.clear(200); + m_wCCIncompleteTimes = 0; + m_dirIssueToInitialRequestHistogram.clear(200); + m_dirInitialRequestToForwardRequestHistogram.clear(200); + m_dirForwardRequestToFirstResponseHistogram.clear(200); + m_dirFirstResponseToCompleteHistogram.clear(200); + m_dirIncompleteTimes = 0; m_SWPrefetchLatencyHistograms.resize(CacheRequestType_NUM); for (int i = 0; i < m_SWPrefetchLatencyHistograms.size(); i++) { @@ -581,6 +627,59 @@ Profiler::missLatency(Time cycles, m_allMissLatencyHistogram.add(cycles); m_missLatencyHistograms[type].add(cycles); m_machLatencyHistograms[respondingMach].add(cycles); + m_missMachLatencyHistograms[type][respondingMach].add(cycles); +} + +void +Profiler::missLatencyWcc(Time issuedTime, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime, + Time completionTime) +{ + if ((issuedTime <= initialRequestTime) && + (initialRequestTime <= forwardRequestTime) && + (forwardRequestTime <= firstResponseTime) && + (firstResponseTime <= completionTime)) { + m_wCCIssueToInitialRequestHistogram.add(initialRequestTime - issuedTime); + + m_wCCInitialRequestToForwardRequestHistogram.add(forwardRequestTime - + initialRequestTime); + + m_wCCForwardRequestToFirstResponseHistogram.add(firstResponseTime - + forwardRequestTime); + + m_wCCFirstResponseToCompleteHistogram.add(completionTime - + firstResponseTime); + } else { + m_wCCIncompleteTimes++; + } +} + +void +Profiler::missLatencyDir(Time issuedTime, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime, + Time completionTime) +{ + if ((issuedTime <= initialRequestTime) && + (initialRequestTime <= forwardRequestTime) && + (forwardRequestTime <= firstResponseTime) && + (firstResponseTime <= completionTime)) { + m_dirIssueToInitialRequestHistogram.add(initialRequestTime - issuedTime); + + m_dirInitialRequestToForwardRequestHistogram.add(forwardRequestTime - + initialRequestTime); + + m_dirForwardRequestToFirstResponseHistogram.add(firstResponseTime - + forwardRequestTime); + + m_dirFirstResponseToCompleteHistogram.add(completionTime - + firstResponseTime); + } else { + m_dirIncompleteTimes++; + } } // non-zero cycle prefetch request diff --git a/src/mem/ruby/profiler/Profiler.hh b/src/mem/ruby/profiler/Profiler.hh index de9834f05..3a11f0596 100644 --- a/src/mem/ruby/profiler/Profiler.hh +++ b/src/mem/ruby/profiler/Profiler.hh @@ -138,6 +138,18 @@ class Profiler : public SimObject, public Consumer RubyRequestType type, const GenericMachineType respondingMach); + void missLatencyWcc(Time issuedTime, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime, + Time completionTime); + + void missLatencyDir(Time issuedTime, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime, + Time completionTime); + void swPrefetchLatency(Time t, CacheRequestType type, const GenericMachineType respondingMach); @@ -200,6 +212,18 @@ class Profiler : public SimObject, public Consumer std::vector m_missLatencyHistograms; std::vector m_machLatencyHistograms; + std::vector< std::vector > m_missMachLatencyHistograms; + Histogram m_wCCIssueToInitialRequestHistogram; + Histogram m_wCCInitialRequestToForwardRequestHistogram; + Histogram m_wCCForwardRequestToFirstResponseHistogram; + Histogram m_wCCFirstResponseToCompleteHistogram; + int64 m_wCCIncompleteTimes; + Histogram m_dirIssueToInitialRequestHistogram; + Histogram m_dirInitialRequestToForwardRequestHistogram; + Histogram m_dirForwardRequestToFirstResponseHistogram; + Histogram m_dirFirstResponseToCompleteHistogram; + int64 m_dirIncompleteTimes; + Histogram m_allMissLatencyHistogram; Histogram m_allSWPrefetchLatencyHistogram; diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 175bb6721..74b6355e8 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -356,6 +356,17 @@ void Sequencer::writeCallback(const Address& address, GenericMachineType mach, DataBlock& data) +{ + writeCallback(address, mach, data, 0, 0, 0); +} + +void +Sequencer::writeCallback(const Address& address, + GenericMachineType mach, + DataBlock& data, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime) { assert(address == line_address(address)); assert(m_writeRequestTable.count(line_address(address))); @@ -385,7 +396,8 @@ Sequencer::writeCallback(const Address& address, m_controller->unblock(address); } - hitCallback(request, mach, data, success); + hitCallback(request, mach, data, success, + initialRequestTime, forwardRequestTime, firstResponseTime); } void @@ -398,6 +410,17 @@ void Sequencer::readCallback(const Address& address, GenericMachineType mach, DataBlock& data) +{ + readCallback(address, mach, data, 0, 0, 0); +} + +void +Sequencer::readCallback(const Address& address, + GenericMachineType mach, + DataBlock& data, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime) { assert(address == line_address(address)); assert(m_readRequestTable.count(line_address(address))); @@ -413,14 +436,18 @@ Sequencer::readCallback(const Address& address, (request->ruby_request.type == RubyRequestType_RMW_Read) || (request->ruby_request.type == RubyRequestType_IFETCH)); - hitCallback(request, mach, data, true); + hitCallback(request, mach, data, true, + initialRequestTime, forwardRequestTime, firstResponseTime); } void Sequencer::hitCallback(SequencerRequest* srequest, GenericMachineType mach, DataBlock& data, - bool success) + bool success, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime) { const RubyRequest & ruby_request = srequest->ruby_request; Address request_address(ruby_request.paddr); @@ -445,6 +472,22 @@ Sequencer::hitCallback(SequencerRequest* srequest, if (miss_latency != 0) { g_system_ptr->getProfiler()->missLatency(miss_latency, type, mach); + if (mach == GenericMachineType_L1Cache_wCC) { + g_system_ptr->getProfiler()->missLatencyWcc(issued_time, + initialRequestTime, + forwardRequestTime, + firstResponseTime, + g_eventQueue_ptr->getTime()); + } + + if (mach == GenericMachineType_Directory) { + g_system_ptr->getProfiler()->missLatencyDir(issued_time, + initialRequestTime, + forwardRequestTime, + firstResponseTime, + g_eventQueue_ptr->getTime()); + } + if (Debug::getProtocolTrace()) { if (success) { g_system_ptr->getProfiler()-> diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index fecfc9a1a..4ab85dac8 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -80,12 +80,26 @@ class Sequencer : public RubyPort, public Consumer GenericMachineType mach, DataBlock& data); + void writeCallback(const Address& address, + GenericMachineType mach, + DataBlock& data, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime); + void readCallback(const Address& address, DataBlock& data); void readCallback(const Address& address, GenericMachineType mach, DataBlock& data); + void readCallback(const Address& address, + GenericMachineType mach, + DataBlock& data, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime); + RequestStatus makeRequest(const RubyRequest & request); RequestStatus getRequestStatus(const RubyRequest& request); bool empty() const; @@ -106,7 +120,10 @@ class Sequencer : public RubyPort, public Consumer void hitCallback(SequencerRequest* request, GenericMachineType mach, DataBlock& data, - bool success); + bool success, + Time initialRequestTime, + Time forwardRequestTime, + Time firstResponseTime); bool insertRequest(SequencerRequest* request); -- 2.30.2