MOESI_hammer: break down miss latency stalled cycles
authorBrad Beckmann <Brad.Beckmann@amd.com>
Fri, 20 Aug 2010 18:46:14 +0000 (11:46 -0700)
committerBrad Beckmann <Brad.Beckmann@amd.com>
Fri, 20 Aug 2010 18:46:14 +0000 (11:46 -0700)
This patch tracks the number of cycles a transaction is delayed at different
points of the request-forward-response loop.

src/mem/protocol/MOESI_hammer-cache.sm
src/mem/protocol/MOESI_hammer-dir.sm
src/mem/protocol/MOESI_hammer-msg.sm
src/mem/protocol/RubySlicc_Types.sm
src/mem/ruby/profiler/Profiler.cc
src/mem/ruby/profiler/Profiler.hh
src/mem/ruby/system/Sequencer.cc
src/mem/ruby/system/Sequencer.hh

index 06ce696248825e0ac940938e13e541345bdbd908..ae74e284f745f601bcf4f014c2e7b2fa9388f9d3 100644 (file)
@@ -136,6 +136,9 @@ machine(L1Cache, "AMD Hammer-like protocol")
     int NumPendingMsgs,      desc="Number of acks/data messages that this processor is waiting for";
     bool Sharers,            desc="On a GetS, did we find any other sharers in the system";
     MachineID LastResponder, desc="last machine to send a response for this request";
+    Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+    Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+    Time FirstResponseTime, default="0", desc="the time the first response was received";
   }
 
   external_type(TBETable) {
@@ -424,6 +427,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
       out_msg.Requestor := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
       out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := get_time();
       TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
     }
   }
@@ -435,6 +439,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
       out_msg.Requestor := machineID;
       out_msg.Destination.add(map_Address_to_Directory(address));
       out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := get_time();
       TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache); // One from each other cache (n-1) plus the memory (+1)
     }
   }
@@ -454,6 +459,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
           out_msg.Acks := 2;
         }
         out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
     }
   }
@@ -483,6 +490,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
           out_msg.Acks := 2;
         }
         out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
     }
   }
@@ -503,6 +512,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
           out_msg.Acks := 2;
         }
         out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
     }
   }
@@ -517,6 +528,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Acks := 1;
         assert(in_msg.DirectedProbe == false);
         out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
     }
   }
@@ -531,6 +544,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Acks := 1;
         assert(in_msg.DirectedProbe == false);
         out_msg.MessageSize := MessageSizeType:Response_Control;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
     }
   }
@@ -580,7 +595,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
       sequencer.readCallback(address, 
                              getNondirectHitMachType(in_msg.Address, in_msg.Sender),
-                             getCacheEntry(address).DataBlk);
+                             getCacheEntry(address).DataBlk,
+                             TBEs[address].InitialRequestTime,
+                             TBEs[address].ForwardRequestTime,
+                             TBEs[address].FirstResponseTime);
 
     }
   }
@@ -605,7 +623,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
       sequencer.writeCallback(address, 
                               getNondirectHitMachType(address, in_msg.Sender),
-                              getCacheEntry(address).DataBlk);
+                              getCacheEntry(address).DataBlk,
+                              TBEs[address].InitialRequestTime,
+                              TBEs[address].ForwardRequestTime,
+                              TBEs[address].FirstResponseTime);
 
     }
     getCacheEntry(address).Dirty := true;
@@ -617,7 +638,10 @@ machine(L1Cache, "AMD Hammer-like protocol")
     sequencer.writeCallback(address, 
                             getNondirectHitMachType(address, 
                                                     TBEs[address].LastResponder),
-                            getCacheEntry(address).DataBlk);
+                            getCacheEntry(address).DataBlk,
+                            TBEs[address].InitialRequestTime,
+                            TBEs[address].ForwardRequestTime,
+                            TBEs[address].FirstResponseTime);
 
     getCacheEntry(address).Dirty := true;
   }
@@ -649,6 +673,21 @@ machine(L1Cache, "AMD Hammer-like protocol")
       TBEs[address].NumPendingMsgs := TBEs[address].NumPendingMsgs - in_msg.Acks;
       DEBUG_EXPR(TBEs[address].NumPendingMsgs);
       TBEs[address].LastResponder := in_msg.Sender;
+      if (TBEs[address].InitialRequestTime != zero_time() && in_msg.InitialRequestTime != zero_time()) {
+        assert(TBEs[address].InitialRequestTime == in_msg.InitialRequestTime);
+      }
+      if (in_msg.InitialRequestTime != zero_time()) {
+        TBEs[address].InitialRequestTime := in_msg.InitialRequestTime;
+      }
+      if (TBEs[address].ForwardRequestTime != zero_time() && in_msg.ForwardRequestTime != zero_time()) {
+        assert(TBEs[address].ForwardRequestTime == in_msg.ForwardRequestTime);
+      }
+      if (in_msg.ForwardRequestTime != zero_time()) {
+        TBEs[address].ForwardRequestTime := in_msg.ForwardRequestTime;
+      }
+      if (TBEs[address].FirstResponseTime == zero_time()) {
+        TBEs[address].FirstResponseTime := get_time();
+      }
     }
   }
 
@@ -700,6 +739,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
           out_msg.Acks := 2;
         }
         out_msg.MessageSize := MessageSizeType:Response_Data;
+        out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+        out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
       }
     }
   }
index ae282ba3a24fe3f4ea9f80e592854fc068ea519e..d4b36cded736e139c693d552349a9a1e558944aa 100644 (file)
@@ -645,6 +645,8 @@ machine(Directory, "AMD Hammer-like protocol")
           out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
           out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
           out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := get_time();
         }
       }
     }
@@ -685,6 +687,8 @@ machine(Directory, "AMD Hammer-like protocol")
           out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
           out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
           out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := get_time();
         }
       }
     }
@@ -701,6 +705,8 @@ machine(Directory, "AMD Hammer-like protocol")
           out_msg.Destination.add(getPfEntry(address).Owner);
           out_msg.MessageSize := MessageSizeType:Request_Control;
           out_msg.DirectedProbe := true;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := get_time();
         }
       }      
     } else {
@@ -712,6 +718,8 @@ machine(Directory, "AMD Hammer-like protocol")
           out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
           out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
           out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+          out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+          out_msg.ForwardRequestTime := get_time();
         }
       }
     }
index f414d599df61f933c6277a4542ab17c77a587f95..05a52b881cc572a65943abb74f888bfd99e26ab4 100644 (file)
@@ -78,6 +78,8 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") {
   NetDest Destination,             desc="Multicast destination mask";
   MessageSizeType MessageSize, desc="size category of the message";
   bool DirectedProbe, default="false", desc="probe filter directed probe";
+  Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+  Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
 }
 
 // ResponseMsg (and also unblock requests)
@@ -90,6 +92,8 @@ structure(ResponseMsg, desc="...", interface="NetworkMessage") {
   bool Dirty,                  desc="Is the data dirty (different than memory)?";
   int Acks,                    desc="How many messages this counts as";
   MessageSizeType MessageSize, desc="size category of the message";
+  Time InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+  Time ForwardRequestTime, default="0", desc="time the dir forwarded the request";
 }
 
 enumeration(DMARequestType, desc="...", default="DMARequestType_NULL") {
index 8dcdabeb8c2cfa3b86e22d9ae7944c23b1b74803..9d3cd66342bf664a651182ee97b4e574c0a25dde 100644 (file)
@@ -101,8 +101,10 @@ external_type(NetDest, non_obj="yes") {
 external_type(Sequencer) {
   void readCallback(Address, DataBlock);
   void readCallback(Address, GenericMachineType, DataBlock);
+  void readCallback(Address, GenericMachineType, DataBlock, Time, Time, Time);
   void writeCallback(Address, DataBlock);
   void writeCallback(Address, GenericMachineType, DataBlock);
+  void writeCallback(Address, GenericMachineType, DataBlock, Time, Time, Time);
   void checkCoherence(Address);
   void profileNack(Address, int, int, uint64);
 }
index 753fdd23051d23d8be428128504776ad9bd2bcc8..a42d919f76dfd62e15123ce19a08e133cc71f4c6 100644 (file)
@@ -286,6 +286,35 @@ Profiler::printStats(ostream& out, bool short_stats)
             }
         }
 
+        out << "miss_latency_wCC_issue_to_initial_request: " 
+            << m_wCCIssueToInitialRequestHistogram << endl;
+        out << "miss_latency_wCC_initial_forward_request: " 
+            << m_wCCInitialRequestToForwardRequestHistogram << endl;
+        out << "miss_latency_wCC_forward_to_first_response: " 
+            << m_wCCForwardRequestToFirstResponseHistogram << endl;
+        out << "miss_latency_wCC_first_response_to_completion: " 
+            << m_wCCFirstResponseToCompleteHistogram << endl;
+        out << "imcomplete_wCC_Times: " << m_wCCIncompleteTimes << endl;
+        out << "miss_latency_dir_issue_to_initial_request: " 
+            << m_dirIssueToInitialRequestHistogram << endl;
+        out << "miss_latency_dir_initial_forward_request: " 
+            << m_dirInitialRequestToForwardRequestHistogram << endl;
+        out << "miss_latency_dir_forward_to_first_response: " 
+            << m_dirForwardRequestToFirstResponseHistogram << endl;
+        out << "miss_latency_dir_first_response_to_completion: " 
+            << m_dirFirstResponseToCompleteHistogram << endl;
+        out << "imcomplete_dir_Times: " << m_dirIncompleteTimes << endl;
+
+        for (int i = 0; i < m_missMachLatencyHistograms.size(); i++) {
+            for (int j = 0; j < m_missMachLatencyHistograms[i].size(); j++) {
+                if (m_missMachLatencyHistograms[i][j].size() > 0) {
+                    out << "miss_latency_" << RubyRequestType(i) 
+                        << "_" << GenericMachineType(j) << ": "
+                        << m_missMachLatencyHistograms[i][j] << endl;
+                }
+            }
+        }
+
         out << endl;
 
         out << "All Non-Zero Cycle SW Prefetch Requests" << endl;
@@ -454,7 +483,24 @@ Profiler::clearStats()
     for (int i = 0; i < m_machLatencyHistograms.size(); i++) {
         m_machLatencyHistograms[i].clear(200);
     }
+    m_missMachLatencyHistograms.resize(RubyRequestType_NUM);
+    for (int i = 0; i < m_missLatencyHistograms.size(); i++) {
+        m_missMachLatencyHistograms[i].resize(GenericMachineType_NUM+1);
+        for (int j = 0; j < m_missMachLatencyHistograms[i].size(); j++) {
+            m_missMachLatencyHistograms[i][j].clear(200);
+        }
+    }
     m_allMissLatencyHistogram.clear(200);
+    m_wCCIssueToInitialRequestHistogram.clear(200);
+    m_wCCInitialRequestToForwardRequestHistogram.clear(200);
+    m_wCCForwardRequestToFirstResponseHistogram.clear(200);
+    m_wCCFirstResponseToCompleteHistogram.clear(200);
+    m_wCCIncompleteTimes = 0;
+    m_dirIssueToInitialRequestHistogram.clear(200);
+    m_dirInitialRequestToForwardRequestHistogram.clear(200);
+    m_dirForwardRequestToFirstResponseHistogram.clear(200);
+    m_dirFirstResponseToCompleteHistogram.clear(200);
+    m_dirIncompleteTimes = 0;
 
     m_SWPrefetchLatencyHistograms.resize(CacheRequestType_NUM);
     for (int i = 0; i < m_SWPrefetchLatencyHistograms.size(); i++) {
@@ -581,6 +627,59 @@ Profiler::missLatency(Time cycles,
     m_allMissLatencyHistogram.add(cycles);
     m_missLatencyHistograms[type].add(cycles);
     m_machLatencyHistograms[respondingMach].add(cycles);
+    m_missMachLatencyHistograms[type][respondingMach].add(cycles);
+}
+
+void
+Profiler::missLatencyWcc(Time issuedTime,
+                         Time initialRequestTime,
+                         Time forwardRequestTime,
+                         Time firstResponseTime,
+                         Time completionTime)
+{
+    if ((issuedTime <= initialRequestTime) &&
+        (initialRequestTime <= forwardRequestTime) &&
+        (forwardRequestTime <= firstResponseTime) &&
+        (firstResponseTime <= completionTime)) {
+        m_wCCIssueToInitialRequestHistogram.add(initialRequestTime - issuedTime);
+        
+        m_wCCInitialRequestToForwardRequestHistogram.add(forwardRequestTime - 
+                                                         initialRequestTime);
+        
+        m_wCCForwardRequestToFirstResponseHistogram.add(firstResponseTime - 
+                                                        forwardRequestTime);
+        
+        m_wCCFirstResponseToCompleteHistogram.add(completionTime - 
+                                                  firstResponseTime);
+    } else {
+        m_wCCIncompleteTimes++;
+    }
+}
+
+void
+Profiler::missLatencyDir(Time issuedTime,
+                         Time initialRequestTime,
+                         Time forwardRequestTime,
+                         Time firstResponseTime,
+                         Time completionTime)
+{
+    if ((issuedTime <= initialRequestTime) &&
+        (initialRequestTime <= forwardRequestTime) &&
+        (forwardRequestTime <= firstResponseTime) &&
+        (firstResponseTime <= completionTime)) {
+        m_dirIssueToInitialRequestHistogram.add(initialRequestTime - issuedTime);
+        
+        m_dirInitialRequestToForwardRequestHistogram.add(forwardRequestTime - 
+                                                         initialRequestTime);
+        
+        m_dirForwardRequestToFirstResponseHistogram.add(firstResponseTime - 
+                                                        forwardRequestTime);
+        
+        m_dirFirstResponseToCompleteHistogram.add(completionTime - 
+                                                  firstResponseTime);
+    } else {
+        m_dirIncompleteTimes++;
+    }
 }
 
 // non-zero cycle prefetch request
index de9834f0523b9f8d1610a4ea666c401eefe176a5..3a11f059605d0d79ff81640584648725b122b99f 100644 (file)
@@ -138,6 +138,18 @@ class Profiler : public SimObject, public Consumer
                      RubyRequestType type,
                      const GenericMachineType respondingMach);
 
+    void missLatencyWcc(Time issuedTime,
+                        Time initialRequestTime,
+                        Time forwardRequestTime,
+                        Time firstResponseTime,
+                        Time completionTime);
+    
+    void missLatencyDir(Time issuedTime,
+                        Time initialRequestTime,
+                        Time forwardRequestTime,
+                        Time firstResponseTime,
+                        Time completionTime);
+    
     void swPrefetchLatency(Time t, 
                            CacheRequestType type,
                            const GenericMachineType respondingMach);
@@ -200,6 +212,18 @@ class Profiler : public SimObject, public Consumer
 
     std::vector<Histogram> m_missLatencyHistograms;
     std::vector<Histogram> m_machLatencyHistograms;
+    std::vector< std::vector<Histogram> > m_missMachLatencyHistograms;
+    Histogram m_wCCIssueToInitialRequestHistogram;
+    Histogram m_wCCInitialRequestToForwardRequestHistogram;
+    Histogram m_wCCForwardRequestToFirstResponseHistogram;
+    Histogram m_wCCFirstResponseToCompleteHistogram;
+    int64 m_wCCIncompleteTimes;
+    Histogram m_dirIssueToInitialRequestHistogram;
+    Histogram m_dirInitialRequestToForwardRequestHistogram;
+    Histogram m_dirForwardRequestToFirstResponseHistogram;
+    Histogram m_dirFirstResponseToCompleteHistogram;
+    int64 m_dirIncompleteTimes;
+
     Histogram m_allMissLatencyHistogram;
 
     Histogram m_allSWPrefetchLatencyHistogram;
index 175bb6721fa98e5a6030208d8cf9ce095739598c..74b6355e8c93eb9aa84ee56c7981dfda0deb9ee0 100644 (file)
@@ -356,6 +356,17 @@ void
 Sequencer::writeCallback(const Address& address,
                          GenericMachineType mach, 
                          DataBlock& data)
+{
+    writeCallback(address, mach, data, 0, 0, 0);
+}
+
+void
+Sequencer::writeCallback(const Address& address,
+                         GenericMachineType mach, 
+                         DataBlock& data,
+                         Time initialRequestTime,
+                         Time forwardRequestTime,
+                         Time firstResponseTime)
 {
     assert(address == line_address(address));
     assert(m_writeRequestTable.count(line_address(address)));
@@ -385,7 +396,8 @@ Sequencer::writeCallback(const Address& address,
         m_controller->unblock(address);
     }
 
-    hitCallback(request, mach, data, success);
+    hitCallback(request, mach, data, success, 
+                initialRequestTime, forwardRequestTime, firstResponseTime);
 }
 
 void
@@ -398,6 +410,17 @@ void
 Sequencer::readCallback(const Address& address,
                         GenericMachineType mach,
                         DataBlock& data)
+{
+    readCallback(address, mach, data, 0, 0, 0);
+}
+
+void
+Sequencer::readCallback(const Address& address,
+                        GenericMachineType mach,
+                        DataBlock& data,
+                        Time initialRequestTime,
+                        Time forwardRequestTime,
+                        Time firstResponseTime)
 {
     assert(address == line_address(address));
     assert(m_readRequestTable.count(line_address(address)));
@@ -413,14 +436,18 @@ Sequencer::readCallback(const Address& address,
            (request->ruby_request.type == RubyRequestType_RMW_Read) ||
            (request->ruby_request.type == RubyRequestType_IFETCH));
 
-    hitCallback(request, mach, data, true);
+    hitCallback(request, mach, data, true, 
+                initialRequestTime, forwardRequestTime, firstResponseTime);
 }
 
 void
 Sequencer::hitCallback(SequencerRequest* srequest,
                        GenericMachineType mach,
                        DataBlock& data,
-                       bool success)
+                       bool success,
+                       Time initialRequestTime,
+                       Time forwardRequestTime,
+                       Time firstResponseTime)
 {
     const RubyRequest & ruby_request = srequest->ruby_request;
     Address request_address(ruby_request.paddr);
@@ -445,6 +472,22 @@ Sequencer::hitCallback(SequencerRequest* srequest,
     if (miss_latency != 0) {
         g_system_ptr->getProfiler()->missLatency(miss_latency, type, mach);
 
+        if (mach == GenericMachineType_L1Cache_wCC) {
+            g_system_ptr->getProfiler()->missLatencyWcc(issued_time,
+                                                   initialRequestTime,
+                                                   forwardRequestTime,
+                                                   firstResponseTime,
+                                                   g_eventQueue_ptr->getTime());
+        }
+
+        if (mach == GenericMachineType_Directory) {
+            g_system_ptr->getProfiler()->missLatencyDir(issued_time,
+                                                   initialRequestTime,
+                                                   forwardRequestTime,
+                                                   firstResponseTime,
+                                                   g_eventQueue_ptr->getTime());
+        }
+
         if (Debug::getProtocolTrace()) {
             if (success) {
                 g_system_ptr->getProfiler()->
index fecfc9a1a39f47284cda8a3d3e33c4cfa443c1c0..4ab85dac8078cf34917dd92758c60bad9b99a2db 100644 (file)
@@ -80,12 +80,26 @@ class Sequencer : public RubyPort, public Consumer
                        GenericMachineType mach, 
                        DataBlock& data);
 
+    void writeCallback(const Address& address, 
+                       GenericMachineType mach, 
+                       DataBlock& data,
+                       Time initialRequestTime,
+                       Time forwardRequestTime,
+                       Time firstResponseTime);
+
     void readCallback(const Address& address, DataBlock& data);
 
     void readCallback(const Address& address, 
                       GenericMachineType mach, 
                       DataBlock& data);
 
+    void readCallback(const Address& address, 
+                      GenericMachineType mach, 
+                      DataBlock& data,
+                      Time initialRequestTime,
+                      Time forwardRequestTime,
+                      Time firstResponseTime);
+
     RequestStatus makeRequest(const RubyRequest & request);
     RequestStatus getRequestStatus(const RubyRequest& request);
     bool empty() const;
@@ -106,7 +120,10 @@ class Sequencer : public RubyPort, public Consumer
     void hitCallback(SequencerRequest* request, 
                      GenericMachineType mach,
                      DataBlock& data,
-                     bool success);
+                     bool success,
+                     Time initialRequestTime,
+                     Time forwardRequestTime,
+                     Time firstResponseTime);
 
     bool insertRequest(SequencerRequest* request);