mem-ruby: Fixing token port responses in GPUCoalescer

author Matthew Poremba <matthew.poremba@amd.com>

Fri, 25 Sep 2020 20:50:31 +0000 (15:50 -0500)

committer Matthew Poremba <matthew.poremba@amd.com>

Wed, 30 Sep 2020 20:19:36 +0000 (20:19 +0000)
author Matthew Poremba <matthew.poremba@amd.com>
Fri, 25 Sep 2020 20:50:31 +0000 (15:50 -0500)
committer Matthew Poremba <matthew.poremba@amd.com>
Wed, 30 Sep 2020 20:19:36 +0000 (20:19 +0000)
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc

index e70293269dc0123e6e84e54d6d9c0ab994abae1c..310ba72fc77703fe3aac73bbcb60f0459a2af93e 100644 (file)
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -77,6 +77,26 @@ UncoalescedTable::packetAvailable()
      return !instMap.empty();
  }
  
+void
+UncoalescedTable::initPacketsRemaining(InstSeqNum seqNum, int count)
+{
+    if (!instPktsRemaining.count(seqNum)) {
+        instPktsRemaining[seqNum] = count;
+    }
+}
+
+int
+UncoalescedTable::getPacketsRemaining(InstSeqNum seqNum)
+{
+    return instPktsRemaining[seqNum];
+}
+
+void
+UncoalescedTable::setPacketsRemaining(InstSeqNum seqNum, int count)
+{
+    instPktsRemaining[seqNum] = count;
+}
+
  PerInstPackets*
  UncoalescedTable::getInstPackets(int offset)
  {
@@ -94,9 +114,20 @@ void
  UncoalescedTable::updateResources()
  {
      for (auto iter = instMap.begin(); iter != instMap.end(); ) {
-        if (iter->second.empty()) {
-            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
+        InstSeqNum seq_num = iter->first;
+        DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n",
+                coalescer->name().c_str(), seq_num);
+        assert(instPktsRemaining.count(seq_num));
+
+        if (instPktsRemaining[seq_num] == 0) {
+            assert(iter->second.empty());
+
+            // Remove from both maps
              instMap.erase(iter++);
+            instPktsRemaining.erase(seq_num);
+
+            // Release the token
+            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
              coalescer->getGMTokenPort().sendTokens(1);
          } else {
              ++iter;
@@ -555,16 +586,23 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
          // otherwise, this must be either read or write command
          assert(pkt->isRead() || pkt->isWrite());
  
+        InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
+        int num_packets = getDynInst(pkt)->exec_mask.count();
+
          // the pkt is temporarily stored in the uncoalesced table until
          // it's picked for coalescing process later in this cycle or in a
-        // future cycle
+        // future cycle. Packets remaining is set to the number of excepted
+        // requests from the instruction based on its exec_mask.
          uncoalescedTable.insertPacket(pkt);
+        uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
          DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
                  pkt->getAddr());
  
          // we schedule an issue event here to process the uncoalesced table
          // and try to issue Ruby request to cache system
          if (!issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n",
+                    seq_num);
              schedule(issueEvent, curTick());
          }
      }
@@ -595,6 +633,18 @@ GPUCoalescer::print(ostream& out) const
          << "]";
  }
  
+GPUDynInstPtr
+GPUCoalescer::getDynInst(PacketPtr pkt) const
+{
+    RubyPort::SenderState* ss =
+            safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+    ComputeUnit::DataPort::SenderState* cu_state =
+        safe_cast<ComputeUnit::DataPort::SenderState*>
+            (ss->predecessor);
+
+    return cu_state->_gpuDynInst;
+}
  
  bool
  GPUCoalescer::coalescePacket(PacketPtr pkt)
@@ -674,10 +724,7 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
                  // CU will use that instruction to decrement wait counters
                  // in the issuing wavefront.
                  // For Ruby tester, gpuDynInst == nullptr
-                ComputeUnit::DataPort::SenderState* cu_state =
-                    safe_cast<ComputeUnit::DataPort::SenderState*>
-                        (ss->predecessor);
-                gpuDynInst = cu_state->_gpuDynInst;
+                gpuDynInst = getDynInst(pkt);
              }
  
              PendingWriteInst& inst = pendingWriteInsts[seqNum];
@@ -698,21 +745,45 @@ GPUCoalescer::completeIssue()
      // Iterate over the maximum number of instructions we can coalesce
      // per cycle (coalescingWindow).
      for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
-        PerInstPackets *pktList =
+        PerInstPackets *pkt_list =
              uncoalescedTable.getInstPackets(instIdx);
  
          // getInstPackets will return nullptr if no instruction
          // exists at the current offset.
-        if (!pktList) {
+        if (!pkt_list) {
              break;
+        } else if (pkt_list->empty()) {
+            // Found something, but it has not been cleaned up by update
+            // resources yet. See if there is anything else to coalesce.
+            // Assume we can't check anymore if the coalescing window is 1.
+            continue;
          } else {
+            // All packets in the list have the same seqNum, use first.
+            InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum();
+
+            // The difference in list size before and after tells us the
+            // number of packets which were coalesced.
+            size_t pkt_list_size = pkt_list->size();
+
              // Since we have a pointer to the list of packets in the inst,
              // erase them from the list if coalescing is successful and
              // leave them in the list otherwise. This aggressively attempts
              // to coalesce as many packets as possible from the current inst.
-            pktList->remove_if(
+            pkt_list->remove_if(
                  [&](PacketPtr pkt) { return coalescePacket(pkt); }
              );
+
+            assert(pkt_list_size >= pkt_list->size());
+            size_t pkt_list_diff = pkt_list_size - pkt_list->size();
+
+            int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num);
+            num_remaining -= pkt_list_diff;
+            assert(num_remaining >= 0);
+
+            uncoalescedTable.setPacketsRemaining(seq_num, num_remaining);
+            DPRINTF(GPUCoalescer,
+                    "Coalesced %d pkts for seqNum %d, %d remaining\n",
+                    pkt_list_diff, seq_num, num_remaining);
          }
      }
  
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh

index 3b1b7af2b144a97e9b1093e6375209cd02e9f353..2684d51bdf047592ed4f0fd1fa03cab0a402e8b1 100644 (file)
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -70,12 +70,18 @@ class UncoalescedTable
      bool packetAvailable();
      void printRequestTable(std::stringstream& ss);
  
+    // Modify packets remaining map. Init sets value iff the seqNum has not
+    // yet been seen before. get/set act as a regular getter/setter.
+    void initPacketsRemaining(InstSeqNum seqNum, int count);
+    int getPacketsRemaining(InstSeqNum seqNum);
+    void setPacketsRemaining(InstSeqNum seqNum, int count);
+
      // Returns a pointer to the list of packets corresponding to an
      // instruction in the instruction map or nullptr if there are no
      // instructions at the offset.
      PerInstPackets* getInstPackets(int offset);
      void updateResources();
-    bool areRequestsDone(const uint64_t instSeqNum);
+    bool areRequestsDone(const InstSeqNum instSeqNum);
  
      // Check if a packet hasn't been removed from instMap in too long.
      // Panics if a deadlock is detected and returns nothing otherwise.
@@ -88,7 +94,9 @@ class UncoalescedTable
      // which need responses. This data structure assumes the sequence number
      // is monotonically increasing (which is true for CU class) in order to
      // issue packets in age order.
-    std::map<uint64_t, PerInstPackets> instMap;
+    std::map<InstSeqNum, PerInstPackets> instMap;
+
+    std::map<InstSeqNum, int> instPktsRemaining;
  };
  
  class CoalescedRequest
@@ -389,6 +397,8 @@ class GPUCoalescer : public RubyPort
  
      virtual RubyRequestType getRequestType(PacketPtr pkt);
  
+    GPUDynInstPtr getDynInst(PacketPtr pkt) const;
+
      // Attempt to remove a packet from the uncoalescedTable and coalesce
      // with a previous request from the same instruction. If there is no
      // previous instruction and the max number of outstanding requests has
author	Matthew Poremba <matthew.poremba@amd.com>
	Fri, 25 Sep 2020 20:50:31 +0000 (15:50 -0500)
committer	Matthew Poremba <matthew.poremba@amd.com>
	Wed, 30 Sep 2020 20:19:36 +0000 (20:19 +0000)
src/mem/ruby/system/GPUCoalescer.cc		patch \| blob \| history
src/mem/ruby/system/GPUCoalescer.hh		patch \| blob \| history