gpu-compute,mem-ruby: Properly create/handle WriteCompletePkts

author Kyle Roarty <kyleroarty1716@gmail.com>

Fri, 28 Aug 2020 22:42:10 +0000 (17:42 -0500)

committer Kyle Roarty <kyleroarty1716@gmail.com>

Thu, 15 Oct 2020 17:52:51 +0000 (17:52 +0000)
author Kyle Roarty <kyleroarty1716@gmail.com>
Fri, 28 Aug 2020 22:42:10 +0000 (17:42 -0500)
committer Kyle Roarty <kyleroarty1716@gmail.com>
Thu, 15 Oct 2020 17:52:51 +0000 (17:52 +0000)
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc

index d15c4328bd197207afc676ff69b3abacf8b0748e..c39dec8430e05de795ccce7cb8044976789df7f3 100644 (file)
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -862,33 +862,6 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
  
          delete pkt->senderState;
          delete pkt;
-        return true;
-    } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
-        // this is for writeComplete callback
-        // we simply get decrement write-related wait counters
-        assert(gpuDynInst);
-        M5_VAR_USED Wavefront *w =
-            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
-        assert(w);
-        DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
-                        "outstanding reqs %d => %d\n", gpuDynInst->simdId,
-                        gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
-                        gpuDynInst->disassemble(), w->outstandingReqs,
-                        w->outstandingReqs - 1);
-        if (gpuDynInst->allLanesZero()) {
-            // ask gm pipe to decrement request counters, instead of directly
-            // performing here, to avoid asynchronous counter update and
-            // instruction retirement (which may hurt waincnt effects)
-            computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
-
-            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
-                            computeUnit->cu_id, gpuDynInst->simdId,
-                            gpuDynInst->wfSlotId);
-        }
-
-        delete pkt->senderState;
-        delete pkt;
-
          return true;
      }
  
@@ -1319,10 +1292,16 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
  
      Addr paddr = pkt->req->getPaddr();
  
-    // mem sync resp and write-complete callback must be handled already in
+    // mem sync resp callback must be handled already in
      // DataPort::recvTimingResp
      assert(pkt->cmd != MemCmd::MemSyncResp);
-    assert(pkt->cmd != MemCmd::WriteCompleteResp);
+
+    // The status vector and global memory response for WriteResp packets get
+    // handled by the WriteCompleteResp packets.
+    if (pkt->cmd == MemCmd::WriteResp) {
+        delete pkt;
+        return;
+    }
  
      // this is for read, write and atomic
      int index = gpuDynInst->memStatusVector[paddr].back();
@@ -1356,17 +1335,13 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
  
          gpuDynInst->memStatusVector.clear();
  
-        // note: only handle read response here; for write, the response
-        // is separately handled when writeComplete callback is received
-        if (pkt->isRead()) {
-            gpuDynInst->
-                profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
-            compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
+        gpuDynInst->
+            profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
+        compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
  
-            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
-                    compute_unit->cu_id, gpuDynInst->simdId,
-                    gpuDynInst->wfSlotId);
-        }
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId);
      } else {
          if (pkt->isRead()) {
              if (!compute_unit->headTailMap.count(gpuDynInst)) {
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc

index d9df1d89313a734ce50ed4d4194a0a9a9b06f0c0..3f73568396d2ca250fa69f3ed7b52933aa06dcdb 100644 (file)
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -682,10 +682,11 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
              // create a new coalecsed request and issue it immediately.
              auto reqList = std::deque<CoalescedRequest*> { creq };
              coalescedTable.insert(std::make_pair(line_addr, reqList));
-
-            DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
-                    RubyRequestType_to_string(creq->getRubyType()), seqNum);
-            issueRequest(creq);
+            if (!coalescedReqs.count(seqNum)) {
+                coalescedReqs.insert(std::make_pair(seqNum, reqList));
+            } else {
+                coalescedReqs.at(seqNum).push_back(creq);
+            }
          } else {
              // The request is for a line address that is already outstanding
              // but for a different instruction. Add it as a new request to be
@@ -773,6 +774,17 @@ GPUCoalescer::completeIssue()
                  [&](PacketPtr pkt) { return coalescePacket(pkt); }
              );
  
+            if (coalescedReqs.count(seq_num)) {
+                auto& creqs = coalescedReqs.at(seq_num);
+                for (auto creq : creqs) {
+                    DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
+                            RubyRequestType_to_string(creq->getRubyType()),
+                                                      seq_num);
+                    issueRequest(creq);
+                }
+                coalescedReqs.erase(seq_num);
+            }
+
              assert(pkt_list_size >= pkt_list->size());
              size_t pkt_list_diff = pkt_list_size - pkt_list->size();
  
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh

index 086cc6da3ba34ccaf890d689b4c3a73b2801f6f4..709b491a831943d4f92427e107914ef1b1b38429 100644 (file)
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -430,6 +430,10 @@ class GPUCoalescer : public RubyPort
      // (typically the number of blocks in TCP). If there are duplicates of
      // an address, the are serviced in age order.
      std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+    // Map of instruction sequence number to coalesced requests that get
+    // created in coalescePacket, used in completeIssue to send the fully
+    // coalesced request
+    std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
  
      // a map btw an instruction sequence number and PendingWriteInst
      // this is used to do a final call back for each write when it is
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc

index 9a6434ab50a367f09748800edcda281c9d57615b..b47aaefca46a48c88eb36ba68de6896cc1e3ed82 100644 (file)
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -546,7 +546,8 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt)
      }
  
      // Flush, acquire, release requests don't access physical memory
-    if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) {
+    if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq
+        || pkt->cmd == MemCmd::WriteCompleteResp) {
          accessPhysMem = false;
      }
  
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc

index 6589a7d764e2e1d410a5a6e83bb8a6880cbb3ce0..f0873a42a43884f60f99d52d3ea5bed13f6a7367 100644 (file)
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -238,19 +238,28 @@ VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
      assert(m_writeCompletePktMap.count(key) == 1 &&
             !m_writeCompletePktMap[key].empty());
  
-    for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
-        if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
-            RubyPort::SenderState *ss =
-                safe_cast<RubyPort::SenderState *>
-                    (writeCompletePkt->senderState);
-            MemResponsePort *port = ss->port;
-            assert(port != NULL);
-
-            writeCompletePkt->senderState = ss->predecessor;
-            delete ss;
-            port->hitCallback(writeCompletePkt);
-        }
-    }
+    m_writeCompletePktMap[key].erase(
+        std::remove_if(
+            m_writeCompletePktMap[key].begin(),
+            m_writeCompletePktMap[key].end(),
+            [addr](PacketPtr writeCompletePkt) -> bool {
+                if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
+                    RubyPort::SenderState *ss =
+                        safe_cast<RubyPort::SenderState *>
+                            (writeCompletePkt->senderState);
+                    MemResponsePort *port = ss->port;
+                    assert(port != NULL);
+
+                    writeCompletePkt->senderState = ss->predecessor;
+                    delete ss;
+                    port->hitCallback(writeCompletePkt);
+                    return true;
+                }
+                return false;
+            }
+        ),
+        m_writeCompletePktMap[key].end()
+    );
  
      trySendRetries();
author	Kyle Roarty <kyleroarty1716@gmail.com>
	Fri, 28 Aug 2020 22:42:10 +0000 (17:42 -0500)
committer	Kyle Roarty <kyleroarty1716@gmail.com>
	Thu, 15 Oct 2020 17:52:51 +0000 (17:52 +0000)
src/gpu-compute/compute_unit.cc		patch \| blob \| history
src/mem/ruby/system/GPUCoalescer.cc		patch \| blob \| history
src/mem/ruby/system/GPUCoalescer.hh		patch \| blob \| history
src/mem/ruby/system/RubyPort.cc		patch \| blob \| history
src/mem/ruby/system/VIPERCoalescer.cc		patch \| blob \| history