From b20cc7e6d890aba7feadf27782a773262e6486e6 Mon Sep 17 00:00:00 2001 From: Kyle Roarty Date: Fri, 28 Aug 2020 17:42:10 -0500 Subject: [PATCH] gpu-compute,mem-ruby: Properly create/handle WriteCompletePkts There is a flow of packets as so: WriteResp -> WriteReq -> WriteCompleteResp These packets share some variables, in particular senderState and a status vector. One issue was the WriteResp packet decremented the status vector, which was used by the WriteCompleteResp packets to determine when to handle the global memory response. This could lead to multiple WriteCompleteResp packets attempting to handle the global memory response. Because of that, the WriteCompleteResp packets needed to handle the status vector. this patch moves WriteCompleteResp packet handling back into ComputeUnit::DataPort::processMemRespEvent from ComputeUnit::DataPort::recvTimingResp. This helps remove some redundant code. This patch has the WriteResp packet return without doing any status vector handling, and without deleting the senderState, which had previously caused a segfault. Another issue was WriteCompleteResp packets weren't being issued for each active lane, as the coalesced request was being issued too early. In order to fix that, we have to ensure every active lane puts their request into their applicable coalesced request before issuing the coalesced request. Because of that change, we change the issuing of CoalescedRequests from GPUCoalescer::coalescePacket to GPUCoalescer::completeIssue. That change involves adding a new variable to store the CoalescedRequests that are created in the calls to coalescePacket. This variable is a map from instruction sequence number to coalesced requests. Additionally, the WriteCompleteResp packet was attempting to access physical memory in hitCallback while not having any data, which caused a crash. This can be resolved either by not allowing WriteCompleteResp packets to access memory, or by copying the data from the WriteReq packet. This patch denies WriteCompleteResp packets memory access in hitCallback. Finally, in VIPERCoalescer::writeCompleteCallback there was a map that held the WriteComplete packets, but no packets were ever being removed. This patch removes packets that match the address that was passed in to the function. Change-Id: I9a064a0def2bf6c513f5295596c56b1b652b0ca4 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/33656 Reviewed-by: Matt Sinclair Reviewed-by: Anthony Gutierrez Maintainer: Matt Sinclair Maintainer: Anthony Gutierrez Tested-by: kokoro --- src/gpu-compute/compute_unit.cc | 53 +++++++-------------------- src/mem/ruby/system/GPUCoalescer.cc | 20 ++++++++-- src/mem/ruby/system/GPUCoalescer.hh | 4 ++ src/mem/ruby/system/RubyPort.cc | 3 +- src/mem/ruby/system/VIPERCoalescer.cc | 35 +++++++++++------- 5 files changed, 58 insertions(+), 57 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index d15c4328b..c39dec843 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -862,33 +862,6 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) delete pkt->senderState; delete pkt; - return true; - } else if (pkt->cmd == MemCmd::WriteCompleteResp) { - // this is for writeComplete callback - // we simply get decrement write-related wait counters - assert(gpuDynInst); - M5_VAR_USED Wavefront *w = - computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; - assert(w); - DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing " - "outstanding reqs %d => %d\n", gpuDynInst->simdId, - gpuDynInst->wfSlotId, gpuDynInst->wfDynId, - gpuDynInst->disassemble(), w->outstandingReqs, - w->outstandingReqs - 1); - if (gpuDynInst->allLanesZero()) { - // ask gm pipe to decrement request counters, instead of directly - // performing here, to avoid asynchronous counter update and - // instruction retirement (which may hurt waincnt effects) - computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); - - DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n", - computeUnit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId); - } - - delete pkt->senderState; - delete pkt; - return true; } @@ -1319,10 +1292,16 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) Addr paddr = pkt->req->getPaddr(); - // mem sync resp and write-complete callback must be handled already in + // mem sync resp callback must be handled already in // DataPort::recvTimingResp assert(pkt->cmd != MemCmd::MemSyncResp); - assert(pkt->cmd != MemCmd::WriteCompleteResp); + + // The status vector and global memory response for WriteResp packets get + // handled by the WriteCompleteResp packets. + if (pkt->cmd == MemCmd::WriteResp) { + delete pkt; + return; + } // this is for read, write and atomic int index = gpuDynInst->memStatusVector[paddr].back(); @@ -1356,17 +1335,13 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) gpuDynInst->memStatusVector.clear(); - // note: only handle read response here; for write, the response - // is separately handled when writeComplete callback is received - if (pkt->isRead()) { - gpuDynInst-> - profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); - compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); + gpuDynInst-> + profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); + compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); - DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", - compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId); - } + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); } else { if (pkt->isRead()) { if (!compute_unit->headTailMap.count(gpuDynInst)) { diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index d9df1d893..3f7356839 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -682,10 +682,11 @@ GPUCoalescer::coalescePacket(PacketPtr pkt) // create a new coalecsed request and issue it immediately. auto reqList = std::deque { creq }; coalescedTable.insert(std::make_pair(line_addr, reqList)); - - DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n", - RubyRequestType_to_string(creq->getRubyType()), seqNum); - issueRequest(creq); + if (!coalescedReqs.count(seqNum)) { + coalescedReqs.insert(std::make_pair(seqNum, reqList)); + } else { + coalescedReqs.at(seqNum).push_back(creq); + } } else { // The request is for a line address that is already outstanding // but for a different instruction. Add it as a new request to be @@ -773,6 +774,17 @@ GPUCoalescer::completeIssue() [&](PacketPtr pkt) { return coalescePacket(pkt); } ); + if (coalescedReqs.count(seq_num)) { + auto& creqs = coalescedReqs.at(seq_num); + for (auto creq : creqs) { + DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n", + RubyRequestType_to_string(creq->getRubyType()), + seq_num); + issueRequest(creq); + } + coalescedReqs.erase(seq_num); + } + assert(pkt_list_size >= pkt_list->size()); size_t pkt_list_diff = pkt_list_size - pkt_list->size(); diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index 086cc6da3..709b491a8 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -430,6 +430,10 @@ class GPUCoalescer : public RubyPort // (typically the number of blocks in TCP). If there are duplicates of // an address, the are serviced in age order. std::map> coalescedTable; + // Map of instruction sequence number to coalesced requests that get + // created in coalescePacket, used in completeIssue to send the fully + // coalesced request + std::unordered_map> coalescedReqs; // a map btw an instruction sequence number and PendingWriteInst // this is used to do a final call back for each write when it is diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 9a6434ab5..b47aaefca 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -546,7 +546,8 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt) } // Flush, acquire, release requests don't access physical memory - if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) { + if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq + || pkt->cmd == MemCmd::WriteCompleteResp) { accessPhysMem = false; } diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index 6589a7d76..f0873a42a 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -238,19 +238,28 @@ VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum) assert(m_writeCompletePktMap.count(key) == 1 && !m_writeCompletePktMap[key].empty()); - for (auto writeCompletePkt : m_writeCompletePktMap[key]) { - if (makeLineAddress(writeCompletePkt->getAddr()) == addr) { - RubyPort::SenderState *ss = - safe_cast - (writeCompletePkt->senderState); - MemResponsePort *port = ss->port; - assert(port != NULL); - - writeCompletePkt->senderState = ss->predecessor; - delete ss; - port->hitCallback(writeCompletePkt); - } - } + m_writeCompletePktMap[key].erase( + std::remove_if( + m_writeCompletePktMap[key].begin(), + m_writeCompletePktMap[key].end(), + [addr](PacketPtr writeCompletePkt) -> bool { + if (makeLineAddress(writeCompletePkt->getAddr()) == addr) { + RubyPort::SenderState *ss = + safe_cast + (writeCompletePkt->senderState); + MemResponsePort *port = ss->port; + assert(port != NULL); + + writeCompletePkt->senderState = ss->predecessor; + delete ss; + port->hitCallback(writeCompletePkt); + return true; + } + return false; + } + ), + m_writeCompletePktMap[key].end() + ); trySendRetries(); -- 2.30.2