From: Matthew Poremba Date: Fri, 25 Sep 2020 20:50:31 +0000 (-0500) Subject: mem-ruby: Fixing token port responses in GPUCoalescer X-Git-Tag: develop-gem5-snapshot~694 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=dcf242d838010402936f3d6580eabc9a6e297c7d;p=gem5.git mem-ruby: Fixing token port responses in GPUCoalescer The is a bug in the GPUCoalescer which occurs in the following situation: 1) An instruction crosses a page boundary causing multiple TLB requests to be sent. 2) The TLB responses arrive at different times, causing the vector memory requests to be sent at different times. 3) The first vector memory request completes before the second vector memory request arrives at the coalescer. This caused the coalescer to consider the instruction sequence number done and return its token. Then the second request would arrive and complete sending back another token. Eventually this increases the token count beyond the maximum tripping an assert. This change keeps track of the number of per-lane requests which are expected to be sent in the vector memory request by looking at the exec mask of the instruction. The token is not returned until the expected number of per-lane requests have been coalesced. This fixes "#7" in the list of issues in JIRA-300. There are also style fixes for local variables in code nearby the changes in this CL. Change-Id: I152fd9397920ad82ba6079112908387e71ff3cce JIRA: https://gem5.atlassian.net/browse/GEM5-300 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/35176 Reviewed-by: Matt Sinclair Reviewed-by: Kyle Roarty Maintainer: Matt Sinclair Tested-by: kokoro --- diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index e70293269..310ba72fc 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -77,6 +77,26 @@ UncoalescedTable::packetAvailable() return !instMap.empty(); } +void +UncoalescedTable::initPacketsRemaining(InstSeqNum seqNum, int count) +{ + if (!instPktsRemaining.count(seqNum)) { + instPktsRemaining[seqNum] = count; + } +} + +int +UncoalescedTable::getPacketsRemaining(InstSeqNum seqNum) +{ + return instPktsRemaining[seqNum]; +} + +void +UncoalescedTable::setPacketsRemaining(InstSeqNum seqNum, int count) +{ + instPktsRemaining[seqNum] = count; +} + PerInstPackets* UncoalescedTable::getInstPackets(int offset) { @@ -94,9 +114,20 @@ void UncoalescedTable::updateResources() { for (auto iter = instMap.begin(); iter != instMap.end(); ) { - if (iter->second.empty()) { - DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first); + InstSeqNum seq_num = iter->first; + DPRINTF(GPUCoalescer, "%s checking remaining pkts for %d\n", + coalescer->name().c_str(), seq_num); + assert(instPktsRemaining.count(seq_num)); + + if (instPktsRemaining[seq_num] == 0) { + assert(iter->second.empty()); + + // Remove from both maps instMap.erase(iter++); + instPktsRemaining.erase(seq_num); + + // Release the token + DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num); coalescer->getGMTokenPort().sendTokens(1); } else { ++iter; @@ -555,16 +586,23 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // otherwise, this must be either read or write command assert(pkt->isRead() || pkt->isWrite()); + InstSeqNum seq_num = pkt->req->getReqInstSeqNum(); + int num_packets = getDynInst(pkt)->exec_mask.count(); + // the pkt is temporarily stored in the uncoalesced table until // it's picked for coalescing process later in this cycle or in a - // future cycle + // future cycle. Packets remaining is set to the number of excepted + // requests from the instruction based on its exec_mask. uncoalescedTable.insertPacket(pkt); + uncoalescedTable.initPacketsRemaining(seq_num, num_packets); DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", pkt->getAddr()); // we schedule an issue event here to process the uncoalesced table // and try to issue Ruby request to cache system if (!issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "Scheduled issueEvent for seqNum %d\n", + seq_num); schedule(issueEvent, curTick()); } } @@ -595,6 +633,18 @@ GPUCoalescer::print(ostream& out) const << "]"; } +GPUDynInstPtr +GPUCoalescer::getDynInst(PacketPtr pkt) const +{ + RubyPort::SenderState* ss = + safe_cast(pkt->senderState); + + ComputeUnit::DataPort::SenderState* cu_state = + safe_cast + (ss->predecessor); + + return cu_state->_gpuDynInst; +} bool GPUCoalescer::coalescePacket(PacketPtr pkt) @@ -674,10 +724,7 @@ GPUCoalescer::coalescePacket(PacketPtr pkt) // CU will use that instruction to decrement wait counters // in the issuing wavefront. // For Ruby tester, gpuDynInst == nullptr - ComputeUnit::DataPort::SenderState* cu_state = - safe_cast - (ss->predecessor); - gpuDynInst = cu_state->_gpuDynInst; + gpuDynInst = getDynInst(pkt); } PendingWriteInst& inst = pendingWriteInsts[seqNum]; @@ -698,21 +745,45 @@ GPUCoalescer::completeIssue() // Iterate over the maximum number of instructions we can coalesce // per cycle (coalescingWindow). for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) { - PerInstPackets *pktList = + PerInstPackets *pkt_list = uncoalescedTable.getInstPackets(instIdx); // getInstPackets will return nullptr if no instruction // exists at the current offset. - if (!pktList) { + if (!pkt_list) { break; + } else if (pkt_list->empty()) { + // Found something, but it has not been cleaned up by update + // resources yet. See if there is anything else to coalesce. + // Assume we can't check anymore if the coalescing window is 1. + continue; } else { + // All packets in the list have the same seqNum, use first. + InstSeqNum seq_num = pkt_list->front()->req->getReqInstSeqNum(); + + // The difference in list size before and after tells us the + // number of packets which were coalesced. + size_t pkt_list_size = pkt_list->size(); + // Since we have a pointer to the list of packets in the inst, // erase them from the list if coalescing is successful and // leave them in the list otherwise. This aggressively attempts // to coalesce as many packets as possible from the current inst. - pktList->remove_if( + pkt_list->remove_if( [&](PacketPtr pkt) { return coalescePacket(pkt); } ); + + assert(pkt_list_size >= pkt_list->size()); + size_t pkt_list_diff = pkt_list_size - pkt_list->size(); + + int num_remaining = uncoalescedTable.getPacketsRemaining(seq_num); + num_remaining -= pkt_list_diff; + assert(num_remaining >= 0); + + uncoalescedTable.setPacketsRemaining(seq_num, num_remaining); + DPRINTF(GPUCoalescer, + "Coalesced %d pkts for seqNum %d, %d remaining\n", + pkt_list_diff, seq_num, num_remaining); } } diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index 3b1b7af2b..2684d51bd 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -70,12 +70,18 @@ class UncoalescedTable bool packetAvailable(); void printRequestTable(std::stringstream& ss); + // Modify packets remaining map. Init sets value iff the seqNum has not + // yet been seen before. get/set act as a regular getter/setter. + void initPacketsRemaining(InstSeqNum seqNum, int count); + int getPacketsRemaining(InstSeqNum seqNum); + void setPacketsRemaining(InstSeqNum seqNum, int count); + // Returns a pointer to the list of packets corresponding to an // instruction in the instruction map or nullptr if there are no // instructions at the offset. PerInstPackets* getInstPackets(int offset); void updateResources(); - bool areRequestsDone(const uint64_t instSeqNum); + bool areRequestsDone(const InstSeqNum instSeqNum); // Check if a packet hasn't been removed from instMap in too long. // Panics if a deadlock is detected and returns nothing otherwise. @@ -88,7 +94,9 @@ class UncoalescedTable // which need responses. This data structure assumes the sequence number // is monotonically increasing (which is true for CU class) in order to // issue packets in age order. - std::map instMap; + std::map instMap; + + std::map instPktsRemaining; }; class CoalescedRequest @@ -389,6 +397,8 @@ class GPUCoalescer : public RubyPort virtual RubyRequestType getRequestType(PacketPtr pkt); + GPUDynInstPtr getDynInst(PacketPtr pkt) const; + // Attempt to remove a packet from the uncoalescedTable and coalesce // with a previous request from the same instruction. If there is no // previous instruction and the max number of outstanding requests has