delete pkt->senderState;
delete pkt;
- return true;
- } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
- // this is for writeComplete callback
- // we simply get decrement write-related wait counters
- assert(gpuDynInst);
- M5_VAR_USED Wavefront *w =
- computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
- assert(w);
- DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
- "outstanding reqs %d => %d\n", gpuDynInst->simdId,
- gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
- gpuDynInst->disassemble(), w->outstandingReqs,
- w->outstandingReqs - 1);
- if (gpuDynInst->allLanesZero()) {
- // ask gm pipe to decrement request counters, instead of directly
- // performing here, to avoid asynchronous counter update and
- // instruction retirement (which may hurt waincnt effects)
- computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
-
- DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
- computeUnit->cu_id, gpuDynInst->simdId,
- gpuDynInst->wfSlotId);
- }
-
- delete pkt->senderState;
- delete pkt;
-
return true;
}
Addr paddr = pkt->req->getPaddr();
- // mem sync resp and write-complete callback must be handled already in
+ // mem sync resp callback must be handled already in
// DataPort::recvTimingResp
assert(pkt->cmd != MemCmd::MemSyncResp);
- assert(pkt->cmd != MemCmd::WriteCompleteResp);
+
+ // The status vector and global memory response for WriteResp packets get
+ // handled by the WriteCompleteResp packets.
+ if (pkt->cmd == MemCmd::WriteResp) {
+ delete pkt;
+ return;
+ }
// this is for read, write and atomic
int index = gpuDynInst->memStatusVector[paddr].back();
gpuDynInst->memStatusVector.clear();
- // note: only handle read response here; for write, the response
- // is separately handled when writeComplete callback is received
- if (pkt->isRead()) {
- gpuDynInst->
- profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
- compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
+ gpuDynInst->
+ profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
+ compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
- DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
- compute_unit->cu_id, gpuDynInst->simdId,
- gpuDynInst->wfSlotId);
- }
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
} else {
if (pkt->isRead()) {
if (!compute_unit->headTailMap.count(gpuDynInst)) {
// create a new coalecsed request and issue it immediately.
auto reqList = std::deque<CoalescedRequest*> { creq };
coalescedTable.insert(std::make_pair(line_addr, reqList));
-
- DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
- RubyRequestType_to_string(creq->getRubyType()), seqNum);
- issueRequest(creq);
+ if (!coalescedReqs.count(seqNum)) {
+ coalescedReqs.insert(std::make_pair(seqNum, reqList));
+ } else {
+ coalescedReqs.at(seqNum).push_back(creq);
+ }
} else {
// The request is for a line address that is already outstanding
// but for a different instruction. Add it as a new request to be
[&](PacketPtr pkt) { return coalescePacket(pkt); }
);
+ if (coalescedReqs.count(seq_num)) {
+ auto& creqs = coalescedReqs.at(seq_num);
+ for (auto creq : creqs) {
+ DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
+ RubyRequestType_to_string(creq->getRubyType()),
+ seq_num);
+ issueRequest(creq);
+ }
+ coalescedReqs.erase(seq_num);
+ }
+
assert(pkt_list_size >= pkt_list->size());
size_t pkt_list_diff = pkt_list_size - pkt_list->size();
// (typically the number of blocks in TCP). If there are duplicates of
// an address, the are serviced in age order.
std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+ // Map of instruction sequence number to coalesced requests that get
+ // created in coalescePacket, used in completeIssue to send the fully
+ // coalesced request
+ std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
// a map btw an instruction sequence number and PendingWriteInst
// this is used to do a final call back for each write when it is
}
// Flush, acquire, release requests don't access physical memory
- if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) {
+ if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq
+ || pkt->cmd == MemCmd::WriteCompleteResp) {
accessPhysMem = false;
}
assert(m_writeCompletePktMap.count(key) == 1 &&
!m_writeCompletePktMap[key].empty());
- for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
- if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
- RubyPort::SenderState *ss =
- safe_cast<RubyPort::SenderState *>
- (writeCompletePkt->senderState);
- MemResponsePort *port = ss->port;
- assert(port != NULL);
-
- writeCompletePkt->senderState = ss->predecessor;
- delete ss;
- port->hitCallback(writeCompletePkt);
- }
- }
+ m_writeCompletePktMap[key].erase(
+ std::remove_if(
+ m_writeCompletePktMap[key].begin(),
+ m_writeCompletePktMap[key].end(),
+ [addr](PacketPtr writeCompletePkt) -> bool {
+ if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
+ RubyPort::SenderState *ss =
+ safe_cast<RubyPort::SenderState *>
+ (writeCompletePkt->senderState);
+ MemResponsePort *port = ss->port;
+ assert(port != NULL);
+
+ writeCompletePkt->senderState = ss->predecessor;
+ delete ss;
+ port->hitCallback(writeCompletePkt);
+ return true;
+ }
+ return false;
+ }
+ ),
+ m_writeCompletePktMap[key].end()
+ );
trySendRetries();