2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "base/logging.hh"
35 #include "base/str.hh"
36 #include "config/the_isa.hh"
38 #if THE_ISA == X86_ISA
39 #include "arch/x86/insts/microldstop.hh"
42 #include "mem/ruby/system/GPUCoalescer.hh"
44 #include "cpu/testers/rubytest/RubyTester.hh"
45 #include "debug/GPUCoalescer.hh"
46 #include "debug/MemoryAccess.hh"
47 #include "debug/ProtocolTrace.hh"
48 #include "debug/RubyPort.hh"
49 #include "debug/RubyStats.hh"
50 #include "gpu-compute/shader.hh"
51 #include "mem/packet.hh"
52 #include "mem/ruby/common/DataBlock.hh"
53 #include "mem/ruby/common/SubBlock.hh"
54 #include "mem/ruby/network/MessageBuffer.hh"
55 #include "mem/ruby/profiler/Profiler.hh"
56 #include "mem/ruby/slicc_interface/AbstractController.hh"
57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
58 #include "mem/ruby/structures/CacheMemory.hh"
59 #include "mem/ruby/system/RubySystem.hh"
60 #include "params/RubyGPUCoalescer.hh"
64 UncoalescedTable::UncoalescedTable(GPUCoalescer
*gc
)
70 UncoalescedTable::insertPacket(PacketPtr pkt
)
72 uint64_t seqNum
= pkt
->req
->getReqInstSeqNum();
74 instMap
[seqNum
].push_back(pkt
);
75 DPRINTF(GPUCoalescer
, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
76 pkt
->getAddr(), seqNum
, instMap
.size(), instMap
[seqNum
].size());
80 UncoalescedTable::packetAvailable()
82 return !instMap
.empty();
86 UncoalescedTable::getInstPackets(int offset
)
88 if (offset
>= instMap
.size()) {
92 auto instMapIter
= instMap
.begin();
93 std::advance(instMapIter
, offset
);
95 return &(instMapIter
->second
);
99 UncoalescedTable::updateResources()
101 for (auto iter
= instMap
.begin(); iter
!= instMap
.end(); ) {
102 if (iter
->second
.empty()) {
103 DPRINTF(GPUCoalescer
, "Returning token seqNum %d\n", iter
->first
);
104 instMap
.erase(iter
++);
105 coalescer
->getGMTokenPort().sendTokens(1);
113 UncoalescedTable::areRequestsDone(const uint64_t instSeqNum
) {
114 // iterate the instructions held in UncoalescedTable to see whether there
115 // are more requests to issue; if yes, not yet done; otherwise, done
116 for (auto& inst
: instMap
) {
117 DPRINTF(GPUCoalescer
, "instSeqNum= %d, pending packets=%d\n"
118 ,inst
.first
, inst
.second
.size());
119 if (inst
.first
== instSeqNum
) { return false; }
126 UncoalescedTable::printRequestTable(std::stringstream
& ss
)
128 ss
<< "Listing pending packets from " << instMap
.size() << " instructions";
130 for (auto& inst
: instMap
) {
131 ss
<< "\tAddr: " << printAddress(inst
.first
) << " with "
132 << inst
.second
.size() << " pending packets" << std::endl
;
137 UncoalescedTable::checkDeadlock(Tick threshold
)
139 Tick current_time
= curTick();
141 for (auto &it
: instMap
) {
142 for (auto &pkt
: it
.second
) {
143 if (current_time
- pkt
->req
->time() > threshold
) {
144 std::stringstream ss
;
145 printRequestTable(ss
);
147 panic("Possible Deadlock detected. Aborting!\n"
148 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
149 "current time: %u issue_time: %d difference: %d\n"
150 "Request Tables:\n\n%s", coalescer
->getId(),
151 pkt
->getAddr(), instMap
.size(), current_time
,
152 pkt
->req
->time(), current_time
- pkt
->req
->time(),
159 GPUCoalescer::GPUCoalescer(const Params
*p
)
161 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
162 false, Event::Progress_Event_Pri
),
163 uncoalescedTable(this),
164 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
165 gmTokenPort(name() + ".gmTokenPort", this)
167 m_store_waiting_on_load_cycles
= 0;
168 m_store_waiting_on_store_cycles
= 0;
169 m_load_waiting_on_store_cycles
= 0;
170 m_load_waiting_on_load_cycles
= 0;
172 m_outstanding_count
= 0;
174 coalescingWindow
= p
->max_coalesces_per_cycle
;
176 m_max_outstanding_requests
= 0;
177 m_instCache_ptr
= nullptr;
178 m_dataCache_ptr
= nullptr;
180 m_instCache_ptr
= p
->icache
;
181 m_dataCache_ptr
= p
->dcache
;
182 m_max_outstanding_requests
= p
->max_outstanding_requests
;
183 m_deadlock_threshold
= p
->deadlock_threshold
;
185 assert(m_max_outstanding_requests
> 0);
186 assert(m_deadlock_threshold
> 0);
187 assert(m_instCache_ptr
);
188 assert(m_dataCache_ptr
);
190 m_runningGarnetStandalone
= p
->garnet_standalone
;
193 GPUCoalescer::~GPUCoalescer()
198 GPUCoalescer::getPort(const std::string
&if_name
, PortID idx
)
200 if (if_name
== "gmTokenPort") {
204 // delgate to RubyPort otherwise
205 return RubyPort::getPort(if_name
, idx
);
209 GPUCoalescer::wakeup()
211 Cycles current_time
= curCycle();
212 for (auto& requestList
: coalescedTable
) {
213 for (auto& req
: requestList
.second
) {
214 if (current_time
- req
->getIssueTime() > m_deadlock_threshold
) {
215 std::stringstream ss
;
216 printRequestTable(ss
);
217 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
218 m_version
, ss
.str());
219 panic("Aborting due to deadlock!\n");
224 Tick tick_threshold
= cyclesToTicks(m_deadlock_threshold
);
225 uncoalescedTable
.checkDeadlock(tick_threshold
);
227 if (m_outstanding_count
> 0) {
228 schedule(deadlockCheckEvent
,
229 m_deadlock_threshold
* clockPeriod() +
235 GPUCoalescer::printRequestTable(std::stringstream
& ss
)
237 ss
<< "Printing out " << coalescedTable
.size()
238 << " outstanding requests in the coalesced table\n";
240 for (auto& requestList
: coalescedTable
) {
241 for (auto& request
: requestList
.second
) {
242 ss
<< "\tAddr: " << printAddress(requestList
.first
) << "\n"
243 << "\tInstruction sequence number: "
244 << request
->getSeqNum() << "\n"
246 << RubyRequestType_to_string(request
->getRubyType()) << "\n"
247 << "\t\tNumber of associated packets: "
248 << request
->getPackets().size() << "\n"
249 << "\t\tIssue time: "
250 << request
->getIssueTime() * clockPeriod() << "\n"
251 << "\t\tDifference from current tick: "
252 << (curCycle() - request
->getIssueTime()) * clockPeriod();
256 // print out packets waiting to be issued in uncoalesced table
257 uncoalescedTable
.printRequestTable(ss
);
261 GPUCoalescer::resetStats()
263 m_latencyHist
.reset();
264 m_missLatencyHist
.reset();
265 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
266 m_typeLatencyHist
[i
]->reset();
267 m_missTypeLatencyHist
[i
]->reset();
268 for (int j
= 0; j
< MachineType_NUM
; j
++) {
269 m_missTypeMachLatencyHist
[i
][j
]->reset();
273 for (int i
= 0; i
< MachineType_NUM
; i
++) {
274 m_missMachLatencyHist
[i
]->reset();
276 m_IssueToInitialDelayHist
[i
]->reset();
277 m_InitialToForwardDelayHist
[i
]->reset();
278 m_ForwardToFirstResponseDelayHist
[i
]->reset();
279 m_FirstResponseToCompletionDelayHist
[i
]->reset();
284 GPUCoalescer::printProgress(ostream
& out
) const
288 // sets the kernelEndList
290 GPUCoalescer::insertKernel(int wavefront_id
, PacketPtr pkt
)
292 // Don't know if this will happen or is possible
293 // but I just want to be careful and not have it become
294 // simulator hang in the future
295 DPRINTF(GPUCoalescer
, "inserting wf: %d to kernelEndlist\n", wavefront_id
);
296 assert(kernelEndList
.count(wavefront_id
) == 0);
298 kernelEndList
[wavefront_id
] = pkt
;
299 DPRINTF(GPUCoalescer
, "kernelEndList->size() = %d\n",
300 kernelEndList
.size());
304 GPUCoalescer::writeCallback(Addr address
, DataBlock
& data
)
306 writeCallback(address
, MachineType_NULL
, data
);
310 GPUCoalescer::writeCallback(Addr address
,
314 writeCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
318 GPUCoalescer::writeCallback(Addr address
,
321 Cycles initialRequestTime
,
322 Cycles forwardRequestTime
,
323 Cycles firstResponseTime
)
325 writeCallback(address
, mach
, data
,
326 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
331 GPUCoalescer::writeCallback(Addr address
,
334 Cycles initialRequestTime
,
335 Cycles forwardRequestTime
,
336 Cycles firstResponseTime
,
339 assert(address
== makeLineAddress(address
));
340 assert(coalescedTable
.count(address
));
342 auto crequest
= coalescedTable
.at(address
).front();
344 hitCallback(crequest
, mach
, data
, true, crequest
->getIssueTime(),
345 forwardRequestTime
, firstResponseTime
, isRegion
);
347 // remove this crequest in coalescedTable
349 coalescedTable
.at(address
).pop_front();
351 if (coalescedTable
.at(address
).empty()) {
352 coalescedTable
.erase(address
);
354 auto nextRequest
= coalescedTable
.at(address
).front();
355 issueRequest(nextRequest
);
360 GPUCoalescer::writeCompleteCallback(Addr address
,
364 DPRINTF(GPUCoalescer
, "writeCompleteCallback for address 0x%x"
365 " instSeqNum = %d\n", address
, instSeqNum
);
367 assert(pendingWriteInsts
.count(instSeqNum
) == 1);
368 PendingWriteInst
& inst
= pendingWriteInsts
[instSeqNum
];
370 // check the uncoalescedTable to see whether all requests for the inst
371 // have been issued or not
372 bool reqsAllIssued
= uncoalescedTable
.areRequestsDone(instSeqNum
);
373 DPRINTF(GPUCoalescer
, "instSeqNum = %d, pendingStores=%d, "
374 "reqsAllIssued=%d\n", reqsAllIssued
,
375 inst
.getNumPendingStores()-1, reqsAllIssued
);
377 if (inst
.receiveWriteCompleteAck() && reqsAllIssued
) {
378 // if the pending write instruction has received all write completion
379 // callbacks for its issued Ruby requests, we can now start respond
380 // the requesting CU in one response packet.
381 inst
.ackWriteCompletion(m_usingRubyTester
);
383 DPRINTF(GPUCoalescer
, "write inst %d completed at coalescer\n",
385 pendingWriteInsts
.erase(instSeqNum
);
390 GPUCoalescer::readCallback(Addr address
, DataBlock
& data
)
392 readCallback(address
, MachineType_NULL
, data
);
396 GPUCoalescer::readCallback(Addr address
,
400 readCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
404 GPUCoalescer::readCallback(Addr address
,
407 Cycles initialRequestTime
,
408 Cycles forwardRequestTime
,
409 Cycles firstResponseTime
)
412 readCallback(address
, mach
, data
,
413 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
418 GPUCoalescer::readCallback(Addr address
,
421 Cycles initialRequestTime
,
422 Cycles forwardRequestTime
,
423 Cycles firstResponseTime
,
426 assert(address
== makeLineAddress(address
));
427 assert(coalescedTable
.count(address
));
429 auto crequest
= coalescedTable
.at(address
).front();
430 fatal_if(crequest
->getRubyType() != RubyRequestType_LD
,
431 "readCallback received non-read type response\n");
433 // Iterate over the coalesced requests to respond to as many loads as
434 // possible until another request type is seen. Models MSHR for TCP.
435 while (crequest
->getRubyType() == RubyRequestType_LD
) {
436 hitCallback(crequest
, mach
, data
, true, crequest
->getIssueTime(),
437 forwardRequestTime
, firstResponseTime
, isRegion
);
440 coalescedTable
.at(address
).pop_front();
441 if (coalescedTable
.at(address
).empty()) {
445 crequest
= coalescedTable
.at(address
).front();
448 if (coalescedTable
.at(address
).empty()) {
449 coalescedTable
.erase(address
);
451 auto nextRequest
= coalescedTable
.at(address
).front();
452 issueRequest(nextRequest
);
457 GPUCoalescer::hitCallback(CoalescedRequest
* crequest
,
461 Cycles initialRequestTime
,
462 Cycles forwardRequestTime
,
463 Cycles firstResponseTime
,
466 PacketPtr pkt
= crequest
->getFirstPkt();
467 Addr request_address
= pkt
->getAddr();
468 Addr request_line_address M5_VAR_USED
= makeLineAddress(request_address
);
470 RubyRequestType type
= crequest
->getRubyType();
472 DPRINTF(GPUCoalescer
, "Got hitCallback for 0x%X\n", request_line_address
);
474 recordMissLatency(crequest
, mach
,
481 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
482 std::vector
<PacketPtr
> pktList
= crequest
->getPackets();
483 DPRINTF(GPUCoalescer
, "Responding to %d packets for addr 0x%X\n",
484 pktList
.size(), request_line_address
);
485 for (auto& pkt
: pktList
) {
486 request_address
= pkt
->getAddr();
487 if (pkt
->getPtr
<uint8_t>()) {
488 if ((type
== RubyRequestType_LD
) ||
489 (type
== RubyRequestType_ATOMIC
) ||
490 (type
== RubyRequestType_ATOMIC_RETURN
) ||
491 (type
== RubyRequestType_IFETCH
) ||
492 (type
== RubyRequestType_RMW_Read
) ||
493 (type
== RubyRequestType_Locked_RMW_Read
) ||
494 (type
== RubyRequestType_Load_Linked
)) {
496 data
.getData(getOffset(request_address
), pkt
->getSize()));
498 data
.setData(pkt
->getPtr
<uint8_t>(),
499 getOffset(request_address
), pkt
->getSize());
502 DPRINTF(MemoryAccess
,
503 "WARNING. Data not transfered from Ruby to M5 for type " \
505 RubyRequestType_to_string(type
));
511 m_outstanding_count
--;
512 assert(m_outstanding_count
>= 0);
514 completeHitCallback(pktList
);
518 GPUCoalescer::empty() const
520 return coalescedTable
.empty();
524 GPUCoalescer::getRequestType(PacketPtr pkt
)
526 RubyRequestType req_type
= RubyRequestType_NULL
;
528 // These types are not support or not used in GPU caches.
529 assert(!pkt
->req
->isLLSC());
530 assert(!pkt
->req
->isLockedRMW());
531 assert(!pkt
->req
->isInstFetch());
532 assert(!pkt
->isFlush());
534 if (pkt
->req
->isAtomicReturn()) {
535 req_type
= RubyRequestType_ATOMIC_RETURN
;
536 } else if (pkt
->req
->isAtomicNoReturn()) {
537 req_type
= RubyRequestType_ATOMIC_NO_RETURN
;
538 } else if (pkt
->isRead()) {
539 req_type
= RubyRequestType_LD
;
540 } else if (pkt
->isWrite()) {
541 req_type
= RubyRequestType_ST
;
543 panic("Unsupported ruby packet type\n");
549 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
550 // special type (MemFence, scoping, etc), it is issued immediately.
552 GPUCoalescer::makeRequest(PacketPtr pkt
)
554 // all packets must have valid instruction sequence numbers
555 assert(pkt
->req
->hasInstSeqNum());
557 if (pkt
->cmd
== MemCmd::MemSyncReq
) {
558 // issue mem_sync requests immedidately to the cache system without
559 // going though uncoalescedTable like normal LD/ST/Atomic requests
560 issueMemSyncRequest(pkt
);
562 // otherwise, this must be either read or write command
563 assert(pkt
->isRead() || pkt
->isWrite());
565 // the pkt is temporarily stored in the uncoalesced table until
566 // it's picked for coalescing process later in this cycle or in a
568 uncoalescedTable
.insertPacket(pkt
);
569 DPRINTF(GPUCoalescer
, "Put pkt with addr 0x%X to uncoalescedTable\n",
572 // we schedule an issue event here to process the uncoalesced table
573 // and try to issue Ruby request to cache system
574 if (!issueEvent
.scheduled()) {
575 schedule(issueEvent
, curTick());
579 // we always return RequestStatus_Issued in this coalescer
580 // b/c the coalescer's resouce was checked ealier and the coalescer is
581 // queueing up aliased requets in its coalesced table
582 return RequestStatus_Issued
;
586 * TODO: Figure out what do with this code. This code may go away
587 * and/or be merged into the VIPER coalescer once the VIPER
588 * protocol is re-integrated with GCN3 codes.
592 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
594 PacketPtr pkt = crequest->getFirstPkt();
597 if (pkt != NULL && pkt->req->hasContextId()) {
598 proc_id = pkt->req->contextId();
601 // If valid, copy the pc to the ruby request
603 if (pkt->req->hasPC()) {
604 pc = pkt->req->getPC();
607 // At the moment setting scopes only counts
608 // for GPU spill space accesses
609 // which is pkt->req->isStack()
610 // this scope is REPLACE since it
611 // does not need to be flushed at the end
612 // of a kernel Private and local may need
613 // to be visible at the end of the kernel
614 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
615 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
617 Addr line_addr = makeLineAddress(pkt->getAddr());
619 // Creating WriteMask that records written bytes
620 // and atomic operations. This enables partial writes
621 // and partial reads of those writes
624 uint32_t blockSize = RubySystem::getBlockSizeBytes();
625 std::vector<bool> accessMask(blockSize,false);
626 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
627 uint32_t tableSize = crequest->getPackets().size();
628 for (int i = 0; i < tableSize; i++) {
629 PacketPtr tmpPkt = crequest->getPackets()[i];
630 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
631 uint32_t tmpSize = tmpPkt->getSize();
632 if (tmpPkt->isAtomicOp()) {
633 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
634 tmpPkt->getAtomicOp());
635 atomicOps.push_back(tmpAtomicOp);
636 } else if (tmpPkt->isWrite()) {
637 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
640 for (int j = 0; j < tmpSize; j++) {
641 accessMask[tmpOffset + j] = true;
644 std::shared_ptr<RubyRequest> msg;
645 if (pkt->isAtomicOp()) {
646 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
647 pkt->getPtr<uint8_t>(),
648 pkt->getSize(), pc, crequest->getRubyType(),
649 RubyAccessMode_Supervisor, pkt,
650 PrefetchBit_No, proc_id, 100,
651 blockSize, accessMask,
652 dataBlock, atomicOps,
653 accessScope, accessSegment);
655 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
656 pkt->getPtr<uint8_t>(),
657 pkt->getSize(), pc, crequest->getRubyType(),
658 RubyAccessMode_Supervisor, pkt,
659 PrefetchBit_No, proc_id, 100,
660 blockSize, accessMask,
662 accessScope, accessSegment);
664 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
665 curTick(), m_version, "Coal", "Begin", "", "",
666 printAddress(msg->getPhysicalAddress()),
667 RubyRequestType_to_string(crequest->getRubyType()));
669 fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
670 "there should not be any I-Fetch requests in the GPU Coalescer");
672 Tick latency = cyclesToTicks(
673 m_controller->mandatoryQueueLatency(crequest->getRubyType()));
676 if (!deadlockCheckEvent.scheduled()) {
677 schedule(deadlockCheckEvent,
678 m_deadlock_threshold * clockPeriod() +
682 assert(m_mandatory_q_ptr);
683 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
686 template <class KEY
, class VALUE
>
688 operator<<(ostream
&out
, const std::unordered_map
<KEY
, VALUE
> &map
)
691 for (auto i
= map
.begin(); i
!= map
.end(); ++i
)
692 out
<< " " << i
->first
<< "=" << i
->second
;
699 GPUCoalescer::print(ostream
& out
) const
701 out
<< "[GPUCoalescer: " << m_version
702 << ", outstanding requests: " << m_outstanding_count
708 GPUCoalescer::coalescePacket(PacketPtr pkt
)
710 uint64_t seqNum
= pkt
->req
->getReqInstSeqNum();
711 Addr line_addr
= makeLineAddress(pkt
->getAddr());
713 // If the packet has the same line address as a request already in the
714 // coalescedTable and has the same sequence number, it can be coalesced.
715 if (coalescedTable
.count(line_addr
)) {
716 // Search for a previous coalesced request with the same seqNum.
717 auto& creqQueue
= coalescedTable
.at(line_addr
);
718 auto citer
= std::find_if(creqQueue
.begin(), creqQueue
.end(),
719 [&](CoalescedRequest
* c
) { return c
->getSeqNum() == seqNum
; }
721 if (citer
!= creqQueue
.end()) {
722 (*citer
)->insertPacket(pkt
);
727 if (m_outstanding_count
< m_max_outstanding_requests
) {
728 // This is an "aliased" or new request. Create a RubyRequest and
729 // append it to the list of "targets" in the coalescing table.
730 DPRINTF(GPUCoalescer
, "Creating new or aliased request for 0x%X\n",
733 CoalescedRequest
*creq
= new CoalescedRequest(seqNum
);
734 creq
->insertPacket(pkt
);
735 creq
->setRubyType(getRequestType(pkt
));
736 creq
->setIssueTime(curCycle());
738 if (!coalescedTable
.count(line_addr
)) {
739 // If there is no outstanding request for this line address,
740 // create a new coalecsed request and issue it immediately.
741 auto reqList
= std::deque
<CoalescedRequest
*> { creq
};
742 coalescedTable
.insert(std::make_pair(line_addr
, reqList
));
744 DPRINTF(GPUCoalescer
, "Issued req type %s seqNum %d\n",
745 RubyRequestType_to_string(creq
->getRubyType()), seqNum
);
748 // The request is for a line address that is already outstanding
749 // but for a different instruction. Add it as a new request to be
750 // issued when the current outstanding request is completed.
751 coalescedTable
.at(line_addr
).push_back(creq
);
752 DPRINTF(GPUCoalescer
, "found address 0x%X with new seqNum %d\n",
756 // In both cases, requests are added to the coalescing table and will
757 // be counted as outstanding requests.
758 m_outstanding_count
++;
760 // We track all issued or to-be-issued Ruby requests associated with
761 // write instructions. An instruction may have multiple Ruby
763 if (pkt
->cmd
== MemCmd::WriteReq
) {
764 DPRINTF(GPUCoalescer
, "adding write inst %d at line 0x%x to"
765 " the pending write instruction list\n", seqNum
,
768 RubyPort::SenderState
* ss
=
769 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
771 // we need to save this port because it will be used to call
772 // back the requesting CU when we receive write
773 // complete callbacks for all issued Ruby requests of this
775 RubyPort::MemSlavePort
* mem_slave_port
= ss
->port
;
777 GPUDynInstPtr gpuDynInst
= nullptr;
779 if (!m_usingRubyTester
) {
780 // If this coalescer is connected to a real CU, we need
781 // to save the corresponding gpu dynamic instruction.
782 // CU will use that instruction to decrement wait counters
783 // in the issuing wavefront.
784 // For Ruby tester, gpuDynInst == nullptr
785 ComputeUnit::DataPort::SenderState
* cu_state
=
786 safe_cast
<ComputeUnit::DataPort::SenderState
*>
788 gpuDynInst
= cu_state
->_gpuDynInst
;
791 PendingWriteInst
& inst
= pendingWriteInsts
[seqNum
];
792 inst
.addPendingReq(mem_slave_port
, gpuDynInst
, m_usingRubyTester
);
798 // The maximum number of outstanding requests have been issued.
803 GPUCoalescer::completeIssue()
805 // Iterate over the maximum number of instructions we can coalesce
806 // per cycle (coalescingWindow).
807 for (int instIdx
= 0; instIdx
< coalescingWindow
; ++instIdx
) {
808 PerInstPackets
*pktList
=
809 uncoalescedTable
.getInstPackets(instIdx
);
811 // getInstPackets will return nullptr if no instruction
812 // exists at the current offset.
816 // Since we have a pointer to the list of packets in the inst,
817 // erase them from the list if coalescing is successful and
818 // leave them in the list otherwise. This aggressively attempts
819 // to coalesce as many packets as possible from the current inst.
821 [&](PacketPtr pkt
) { return coalescePacket(pkt
); }
826 // Clean up any instructions in the uncoalesced table that have had
827 // all of their packets coalesced and return a token for that column.
828 uncoalescedTable
.updateResources();
830 // have Kernel End releases been issued this cycle
831 int len
= newKernelEnds
.size();
832 for (int i
= 0; i
< len
; i
++) {
833 kernelCallback(newKernelEnds
[i
]);
835 newKernelEnds
.clear();
839 GPUCoalescer::evictionCallback(Addr address
)
841 ruby_eviction_callback(address
);
845 GPUCoalescer::kernelCallback(int wavefront_id
)
847 assert(kernelEndList
.count(wavefront_id
));
849 ruby_hit_callback(kernelEndList
[wavefront_id
]);
851 kernelEndList
.erase(wavefront_id
);
855 GPUCoalescer::atomicCallback(Addr address
,
857 const DataBlock
& data
)
859 assert(address
== makeLineAddress(address
));
860 assert(coalescedTable
.count(address
));
862 auto crequest
= coalescedTable
.at(address
).front();
864 fatal_if((crequest
->getRubyType() != RubyRequestType_ATOMIC
&&
865 crequest
->getRubyType() != RubyRequestType_ATOMIC_RETURN
&&
866 crequest
->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN
),
867 "atomicCallback saw non-atomic type response\n");
869 hitCallback(crequest
, mach
, (DataBlock
&)data
, true,
870 crequest
->getIssueTime(), Cycles(0), Cycles(0), false);
873 coalescedTable
.at(address
).pop_front();
875 if (coalescedTable
.at(address
).empty()) {
876 coalescedTable
.erase(address
);
878 auto nextRequest
= coalescedTable
.at(address
).front();
879 issueRequest(nextRequest
);
884 GPUCoalescer::completeHitCallback(std::vector
<PacketPtr
> & mylist
)
886 for (auto& pkt
: mylist
) {
887 RubyPort::SenderState
*ss
=
888 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
889 MemSlavePort
*port
= ss
->port
;
890 assert(port
!= NULL
);
892 pkt
->senderState
= ss
->predecessor
;
894 port
->hitCallback(pkt
);
898 // We schedule an event in the same tick as hitCallback (similar to
899 // makeRequest) rather than calling completeIssue directly to reduce
900 // function calls to complete issue. This can only happen if the max
901 // outstanding requests is less than the number of slots in the
902 // uncoalesced table and makeRequest is not called again.
903 if (uncoalescedTable
.packetAvailable() && !issueEvent
.scheduled()) {
904 schedule(issueEvent
, curTick());
911 GPUCoalescer::recordMissLatency(CoalescedRequest
* crequest
,
913 Cycles initialRequestTime
,
914 Cycles forwardRequestTime
,
915 Cycles firstResponseTime
,
916 bool success
, bool isRegion
)
921 GPUCoalescer::regStats()
923 RubyPort::regStats();
925 // These statistical variables are not for display.
926 // The profiler will collate these across different
927 // coalescers and display those collated statistics.
928 m_outstandReqHist
.init(10);
929 m_latencyHist
.init(10);
930 m_missLatencyHist
.init(10);
932 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
933 m_typeLatencyHist
.push_back(new Stats::Histogram());
934 m_typeLatencyHist
[i
]->init(10);
936 m_missTypeLatencyHist
.push_back(new Stats::Histogram());
937 m_missTypeLatencyHist
[i
]->init(10);
940 for (int i
= 0; i
< MachineType_NUM
; i
++) {
941 m_missMachLatencyHist
.push_back(new Stats::Histogram());
942 m_missMachLatencyHist
[i
]->init(10);
944 m_IssueToInitialDelayHist
.push_back(new Stats::Histogram());
945 m_IssueToInitialDelayHist
[i
]->init(10);
947 m_InitialToForwardDelayHist
.push_back(new Stats::Histogram());
948 m_InitialToForwardDelayHist
[i
]->init(10);
950 m_ForwardToFirstResponseDelayHist
.push_back(new Stats::Histogram());
951 m_ForwardToFirstResponseDelayHist
[i
]->init(10);
953 m_FirstResponseToCompletionDelayHist
.push_back(new Stats::Histogram());
954 m_FirstResponseToCompletionDelayHist
[i
]->init(10);
957 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
958 m_missTypeMachLatencyHist
.push_back(std::vector
<Stats::Histogram
*>());
960 for (int j
= 0; j
< MachineType_NUM
; j
++) {
961 m_missTypeMachLatencyHist
[i
].push_back(new Stats::Histogram());
962 m_missTypeMachLatencyHist
[i
][j
]->init(10);