2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "base/logging.hh"
35 #include "base/str.hh"
36 #include "config/the_isa.hh"
38 #if THE_ISA == X86_ISA
39 #include "arch/x86/insts/microldstop.hh"
42 #include "mem/ruby/system/GPUCoalescer.hh"
44 #include "cpu/testers/rubytest/RubyTester.hh"
45 #include "debug/GPUCoalescer.hh"
46 #include "debug/MemoryAccess.hh"
47 #include "debug/ProtocolTrace.hh"
48 #include "debug/RubyPort.hh"
49 #include "debug/RubyStats.hh"
50 #include "gpu-compute/shader.hh"
51 #include "mem/packet.hh"
52 #include "mem/ruby/common/DataBlock.hh"
53 #include "mem/ruby/common/SubBlock.hh"
54 #include "mem/ruby/network/MessageBuffer.hh"
55 #include "mem/ruby/profiler/Profiler.hh"
56 #include "mem/ruby/slicc_interface/AbstractController.hh"
57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
58 #include "mem/ruby/structures/CacheMemory.hh"
59 #include "mem/ruby/system/RubySystem.hh"
60 #include "params/RubyGPUCoalescer.hh"
64 UncoalescedTable::UncoalescedTable(GPUCoalescer
*gc
)
70 UncoalescedTable::insertPacket(PacketPtr pkt
)
72 uint64_t seqNum
= pkt
->req
->getReqInstSeqNum();
74 instMap
[seqNum
].push_back(pkt
);
75 DPRINTF(GPUCoalescer
, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
76 pkt
->getAddr(), seqNum
, instMap
.size(), instMap
[seqNum
].size());
80 UncoalescedTable::packetAvailable()
82 return !instMap
.empty();
86 UncoalescedTable::getInstPackets(int offset
)
88 if (offset
>= instMap
.size()) {
92 auto instMapIter
= instMap
.begin();
93 std::advance(instMapIter
, offset
);
95 return &(instMapIter
->second
);
99 UncoalescedTable::updateResources()
101 for (auto iter
= instMap
.begin(); iter
!= instMap
.end(); ) {
102 if (iter
->second
.empty()) {
103 DPRINTF(GPUCoalescer
, "Returning token seqNum %d\n", iter
->first
);
104 instMap
.erase(iter
++);
105 coalescer
->getGMTokenPort().sendTokens(1);
113 UncoalescedTable::areRequestsDone(const uint64_t instSeqNum
) {
114 // iterate the instructions held in UncoalescedTable to see whether there
115 // are more requests to issue; if yes, not yet done; otherwise, done
116 for (auto& inst
: instMap
) {
117 DPRINTF(GPUCoalescer
, "instSeqNum= %d, pending packets=%d\n"
118 ,inst
.first
, inst
.second
.size());
119 if (inst
.first
== instSeqNum
) { return false; }
126 UncoalescedTable::printRequestTable(std::stringstream
& ss
)
128 ss
<< "Listing pending packets from " << instMap
.size() << " instructions";
130 for (auto& inst
: instMap
) {
131 ss
<< "\tAddr: " << printAddress(inst
.first
) << " with "
132 << inst
.second
.size() << " pending packets" << std::endl
;
137 UncoalescedTable::checkDeadlock(Tick threshold
)
139 Tick current_time
= curTick();
141 for (auto &it
: instMap
) {
142 for (auto &pkt
: it
.second
) {
143 if (current_time
- pkt
->req
->time() > threshold
) {
144 std::stringstream ss
;
145 printRequestTable(ss
);
147 panic("Possible Deadlock detected. Aborting!\n"
148 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
149 "current time: %u issue_time: %d difference: %d\n"
150 "Request Tables:\n\n%s", coalescer
->getId(),
151 pkt
->getAddr(), instMap
.size(), current_time
,
152 pkt
->req
->time(), current_time
- pkt
->req
->time(),
159 GPUCoalescer::GPUCoalescer(const Params
*p
)
161 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
162 false, Event::Progress_Event_Pri
),
163 uncoalescedTable(this),
164 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
165 gmTokenPort(name() + ".gmTokenPort", this)
167 m_store_waiting_on_load_cycles
= 0;
168 m_store_waiting_on_store_cycles
= 0;
169 m_load_waiting_on_store_cycles
= 0;
170 m_load_waiting_on_load_cycles
= 0;
172 m_outstanding_count
= 0;
174 coalescingWindow
= p
->max_coalesces_per_cycle
;
176 m_max_outstanding_requests
= 0;
177 m_instCache_ptr
= nullptr;
178 m_dataCache_ptr
= nullptr;
180 m_instCache_ptr
= p
->icache
;
181 m_dataCache_ptr
= p
->dcache
;
182 m_max_outstanding_requests
= p
->max_outstanding_requests
;
183 m_deadlock_threshold
= p
->deadlock_threshold
;
185 assert(m_max_outstanding_requests
> 0);
186 assert(m_deadlock_threshold
> 0);
187 assert(m_instCache_ptr
);
188 assert(m_dataCache_ptr
);
190 m_runningGarnetStandalone
= p
->garnet_standalone
;
193 GPUCoalescer::~GPUCoalescer()
198 GPUCoalescer::getPort(const std::string
&if_name
, PortID idx
)
200 if (if_name
== "gmTokenPort") {
204 // delgate to RubyPort otherwise
205 return RubyPort::getPort(if_name
, idx
);
209 GPUCoalescer::wakeup()
211 Cycles current_time
= curCycle();
212 for (auto& requestList
: coalescedTable
) {
213 for (auto& req
: requestList
.second
) {
214 if (current_time
- req
->getIssueTime() > m_deadlock_threshold
) {
215 std::stringstream ss
;
216 printRequestTable(ss
);
217 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
218 m_version
, ss
.str());
219 panic("Aborting due to deadlock!\n");
224 Tick tick_threshold
= cyclesToTicks(m_deadlock_threshold
);
225 uncoalescedTable
.checkDeadlock(tick_threshold
);
227 if (m_outstanding_count
> 0) {
228 schedule(deadlockCheckEvent
,
229 m_deadlock_threshold
* clockPeriod() +
235 GPUCoalescer::printRequestTable(std::stringstream
& ss
)
237 ss
<< "Printing out " << coalescedTable
.size()
238 << " outstanding requests in the coalesced table\n";
240 for (auto& requestList
: coalescedTable
) {
241 for (auto& request
: requestList
.second
) {
242 ss
<< "\tAddr: " << printAddress(requestList
.first
) << "\n"
243 << "\tInstruction sequence number: "
244 << request
->getSeqNum() << "\n"
246 << RubyRequestType_to_string(request
->getRubyType()) << "\n"
247 << "\t\tNumber of associated packets: "
248 << request
->getPackets().size() << "\n"
249 << "\t\tIssue time: "
250 << request
->getIssueTime() * clockPeriod() << "\n"
251 << "\t\tDifference from current tick: "
252 << (curCycle() - request
->getIssueTime()) * clockPeriod();
256 // print out packets waiting to be issued in uncoalesced table
257 uncoalescedTable
.printRequestTable(ss
);
261 GPUCoalescer::resetStats()
263 m_latencyHist
.reset();
264 m_missLatencyHist
.reset();
265 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
266 m_typeLatencyHist
[i
]->reset();
267 m_missTypeLatencyHist
[i
]->reset();
268 for (int j
= 0; j
< MachineType_NUM
; j
++) {
269 m_missTypeMachLatencyHist
[i
][j
]->reset();
273 for (int i
= 0; i
< MachineType_NUM
; i
++) {
274 m_missMachLatencyHist
[i
]->reset();
276 m_IssueToInitialDelayHist
[i
]->reset();
277 m_InitialToForwardDelayHist
[i
]->reset();
278 m_ForwardToFirstResponseDelayHist
[i
]->reset();
279 m_FirstResponseToCompletionDelayHist
[i
]->reset();
284 GPUCoalescer::printProgress(ostream
& out
) const
288 // sets the kernelEndList
290 GPUCoalescer::insertKernel(int wavefront_id
, PacketPtr pkt
)
292 // Don't know if this will happen or is possible
293 // but I just want to be careful and not have it become
294 // simulator hang in the future
295 DPRINTF(GPUCoalescer
, "inserting wf: %d to kernelEndlist\n", wavefront_id
);
296 assert(kernelEndList
.count(wavefront_id
) == 0);
298 kernelEndList
[wavefront_id
] = pkt
;
299 DPRINTF(GPUCoalescer
, "kernelEndList->size() = %d\n",
300 kernelEndList
.size());
304 GPUCoalescer::writeCallback(Addr address
, DataBlock
& data
)
306 writeCallback(address
, MachineType_NULL
, data
);
310 GPUCoalescer::writeCallback(Addr address
,
314 writeCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
318 GPUCoalescer::writeCallback(Addr address
,
321 Cycles initialRequestTime
,
322 Cycles forwardRequestTime
,
323 Cycles firstResponseTime
)
325 writeCallback(address
, mach
, data
,
326 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
331 GPUCoalescer::writeCallback(Addr address
,
334 Cycles initialRequestTime
,
335 Cycles forwardRequestTime
,
336 Cycles firstResponseTime
,
339 assert(address
== makeLineAddress(address
));
340 assert(coalescedTable
.count(address
));
342 auto crequest
= coalescedTable
.at(address
).front();
344 hitCallback(crequest
, mach
, data
, true, crequest
->getIssueTime(),
345 forwardRequestTime
, firstResponseTime
, isRegion
);
347 // remove this crequest in coalescedTable
349 coalescedTable
.at(address
).pop_front();
351 if (coalescedTable
.at(address
).empty()) {
352 coalescedTable
.erase(address
);
354 auto nextRequest
= coalescedTable
.at(address
).front();
355 issueRequest(nextRequest
);
360 GPUCoalescer::writeCompleteCallback(Addr address
,
364 DPRINTF(GPUCoalescer
, "writeCompleteCallback for address 0x%x"
365 " instSeqNum = %d\n", address
, instSeqNum
);
367 assert(pendingWriteInsts
.count(instSeqNum
) == 1);
368 PendingWriteInst
& inst
= pendingWriteInsts
[instSeqNum
];
370 // check the uncoalescedTable to see whether all requests for the inst
371 // have been issued or not
372 bool reqsAllIssued
= uncoalescedTable
.areRequestsDone(instSeqNum
);
373 DPRINTF(GPUCoalescer
, "instSeqNum = %d, pendingStores=%d, "
374 "reqsAllIssued=%d\n", reqsAllIssued
,
375 inst
.getNumPendingStores()-1, reqsAllIssued
);
377 if (inst
.receiveWriteCompleteAck() && reqsAllIssued
) {
378 // if the pending write instruction has received all write completion
379 // callbacks for its issued Ruby requests, we can now start respond
380 // the requesting CU in one response packet.
381 inst
.ackWriteCompletion(m_usingRubyTester
);
383 DPRINTF(GPUCoalescer
, "write inst %d completed at coalescer\n",
385 pendingWriteInsts
.erase(instSeqNum
);
390 GPUCoalescer::readCallback(Addr address
, DataBlock
& data
)
392 readCallback(address
, MachineType_NULL
, data
);
396 GPUCoalescer::readCallback(Addr address
,
400 readCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
404 GPUCoalescer::readCallback(Addr address
,
407 Cycles initialRequestTime
,
408 Cycles forwardRequestTime
,
409 Cycles firstResponseTime
)
412 readCallback(address
, mach
, data
,
413 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
418 GPUCoalescer::readCallback(Addr address
,
421 Cycles initialRequestTime
,
422 Cycles forwardRequestTime
,
423 Cycles firstResponseTime
,
426 assert(address
== makeLineAddress(address
));
427 assert(coalescedTable
.count(address
));
429 auto crequest
= coalescedTable
.at(address
).front();
430 fatal_if(crequest
->getRubyType() != RubyRequestType_LD
,
431 "readCallback received non-read type response\n");
433 // Iterate over the coalesced requests to respond to as many loads as
434 // possible until another request type is seen. Models MSHR for TCP.
435 while (crequest
->getRubyType() == RubyRequestType_LD
) {
436 hitCallback(crequest
, mach
, data
, true, crequest
->getIssueTime(),
437 forwardRequestTime
, firstResponseTime
, isRegion
);
440 coalescedTable
.at(address
).pop_front();
441 if (coalescedTable
.at(address
).empty()) {
445 crequest
= coalescedTable
.at(address
).front();
448 if (coalescedTable
.at(address
).empty()) {
449 coalescedTable
.erase(address
);
451 auto nextRequest
= coalescedTable
.at(address
).front();
452 issueRequest(nextRequest
);
457 GPUCoalescer::hitCallback(CoalescedRequest
* crequest
,
461 Cycles initialRequestTime
,
462 Cycles forwardRequestTime
,
463 Cycles firstResponseTime
,
466 PacketPtr pkt
= crequest
->getFirstPkt();
467 Addr request_address
= pkt
->getAddr();
468 Addr request_line_address M5_VAR_USED
= makeLineAddress(request_address
);
470 RubyRequestType type
= crequest
->getRubyType();
472 DPRINTF(GPUCoalescer
, "Got hitCallback for 0x%X\n", request_line_address
);
474 recordMissLatency(crequest
, mach
,
481 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
482 std::vector
<PacketPtr
> pktList
= crequest
->getPackets();
483 DPRINTF(GPUCoalescer
, "Responding to %d packets for addr 0x%X\n",
484 pktList
.size(), request_line_address
);
485 for (auto& pkt
: pktList
) {
486 request_address
= pkt
->getAddr();
487 if (pkt
->getPtr
<uint8_t>()) {
488 if ((type
== RubyRequestType_LD
) ||
489 (type
== RubyRequestType_ATOMIC
) ||
490 (type
== RubyRequestType_ATOMIC_RETURN
) ||
491 (type
== RubyRequestType_IFETCH
) ||
492 (type
== RubyRequestType_RMW_Read
) ||
493 (type
== RubyRequestType_Locked_RMW_Read
) ||
494 (type
== RubyRequestType_Load_Linked
)) {
496 data
.getData(getOffset(request_address
), pkt
->getSize()));
498 data
.setData(pkt
->getPtr
<uint8_t>(),
499 getOffset(request_address
), pkt
->getSize());
502 DPRINTF(MemoryAccess
,
503 "WARNING. Data not transfered from Ruby to M5 for type " \
505 RubyRequestType_to_string(type
));
509 m_outstanding_count
--;
510 assert(m_outstanding_count
>= 0);
512 completeHitCallback(pktList
);
516 GPUCoalescer::empty() const
518 return coalescedTable
.empty();
522 GPUCoalescer::getRequestType(PacketPtr pkt
)
524 RubyRequestType req_type
= RubyRequestType_NULL
;
526 // These types are not support or not used in GPU caches.
527 assert(!pkt
->req
->isLLSC());
528 assert(!pkt
->req
->isLockedRMW());
529 assert(!pkt
->req
->isInstFetch());
530 assert(!pkt
->isFlush());
532 if (pkt
->req
->isAtomicReturn()) {
533 req_type
= RubyRequestType_ATOMIC_RETURN
;
534 } else if (pkt
->req
->isAtomicNoReturn()) {
535 req_type
= RubyRequestType_ATOMIC_NO_RETURN
;
536 } else if (pkt
->isRead()) {
537 req_type
= RubyRequestType_LD
;
538 } else if (pkt
->isWrite()) {
539 req_type
= RubyRequestType_ST
;
541 panic("Unsupported ruby packet type\n");
547 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
548 // special type (MemFence, scoping, etc), it is issued immediately.
550 GPUCoalescer::makeRequest(PacketPtr pkt
)
552 // all packets must have valid instruction sequence numbers
553 assert(pkt
->req
->hasInstSeqNum());
555 if (pkt
->cmd
== MemCmd::MemSyncReq
) {
556 // issue mem_sync requests immediately to the cache system without
557 // going through uncoalescedTable like normal LD/ST/Atomic requests
558 issueMemSyncRequest(pkt
);
560 // otherwise, this must be either read or write command
561 assert(pkt
->isRead() || pkt
->isWrite());
563 // the pkt is temporarily stored in the uncoalesced table until
564 // it's picked for coalescing process later in this cycle or in a
566 uncoalescedTable
.insertPacket(pkt
);
567 DPRINTF(GPUCoalescer
, "Put pkt with addr 0x%X to uncoalescedTable\n",
570 // we schedule an issue event here to process the uncoalesced table
571 // and try to issue Ruby request to cache system
572 if (!issueEvent
.scheduled()) {
573 schedule(issueEvent
, curTick());
577 // we always return RequestStatus_Issued in this coalescer
578 // b/c the coalescer's resouce was checked ealier and the coalescer is
579 // queueing up aliased requets in its coalesced table
580 return RequestStatus_Issued
;
583 template <class KEY
, class VALUE
>
585 operator<<(ostream
&out
, const std::unordered_map
<KEY
, VALUE
> &map
)
588 for (auto i
= map
.begin(); i
!= map
.end(); ++i
)
589 out
<< " " << i
->first
<< "=" << i
->second
;
596 GPUCoalescer::print(ostream
& out
) const
598 out
<< "[GPUCoalescer: " << m_version
599 << ", outstanding requests: " << m_outstanding_count
605 GPUCoalescer::coalescePacket(PacketPtr pkt
)
607 uint64_t seqNum
= pkt
->req
->getReqInstSeqNum();
608 Addr line_addr
= makeLineAddress(pkt
->getAddr());
610 // If the packet has the same line address as a request already in the
611 // coalescedTable and has the same sequence number, it can be coalesced.
612 if (coalescedTable
.count(line_addr
)) {
613 // Search for a previous coalesced request with the same seqNum.
614 auto& creqQueue
= coalescedTable
.at(line_addr
);
615 auto citer
= std::find_if(creqQueue
.begin(), creqQueue
.end(),
616 [&](CoalescedRequest
* c
) { return c
->getSeqNum() == seqNum
; }
618 if (citer
!= creqQueue
.end()) {
619 (*citer
)->insertPacket(pkt
);
624 if (m_outstanding_count
< m_max_outstanding_requests
) {
625 // This is an "aliased" or new request. Create a RubyRequest and
626 // append it to the list of "targets" in the coalescing table.
627 DPRINTF(GPUCoalescer
, "Creating new or aliased request for 0x%X\n",
630 CoalescedRequest
*creq
= new CoalescedRequest(seqNum
);
631 creq
->insertPacket(pkt
);
632 creq
->setRubyType(getRequestType(pkt
));
633 creq
->setIssueTime(curCycle());
635 if (!coalescedTable
.count(line_addr
)) {
636 // If there is no outstanding request for this line address,
637 // create a new coalecsed request and issue it immediately.
638 auto reqList
= std::deque
<CoalescedRequest
*> { creq
};
639 coalescedTable
.insert(std::make_pair(line_addr
, reqList
));
641 DPRINTF(GPUCoalescer
, "Issued req type %s seqNum %d\n",
642 RubyRequestType_to_string(creq
->getRubyType()), seqNum
);
645 // The request is for a line address that is already outstanding
646 // but for a different instruction. Add it as a new request to be
647 // issued when the current outstanding request is completed.
648 coalescedTable
.at(line_addr
).push_back(creq
);
649 DPRINTF(GPUCoalescer
, "found address 0x%X with new seqNum %d\n",
653 // In both cases, requests are added to the coalescing table and will
654 // be counted as outstanding requests.
655 m_outstanding_count
++;
657 // We track all issued or to-be-issued Ruby requests associated with
658 // write instructions. An instruction may have multiple Ruby
660 if (pkt
->cmd
== MemCmd::WriteReq
) {
661 DPRINTF(GPUCoalescer
, "adding write inst %d at line 0x%x to"
662 " the pending write instruction list\n", seqNum
,
665 RubyPort::SenderState
* ss
=
666 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
668 // we need to save this port because it will be used to call
669 // back the requesting CU when we receive write
670 // complete callbacks for all issued Ruby requests of this
672 RubyPort::MemResponsePort
* mem_response_port
= ss
->port
;
674 GPUDynInstPtr gpuDynInst
= nullptr;
676 if (!m_usingRubyTester
) {
677 // If this coalescer is connected to a real CU, we need
678 // to save the corresponding gpu dynamic instruction.
679 // CU will use that instruction to decrement wait counters
680 // in the issuing wavefront.
681 // For Ruby tester, gpuDynInst == nullptr
682 ComputeUnit::DataPort::SenderState
* cu_state
=
683 safe_cast
<ComputeUnit::DataPort::SenderState
*>
685 gpuDynInst
= cu_state
->_gpuDynInst
;
688 PendingWriteInst
& inst
= pendingWriteInsts
[seqNum
];
689 inst
.addPendingReq(mem_response_port
, gpuDynInst
,
696 // The maximum number of outstanding requests have been issued.
701 GPUCoalescer::completeIssue()
703 // Iterate over the maximum number of instructions we can coalesce
704 // per cycle (coalescingWindow).
705 for (int instIdx
= 0; instIdx
< coalescingWindow
; ++instIdx
) {
706 PerInstPackets
*pktList
=
707 uncoalescedTable
.getInstPackets(instIdx
);
709 // getInstPackets will return nullptr if no instruction
710 // exists at the current offset.
714 // Since we have a pointer to the list of packets in the inst,
715 // erase them from the list if coalescing is successful and
716 // leave them in the list otherwise. This aggressively attempts
717 // to coalesce as many packets as possible from the current inst.
719 [&](PacketPtr pkt
) { return coalescePacket(pkt
); }
724 // Clean up any instructions in the uncoalesced table that have had
725 // all of their packets coalesced and return a token for that column.
726 uncoalescedTable
.updateResources();
728 // have Kernel End releases been issued this cycle
729 int len
= newKernelEnds
.size();
730 for (int i
= 0; i
< len
; i
++) {
731 kernelCallback(newKernelEnds
[i
]);
733 newKernelEnds
.clear();
737 GPUCoalescer::evictionCallback(Addr address
)
739 ruby_eviction_callback(address
);
743 GPUCoalescer::kernelCallback(int wavefront_id
)
745 assert(kernelEndList
.count(wavefront_id
));
747 ruby_hit_callback(kernelEndList
[wavefront_id
]);
749 kernelEndList
.erase(wavefront_id
);
753 GPUCoalescer::atomicCallback(Addr address
,
755 const DataBlock
& data
)
757 assert(address
== makeLineAddress(address
));
758 assert(coalescedTable
.count(address
));
760 auto crequest
= coalescedTable
.at(address
).front();
762 fatal_if((crequest
->getRubyType() != RubyRequestType_ATOMIC
&&
763 crequest
->getRubyType() != RubyRequestType_ATOMIC_RETURN
&&
764 crequest
->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN
),
765 "atomicCallback saw non-atomic type response\n");
767 hitCallback(crequest
, mach
, (DataBlock
&)data
, true,
768 crequest
->getIssueTime(), Cycles(0), Cycles(0), false);
771 coalescedTable
.at(address
).pop_front();
773 if (coalescedTable
.at(address
).empty()) {
774 coalescedTable
.erase(address
);
776 auto nextRequest
= coalescedTable
.at(address
).front();
777 issueRequest(nextRequest
);
782 GPUCoalescer::completeHitCallback(std::vector
<PacketPtr
> & mylist
)
784 for (auto& pkt
: mylist
) {
785 RubyPort::SenderState
*ss
=
786 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
787 MemResponsePort
*port
= ss
->port
;
788 assert(port
!= NULL
);
790 pkt
->senderState
= ss
->predecessor
;
792 if (pkt
->cmd
!= MemCmd::WriteReq
) {
793 // for WriteReq, we keep the original senderState until
794 // writeCompleteCallback
798 port
->hitCallback(pkt
);
802 // We schedule an event in the same tick as hitCallback (similar to
803 // makeRequest) rather than calling completeIssue directly to reduce
804 // function calls to complete issue. This can only happen if the max
805 // outstanding requests is less than the number of slots in the
806 // uncoalesced table and makeRequest is not called again.
807 if (uncoalescedTable
.packetAvailable() && !issueEvent
.scheduled()) {
808 schedule(issueEvent
, curTick());
815 GPUCoalescer::recordMissLatency(CoalescedRequest
* crequest
,
817 Cycles initialRequestTime
,
818 Cycles forwardRequestTime
,
819 Cycles firstResponseTime
,
820 bool success
, bool isRegion
)
825 GPUCoalescer::regStats()
827 RubyPort::regStats();
829 // These statistical variables are not for display.
830 // The profiler will collate these across different
831 // coalescers and display those collated statistics.
832 m_outstandReqHist
.init(10);
833 m_latencyHist
.init(10);
834 m_missLatencyHist
.init(10);
836 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
837 m_typeLatencyHist
.push_back(new Stats::Histogram());
838 m_typeLatencyHist
[i
]->init(10);
840 m_missTypeLatencyHist
.push_back(new Stats::Histogram());
841 m_missTypeLatencyHist
[i
]->init(10);
844 for (int i
= 0; i
< MachineType_NUM
; i
++) {
845 m_missMachLatencyHist
.push_back(new Stats::Histogram());
846 m_missMachLatencyHist
[i
]->init(10);
848 m_IssueToInitialDelayHist
.push_back(new Stats::Histogram());
849 m_IssueToInitialDelayHist
[i
]->init(10);
851 m_InitialToForwardDelayHist
.push_back(new Stats::Histogram());
852 m_InitialToForwardDelayHist
[i
]->init(10);
854 m_ForwardToFirstResponseDelayHist
.push_back(new Stats::Histogram());
855 m_ForwardToFirstResponseDelayHist
[i
]->init(10);
857 m_FirstResponseToCompletionDelayHist
.push_back(new Stats::Histogram());
858 m_FirstResponseToCompletionDelayHist
[i
]->init(10);
861 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
862 m_missTypeMachLatencyHist
.push_back(std::vector
<Stats::Histogram
*>());
864 for (int j
= 0; j
< MachineType_NUM
; j
++) {
865 m_missTypeMachLatencyHist
[i
].push_back(new Stats::Histogram());
866 m_missTypeMachLatencyHist
[i
][j
]->init(10);