2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "base/logging.hh"
35 #include "base/str.hh"
36 #include "config/the_isa.hh"
38 #if THE_ISA == X86_ISA
39 #include "arch/x86/insts/microldstop.hh"
42 #include "mem/ruby/system/GPUCoalescer.hh"
44 #include "cpu/testers/rubytest/RubyTester.hh"
45 #include "debug/GPUCoalescer.hh"
46 #include "debug/MemoryAccess.hh"
47 #include "debug/ProtocolTrace.hh"
48 #include "debug/RubyPort.hh"
49 #include "debug/RubyStats.hh"
50 #include "gpu-compute/shader.hh"
51 #include "mem/packet.hh"
52 #include "mem/ruby/common/DataBlock.hh"
53 #include "mem/ruby/common/SubBlock.hh"
54 #include "mem/ruby/network/MessageBuffer.hh"
55 #include "mem/ruby/profiler/Profiler.hh"
56 #include "mem/ruby/slicc_interface/AbstractController.hh"
57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
58 #include "mem/ruby/structures/CacheMemory.hh"
59 #include "mem/ruby/system/RubySystem.hh"
60 #include "params/RubyGPUCoalescer.hh"
65 RubyGPUCoalescerParams::create()
67 return new GPUCoalescer(this);
71 reqScopeToHSAScope(const RequestPtr
&req
)
73 HSAScope accessScope
= HSAScope_UNSPECIFIED
;
74 if (req
->isScoped()) {
75 if (req
->isWavefrontScope()) {
76 accessScope
= HSAScope_WAVEFRONT
;
77 } else if (req
->isWorkgroupScope()) {
78 accessScope
= HSAScope_WORKGROUP
;
79 } else if (req
->isDeviceScope()) {
80 accessScope
= HSAScope_DEVICE
;
81 } else if (req
->isSystemScope()) {
82 accessScope
= HSAScope_SYSTEM
;
84 fatal("Bad scope type");
91 reqSegmentToHSASegment(const RequestPtr
&req
)
93 HSASegment accessSegment
= HSASegment_GLOBAL
;
95 if (req
->isGlobalSegment()) {
96 accessSegment
= HSASegment_GLOBAL
;
97 } else if (req
->isGroupSegment()) {
98 accessSegment
= HSASegment_GROUP
;
99 } else if (req
->isPrivateSegment()) {
100 accessSegment
= HSASegment_PRIVATE
;
101 } else if (req
->isKernargSegment()) {
102 accessSegment
= HSASegment_KERNARG
;
103 } else if (req
->isReadonlySegment()) {
104 accessSegment
= HSASegment_READONLY
;
105 } else if (req
->isSpillSegment()) {
106 accessSegment
= HSASegment_SPILL
;
107 } else if (req
->isArgSegment()) {
108 accessSegment
= HSASegment_ARG
;
110 fatal("Bad segment type");
113 return accessSegment
;
116 UncoalescedTable::UncoalescedTable(GPUCoalescer
*gc
)
122 UncoalescedTable::insertPacket(PacketPtr pkt
)
124 uint64_t seqNum
= pkt
->req
->getReqInstSeqNum();
126 instMap
[seqNum
].push_back(pkt
);
127 DPRINTF(GPUCoalescer
, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
128 pkt
->getAddr(), seqNum
, instMap
.size(), instMap
[seqNum
].size());
132 UncoalescedTable::packetAvailable()
134 return !instMap
.empty();
138 UncoalescedTable::getInstPackets(int offset
)
140 if (offset
>= instMap
.size()) {
144 auto instMapIter
= instMap
.begin();
145 std::advance(instMapIter
, offset
);
147 return &(instMapIter
->second
);
151 UncoalescedTable::updateResources()
153 for (auto iter
= instMap
.begin(); iter
!= instMap
.end(); ) {
154 if (iter
->second
.empty()) {
155 instMap
.erase(iter
++);
156 coalescer
->getGMTokenPort().sendTokens(1);
164 UncoalescedTable::printRequestTable(std::stringstream
& ss
)
166 ss
<< "UncoalescedTable contains " << instMap
.size()
167 << " address entries." << std::endl
;
168 for (auto& inst
: instMap
) {
169 ss
<< "Addr 0x" << std::hex
<< inst
.first
<< std::dec
170 << " with " << inst
.second
.size() << " packets"
176 UncoalescedTable::checkDeadlock(Tick threshold
)
178 Tick current_time
= curTick();
180 for (auto &it
: instMap
) {
181 for (auto &pkt
: it
.second
) {
182 if (current_time
- pkt
->req
->time() > threshold
) {
183 std::stringstream ss
;
184 printRequestTable(ss
);
186 panic("Possible Deadlock detected. Aborting!\n"
187 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
188 "current time: %u issue_time: %d difference: %d\n"
189 "Request Tables:\n\n%s", coalescer
->getId(),
190 pkt
->getAddr(), instMap
.size(), current_time
,
191 pkt
->req
->time(), current_time
- pkt
->req
->time(),
198 GPUCoalescer::GPUCoalescer(const Params
*p
)
200 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
201 false, Event::Progress_Event_Pri
),
202 uncoalescedTable(this),
203 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
204 gmTokenPort(name() + ".gmTokenPort", this)
206 m_store_waiting_on_load_cycles
= 0;
207 m_store_waiting_on_store_cycles
= 0;
208 m_load_waiting_on_store_cycles
= 0;
209 m_load_waiting_on_load_cycles
= 0;
211 m_outstanding_count
= 0;
213 coalescingWindow
= p
->max_coalesces_per_cycle
;
215 m_max_outstanding_requests
= 0;
216 m_instCache_ptr
= nullptr;
217 m_dataCache_ptr
= nullptr;
219 m_instCache_ptr
= p
->icache
;
220 m_dataCache_ptr
= p
->dcache
;
221 m_max_outstanding_requests
= p
->max_outstanding_requests
;
222 m_deadlock_threshold
= p
->deadlock_threshold
;
224 assert(m_max_outstanding_requests
> 0);
225 assert(m_deadlock_threshold
> 0);
226 assert(m_instCache_ptr
);
227 assert(m_dataCache_ptr
);
229 m_runningGarnetStandalone
= p
->garnet_standalone
;
230 assumingRfOCoherence
= p
->assume_rfo
;
233 GPUCoalescer::~GPUCoalescer()
238 GPUCoalescer::getPort(const std::string
&if_name
, PortID idx
)
240 if (if_name
== "gmTokenPort") {
244 // delgate to RubyPort otherwise
245 return RubyPort::getPort(if_name
, idx
);
249 GPUCoalescer::wakeup()
251 Cycles current_time
= curCycle();
252 for (auto& requestList
: coalescedTable
) {
253 for (auto& req
: requestList
.second
) {
254 if (current_time
- req
->getIssueTime() > m_deadlock_threshold
) {
255 std::stringstream ss
;
256 printRequestTable(ss
);
257 ss
<< "Outstanding requests: " << m_outstanding_count
260 panic("Possible Deadlock detected. Aborting!\n"
261 "version: %d request.paddr: 0x%x coalescedTable: %d "
262 "current time: %u issue_time: %d difference: %d\n"
263 "Request Tables:\n %s", m_version
,
264 req
->getFirstPkt()->getAddr(),
265 coalescedTable
.size(), cyclesToTicks(current_time
),
266 cyclesToTicks(req
->getIssueTime()),
267 cyclesToTicks(current_time
- req
->getIssueTime()),
273 Tick tick_threshold
= cyclesToTicks(m_deadlock_threshold
);
274 uncoalescedTable
.checkDeadlock(tick_threshold
);
276 if (m_outstanding_count
> 0) {
277 schedule(deadlockCheckEvent
,
278 m_deadlock_threshold
* clockPeriod() +
284 GPUCoalescer::printRequestTable(std::stringstream
& ss
)
286 uncoalescedTable
.printRequestTable(ss
);
288 ss
<< "CoalescedTable contains " << coalescedTable
.size()
289 << " address entries." << std::endl
;
290 for (auto& requestList
: coalescedTable
) {
291 ss
<< "Addr 0x" << std::hex
<< requestList
.first
<< std::dec
293 for (auto& request
: requestList
.second
) {
294 ss
<< RubyRequestType_to_string(request
->getRubyType())
295 << " pkts-" << request
->getPackets().size()
296 << " issued-" << request
->getIssueTime() << " seqNum-"
297 << request
->getSeqNum() << "; ";
304 GPUCoalescer::resetStats()
306 m_latencyHist
.reset();
307 m_missLatencyHist
.reset();
308 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
309 m_typeLatencyHist
[i
]->reset();
310 m_missTypeLatencyHist
[i
]->reset();
311 for (int j
= 0; j
< MachineType_NUM
; j
++) {
312 m_missTypeMachLatencyHist
[i
][j
]->reset();
316 for (int i
= 0; i
< MachineType_NUM
; i
++) {
317 m_missMachLatencyHist
[i
]->reset();
319 m_IssueToInitialDelayHist
[i
]->reset();
320 m_InitialToForwardDelayHist
[i
]->reset();
321 m_ForwardToFirstResponseDelayHist
[i
]->reset();
322 m_FirstResponseToCompletionDelayHist
[i
]->reset();
327 GPUCoalescer::printProgress(ostream
& out
) const
331 // sets the kernelEndList
333 GPUCoalescer::insertKernel(int wavefront_id
, PacketPtr pkt
)
335 // Don't know if this will happen or is possible
336 // but I just want to be careful and not have it become
337 // simulator hang in the future
338 DPRINTF(GPUCoalescer
, "inserting wf: %d to kernelEndlist\n", wavefront_id
);
339 assert(kernelEndList
.count(wavefront_id
) == 0);
341 kernelEndList
[wavefront_id
] = pkt
;
342 DPRINTF(GPUCoalescer
, "kernelEndList->size() = %d\n",
343 kernelEndList
.size());
347 GPUCoalescer::writeCallback(Addr address
, DataBlock
& data
)
349 writeCallback(address
, MachineType_NULL
, data
);
353 GPUCoalescer::writeCallback(Addr address
,
357 writeCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
361 GPUCoalescer::writeCallback(Addr address
,
364 Cycles initialRequestTime
,
365 Cycles forwardRequestTime
,
366 Cycles firstResponseTime
)
368 writeCallback(address
, mach
, data
,
369 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
374 GPUCoalescer::writeCallback(Addr address
,
377 Cycles initialRequestTime
,
378 Cycles forwardRequestTime
,
379 Cycles firstResponseTime
,
382 assert(address
== makeLineAddress(address
));
383 assert(coalescedTable
.count(address
));
385 auto crequest
= coalescedTable
.at(address
).front();
387 hitCallback(crequest
, mach
, data
, true, crequest
->getIssueTime(),
388 forwardRequestTime
, firstResponseTime
, isRegion
);
391 coalescedTable
.at(address
).pop_front();
393 if (coalescedTable
.at(address
).empty()) {
394 coalescedTable
.erase(address
);
396 auto nextRequest
= coalescedTable
.at(address
).front();
397 issueRequest(nextRequest
);
402 GPUCoalescer::readCallback(Addr address
, DataBlock
& data
)
404 readCallback(address
, MachineType_NULL
, data
);
408 GPUCoalescer::readCallback(Addr address
,
412 readCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
416 GPUCoalescer::readCallback(Addr address
,
419 Cycles initialRequestTime
,
420 Cycles forwardRequestTime
,
421 Cycles firstResponseTime
)
424 readCallback(address
, mach
, data
,
425 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
430 GPUCoalescer::readCallback(Addr address
,
433 Cycles initialRequestTime
,
434 Cycles forwardRequestTime
,
435 Cycles firstResponseTime
,
438 assert(address
== makeLineAddress(address
));
439 assert(coalescedTable
.count(address
));
441 auto crequest
= coalescedTable
.at(address
).front();
442 fatal_if(crequest
->getRubyType() != RubyRequestType_LD
,
443 "readCallback received non-read type response\n");
445 // Iterate over the coalesced requests to respond to as many loads as
446 // possible until another request type is seen. Models MSHR for TCP.
447 while (crequest
->getRubyType() == RubyRequestType_LD
) {
448 hitCallback(crequest
, mach
, data
, true, crequest
->getIssueTime(),
449 forwardRequestTime
, firstResponseTime
, isRegion
);
452 coalescedTable
.at(address
).pop_front();
453 if (coalescedTable
.at(address
).empty()) {
457 crequest
= coalescedTable
.at(address
).front();
460 if (coalescedTable
.at(address
).empty()) {
461 coalescedTable
.erase(address
);
463 auto nextRequest
= coalescedTable
.at(address
).front();
464 issueRequest(nextRequest
);
469 GPUCoalescer::hitCallback(CoalescedRequest
* crequest
,
473 Cycles initialRequestTime
,
474 Cycles forwardRequestTime
,
475 Cycles firstResponseTime
,
478 PacketPtr pkt
= crequest
->getFirstPkt();
479 Addr request_address
= pkt
->getAddr();
480 Addr request_line_address
= makeLineAddress(request_address
);
482 RubyRequestType type
= crequest
->getRubyType();
484 DPRINTF(GPUCoalescer
, "Got hitCallback for 0x%X\n", request_line_address
);
486 recordMissLatency(crequest
, mach
,
493 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
494 std::vector
<PacketPtr
> pktList
= crequest
->getPackets();
495 DPRINTF(GPUCoalescer
, "Responding to %d packets for addr 0x%X\n",
496 pktList
.size(), request_line_address
);
497 for (auto& pkt
: pktList
) {
498 request_address
= pkt
->getAddr();
499 if (pkt
->getPtr
<uint8_t>()) {
500 if ((type
== RubyRequestType_LD
) ||
501 (type
== RubyRequestType_ATOMIC
) ||
502 (type
== RubyRequestType_ATOMIC_RETURN
) ||
503 (type
== RubyRequestType_IFETCH
) ||
504 (type
== RubyRequestType_RMW_Read
) ||
505 (type
== RubyRequestType_Locked_RMW_Read
) ||
506 (type
== RubyRequestType_Load_Linked
)) {
508 data
.getData(getOffset(request_address
), pkt
->getSize()));
510 data
.setData(pkt
->getPtr
<uint8_t>(),
511 getOffset(request_address
), pkt
->getSize());
514 DPRINTF(MemoryAccess
,
515 "WARNING. Data not transfered from Ruby to M5 for type " \
517 RubyRequestType_to_string(type
));
520 // If using the RubyTester, update the RubyTester sender state's
521 // subBlock with the recieved data. The tester will later access
523 // Note: RubyPort will access it's sender state before the
525 if (m_usingRubyTester
) {
526 RubyPort::SenderState
*requestSenderState
=
527 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
528 RubyTester::SenderState
* testerSenderState
=
529 safe_cast
<RubyTester::SenderState
*>
530 (requestSenderState
->predecessor
);
531 testerSenderState
->subBlock
.mergeFrom(data
);
537 m_outstanding_count
--;
538 assert(m_outstanding_count
>= 0);
540 completeHitCallback(pktList
);
544 GPUCoalescer::empty() const
546 return coalescedTable
.empty();
550 GPUCoalescer::getRequestType(PacketPtr pkt
)
552 RubyRequestType req_type
= RubyRequestType_NULL
;
554 // These types are not support or not used in GPU caches.
555 assert(!pkt
->req
->isLLSC());
556 assert(!pkt
->req
->isLockedRMW());
557 assert(!pkt
->req
->isInstFetch());
558 assert(!pkt
->isFlush());
560 if (pkt
->req
->isAtomicReturn()) {
561 req_type
= RubyRequestType_ATOMIC_RETURN
;
562 } else if (pkt
->req
->isAtomicNoReturn()) {
563 req_type
= RubyRequestType_ATOMIC_NO_RETURN
;
564 } else if (pkt
->isRead()) {
565 req_type
= RubyRequestType_LD
;
566 } else if (pkt
->isWrite()) {
567 req_type
= RubyRequestType_ST
;
569 // Acquire and release packets will have been issued by
570 // makeRequest, so we do not need to check for it here.
571 panic("Unsupported ruby packet type\n");
577 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
578 // special type (MemFence, scoping, etc), it is issued immediately.
580 GPUCoalescer::makeRequest(PacketPtr pkt
)
582 // Check for GPU Barrier Kernel End or Kernel Begin
583 // Leave these to be handled by the child class
584 // Kernel End/Barrier = isFlush + isRelease
585 // Kernel Begin = isFlush + isAcquire
586 if (pkt
->req
->isKernel()) {
587 if (pkt
->req
->isAcquire()){
588 // This is a Kernel Begin leave handling to
589 // virtual xCoalescer::makeRequest
590 return RequestStatus_Issued
;
591 }else if (pkt
->req
->isRelease()) {
592 // This is a Kernel End leave handling to
593 // virtual xCoalescer::makeRequest
594 // If we are here then we didn't call
595 // a virtual version of this function
596 // so we will also schedule the callback
598 if (pkt
->req
->hasContextId()) {
599 wf_id
= pkt
->req
->contextId();
601 insertKernel(wf_id
, pkt
);
602 newKernelEnds
.push_back(wf_id
);
603 if (!issueEvent
.scheduled()) {
604 schedule(issueEvent
, curTick());
606 return RequestStatus_Issued
;
610 if (!pkt
->isLLSC() && !pkt
->req
->isLockedRMW() && !pkt
->isAtomicOp() &&
611 !pkt
->isRead() && !pkt
->isWrite() && !pkt
->isFlush() &&
612 (pkt
->req
->isRelease() || pkt
->req
->isAcquire())) {
613 if (assumingRfOCoherence
) {
614 // If we reached here, this request must be a memFence
615 // and the protocol implements RfO, the coalescer can
616 // assume sequentially consistency and schedule the callback
618 // Currently the code implements fence callbacks
619 // by reusing the mechanism for kernel completions.
620 // This should be fixed.
622 if (pkt
->req
->hasContextId()) {
623 wf_id
= pkt
->req
->contextId();
625 insertKernel(wf_id
, pkt
);
626 newKernelEnds
.push_back(wf_id
);
627 if (!issueEvent
.scheduled()) {
628 schedule(issueEvent
, curTick());
630 return RequestStatus_Issued
;
632 // If not RfO, return issued here and let the child coalescer
634 return RequestStatus_Issued
;
638 uncoalescedTable
.insertPacket(pkt
);
639 DPRINTF(GPUCoalescer
, "UC insertPacket 0x%X\n", pkt
->getAddr());
641 if (!issueEvent
.scheduled())
642 schedule(issueEvent
, curTick());
643 // TODO: issue hardware prefetches here
644 return RequestStatus_Issued
;
648 GPUCoalescer::issueRequest(CoalescedRequest
* crequest
)
650 PacketPtr pkt
= crequest
->getFirstPkt();
653 if (pkt
!= NULL
&& pkt
->req
->hasContextId()) {
654 proc_id
= pkt
->req
->contextId();
657 // If valid, copy the pc to the ruby request
659 if (pkt
->req
->hasPC()) {
660 pc
= pkt
->req
->getPC();
663 // At the moment setting scopes only counts
664 // for GPU spill space accesses
665 // which is pkt->req->isStack()
666 // this scope is REPLACE since it
667 // does not need to be flushed at the end
668 // of a kernel Private and local may need
669 // to be visible at the end of the kernel
670 HSASegment accessSegment
= reqSegmentToHSASegment(pkt
->req
);
671 HSAScope accessScope
= reqScopeToHSAScope(pkt
->req
);
673 Addr line_addr
= makeLineAddress(pkt
->getAddr());
675 // Creating WriteMask that records written bytes
676 // and atomic operations. This enables partial writes
677 // and partial reads of those writes
680 uint32_t blockSize
= RubySystem::getBlockSizeBytes();
681 std::vector
<bool> accessMask(blockSize
,false);
682 std::vector
< std::pair
<int,AtomicOpFunctor
*> > atomicOps
;
683 uint32_t tableSize
= crequest
->getPackets().size();
684 for (int i
= 0; i
< tableSize
; i
++) {
685 PacketPtr tmpPkt
= crequest
->getPackets()[i
];
686 uint32_t tmpOffset
= (tmpPkt
->getAddr()) - line_addr
;
687 uint32_t tmpSize
= tmpPkt
->getSize();
688 if (tmpPkt
->isAtomicOp()) {
689 std::pair
<int,AtomicOpFunctor
*> tmpAtomicOp(tmpOffset
,
690 tmpPkt
->getAtomicOp());
691 atomicOps
.push_back(tmpAtomicOp
);
692 } else if (tmpPkt
->isWrite()) {
693 dataBlock
.setData(tmpPkt
->getPtr
<uint8_t>(),
696 for (int j
= 0; j
< tmpSize
; j
++) {
697 accessMask
[tmpOffset
+ j
] = true;
700 std::shared_ptr
<RubyRequest
> msg
;
701 if (pkt
->isAtomicOp()) {
702 msg
= std::make_shared
<RubyRequest
>(clockEdge(), pkt
->getAddr(),
703 pkt
->getPtr
<uint8_t>(),
704 pkt
->getSize(), pc
, crequest
->getRubyType(),
705 RubyAccessMode_Supervisor
, pkt
,
706 PrefetchBit_No
, proc_id
, 100,
707 blockSize
, accessMask
,
708 dataBlock
, atomicOps
,
709 accessScope
, accessSegment
);
711 msg
= std::make_shared
<RubyRequest
>(clockEdge(), pkt
->getAddr(),
712 pkt
->getPtr
<uint8_t>(),
713 pkt
->getSize(), pc
, crequest
->getRubyType(),
714 RubyAccessMode_Supervisor
, pkt
,
715 PrefetchBit_No
, proc_id
, 100,
716 blockSize
, accessMask
,
718 accessScope
, accessSegment
);
720 DPRINTFR(ProtocolTrace
, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
721 curTick(), m_version
, "Coal", "Begin", "", "",
722 printAddress(msg
->getPhysicalAddress()),
723 RubyRequestType_to_string(crequest
->getRubyType()));
725 fatal_if(crequest
->getRubyType() == RubyRequestType_IFETCH
,
726 "there should not be any I-Fetch requests in the GPU Coalescer");
728 Tick latency
= cyclesToTicks(
729 m_controller
->mandatoryQueueLatency(crequest
->getRubyType()));
732 if (!deadlockCheckEvent
.scheduled()) {
733 schedule(deadlockCheckEvent
,
734 m_deadlock_threshold
* clockPeriod() +
738 assert(m_mandatory_q_ptr
);
739 m_mandatory_q_ptr
->enqueue(msg
, clockEdge(), latency
);
742 template <class KEY
, class VALUE
>
744 operator<<(ostream
&out
, const std::unordered_map
<KEY
, VALUE
> &map
)
747 for (auto i
= map
.begin(); i
!= map
.end(); ++i
)
748 out
<< " " << i
->first
<< "=" << i
->second
;
755 GPUCoalescer::print(ostream
& out
) const
757 out
<< "[GPUCoalescer: " << m_version
758 << ", outstanding requests: " << m_outstanding_count
764 GPUCoalescer::recordRequestType(SequencerRequestType requestType
) {
765 DPRINTF(RubyStats
, "Recorded statistic: %s\n",
766 SequencerRequestType_to_string(requestType
));
770 GPUCoalescer::coalescePacket(PacketPtr pkt
)
772 uint64_t seqNum
= pkt
->req
->getReqInstSeqNum();
773 Addr line_addr
= makeLineAddress(pkt
->getAddr());
775 // If the packet has the same line address as a request already in the
776 // coalescedTable and has the same sequence number, it can be coalesced.
777 if (coalescedTable
.count(line_addr
)) {
778 // Search for a previous coalesced request with the same seqNum.
779 auto& creqQueue
= coalescedTable
.at(line_addr
);
780 auto citer
= std::find_if(creqQueue
.begin(), creqQueue
.end(),
781 [&](CoalescedRequest
* c
) { return c
->getSeqNum() == seqNum
; }
783 if (citer
!= creqQueue
.end()) {
784 (*citer
)->insertPacket(pkt
);
789 if (m_outstanding_count
< m_max_outstanding_requests
) {
790 // This is an "aliased" or new request. Create a RubyRequest and
791 // append it to the list of "targets" in the coalescing table.
792 DPRINTF(GPUCoalescer
, "Creating new or aliased request for 0x%X\n",
795 CoalescedRequest
*creq
= new CoalescedRequest(seqNum
);
796 creq
->insertPacket(pkt
);
797 creq
->setRubyType(getRequestType(pkt
));
798 creq
->setIssueTime(curCycle());
800 if (!coalescedTable
.count(line_addr
)) {
801 // If there is no outstanding request for this line address,
802 // create a new coalecsed request and issue it immediately.
803 auto reqList
= std::deque
<CoalescedRequest
*> { creq
};
804 coalescedTable
.insert(std::make_pair(line_addr
, reqList
));
806 DPRINTF(GPUCoalescer
, "Issued req type %s seqNum %d\n",
807 RubyRequestType_to_string(creq
->getRubyType()), seqNum
);
810 // The request is for a line address that is already outstanding
811 // but for a different instruction. Add it as a new request to be
812 // issued when the current outstanding request is completed.
813 coalescedTable
.at(line_addr
).push_back(creq
);
814 DPRINTF(GPUCoalescer
, "found address 0x%X with new seqNum %d\n",
818 // In both cases, requests are added to the coalescing table and will
819 // be counted as outstanding requests.
820 m_outstanding_count
++;
825 // The maximum number of outstanding requests have been issued.
830 GPUCoalescer::completeIssue()
832 // Iterate over the maximum number of instructions we can coalesce
833 // per cycle (coalescingWindow).
834 for (int instIdx
= 0; instIdx
< coalescingWindow
; ++instIdx
) {
835 PerInstPackets
*pktList
=
836 uncoalescedTable
.getInstPackets(instIdx
);
838 // getInstPackets will return nullptr if no instruction
839 // exists at the current offset.
843 // Since we have a pointer to the list of packets in the inst,
844 // erase them from the list if coalescing is successful and
845 // leave them in the list otherwise. This aggressively attempts
846 // to coalesce as many packets as possible from the current inst.
848 [&](PacketPtr pkt
) { return coalescePacket(pkt
); }
853 // Clean up any instructions in the uncoalesced table that have had
854 // all of their packets coalesced and return a token for that column.
855 uncoalescedTable
.updateResources();
857 // have Kernel End releases been issued this cycle
858 int len
= newKernelEnds
.size();
859 for (int i
= 0; i
< len
; i
++) {
860 kernelCallback(newKernelEnds
[i
]);
862 newKernelEnds
.clear();
866 GPUCoalescer::evictionCallback(Addr address
)
868 ruby_eviction_callback(address
);
872 GPUCoalescer::kernelCallback(int wavefront_id
)
874 assert(kernelEndList
.count(wavefront_id
));
876 ruby_hit_callback(kernelEndList
[wavefront_id
]);
878 kernelEndList
.erase(wavefront_id
);
882 GPUCoalescer::atomicCallback(Addr address
,
884 const DataBlock
& data
)
886 assert(address
== makeLineAddress(address
));
887 assert(coalescedTable
.count(address
));
889 auto crequest
= coalescedTable
.at(address
).front();
891 fatal_if((crequest
->getRubyType() != RubyRequestType_ATOMIC
&&
892 crequest
->getRubyType() != RubyRequestType_ATOMIC_RETURN
&&
893 crequest
->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN
),
894 "atomicCallback saw non-atomic type response\n");
896 hitCallback(crequest
, mach
, (DataBlock
&)data
, true,
897 crequest
->getIssueTime(), Cycles(0), Cycles(0), false);
900 coalescedTable
.at(address
).pop_front();
902 if (coalescedTable
.at(address
).empty()) {
903 coalescedTable
.erase(address
);
905 auto nextRequest
= coalescedTable
.at(address
).front();
906 issueRequest(nextRequest
);
911 GPUCoalescer::recordCPReadCallBack(MachineID myMachID
, MachineID senderMachID
)
913 if (myMachID
== senderMachID
) {
915 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCP
) {
917 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCC
) {
925 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID
, MachineID senderMachID
)
927 if (myMachID
== senderMachID
) {
929 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCP
) {
931 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCC
) {
939 GPUCoalescer::completeHitCallback(std::vector
<PacketPtr
> & mylist
)
941 for (auto& pkt
: mylist
) {
942 RubyPort::SenderState
*ss
=
943 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
944 MemSlavePort
*port
= ss
->port
;
945 assert(port
!= NULL
);
947 pkt
->senderState
= ss
->predecessor
;
949 port
->hitCallback(pkt
);
953 // We schedule an event in the same tick as hitCallback (similar to
954 // makeRequest) rather than calling completeIssue directly to reduce
955 // function calls to complete issue. This can only happen if the max
956 // outstanding requests is less than the number of slots in the
957 // uncoalesced table and makeRequest is not called again.
958 if (uncoalescedTable
.packetAvailable() && !issueEvent
.scheduled()) {
959 schedule(issueEvent
, curTick());
966 GPUCoalescer::recordMissLatency(CoalescedRequest
* crequest
,
968 Cycles initialRequestTime
,
969 Cycles forwardRequestTime
,
970 Cycles firstResponseTime
,
971 bool success
, bool isRegion
)
973 RubyRequestType type
= crequest
->getRubyType();
974 Cycles issued_time
= crequest
->getIssueTime();
975 Cycles completion_time
= curCycle();
976 assert(completion_time
>= issued_time
);
977 Cycles total_lat
= completion_time
- issued_time
;
979 // cache stats (valid for RfO protocol only)
980 if (mach
== MachineType_TCP
) {
981 if (type
== RubyRequestType_LD
) {
986 } else if (mach
== MachineType_L1Cache_wCC
) {
987 if (type
== RubyRequestType_LD
) {
988 GPU_TCPLdTransfers
++;
990 GPU_TCPStTransfers
++;
992 } else if (mach
== MachineType_TCC
) {
993 if (type
== RubyRequestType_LD
) {
999 if (type
== RubyRequestType_LD
) {
1006 // Profile all access latency, even zero latency accesses
1007 m_latencyHist
.sample(total_lat
);
1008 m_typeLatencyHist
[type
]->sample(total_lat
);
1010 // Profile the miss latency for all non-zero demand misses
1011 if (total_lat
!= Cycles(0)) {
1012 m_missLatencyHist
.sample(total_lat
);
1013 m_missTypeLatencyHist
[type
]->sample(total_lat
);
1015 if (mach
!= MachineType_NUM
) {
1016 m_missMachLatencyHist
[mach
]->sample(total_lat
);
1017 m_missTypeMachLatencyHist
[type
][mach
]->sample(total_lat
);
1019 if ((issued_time
<= initialRequestTime
) &&
1020 (initialRequestTime
<= forwardRequestTime
) &&
1021 (forwardRequestTime
<= firstResponseTime
) &&
1022 (firstResponseTime
<= completion_time
)) {
1024 m_IssueToInitialDelayHist
[mach
]->sample(
1025 initialRequestTime
- issued_time
);
1026 m_InitialToForwardDelayHist
[mach
]->sample(
1027 forwardRequestTime
- initialRequestTime
);
1028 m_ForwardToFirstResponseDelayHist
[mach
]->sample(
1029 firstResponseTime
- forwardRequestTime
);
1030 m_FirstResponseToCompletionDelayHist
[mach
]->sample(
1031 completion_time
- firstResponseTime
);
1037 DPRINTFR(ProtocolTrace
, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1038 curTick(), m_version
, "Coal",
1039 success
? "Done" : "SC_Failed", "", "",
1040 printAddress(crequest
->getFirstPkt()->getAddr()), total_lat
);
1044 GPUCoalescer::regStats()
1046 RubyPort::regStats();
1048 // These statistical variables are not for display.
1049 // The profiler will collate these across different
1050 // coalescers and display those collated statistics.
1051 m_outstandReqHist
.init(10);
1052 m_latencyHist
.init(10);
1053 m_missLatencyHist
.init(10);
1055 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
1056 m_typeLatencyHist
.push_back(new Stats::Histogram());
1057 m_typeLatencyHist
[i
]->init(10);
1059 m_missTypeLatencyHist
.push_back(new Stats::Histogram());
1060 m_missTypeLatencyHist
[i
]->init(10);
1063 for (int i
= 0; i
< MachineType_NUM
; i
++) {
1064 m_missMachLatencyHist
.push_back(new Stats::Histogram());
1065 m_missMachLatencyHist
[i
]->init(10);
1067 m_IssueToInitialDelayHist
.push_back(new Stats::Histogram());
1068 m_IssueToInitialDelayHist
[i
]->init(10);
1070 m_InitialToForwardDelayHist
.push_back(new Stats::Histogram());
1071 m_InitialToForwardDelayHist
[i
]->init(10);
1073 m_ForwardToFirstResponseDelayHist
.push_back(new Stats::Histogram());
1074 m_ForwardToFirstResponseDelayHist
[i
]->init(10);
1076 m_FirstResponseToCompletionDelayHist
.push_back(new Stats::Histogram());
1077 m_FirstResponseToCompletionDelayHist
[i
]->init(10);
1080 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
1081 m_missTypeMachLatencyHist
.push_back(std::vector
<Stats::Histogram
*>());
1083 for (int j
= 0; j
< MachineType_NUM
; j
++) {
1084 m_missTypeMachLatencyHist
[i
].push_back(new Stats::Histogram());
1085 m_missTypeMachLatencyHist
[i
][j
]->init(10);
1091 .name(name() + ".gpu_tcp_ld_hits")
1092 .desc("loads that hit in the TCP")
1095 .name(name() + ".gpu_tcp_ld_transfers")
1096 .desc("TCP to TCP load transfers")
1099 .name(name() + ".gpu_tcc_ld_hits")
1100 .desc("loads that hit in the TCC")
1103 .name(name() + ".gpu_ld_misses")
1104 .desc("loads that miss in the GPU")
1108 .name(name() + ".gpu_tcp_st_hits")
1109 .desc("stores that hit in the TCP")
1112 .name(name() + ".gpu_tcp_st_transfers")
1113 .desc("TCP to TCP store transfers")
1116 .name(name() + ".gpu_tcc_st_hits")
1117 .desc("stores that hit in the TCC")
1120 .name(name() + ".gpu_st_misses")
1121 .desc("stores that miss in the GPU")
1126 .name(name() + ".cp_tcp_ld_hits")
1127 .desc("loads that hit in the TCP")
1130 .name(name() + ".cp_tcp_ld_transfers")
1131 .desc("TCP to TCP load transfers")
1134 .name(name() + ".cp_tcc_ld_hits")
1135 .desc("loads that hit in the TCC")
1138 .name(name() + ".cp_ld_misses")
1139 .desc("loads that miss in the GPU")
1143 .name(name() + ".cp_tcp_st_hits")
1144 .desc("stores that hit in the TCP")
1147 .name(name() + ".cp_tcp_st_transfers")
1148 .desc("TCP to TCP store transfers")
1151 .name(name() + ".cp_tcc_st_hits")
1152 .desc("stores that hit in the TCC")
1155 .name(name() + ".cp_st_misses")
1156 .desc("stores that miss in the GPU")