0153b4c4bc1ec3bf23dcbde7568f6b0c8eb6d0b3
[gem5.git] / src / mem / ruby / system / GPUCoalescer.cc
1 /*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "base/logging.hh"
35 #include "base/str.hh"
36 #include "config/the_isa.hh"
37
38 #if THE_ISA == X86_ISA
39 #include "arch/x86/insts/microldstop.hh"
40
41 #endif // X86_ISA
42 #include "mem/ruby/system/GPUCoalescer.hh"
43
44 #include "cpu/testers/rubytest/RubyTester.hh"
45 #include "debug/GPUCoalescer.hh"
46 #include "debug/MemoryAccess.hh"
47 #include "debug/ProtocolTrace.hh"
48 #include "debug/RubyPort.hh"
49 #include "debug/RubyStats.hh"
50 #include "gpu-compute/shader.hh"
51 #include "mem/packet.hh"
52 #include "mem/ruby/common/DataBlock.hh"
53 #include "mem/ruby/common/SubBlock.hh"
54 #include "mem/ruby/network/MessageBuffer.hh"
55 #include "mem/ruby/profiler/Profiler.hh"
56 #include "mem/ruby/slicc_interface/AbstractController.hh"
57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
58 #include "mem/ruby/structures/CacheMemory.hh"
59 #include "mem/ruby/system/RubySystem.hh"
60 #include "params/RubyGPUCoalescer.hh"
61
62 using namespace std;
63
64 GPUCoalescer *
65 RubyGPUCoalescerParams::create()
66 {
67 return new GPUCoalescer(this);
68 }
69
70 HSAScope
71 reqScopeToHSAScope(const RequestPtr &req)
72 {
73 HSAScope accessScope = HSAScope_UNSPECIFIED;
74 if (req->isScoped()) {
75 if (req->isWavefrontScope()) {
76 accessScope = HSAScope_WAVEFRONT;
77 } else if (req->isWorkgroupScope()) {
78 accessScope = HSAScope_WORKGROUP;
79 } else if (req->isDeviceScope()) {
80 accessScope = HSAScope_DEVICE;
81 } else if (req->isSystemScope()) {
82 accessScope = HSAScope_SYSTEM;
83 } else {
84 fatal("Bad scope type");
85 }
86 }
87 return accessScope;
88 }
89
90 HSASegment
91 reqSegmentToHSASegment(const RequestPtr &req)
92 {
93 HSASegment accessSegment = HSASegment_GLOBAL;
94
95 if (req->isGlobalSegment()) {
96 accessSegment = HSASegment_GLOBAL;
97 } else if (req->isGroupSegment()) {
98 accessSegment = HSASegment_GROUP;
99 } else if (req->isPrivateSegment()) {
100 accessSegment = HSASegment_PRIVATE;
101 } else if (req->isKernargSegment()) {
102 accessSegment = HSASegment_KERNARG;
103 } else if (req->isReadonlySegment()) {
104 accessSegment = HSASegment_READONLY;
105 } else if (req->isSpillSegment()) {
106 accessSegment = HSASegment_SPILL;
107 } else if (req->isArgSegment()) {
108 accessSegment = HSASegment_ARG;
109 } else {
110 fatal("Bad segment type");
111 }
112
113 return accessSegment;
114 }
115
116 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
117 : coalescer(gc)
118 {
119 }
120
121 void
122 UncoalescedTable::insertPacket(PacketPtr pkt)
123 {
124 uint64_t seqNum = pkt->req->getReqInstSeqNum();
125
126 instMap[seqNum].push_back(pkt);
127 DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
128 pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
129 }
130
131 bool
132 UncoalescedTable::packetAvailable()
133 {
134 return !instMap.empty();
135 }
136
137 PerInstPackets*
138 UncoalescedTable::getInstPackets(int offset)
139 {
140 if (offset >= instMap.size()) {
141 return nullptr;
142 }
143
144 auto instMapIter = instMap.begin();
145 std::advance(instMapIter, offset);
146
147 return &(instMapIter->second);
148 }
149
150 void
151 UncoalescedTable::updateResources()
152 {
153 for (auto iter = instMap.begin(); iter != instMap.end(); ) {
154 if (iter->second.empty()) {
155 instMap.erase(iter++);
156 coalescer->getGMTokenPort().sendTokens(1);
157 } else {
158 ++iter;
159 }
160 }
161 }
162
163 void
164 UncoalescedTable::printRequestTable(std::stringstream& ss)
165 {
166 ss << "UncoalescedTable contains " << instMap.size()
167 << " address entries." << std::endl;
168 for (auto& inst : instMap) {
169 ss << "Addr 0x" << std::hex << inst.first << std::dec
170 << " with " << inst.second.size() << " packets"
171 << std::endl;
172 }
173 }
174
175 void
176 UncoalescedTable::checkDeadlock(Tick threshold)
177 {
178 Tick current_time = curTick();
179
180 for (auto &it : instMap) {
181 for (auto &pkt : it.second) {
182 if (current_time - pkt->req->time() > threshold) {
183 std::stringstream ss;
184 printRequestTable(ss);
185
186 panic("Possible Deadlock detected. Aborting!\n"
187 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
188 "current time: %u issue_time: %d difference: %d\n"
189 "Request Tables:\n\n%s", coalescer->getId(),
190 pkt->getAddr(), instMap.size(), current_time,
191 pkt->req->time(), current_time - pkt->req->time(),
192 ss.str());
193 }
194 }
195 }
196 }
197
198 GPUCoalescer::GPUCoalescer(const Params *p)
199 : RubyPort(p),
200 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
201 false, Event::Progress_Event_Pri),
202 uncoalescedTable(this),
203 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
204 gmTokenPort(name() + ".gmTokenPort", this)
205 {
206 m_store_waiting_on_load_cycles = 0;
207 m_store_waiting_on_store_cycles = 0;
208 m_load_waiting_on_store_cycles = 0;
209 m_load_waiting_on_load_cycles = 0;
210
211 m_outstanding_count = 0;
212
213 coalescingWindow = p->max_coalesces_per_cycle;
214
215 m_max_outstanding_requests = 0;
216 m_instCache_ptr = nullptr;
217 m_dataCache_ptr = nullptr;
218
219 m_instCache_ptr = p->icache;
220 m_dataCache_ptr = p->dcache;
221 m_max_outstanding_requests = p->max_outstanding_requests;
222 m_deadlock_threshold = p->deadlock_threshold;
223
224 assert(m_max_outstanding_requests > 0);
225 assert(m_deadlock_threshold > 0);
226 assert(m_instCache_ptr);
227 assert(m_dataCache_ptr);
228
229 m_runningGarnetStandalone = p->garnet_standalone;
230 assumingRfOCoherence = p->assume_rfo;
231 }
232
233 GPUCoalescer::~GPUCoalescer()
234 {
235 }
236
237 Port &
238 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
239 {
240 if (if_name == "gmTokenPort") {
241 return gmTokenPort;
242 }
243
244 // delgate to RubyPort otherwise
245 return RubyPort::getPort(if_name, idx);
246 }
247
248 void
249 GPUCoalescer::wakeup()
250 {
251 Cycles current_time = curCycle();
252 for (auto& requestList : coalescedTable) {
253 for (auto& req : requestList.second) {
254 if (current_time - req->getIssueTime() > m_deadlock_threshold) {
255 std::stringstream ss;
256 printRequestTable(ss);
257 ss << "Outstanding requests: " << m_outstanding_count
258 << std::endl;
259
260 panic("Possible Deadlock detected. Aborting!\n"
261 "version: %d request.paddr: 0x%x coalescedTable: %d "
262 "current time: %u issue_time: %d difference: %d\n"
263 "Request Tables:\n %s", m_version,
264 req->getFirstPkt()->getAddr(),
265 coalescedTable.size(), cyclesToTicks(current_time),
266 cyclesToTicks(req->getIssueTime()),
267 cyclesToTicks(current_time - req->getIssueTime()),
268 ss.str());
269 }
270 }
271 }
272
273 Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
274 uncoalescedTable.checkDeadlock(tick_threshold);
275
276 if (m_outstanding_count > 0) {
277 schedule(deadlockCheckEvent,
278 m_deadlock_threshold * clockPeriod() +
279 curTick());
280 }
281 }
282
283 void
284 GPUCoalescer::printRequestTable(std::stringstream& ss)
285 {
286 uncoalescedTable.printRequestTable(ss);
287
288 ss << "CoalescedTable contains " << coalescedTable.size()
289 << " address entries." << std::endl;
290 for (auto& requestList : coalescedTable) {
291 ss << "Addr 0x" << std::hex << requestList.first << std::dec
292 << ": type-";
293 for (auto& request : requestList.second) {
294 ss << RubyRequestType_to_string(request->getRubyType())
295 << " pkts-" << request->getPackets().size()
296 << " issued-" << request->getIssueTime() << " seqNum-"
297 << request->getSeqNum() << "; ";
298 }
299 ss << std::endl;
300 }
301 }
302
303 void
304 GPUCoalescer::resetStats()
305 {
306 m_latencyHist.reset();
307 m_missLatencyHist.reset();
308 for (int i = 0; i < RubyRequestType_NUM; i++) {
309 m_typeLatencyHist[i]->reset();
310 m_missTypeLatencyHist[i]->reset();
311 for (int j = 0; j < MachineType_NUM; j++) {
312 m_missTypeMachLatencyHist[i][j]->reset();
313 }
314 }
315
316 for (int i = 0; i < MachineType_NUM; i++) {
317 m_missMachLatencyHist[i]->reset();
318
319 m_IssueToInitialDelayHist[i]->reset();
320 m_InitialToForwardDelayHist[i]->reset();
321 m_ForwardToFirstResponseDelayHist[i]->reset();
322 m_FirstResponseToCompletionDelayHist[i]->reset();
323 }
324 }
325
326 void
327 GPUCoalescer::printProgress(ostream& out) const
328 {
329 }
330
331 // sets the kernelEndList
332 void
333 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
334 {
335 // Don't know if this will happen or is possible
336 // but I just want to be careful and not have it become
337 // simulator hang in the future
338 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
339 assert(kernelEndList.count(wavefront_id) == 0);
340
341 kernelEndList[wavefront_id] = pkt;
342 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
343 kernelEndList.size());
344 }
345
346 void
347 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
348 {
349 writeCallback(address, MachineType_NULL, data);
350 }
351
352 void
353 GPUCoalescer::writeCallback(Addr address,
354 MachineType mach,
355 DataBlock& data)
356 {
357 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
358 }
359
360 void
361 GPUCoalescer::writeCallback(Addr address,
362 MachineType mach,
363 DataBlock& data,
364 Cycles initialRequestTime,
365 Cycles forwardRequestTime,
366 Cycles firstResponseTime)
367 {
368 writeCallback(address, mach, data,
369 initialRequestTime, forwardRequestTime, firstResponseTime,
370 false);
371 }
372
373 void
374 GPUCoalescer::writeCallback(Addr address,
375 MachineType mach,
376 DataBlock& data,
377 Cycles initialRequestTime,
378 Cycles forwardRequestTime,
379 Cycles firstResponseTime,
380 bool isRegion)
381 {
382 assert(address == makeLineAddress(address));
383 assert(coalescedTable.count(address));
384
385 auto crequest = coalescedTable.at(address).front();
386
387 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
388 forwardRequestTime, firstResponseTime, isRegion);
389
390 delete crequest;
391 coalescedTable.at(address).pop_front();
392
393 if (coalescedTable.at(address).empty()) {
394 coalescedTable.erase(address);
395 } else {
396 auto nextRequest = coalescedTable.at(address).front();
397 issueRequest(nextRequest);
398 }
399 }
400
401 void
402 GPUCoalescer::readCallback(Addr address, DataBlock& data)
403 {
404 readCallback(address, MachineType_NULL, data);
405 }
406
407 void
408 GPUCoalescer::readCallback(Addr address,
409 MachineType mach,
410 DataBlock& data)
411 {
412 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
413 }
414
415 void
416 GPUCoalescer::readCallback(Addr address,
417 MachineType mach,
418 DataBlock& data,
419 Cycles initialRequestTime,
420 Cycles forwardRequestTime,
421 Cycles firstResponseTime)
422 {
423
424 readCallback(address, mach, data,
425 initialRequestTime, forwardRequestTime, firstResponseTime,
426 false);
427 }
428
429 void
430 GPUCoalescer::readCallback(Addr address,
431 MachineType mach,
432 DataBlock& data,
433 Cycles initialRequestTime,
434 Cycles forwardRequestTime,
435 Cycles firstResponseTime,
436 bool isRegion)
437 {
438 assert(address == makeLineAddress(address));
439 assert(coalescedTable.count(address));
440
441 auto crequest = coalescedTable.at(address).front();
442 fatal_if(crequest->getRubyType() != RubyRequestType_LD,
443 "readCallback received non-read type response\n");
444
445 // Iterate over the coalesced requests to respond to as many loads as
446 // possible until another request type is seen. Models MSHR for TCP.
447 while (crequest->getRubyType() == RubyRequestType_LD) {
448 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
449 forwardRequestTime, firstResponseTime, isRegion);
450
451 delete crequest;
452 coalescedTable.at(address).pop_front();
453 if (coalescedTable.at(address).empty()) {
454 break;
455 }
456
457 crequest = coalescedTable.at(address).front();
458 }
459
460 if (coalescedTable.at(address).empty()) {
461 coalescedTable.erase(address);
462 } else {
463 auto nextRequest = coalescedTable.at(address).front();
464 issueRequest(nextRequest);
465 }
466 }
467
468 void
469 GPUCoalescer::hitCallback(CoalescedRequest* crequest,
470 MachineType mach,
471 DataBlock& data,
472 bool success,
473 Cycles initialRequestTime,
474 Cycles forwardRequestTime,
475 Cycles firstResponseTime,
476 bool isRegion)
477 {
478 PacketPtr pkt = crequest->getFirstPkt();
479 Addr request_address = pkt->getAddr();
480 Addr request_line_address = makeLineAddress(request_address);
481
482 RubyRequestType type = crequest->getRubyType();
483
484 DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
485
486 recordMissLatency(crequest, mach,
487 initialRequestTime,
488 forwardRequestTime,
489 firstResponseTime,
490 success, isRegion);
491 // update the data
492 //
493 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
494 std::vector<PacketPtr> pktList = crequest->getPackets();
495 DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
496 pktList.size(), request_line_address);
497 for (auto& pkt : pktList) {
498 request_address = pkt->getAddr();
499 if (pkt->getPtr<uint8_t>()) {
500 if ((type == RubyRequestType_LD) ||
501 (type == RubyRequestType_ATOMIC) ||
502 (type == RubyRequestType_ATOMIC_RETURN) ||
503 (type == RubyRequestType_IFETCH) ||
504 (type == RubyRequestType_RMW_Read) ||
505 (type == RubyRequestType_Locked_RMW_Read) ||
506 (type == RubyRequestType_Load_Linked)) {
507 pkt->setData(
508 data.getData(getOffset(request_address), pkt->getSize()));
509 } else {
510 data.setData(pkt->getPtr<uint8_t>(),
511 getOffset(request_address), pkt->getSize());
512 }
513 } else {
514 DPRINTF(MemoryAccess,
515 "WARNING. Data not transfered from Ruby to M5 for type " \
516 "%s\n",
517 RubyRequestType_to_string(type));
518 }
519
520 // If using the RubyTester, update the RubyTester sender state's
521 // subBlock with the recieved data. The tester will later access
522 // this state.
523 // Note: RubyPort will access it's sender state before the
524 // RubyTester.
525 if (m_usingRubyTester) {
526 RubyPort::SenderState *requestSenderState =
527 safe_cast<RubyPort::SenderState*>(pkt->senderState);
528 RubyTester::SenderState* testerSenderState =
529 safe_cast<RubyTester::SenderState*>
530 (requestSenderState->predecessor);
531 testerSenderState->subBlock.mergeFrom(data);
532 }
533 }
534
535
536
537 m_outstanding_count--;
538 assert(m_outstanding_count >= 0);
539
540 completeHitCallback(pktList);
541 }
542
543 bool
544 GPUCoalescer::empty() const
545 {
546 return coalescedTable.empty();
547 }
548
549 RubyRequestType
550 GPUCoalescer::getRequestType(PacketPtr pkt)
551 {
552 RubyRequestType req_type = RubyRequestType_NULL;
553
554 // These types are not support or not used in GPU caches.
555 assert(!pkt->req->isLLSC());
556 assert(!pkt->req->isLockedRMW());
557 assert(!pkt->req->isInstFetch());
558 assert(!pkt->isFlush());
559
560 if (pkt->req->isAtomicReturn()) {
561 req_type = RubyRequestType_ATOMIC_RETURN;
562 } else if (pkt->req->isAtomicNoReturn()) {
563 req_type = RubyRequestType_ATOMIC_NO_RETURN;
564 } else if (pkt->isRead()) {
565 req_type = RubyRequestType_LD;
566 } else if (pkt->isWrite()) {
567 req_type = RubyRequestType_ST;
568 } else {
569 // Acquire and release packets will have been issued by
570 // makeRequest, so we do not need to check for it here.
571 panic("Unsupported ruby packet type\n");
572 }
573
574 return req_type;
575 }
576
577 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
578 // special type (MemFence, scoping, etc), it is issued immediately.
579 RequestStatus
580 GPUCoalescer::makeRequest(PacketPtr pkt)
581 {
582 // Check for GPU Barrier Kernel End or Kernel Begin
583 // Leave these to be handled by the child class
584 // Kernel End/Barrier = isFlush + isRelease
585 // Kernel Begin = isFlush + isAcquire
586 if (pkt->req->isKernel()) {
587 if (pkt->req->isAcquire()){
588 // This is a Kernel Begin leave handling to
589 // virtual xCoalescer::makeRequest
590 return RequestStatus_Issued;
591 }else if (pkt->req->isRelease()) {
592 // This is a Kernel End leave handling to
593 // virtual xCoalescer::makeRequest
594 // If we are here then we didn't call
595 // a virtual version of this function
596 // so we will also schedule the callback
597 int wf_id = 0;
598 if (pkt->req->hasContextId()) {
599 wf_id = pkt->req->contextId();
600 }
601 insertKernel(wf_id, pkt);
602 newKernelEnds.push_back(wf_id);
603 if (!issueEvent.scheduled()) {
604 schedule(issueEvent, curTick());
605 }
606 return RequestStatus_Issued;
607 }
608 }
609
610 if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
611 !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
612 (pkt->req->isRelease() || pkt->req->isAcquire())) {
613 if (assumingRfOCoherence) {
614 // If we reached here, this request must be a memFence
615 // and the protocol implements RfO, the coalescer can
616 // assume sequentially consistency and schedule the callback
617 // immediately.
618 // Currently the code implements fence callbacks
619 // by reusing the mechanism for kernel completions.
620 // This should be fixed.
621 int wf_id = 0;
622 if (pkt->req->hasContextId()) {
623 wf_id = pkt->req->contextId();
624 }
625 insertKernel(wf_id, pkt);
626 newKernelEnds.push_back(wf_id);
627 if (!issueEvent.scheduled()) {
628 schedule(issueEvent, curTick());
629 }
630 return RequestStatus_Issued;
631 } else {
632 // If not RfO, return issued here and let the child coalescer
633 // take care of it.
634 return RequestStatus_Issued;
635 }
636 }
637
638 uncoalescedTable.insertPacket(pkt);
639 DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
640
641 if (!issueEvent.scheduled())
642 schedule(issueEvent, curTick());
643 // TODO: issue hardware prefetches here
644 return RequestStatus_Issued;
645 }
646
647 void
648 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
649 {
650 PacketPtr pkt = crequest->getFirstPkt();
651
652 int proc_id = -1;
653 if (pkt != NULL && pkt->req->hasContextId()) {
654 proc_id = pkt->req->contextId();
655 }
656
657 // If valid, copy the pc to the ruby request
658 Addr pc = 0;
659 if (pkt->req->hasPC()) {
660 pc = pkt->req->getPC();
661 }
662
663 // At the moment setting scopes only counts
664 // for GPU spill space accesses
665 // which is pkt->req->isStack()
666 // this scope is REPLACE since it
667 // does not need to be flushed at the end
668 // of a kernel Private and local may need
669 // to be visible at the end of the kernel
670 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
671 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
672
673 Addr line_addr = makeLineAddress(pkt->getAddr());
674
675 // Creating WriteMask that records written bytes
676 // and atomic operations. This enables partial writes
677 // and partial reads of those writes
678 DataBlock dataBlock;
679 dataBlock.clear();
680 uint32_t blockSize = RubySystem::getBlockSizeBytes();
681 std::vector<bool> accessMask(blockSize,false);
682 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
683 uint32_t tableSize = crequest->getPackets().size();
684 for (int i = 0; i < tableSize; i++) {
685 PacketPtr tmpPkt = crequest->getPackets()[i];
686 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
687 uint32_t tmpSize = tmpPkt->getSize();
688 if (tmpPkt->isAtomicOp()) {
689 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
690 tmpPkt->getAtomicOp());
691 atomicOps.push_back(tmpAtomicOp);
692 } else if (tmpPkt->isWrite()) {
693 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
694 tmpOffset, tmpSize);
695 }
696 for (int j = 0; j < tmpSize; j++) {
697 accessMask[tmpOffset + j] = true;
698 }
699 }
700 std::shared_ptr<RubyRequest> msg;
701 if (pkt->isAtomicOp()) {
702 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
703 pkt->getPtr<uint8_t>(),
704 pkt->getSize(), pc, crequest->getRubyType(),
705 RubyAccessMode_Supervisor, pkt,
706 PrefetchBit_No, proc_id, 100,
707 blockSize, accessMask,
708 dataBlock, atomicOps,
709 accessScope, accessSegment);
710 } else {
711 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
712 pkt->getPtr<uint8_t>(),
713 pkt->getSize(), pc, crequest->getRubyType(),
714 RubyAccessMode_Supervisor, pkt,
715 PrefetchBit_No, proc_id, 100,
716 blockSize, accessMask,
717 dataBlock,
718 accessScope, accessSegment);
719 }
720 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
721 curTick(), m_version, "Coal", "Begin", "", "",
722 printAddress(msg->getPhysicalAddress()),
723 RubyRequestType_to_string(crequest->getRubyType()));
724
725 fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
726 "there should not be any I-Fetch requests in the GPU Coalescer");
727
728 Tick latency = cyclesToTicks(
729 m_controller->mandatoryQueueLatency(crequest->getRubyType()));
730 assert(latency > 0);
731
732 if (!deadlockCheckEvent.scheduled()) {
733 schedule(deadlockCheckEvent,
734 m_deadlock_threshold * clockPeriod() +
735 curTick());
736 }
737
738 assert(m_mandatory_q_ptr);
739 m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
740 }
741
742 template <class KEY, class VALUE>
743 std::ostream &
744 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
745 {
746 out << "[";
747 for (auto i = map.begin(); i != map.end(); ++i)
748 out << " " << i->first << "=" << i->second;
749 out << " ]";
750
751 return out;
752 }
753
754 void
755 GPUCoalescer::print(ostream& out) const
756 {
757 out << "[GPUCoalescer: " << m_version
758 << ", outstanding requests: " << m_outstanding_count
759 << "]";
760 }
761
762
763 void
764 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
765 DPRINTF(RubyStats, "Recorded statistic: %s\n",
766 SequencerRequestType_to_string(requestType));
767 }
768
769 bool
770 GPUCoalescer::coalescePacket(PacketPtr pkt)
771 {
772 uint64_t seqNum = pkt->req->getReqInstSeqNum();
773 Addr line_addr = makeLineAddress(pkt->getAddr());
774
775 // If the packet has the same line address as a request already in the
776 // coalescedTable and has the same sequence number, it can be coalesced.
777 if (coalescedTable.count(line_addr)) {
778 // Search for a previous coalesced request with the same seqNum.
779 auto& creqQueue = coalescedTable.at(line_addr);
780 auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
781 [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
782 );
783 if (citer != creqQueue.end()) {
784 (*citer)->insertPacket(pkt);
785 return true;
786 }
787 }
788
789 if (m_outstanding_count < m_max_outstanding_requests) {
790 // This is an "aliased" or new request. Create a RubyRequest and
791 // append it to the list of "targets" in the coalescing table.
792 DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
793 line_addr);
794
795 CoalescedRequest *creq = new CoalescedRequest(seqNum);
796 creq->insertPacket(pkt);
797 creq->setRubyType(getRequestType(pkt));
798 creq->setIssueTime(curCycle());
799
800 if (!coalescedTable.count(line_addr)) {
801 // If there is no outstanding request for this line address,
802 // create a new coalecsed request and issue it immediately.
803 auto reqList = std::deque<CoalescedRequest*> { creq };
804 coalescedTable.insert(std::make_pair(line_addr, reqList));
805
806 DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
807 RubyRequestType_to_string(creq->getRubyType()), seqNum);
808 issueRequest(creq);
809 } else {
810 // The request is for a line address that is already outstanding
811 // but for a different instruction. Add it as a new request to be
812 // issued when the current outstanding request is completed.
813 coalescedTable.at(line_addr).push_back(creq);
814 DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
815 line_addr, seqNum);
816 }
817
818 // In both cases, requests are added to the coalescing table and will
819 // be counted as outstanding requests.
820 m_outstanding_count++;
821
822 return true;
823 }
824
825 // The maximum number of outstanding requests have been issued.
826 return false;
827 }
828
829 void
830 GPUCoalescer::completeIssue()
831 {
832 // Iterate over the maximum number of instructions we can coalesce
833 // per cycle (coalescingWindow).
834 for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
835 PerInstPackets *pktList =
836 uncoalescedTable.getInstPackets(instIdx);
837
838 // getInstPackets will return nullptr if no instruction
839 // exists at the current offset.
840 if (!pktList) {
841 break;
842 } else {
843 // Since we have a pointer to the list of packets in the inst,
844 // erase them from the list if coalescing is successful and
845 // leave them in the list otherwise. This aggressively attempts
846 // to coalesce as many packets as possible from the current inst.
847 pktList->remove_if(
848 [&](PacketPtr pkt) { return coalescePacket(pkt); }
849 );
850 }
851 }
852
853 // Clean up any instructions in the uncoalesced table that have had
854 // all of their packets coalesced and return a token for that column.
855 uncoalescedTable.updateResources();
856
857 // have Kernel End releases been issued this cycle
858 int len = newKernelEnds.size();
859 for (int i = 0; i < len; i++) {
860 kernelCallback(newKernelEnds[i]);
861 }
862 newKernelEnds.clear();
863 }
864
865 void
866 GPUCoalescer::evictionCallback(Addr address)
867 {
868 ruby_eviction_callback(address);
869 }
870
871 void
872 GPUCoalescer::kernelCallback(int wavefront_id)
873 {
874 assert(kernelEndList.count(wavefront_id));
875
876 ruby_hit_callback(kernelEndList[wavefront_id]);
877
878 kernelEndList.erase(wavefront_id);
879 }
880
881 void
882 GPUCoalescer::atomicCallback(Addr address,
883 MachineType mach,
884 const DataBlock& data)
885 {
886 assert(address == makeLineAddress(address));
887 assert(coalescedTable.count(address));
888
889 auto crequest = coalescedTable.at(address).front();
890
891 fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
892 crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
893 crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
894 "atomicCallback saw non-atomic type response\n");
895
896 hitCallback(crequest, mach, (DataBlock&)data, true,
897 crequest->getIssueTime(), Cycles(0), Cycles(0), false);
898
899 delete crequest;
900 coalescedTable.at(address).pop_front();
901
902 if (coalescedTable.at(address).empty()) {
903 coalescedTable.erase(address);
904 } else {
905 auto nextRequest = coalescedTable.at(address).front();
906 issueRequest(nextRequest);
907 }
908 }
909
910 void
911 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
912 {
913 if (myMachID == senderMachID) {
914 CP_TCPLdHits++;
915 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
916 CP_TCPLdTransfers++;
917 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
918 CP_TCCLdHits++;
919 } else {
920 CP_LdMiss++;
921 }
922 }
923
924 void
925 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
926 {
927 if (myMachID == senderMachID) {
928 CP_TCPStHits++;
929 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
930 CP_TCPStTransfers++;
931 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
932 CP_TCCStHits++;
933 } else {
934 CP_StMiss++;
935 }
936 }
937
938 void
939 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
940 {
941 for (auto& pkt : mylist) {
942 RubyPort::SenderState *ss =
943 safe_cast<RubyPort::SenderState *>(pkt->senderState);
944 MemSlavePort *port = ss->port;
945 assert(port != NULL);
946
947 pkt->senderState = ss->predecessor;
948 delete ss;
949 port->hitCallback(pkt);
950 trySendRetries();
951 }
952
953 // We schedule an event in the same tick as hitCallback (similar to
954 // makeRequest) rather than calling completeIssue directly to reduce
955 // function calls to complete issue. This can only happen if the max
956 // outstanding requests is less than the number of slots in the
957 // uncoalesced table and makeRequest is not called again.
958 if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
959 schedule(issueEvent, curTick());
960 }
961
962 testDrainComplete();
963 }
964
965 void
966 GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
967 MachineType mach,
968 Cycles initialRequestTime,
969 Cycles forwardRequestTime,
970 Cycles firstResponseTime,
971 bool success, bool isRegion)
972 {
973 RubyRequestType type = crequest->getRubyType();
974 Cycles issued_time = crequest->getIssueTime();
975 Cycles completion_time = curCycle();
976 assert(completion_time >= issued_time);
977 Cycles total_lat = completion_time - issued_time;
978
979 // cache stats (valid for RfO protocol only)
980 if (mach == MachineType_TCP) {
981 if (type == RubyRequestType_LD) {
982 GPU_TCPLdHits++;
983 } else {
984 GPU_TCPStHits++;
985 }
986 } else if (mach == MachineType_L1Cache_wCC) {
987 if (type == RubyRequestType_LD) {
988 GPU_TCPLdTransfers++;
989 } else {
990 GPU_TCPStTransfers++;
991 }
992 } else if (mach == MachineType_TCC) {
993 if (type == RubyRequestType_LD) {
994 GPU_TCCLdHits++;
995 } else {
996 GPU_TCCStHits++;
997 }
998 } else {
999 if (type == RubyRequestType_LD) {
1000 GPU_LdMiss++;
1001 } else {
1002 GPU_StMiss++;
1003 }
1004 }
1005
1006 // Profile all access latency, even zero latency accesses
1007 m_latencyHist.sample(total_lat);
1008 m_typeLatencyHist[type]->sample(total_lat);
1009
1010 // Profile the miss latency for all non-zero demand misses
1011 if (total_lat != Cycles(0)) {
1012 m_missLatencyHist.sample(total_lat);
1013 m_missTypeLatencyHist[type]->sample(total_lat);
1014
1015 if (mach != MachineType_NUM) {
1016 m_missMachLatencyHist[mach]->sample(total_lat);
1017 m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1018
1019 if ((issued_time <= initialRequestTime) &&
1020 (initialRequestTime <= forwardRequestTime) &&
1021 (forwardRequestTime <= firstResponseTime) &&
1022 (firstResponseTime <= completion_time)) {
1023
1024 m_IssueToInitialDelayHist[mach]->sample(
1025 initialRequestTime - issued_time);
1026 m_InitialToForwardDelayHist[mach]->sample(
1027 forwardRequestTime - initialRequestTime);
1028 m_ForwardToFirstResponseDelayHist[mach]->sample(
1029 firstResponseTime - forwardRequestTime);
1030 m_FirstResponseToCompletionDelayHist[mach]->sample(
1031 completion_time - firstResponseTime);
1032 }
1033 }
1034
1035 }
1036
1037 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1038 curTick(), m_version, "Coal",
1039 success ? "Done" : "SC_Failed", "", "",
1040 printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
1041 }
1042
1043 void
1044 GPUCoalescer::regStats()
1045 {
1046 RubyPort::regStats();
1047
1048 // These statistical variables are not for display.
1049 // The profiler will collate these across different
1050 // coalescers and display those collated statistics.
1051 m_outstandReqHist.init(10);
1052 m_latencyHist.init(10);
1053 m_missLatencyHist.init(10);
1054
1055 for (int i = 0; i < RubyRequestType_NUM; i++) {
1056 m_typeLatencyHist.push_back(new Stats::Histogram());
1057 m_typeLatencyHist[i]->init(10);
1058
1059 m_missTypeLatencyHist.push_back(new Stats::Histogram());
1060 m_missTypeLatencyHist[i]->init(10);
1061 }
1062
1063 for (int i = 0; i < MachineType_NUM; i++) {
1064 m_missMachLatencyHist.push_back(new Stats::Histogram());
1065 m_missMachLatencyHist[i]->init(10);
1066
1067 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1068 m_IssueToInitialDelayHist[i]->init(10);
1069
1070 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1071 m_InitialToForwardDelayHist[i]->init(10);
1072
1073 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1074 m_ForwardToFirstResponseDelayHist[i]->init(10);
1075
1076 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1077 m_FirstResponseToCompletionDelayHist[i]->init(10);
1078 }
1079
1080 for (int i = 0; i < RubyRequestType_NUM; i++) {
1081 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1082
1083 for (int j = 0; j < MachineType_NUM; j++) {
1084 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1085 m_missTypeMachLatencyHist[i][j]->init(10);
1086 }
1087 }
1088
1089 // GPU cache stats
1090 GPU_TCPLdHits
1091 .name(name() + ".gpu_tcp_ld_hits")
1092 .desc("loads that hit in the TCP")
1093 ;
1094 GPU_TCPLdTransfers
1095 .name(name() + ".gpu_tcp_ld_transfers")
1096 .desc("TCP to TCP load transfers")
1097 ;
1098 GPU_TCCLdHits
1099 .name(name() + ".gpu_tcc_ld_hits")
1100 .desc("loads that hit in the TCC")
1101 ;
1102 GPU_LdMiss
1103 .name(name() + ".gpu_ld_misses")
1104 .desc("loads that miss in the GPU")
1105 ;
1106
1107 GPU_TCPStHits
1108 .name(name() + ".gpu_tcp_st_hits")
1109 .desc("stores that hit in the TCP")
1110 ;
1111 GPU_TCPStTransfers
1112 .name(name() + ".gpu_tcp_st_transfers")
1113 .desc("TCP to TCP store transfers")
1114 ;
1115 GPU_TCCStHits
1116 .name(name() + ".gpu_tcc_st_hits")
1117 .desc("stores that hit in the TCC")
1118 ;
1119 GPU_StMiss
1120 .name(name() + ".gpu_st_misses")
1121 .desc("stores that miss in the GPU")
1122 ;
1123
1124 // CP cache stats
1125 CP_TCPLdHits
1126 .name(name() + ".cp_tcp_ld_hits")
1127 .desc("loads that hit in the TCP")
1128 ;
1129 CP_TCPLdTransfers
1130 .name(name() + ".cp_tcp_ld_transfers")
1131 .desc("TCP to TCP load transfers")
1132 ;
1133 CP_TCCLdHits
1134 .name(name() + ".cp_tcc_ld_hits")
1135 .desc("loads that hit in the TCC")
1136 ;
1137 CP_LdMiss
1138 .name(name() + ".cp_ld_misses")
1139 .desc("loads that miss in the GPU")
1140 ;
1141
1142 CP_TCPStHits
1143 .name(name() + ".cp_tcp_st_hits")
1144 .desc("stores that hit in the TCP")
1145 ;
1146 CP_TCPStTransfers
1147 .name(name() + ".cp_tcp_st_transfers")
1148 .desc("TCP to TCP store transfers")
1149 ;
1150 CP_TCCStHits
1151 .name(name() + ".cp_tcc_st_hits")
1152 .desc("stores that hit in the TCC")
1153 ;
1154 CP_StMiss
1155 .name(name() + ".cp_st_misses")
1156 .desc("stores that miss in the GPU")
1157 ;
1158 }