gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model
[gem5.git] / src / mem / ruby / system / GPUCoalescer.cc
1 /*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "base/logging.hh"
35 #include "base/str.hh"
36 #include "config/the_isa.hh"
37
38 #if THE_ISA == X86_ISA
39 #include "arch/x86/insts/microldstop.hh"
40
41 #endif // X86_ISA
42 #include "mem/ruby/system/GPUCoalescer.hh"
43
44 #include "cpu/testers/rubytest/RubyTester.hh"
45 #include "debug/GPUCoalescer.hh"
46 #include "debug/MemoryAccess.hh"
47 #include "debug/ProtocolTrace.hh"
48 #include "debug/RubyPort.hh"
49 #include "debug/RubyStats.hh"
50 #include "gpu-compute/shader.hh"
51 #include "mem/packet.hh"
52 #include "mem/ruby/common/DataBlock.hh"
53 #include "mem/ruby/common/SubBlock.hh"
54 #include "mem/ruby/network/MessageBuffer.hh"
55 #include "mem/ruby/profiler/Profiler.hh"
56 #include "mem/ruby/slicc_interface/AbstractController.hh"
57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
58 #include "mem/ruby/structures/CacheMemory.hh"
59 #include "mem/ruby/system/RubySystem.hh"
60 #include "params/RubyGPUCoalescer.hh"
61
62 using namespace std;
63
64 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
65 : coalescer(gc)
66 {
67 }
68
69 void
70 UncoalescedTable::insertPacket(PacketPtr pkt)
71 {
72 uint64_t seqNum = pkt->req->getReqInstSeqNum();
73
74 instMap[seqNum].push_back(pkt);
75 DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
76 pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
77 }
78
79 bool
80 UncoalescedTable::packetAvailable()
81 {
82 return !instMap.empty();
83 }
84
85 PerInstPackets*
86 UncoalescedTable::getInstPackets(int offset)
87 {
88 if (offset >= instMap.size()) {
89 return nullptr;
90 }
91
92 auto instMapIter = instMap.begin();
93 std::advance(instMapIter, offset);
94
95 return &(instMapIter->second);
96 }
97
98 void
99 UncoalescedTable::updateResources()
100 {
101 for (auto iter = instMap.begin(); iter != instMap.end(); ) {
102 if (iter->second.empty()) {
103 DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
104 instMap.erase(iter++);
105 coalescer->getGMTokenPort().sendTokens(1);
106 } else {
107 ++iter;
108 }
109 }
110 }
111
112 bool
113 UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
114 // iterate the instructions held in UncoalescedTable to see whether there
115 // are more requests to issue; if yes, not yet done; otherwise, done
116 for (auto& inst : instMap) {
117 DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
118 ,inst.first, inst.second.size());
119 if (inst.first == instSeqNum) { return false; }
120 }
121
122 return true;
123 }
124
125 void
126 UncoalescedTable::printRequestTable(std::stringstream& ss)
127 {
128 ss << "Listing pending packets from " << instMap.size() << " instructions";
129
130 for (auto& inst : instMap) {
131 ss << "\tAddr: " << printAddress(inst.first) << " with "
132 << inst.second.size() << " pending packets" << std::endl;
133 }
134 }
135
136 void
137 UncoalescedTable::checkDeadlock(Tick threshold)
138 {
139 Tick current_time = curTick();
140
141 for (auto &it : instMap) {
142 for (auto &pkt : it.second) {
143 if (current_time - pkt->req->time() > threshold) {
144 std::stringstream ss;
145 printRequestTable(ss);
146
147 panic("Possible Deadlock detected. Aborting!\n"
148 "version: %d request.paddr: 0x%x uncoalescedTable: %d "
149 "current time: %u issue_time: %d difference: %d\n"
150 "Request Tables:\n\n%s", coalescer->getId(),
151 pkt->getAddr(), instMap.size(), current_time,
152 pkt->req->time(), current_time - pkt->req->time(),
153 ss.str());
154 }
155 }
156 }
157 }
158
159 GPUCoalescer::GPUCoalescer(const Params *p)
160 : RubyPort(p),
161 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
162 false, Event::Progress_Event_Pri),
163 uncoalescedTable(this),
164 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
165 gmTokenPort(name() + ".gmTokenPort", this)
166 {
167 m_store_waiting_on_load_cycles = 0;
168 m_store_waiting_on_store_cycles = 0;
169 m_load_waiting_on_store_cycles = 0;
170 m_load_waiting_on_load_cycles = 0;
171
172 m_outstanding_count = 0;
173
174 coalescingWindow = p->max_coalesces_per_cycle;
175
176 m_max_outstanding_requests = 0;
177 m_instCache_ptr = nullptr;
178 m_dataCache_ptr = nullptr;
179
180 m_instCache_ptr = p->icache;
181 m_dataCache_ptr = p->dcache;
182 m_max_outstanding_requests = p->max_outstanding_requests;
183 m_deadlock_threshold = p->deadlock_threshold;
184
185 assert(m_max_outstanding_requests > 0);
186 assert(m_deadlock_threshold > 0);
187 assert(m_instCache_ptr);
188 assert(m_dataCache_ptr);
189
190 m_runningGarnetStandalone = p->garnet_standalone;
191 }
192
193 GPUCoalescer::~GPUCoalescer()
194 {
195 }
196
197 Port &
198 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
199 {
200 if (if_name == "gmTokenPort") {
201 return gmTokenPort;
202 }
203
204 // delgate to RubyPort otherwise
205 return RubyPort::getPort(if_name, idx);
206 }
207
208 void
209 GPUCoalescer::wakeup()
210 {
211 Cycles current_time = curCycle();
212 for (auto& requestList : coalescedTable) {
213 for (auto& req : requestList.second) {
214 if (current_time - req->getIssueTime() > m_deadlock_threshold) {
215 std::stringstream ss;
216 printRequestTable(ss);
217 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
218 m_version, ss.str());
219 panic("Aborting due to deadlock!\n");
220 }
221 }
222 }
223
224 Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
225 uncoalescedTable.checkDeadlock(tick_threshold);
226
227 if (m_outstanding_count > 0) {
228 schedule(deadlockCheckEvent,
229 m_deadlock_threshold * clockPeriod() +
230 curTick());
231 }
232 }
233
234 void
235 GPUCoalescer::printRequestTable(std::stringstream& ss)
236 {
237 ss << "Printing out " << coalescedTable.size()
238 << " outstanding requests in the coalesced table\n";
239
240 for (auto& requestList : coalescedTable) {
241 for (auto& request : requestList.second) {
242 ss << "\tAddr: " << printAddress(requestList.first) << "\n"
243 << "\tInstruction sequence number: "
244 << request->getSeqNum() << "\n"
245 << "\t\tType: "
246 << RubyRequestType_to_string(request->getRubyType()) << "\n"
247 << "\t\tNumber of associated packets: "
248 << request->getPackets().size() << "\n"
249 << "\t\tIssue time: "
250 << request->getIssueTime() * clockPeriod() << "\n"
251 << "\t\tDifference from current tick: "
252 << (curCycle() - request->getIssueTime()) * clockPeriod();
253 }
254 }
255
256 // print out packets waiting to be issued in uncoalesced table
257 uncoalescedTable.printRequestTable(ss);
258 }
259
260 void
261 GPUCoalescer::resetStats()
262 {
263 m_latencyHist.reset();
264 m_missLatencyHist.reset();
265 for (int i = 0; i < RubyRequestType_NUM; i++) {
266 m_typeLatencyHist[i]->reset();
267 m_missTypeLatencyHist[i]->reset();
268 for (int j = 0; j < MachineType_NUM; j++) {
269 m_missTypeMachLatencyHist[i][j]->reset();
270 }
271 }
272
273 for (int i = 0; i < MachineType_NUM; i++) {
274 m_missMachLatencyHist[i]->reset();
275
276 m_IssueToInitialDelayHist[i]->reset();
277 m_InitialToForwardDelayHist[i]->reset();
278 m_ForwardToFirstResponseDelayHist[i]->reset();
279 m_FirstResponseToCompletionDelayHist[i]->reset();
280 }
281 }
282
283 void
284 GPUCoalescer::printProgress(ostream& out) const
285 {
286 }
287
288 // sets the kernelEndList
289 void
290 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
291 {
292 // Don't know if this will happen or is possible
293 // but I just want to be careful and not have it become
294 // simulator hang in the future
295 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
296 assert(kernelEndList.count(wavefront_id) == 0);
297
298 kernelEndList[wavefront_id] = pkt;
299 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
300 kernelEndList.size());
301 }
302
303 void
304 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
305 {
306 writeCallback(address, MachineType_NULL, data);
307 }
308
309 void
310 GPUCoalescer::writeCallback(Addr address,
311 MachineType mach,
312 DataBlock& data)
313 {
314 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
315 }
316
317 void
318 GPUCoalescer::writeCallback(Addr address,
319 MachineType mach,
320 DataBlock& data,
321 Cycles initialRequestTime,
322 Cycles forwardRequestTime,
323 Cycles firstResponseTime)
324 {
325 writeCallback(address, mach, data,
326 initialRequestTime, forwardRequestTime, firstResponseTime,
327 false);
328 }
329
330 void
331 GPUCoalescer::writeCallback(Addr address,
332 MachineType mach,
333 DataBlock& data,
334 Cycles initialRequestTime,
335 Cycles forwardRequestTime,
336 Cycles firstResponseTime,
337 bool isRegion)
338 {
339 assert(address == makeLineAddress(address));
340 assert(coalescedTable.count(address));
341
342 auto crequest = coalescedTable.at(address).front();
343
344 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
345 forwardRequestTime, firstResponseTime, isRegion);
346
347 // remove this crequest in coalescedTable
348 delete crequest;
349 coalescedTable.at(address).pop_front();
350
351 if (coalescedTable.at(address).empty()) {
352 coalescedTable.erase(address);
353 } else {
354 auto nextRequest = coalescedTable.at(address).front();
355 issueRequest(nextRequest);
356 }
357 }
358
359 void
360 GPUCoalescer::writeCompleteCallback(Addr address,
361 uint64_t instSeqNum,
362 MachineType mach)
363 {
364 DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
365 " instSeqNum = %d\n", address, instSeqNum);
366
367 assert(pendingWriteInsts.count(instSeqNum) == 1);
368 PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
369
370 // check the uncoalescedTable to see whether all requests for the inst
371 // have been issued or not
372 bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
373 DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
374 "reqsAllIssued=%d\n", reqsAllIssued,
375 inst.getNumPendingStores()-1, reqsAllIssued);
376
377 if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
378 // if the pending write instruction has received all write completion
379 // callbacks for its issued Ruby requests, we can now start respond
380 // the requesting CU in one response packet.
381 inst.ackWriteCompletion(m_usingRubyTester);
382
383 DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
384 instSeqNum);
385 pendingWriteInsts.erase(instSeqNum);
386 }
387 }
388
389 void
390 GPUCoalescer::readCallback(Addr address, DataBlock& data)
391 {
392 readCallback(address, MachineType_NULL, data);
393 }
394
395 void
396 GPUCoalescer::readCallback(Addr address,
397 MachineType mach,
398 DataBlock& data)
399 {
400 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
401 }
402
403 void
404 GPUCoalescer::readCallback(Addr address,
405 MachineType mach,
406 DataBlock& data,
407 Cycles initialRequestTime,
408 Cycles forwardRequestTime,
409 Cycles firstResponseTime)
410 {
411
412 readCallback(address, mach, data,
413 initialRequestTime, forwardRequestTime, firstResponseTime,
414 false);
415 }
416
417 void
418 GPUCoalescer::readCallback(Addr address,
419 MachineType mach,
420 DataBlock& data,
421 Cycles initialRequestTime,
422 Cycles forwardRequestTime,
423 Cycles firstResponseTime,
424 bool isRegion)
425 {
426 assert(address == makeLineAddress(address));
427 assert(coalescedTable.count(address));
428
429 auto crequest = coalescedTable.at(address).front();
430 fatal_if(crequest->getRubyType() != RubyRequestType_LD,
431 "readCallback received non-read type response\n");
432
433 // Iterate over the coalesced requests to respond to as many loads as
434 // possible until another request type is seen. Models MSHR for TCP.
435 while (crequest->getRubyType() == RubyRequestType_LD) {
436 hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
437 forwardRequestTime, firstResponseTime, isRegion);
438
439 delete crequest;
440 coalescedTable.at(address).pop_front();
441 if (coalescedTable.at(address).empty()) {
442 break;
443 }
444
445 crequest = coalescedTable.at(address).front();
446 }
447
448 if (coalescedTable.at(address).empty()) {
449 coalescedTable.erase(address);
450 } else {
451 auto nextRequest = coalescedTable.at(address).front();
452 issueRequest(nextRequest);
453 }
454 }
455
456 void
457 GPUCoalescer::hitCallback(CoalescedRequest* crequest,
458 MachineType mach,
459 DataBlock& data,
460 bool success,
461 Cycles initialRequestTime,
462 Cycles forwardRequestTime,
463 Cycles firstResponseTime,
464 bool isRegion)
465 {
466 PacketPtr pkt = crequest->getFirstPkt();
467 Addr request_address = pkt->getAddr();
468 Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
469
470 RubyRequestType type = crequest->getRubyType();
471
472 DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
473
474 recordMissLatency(crequest, mach,
475 initialRequestTime,
476 forwardRequestTime,
477 firstResponseTime,
478 success, isRegion);
479 // update the data
480 //
481 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
482 std::vector<PacketPtr> pktList = crequest->getPackets();
483 DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
484 pktList.size(), request_line_address);
485 for (auto& pkt : pktList) {
486 request_address = pkt->getAddr();
487 if (pkt->getPtr<uint8_t>()) {
488 if ((type == RubyRequestType_LD) ||
489 (type == RubyRequestType_ATOMIC) ||
490 (type == RubyRequestType_ATOMIC_RETURN) ||
491 (type == RubyRequestType_IFETCH) ||
492 (type == RubyRequestType_RMW_Read) ||
493 (type == RubyRequestType_Locked_RMW_Read) ||
494 (type == RubyRequestType_Load_Linked)) {
495 pkt->setData(
496 data.getData(getOffset(request_address), pkt->getSize()));
497 } else {
498 data.setData(pkt->getPtr<uint8_t>(),
499 getOffset(request_address), pkt->getSize());
500 }
501 } else {
502 DPRINTF(MemoryAccess,
503 "WARNING. Data not transfered from Ruby to M5 for type " \
504 "%s\n",
505 RubyRequestType_to_string(type));
506 }
507 }
508
509
510
511 m_outstanding_count--;
512 assert(m_outstanding_count >= 0);
513
514 completeHitCallback(pktList);
515 }
516
517 bool
518 GPUCoalescer::empty() const
519 {
520 return coalescedTable.empty();
521 }
522
523 RubyRequestType
524 GPUCoalescer::getRequestType(PacketPtr pkt)
525 {
526 RubyRequestType req_type = RubyRequestType_NULL;
527
528 // These types are not support or not used in GPU caches.
529 assert(!pkt->req->isLLSC());
530 assert(!pkt->req->isLockedRMW());
531 assert(!pkt->req->isInstFetch());
532 assert(!pkt->isFlush());
533
534 if (pkt->req->isAtomicReturn()) {
535 req_type = RubyRequestType_ATOMIC_RETURN;
536 } else if (pkt->req->isAtomicNoReturn()) {
537 req_type = RubyRequestType_ATOMIC_NO_RETURN;
538 } else if (pkt->isRead()) {
539 req_type = RubyRequestType_LD;
540 } else if (pkt->isWrite()) {
541 req_type = RubyRequestType_ST;
542 } else {
543 panic("Unsupported ruby packet type\n");
544 }
545
546 return req_type;
547 }
548
549 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
550 // special type (MemFence, scoping, etc), it is issued immediately.
551 RequestStatus
552 GPUCoalescer::makeRequest(PacketPtr pkt)
553 {
554 // all packets must have valid instruction sequence numbers
555 assert(pkt->req->hasInstSeqNum());
556
557 if (pkt->cmd == MemCmd::MemSyncReq) {
558 // issue mem_sync requests immedidately to the cache system without
559 // going though uncoalescedTable like normal LD/ST/Atomic requests
560 issueMemSyncRequest(pkt);
561 } else {
562 // otherwise, this must be either read or write command
563 assert(pkt->isRead() || pkt->isWrite());
564
565 // the pkt is temporarily stored in the uncoalesced table until
566 // it's picked for coalescing process later in this cycle or in a
567 // future cycle
568 uncoalescedTable.insertPacket(pkt);
569 DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
570 pkt->getAddr());
571
572 // we schedule an issue event here to process the uncoalesced table
573 // and try to issue Ruby request to cache system
574 if (!issueEvent.scheduled()) {
575 schedule(issueEvent, curTick());
576 }
577 }
578
579 // we always return RequestStatus_Issued in this coalescer
580 // b/c the coalescer's resouce was checked ealier and the coalescer is
581 // queueing up aliased requets in its coalesced table
582 return RequestStatus_Issued;
583 }
584
585 /**
586 * TODO: Figure out what do with this code. This code may go away
587 * and/or be merged into the VIPER coalescer once the VIPER
588 * protocol is re-integrated with GCN3 codes.
589 */
590 /*
591 void
592 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
593 {
594 PacketPtr pkt = crequest->getFirstPkt();
595
596 int proc_id = -1;
597 if (pkt != NULL && pkt->req->hasContextId()) {
598 proc_id = pkt->req->contextId();
599 }
600
601 // If valid, copy the pc to the ruby request
602 Addr pc = 0;
603 if (pkt->req->hasPC()) {
604 pc = pkt->req->getPC();
605 }
606
607 // At the moment setting scopes only counts
608 // for GPU spill space accesses
609 // which is pkt->req->isStack()
610 // this scope is REPLACE since it
611 // does not need to be flushed at the end
612 // of a kernel Private and local may need
613 // to be visible at the end of the kernel
614 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
615 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
616
617 Addr line_addr = makeLineAddress(pkt->getAddr());
618
619 // Creating WriteMask that records written bytes
620 // and atomic operations. This enables partial writes
621 // and partial reads of those writes
622 DataBlock dataBlock;
623 dataBlock.clear();
624 uint32_t blockSize = RubySystem::getBlockSizeBytes();
625 std::vector<bool> accessMask(blockSize,false);
626 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
627 uint32_t tableSize = crequest->getPackets().size();
628 for (int i = 0; i < tableSize; i++) {
629 PacketPtr tmpPkt = crequest->getPackets()[i];
630 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
631 uint32_t tmpSize = tmpPkt->getSize();
632 if (tmpPkt->isAtomicOp()) {
633 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
634 tmpPkt->getAtomicOp());
635 atomicOps.push_back(tmpAtomicOp);
636 } else if (tmpPkt->isWrite()) {
637 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
638 tmpOffset, tmpSize);
639 }
640 for (int j = 0; j < tmpSize; j++) {
641 accessMask[tmpOffset + j] = true;
642 }
643 }
644 std::shared_ptr<RubyRequest> msg;
645 if (pkt->isAtomicOp()) {
646 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
647 pkt->getPtr<uint8_t>(),
648 pkt->getSize(), pc, crequest->getRubyType(),
649 RubyAccessMode_Supervisor, pkt,
650 PrefetchBit_No, proc_id, 100,
651 blockSize, accessMask,
652 dataBlock, atomicOps,
653 accessScope, accessSegment);
654 } else {
655 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
656 pkt->getPtr<uint8_t>(),
657 pkt->getSize(), pc, crequest->getRubyType(),
658 RubyAccessMode_Supervisor, pkt,
659 PrefetchBit_No, proc_id, 100,
660 blockSize, accessMask,
661 dataBlock,
662 accessScope, accessSegment);
663 }
664 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
665 curTick(), m_version, "Coal", "Begin", "", "",
666 printAddress(msg->getPhysicalAddress()),
667 RubyRequestType_to_string(crequest->getRubyType()));
668
669 fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
670 "there should not be any I-Fetch requests in the GPU Coalescer");
671
672 Tick latency = cyclesToTicks(
673 m_controller->mandatoryQueueLatency(crequest->getRubyType()));
674 assert(latency > 0);
675
676 if (!deadlockCheckEvent.scheduled()) {
677 schedule(deadlockCheckEvent,
678 m_deadlock_threshold * clockPeriod() +
679 curTick());
680 }
681
682 assert(m_mandatory_q_ptr);
683 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
684 }*/
685
686 template <class KEY, class VALUE>
687 std::ostream &
688 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
689 {
690 out << "[";
691 for (auto i = map.begin(); i != map.end(); ++i)
692 out << " " << i->first << "=" << i->second;
693 out << " ]";
694
695 return out;
696 }
697
698 void
699 GPUCoalescer::print(ostream& out) const
700 {
701 out << "[GPUCoalescer: " << m_version
702 << ", outstanding requests: " << m_outstanding_count
703 << "]";
704 }
705
706
707 bool
708 GPUCoalescer::coalescePacket(PacketPtr pkt)
709 {
710 uint64_t seqNum = pkt->req->getReqInstSeqNum();
711 Addr line_addr = makeLineAddress(pkt->getAddr());
712
713 // If the packet has the same line address as a request already in the
714 // coalescedTable and has the same sequence number, it can be coalesced.
715 if (coalescedTable.count(line_addr)) {
716 // Search for a previous coalesced request with the same seqNum.
717 auto& creqQueue = coalescedTable.at(line_addr);
718 auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
719 [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
720 );
721 if (citer != creqQueue.end()) {
722 (*citer)->insertPacket(pkt);
723 return true;
724 }
725 }
726
727 if (m_outstanding_count < m_max_outstanding_requests) {
728 // This is an "aliased" or new request. Create a RubyRequest and
729 // append it to the list of "targets" in the coalescing table.
730 DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
731 line_addr);
732
733 CoalescedRequest *creq = new CoalescedRequest(seqNum);
734 creq->insertPacket(pkt);
735 creq->setRubyType(getRequestType(pkt));
736 creq->setIssueTime(curCycle());
737
738 if (!coalescedTable.count(line_addr)) {
739 // If there is no outstanding request for this line address,
740 // create a new coalecsed request and issue it immediately.
741 auto reqList = std::deque<CoalescedRequest*> { creq };
742 coalescedTable.insert(std::make_pair(line_addr, reqList));
743
744 DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
745 RubyRequestType_to_string(creq->getRubyType()), seqNum);
746 issueRequest(creq);
747 } else {
748 // The request is for a line address that is already outstanding
749 // but for a different instruction. Add it as a new request to be
750 // issued when the current outstanding request is completed.
751 coalescedTable.at(line_addr).push_back(creq);
752 DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
753 line_addr, seqNum);
754 }
755
756 // In both cases, requests are added to the coalescing table and will
757 // be counted as outstanding requests.
758 m_outstanding_count++;
759
760 // We track all issued or to-be-issued Ruby requests associated with
761 // write instructions. An instruction may have multiple Ruby
762 // requests.
763 if (pkt->cmd == MemCmd::WriteReq) {
764 DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
765 " the pending write instruction list\n", seqNum,
766 line_addr);
767
768 RubyPort::SenderState* ss =
769 safe_cast<RubyPort::SenderState*>(pkt->senderState);
770
771 // we need to save this port because it will be used to call
772 // back the requesting CU when we receive write
773 // complete callbacks for all issued Ruby requests of this
774 // instruction.
775 RubyPort::MemSlavePort* mem_slave_port = ss->port;
776
777 GPUDynInstPtr gpuDynInst = nullptr;
778
779 if (!m_usingRubyTester) {
780 // If this coalescer is connected to a real CU, we need
781 // to save the corresponding gpu dynamic instruction.
782 // CU will use that instruction to decrement wait counters
783 // in the issuing wavefront.
784 // For Ruby tester, gpuDynInst == nullptr
785 ComputeUnit::DataPort::SenderState* cu_state =
786 safe_cast<ComputeUnit::DataPort::SenderState*>
787 (ss->predecessor);
788 gpuDynInst = cu_state->_gpuDynInst;
789 }
790
791 PendingWriteInst& inst = pendingWriteInsts[seqNum];
792 inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
793 }
794
795 return true;
796 }
797
798 // The maximum number of outstanding requests have been issued.
799 return false;
800 }
801
802 void
803 GPUCoalescer::completeIssue()
804 {
805 // Iterate over the maximum number of instructions we can coalesce
806 // per cycle (coalescingWindow).
807 for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
808 PerInstPackets *pktList =
809 uncoalescedTable.getInstPackets(instIdx);
810
811 // getInstPackets will return nullptr if no instruction
812 // exists at the current offset.
813 if (!pktList) {
814 break;
815 } else {
816 // Since we have a pointer to the list of packets in the inst,
817 // erase them from the list if coalescing is successful and
818 // leave them in the list otherwise. This aggressively attempts
819 // to coalesce as many packets as possible from the current inst.
820 pktList->remove_if(
821 [&](PacketPtr pkt) { return coalescePacket(pkt); }
822 );
823 }
824 }
825
826 // Clean up any instructions in the uncoalesced table that have had
827 // all of their packets coalesced and return a token for that column.
828 uncoalescedTable.updateResources();
829
830 // have Kernel End releases been issued this cycle
831 int len = newKernelEnds.size();
832 for (int i = 0; i < len; i++) {
833 kernelCallback(newKernelEnds[i]);
834 }
835 newKernelEnds.clear();
836 }
837
838 void
839 GPUCoalescer::evictionCallback(Addr address)
840 {
841 ruby_eviction_callback(address);
842 }
843
844 void
845 GPUCoalescer::kernelCallback(int wavefront_id)
846 {
847 assert(kernelEndList.count(wavefront_id));
848
849 ruby_hit_callback(kernelEndList[wavefront_id]);
850
851 kernelEndList.erase(wavefront_id);
852 }
853
854 void
855 GPUCoalescer::atomicCallback(Addr address,
856 MachineType mach,
857 const DataBlock& data)
858 {
859 assert(address == makeLineAddress(address));
860 assert(coalescedTable.count(address));
861
862 auto crequest = coalescedTable.at(address).front();
863
864 fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
865 crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
866 crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
867 "atomicCallback saw non-atomic type response\n");
868
869 hitCallback(crequest, mach, (DataBlock&)data, true,
870 crequest->getIssueTime(), Cycles(0), Cycles(0), false);
871
872 delete crequest;
873 coalescedTable.at(address).pop_front();
874
875 if (coalescedTable.at(address).empty()) {
876 coalescedTable.erase(address);
877 } else {
878 auto nextRequest = coalescedTable.at(address).front();
879 issueRequest(nextRequest);
880 }
881 }
882
883 void
884 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
885 {
886 for (auto& pkt : mylist) {
887 RubyPort::SenderState *ss =
888 safe_cast<RubyPort::SenderState *>(pkt->senderState);
889 MemSlavePort *port = ss->port;
890 assert(port != NULL);
891
892 pkt->senderState = ss->predecessor;
893 delete ss;
894 port->hitCallback(pkt);
895 trySendRetries();
896 }
897
898 // We schedule an event in the same tick as hitCallback (similar to
899 // makeRequest) rather than calling completeIssue directly to reduce
900 // function calls to complete issue. This can only happen if the max
901 // outstanding requests is less than the number of slots in the
902 // uncoalesced table and makeRequest is not called again.
903 if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
904 schedule(issueEvent, curTick());
905 }
906
907 testDrainComplete();
908 }
909
910 void
911 GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
912 MachineType mach,
913 Cycles initialRequestTime,
914 Cycles forwardRequestTime,
915 Cycles firstResponseTime,
916 bool success, bool isRegion)
917 {
918 }
919
920 void
921 GPUCoalescer::regStats()
922 {
923 RubyPort::regStats();
924
925 // These statistical variables are not for display.
926 // The profiler will collate these across different
927 // coalescers and display those collated statistics.
928 m_outstandReqHist.init(10);
929 m_latencyHist.init(10);
930 m_missLatencyHist.init(10);
931
932 for (int i = 0; i < RubyRequestType_NUM; i++) {
933 m_typeLatencyHist.push_back(new Stats::Histogram());
934 m_typeLatencyHist[i]->init(10);
935
936 m_missTypeLatencyHist.push_back(new Stats::Histogram());
937 m_missTypeLatencyHist[i]->init(10);
938 }
939
940 for (int i = 0; i < MachineType_NUM; i++) {
941 m_missMachLatencyHist.push_back(new Stats::Histogram());
942 m_missMachLatencyHist[i]->init(10);
943
944 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
945 m_IssueToInitialDelayHist[i]->init(10);
946
947 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
948 m_InitialToForwardDelayHist[i]->init(10);
949
950 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
951 m_ForwardToFirstResponseDelayHist[i]->init(10);
952
953 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
954 m_FirstResponseToCompletionDelayHist[i]->init(10);
955 }
956
957 for (int i = 0; i < RubyRequestType_NUM; i++) {
958 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
959
960 for (int j = 0; j < MachineType_NUM; j++) {
961 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
962 m_missTypeMachLatencyHist[i][j]->init(10);
963 }
964 }
965 }