style: fix missing spaces in control statements
[gem5.git] / src / mem / ruby / system / GPUCoalescer.cc
1 /*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Sooraj Puthoor
34 */
35
36 #include "base/misc.hh"
37 #include "base/str.hh"
38 #include "config/the_isa.hh"
39
40 #if THE_ISA == X86_ISA
41 #include "arch/x86/insts/microldstop.hh"
42
43 #endif // X86_ISA
44 #include "mem/ruby/system/GPUCoalescer.hh"
45
46 #include "cpu/testers/rubytest/RubyTester.hh"
47 #include "debug/GPUCoalescer.hh"
48 #include "debug/MemoryAccess.hh"
49 #include "debug/ProtocolTrace.hh"
50 #include "debug/RubyPort.hh"
51 #include "debug/RubyStats.hh"
52 #include "gpu-compute/shader.hh"
53 #include "mem/packet.hh"
54 #include "mem/ruby/common/DataBlock.hh"
55 #include "mem/ruby/common/SubBlock.hh"
56 #include "mem/ruby/network/MessageBuffer.hh"
57 #include "mem/ruby/profiler/Profiler.hh"
58 #include "mem/ruby/slicc_interface/AbstractController.hh"
59 #include "mem/ruby/slicc_interface/RubyRequest.hh"
60 #include "mem/ruby/structures/CacheMemory.hh"
61 #include "mem/ruby/system/RubySystem.hh"
62 #include "params/RubyGPUCoalescer.hh"
63
64 using namespace std;
65
66 GPUCoalescer *
67 RubyGPUCoalescerParams::create()
68 {
69 return new GPUCoalescer(this);
70 }
71
72 HSAScope
73 reqScopeToHSAScope(Request* req)
74 {
75 HSAScope accessScope = HSAScope_UNSPECIFIED;
76 if (req->isScoped()) {
77 if (req->isWavefrontScope()) {
78 accessScope = HSAScope_WAVEFRONT;
79 } else if (req->isWorkgroupScope()) {
80 accessScope = HSAScope_WORKGROUP;
81 } else if (req->isDeviceScope()) {
82 accessScope = HSAScope_DEVICE;
83 } else if (req->isSystemScope()) {
84 accessScope = HSAScope_SYSTEM;
85 } else {
86 fatal("Bad scope type");
87 }
88 }
89 return accessScope;
90 }
91
92 HSASegment
93 reqSegmentToHSASegment(Request* req)
94 {
95 HSASegment accessSegment = HSASegment_GLOBAL;
96
97 if (req->isGlobalSegment()) {
98 accessSegment = HSASegment_GLOBAL;
99 } else if (req->isGroupSegment()) {
100 accessSegment = HSASegment_GROUP;
101 } else if (req->isPrivateSegment()) {
102 accessSegment = HSASegment_PRIVATE;
103 } else if (req->isKernargSegment()) {
104 accessSegment = HSASegment_KERNARG;
105 } else if (req->isReadonlySegment()) {
106 accessSegment = HSASegment_READONLY;
107 } else if (req->isSpillSegment()) {
108 accessSegment = HSASegment_SPILL;
109 } else if (req->isArgSegment()) {
110 accessSegment = HSASegment_ARG;
111 } else {
112 fatal("Bad segment type");
113 }
114
115 return accessSegment;
116 }
117
118 GPUCoalescer::GPUCoalescer(const Params *p)
119 : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
120 {
121 m_store_waiting_on_load_cycles = 0;
122 m_store_waiting_on_store_cycles = 0;
123 m_load_waiting_on_store_cycles = 0;
124 m_load_waiting_on_load_cycles = 0;
125
126 m_outstanding_count = 0;
127
128 m_max_outstanding_requests = 0;
129 m_deadlock_threshold = 0;
130 m_instCache_ptr = nullptr;
131 m_dataCache_ptr = nullptr;
132
133 m_instCache_ptr = p->icache;
134 m_dataCache_ptr = p->dcache;
135 m_max_outstanding_requests = p->max_outstanding_requests;
136 m_deadlock_threshold = p->deadlock_threshold;
137
138 assert(m_max_outstanding_requests > 0);
139 assert(m_deadlock_threshold > 0);
140 assert(m_instCache_ptr);
141 assert(m_dataCache_ptr);
142
143 m_data_cache_hit_latency = p->dcache_hit_latency;
144
145 m_usingNetworkTester = p->using_network_tester;
146 assumingRfOCoherence = p->assume_rfo;
147 }
148
149 GPUCoalescer::~GPUCoalescer()
150 {
151 }
152
153 void
154 GPUCoalescer::wakeup()
155 {
156 // Check for deadlock of any of the requests
157 Cycles current_time = curCycle();
158
159 // Check across all outstanding requests
160 int total_outstanding = 0;
161
162 RequestTable::iterator read = m_readRequestTable.begin();
163 RequestTable::iterator read_end = m_readRequestTable.end();
164 for (; read != read_end; ++read) {
165 GPUCoalescerRequest* request = read->second;
166 if (current_time - request->issue_time < m_deadlock_threshold)
167 continue;
168
169 panic("Possible Deadlock detected. Aborting!\n"
170 "version: %d request.paddr: 0x%x m_readRequestTable: %d "
171 "current time: %u issue_time: %d difference: %d\n", m_version,
172 request->pkt->getAddr(), m_readRequestTable.size(),
173 current_time * clockPeriod(), request->issue_time * clockPeriod(),
174 (current_time - request->issue_time)*clockPeriod());
175 }
176
177 RequestTable::iterator write = m_writeRequestTable.begin();
178 RequestTable::iterator write_end = m_writeRequestTable.end();
179 for (; write != write_end; ++write) {
180 GPUCoalescerRequest* request = write->second;
181 if (current_time - request->issue_time < m_deadlock_threshold)
182 continue;
183
184 panic("Possible Deadlock detected. Aborting!\n"
185 "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
186 "current time: %u issue_time: %d difference: %d\n", m_version,
187 request->pkt->getAddr(), m_writeRequestTable.size(),
188 current_time * clockPeriod(), request->issue_time * clockPeriod(),
189 (current_time - request->issue_time) * clockPeriod());
190 }
191
192 total_outstanding += m_writeRequestTable.size();
193 total_outstanding += m_readRequestTable.size();
194
195 assert(m_outstanding_count == total_outstanding);
196
197 if (m_outstanding_count > 0) {
198 // If there are still outstanding requests, keep checking
199 schedule(deadlockCheckEvent,
200 m_deadlock_threshold * clockPeriod() +
201 curTick());
202 }
203 }
204
205 void
206 GPUCoalescer::resetStats()
207 {
208 m_latencyHist.reset();
209 m_missLatencyHist.reset();
210 for (int i = 0; i < RubyRequestType_NUM; i++) {
211 m_typeLatencyHist[i]->reset();
212 m_missTypeLatencyHist[i]->reset();
213 for (int j = 0; j < MachineType_NUM; j++) {
214 m_missTypeMachLatencyHist[i][j]->reset();
215 }
216 }
217
218 for (int i = 0; i < MachineType_NUM; i++) {
219 m_missMachLatencyHist[i]->reset();
220
221 m_IssueToInitialDelayHist[i]->reset();
222 m_InitialToForwardDelayHist[i]->reset();
223 m_ForwardToFirstResponseDelayHist[i]->reset();
224 m_FirstResponseToCompletionDelayHist[i]->reset();
225 }
226 }
227
228 void
229 GPUCoalescer::printProgress(ostream& out) const
230 {
231 }
232
233 RequestStatus
234 GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
235 {
236 Addr line_addr = makeLineAddress(pkt->getAddr());
237
238 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
239 return RequestStatus_BufferFull;
240 }
241
242 if (m_controller->isBlocked(line_addr) &&
243 request_type != RubyRequestType_Locked_RMW_Write) {
244 return RequestStatus_Aliased;
245 }
246
247 if ((request_type == RubyRequestType_ST) ||
248 (request_type == RubyRequestType_ATOMIC) ||
249 (request_type == RubyRequestType_ATOMIC_RETURN) ||
250 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
251 (request_type == RubyRequestType_RMW_Read) ||
252 (request_type == RubyRequestType_RMW_Write) ||
253 (request_type == RubyRequestType_Load_Linked) ||
254 (request_type == RubyRequestType_Store_Conditional) ||
255 (request_type == RubyRequestType_Locked_RMW_Read) ||
256 (request_type == RubyRequestType_Locked_RMW_Write) ||
257 (request_type == RubyRequestType_FLUSH)) {
258
259 // Check if there is any outstanding read request for the same
260 // cache line.
261 if (m_readRequestTable.count(line_addr) > 0) {
262 m_store_waiting_on_load_cycles++;
263 return RequestStatus_Aliased;
264 }
265
266 if (m_writeRequestTable.count(line_addr) > 0) {
267 // There is an outstanding write request for the cache line
268 m_store_waiting_on_store_cycles++;
269 return RequestStatus_Aliased;
270 }
271 } else {
272 // Check if there is any outstanding write request for the same
273 // cache line.
274 if (m_writeRequestTable.count(line_addr) > 0) {
275 m_load_waiting_on_store_cycles++;
276 return RequestStatus_Aliased;
277 }
278
279 if (m_readRequestTable.count(line_addr) > 0) {
280 // There is an outstanding read request for the cache line
281 m_load_waiting_on_load_cycles++;
282 return RequestStatus_Aliased;
283 }
284 }
285
286 return RequestStatus_Ready;
287
288 }
289
290
291
292 // sets the kernelEndList
293 void
294 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
295 {
296 // Don't know if this will happen or is possible
297 // but I just want to be careful and not have it become
298 // simulator hang in the future
299 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
300 assert(kernelEndList.count(wavefront_id) == 0);
301
302 kernelEndList[wavefront_id] = pkt;
303 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
304 kernelEndList.size());
305 }
306
307
308 // Insert the request on the correct request table. Return true if
309 // the entry was already present.
310 bool
311 GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
312 {
313 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
314 pkt->req->isLockedRMW() ||
315 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
316
317 int total_outstanding M5_VAR_USED =
318 m_writeRequestTable.size() + m_readRequestTable.size();
319
320 assert(m_outstanding_count == total_outstanding);
321
322 // See if we should schedule a deadlock check
323 if (deadlockCheckEvent.scheduled() == false) {
324 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
325 }
326
327 Addr line_addr = makeLineAddress(pkt->getAddr());
328 if ((request_type == RubyRequestType_ST) ||
329 (request_type == RubyRequestType_ATOMIC) ||
330 (request_type == RubyRequestType_ATOMIC_RETURN) ||
331 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
332 (request_type == RubyRequestType_RMW_Read) ||
333 (request_type == RubyRequestType_RMW_Write) ||
334 (request_type == RubyRequestType_Load_Linked) ||
335 (request_type == RubyRequestType_Store_Conditional) ||
336 (request_type == RubyRequestType_Locked_RMW_Read) ||
337 (request_type == RubyRequestType_Locked_RMW_Write) ||
338 (request_type == RubyRequestType_FLUSH)) {
339
340 pair<RequestTable::iterator, bool> r =
341 m_writeRequestTable.insert(RequestTable::value_type(line_addr,
342 (GPUCoalescerRequest*) NULL));
343 if (r.second) {
344 RequestTable::iterator i = r.first;
345 i->second = new GPUCoalescerRequest(pkt, request_type,
346 curCycle());
347 DPRINTF(GPUCoalescer,
348 "Inserting write request for paddr %#x for type %d\n",
349 pkt->req->getPaddr(), i->second->m_type);
350 m_outstanding_count++;
351 } else {
352 return true;
353 }
354 } else {
355 pair<RequestTable::iterator, bool> r =
356 m_readRequestTable.insert(RequestTable::value_type(line_addr,
357 (GPUCoalescerRequest*) NULL));
358
359 if (r.second) {
360 RequestTable::iterator i = r.first;
361 i->second = new GPUCoalescerRequest(pkt, request_type,
362 curCycle());
363 DPRINTF(GPUCoalescer,
364 "Inserting read request for paddr %#x for type %d\n",
365 pkt->req->getPaddr(), i->second->m_type);
366 m_outstanding_count++;
367 } else {
368 return true;
369 }
370 }
371
372 m_outstandReqHist.sample(m_outstanding_count);
373
374 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
375 assert(m_outstanding_count == total_outstanding);
376
377 return false;
378 }
379
380 void
381 GPUCoalescer::markRemoved()
382 {
383 m_outstanding_count--;
384 assert(m_outstanding_count ==
385 m_writeRequestTable.size() + m_readRequestTable.size());
386 }
387
388 void
389 GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
390 {
391 assert(m_outstanding_count ==
392 m_writeRequestTable.size() + m_readRequestTable.size());
393
394 Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
395 if ((srequest->m_type == RubyRequestType_ST) ||
396 (srequest->m_type == RubyRequestType_RMW_Read) ||
397 (srequest->m_type == RubyRequestType_RMW_Write) ||
398 (srequest->m_type == RubyRequestType_Load_Linked) ||
399 (srequest->m_type == RubyRequestType_Store_Conditional) ||
400 (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
401 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
402 m_writeRequestTable.erase(line_addr);
403 } else {
404 m_readRequestTable.erase(line_addr);
405 }
406
407 markRemoved();
408 }
409
410 bool
411 GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
412 {
413 //
414 // The success flag indicates whether the LLSC operation was successful.
415 // LL ops will always succeed, but SC may fail if the cache line is no
416 // longer locked.
417 //
418 bool success = true;
419 if (request->m_type == RubyRequestType_Store_Conditional) {
420 if (!m_dataCache_ptr->isLocked(address, m_version)) {
421 //
422 // For failed SC requests, indicate the failure to the cpu by
423 // setting the extra data to zero.
424 //
425 request->pkt->req->setExtraData(0);
426 success = false;
427 } else {
428 //
429 // For successful SC requests, indicate the success to the cpu by
430 // setting the extra data to one.
431 //
432 request->pkt->req->setExtraData(1);
433 }
434 //
435 // Independent of success, all SC operations must clear the lock
436 //
437 m_dataCache_ptr->clearLocked(address);
438 } else if (request->m_type == RubyRequestType_Load_Linked) {
439 //
440 // Note: To fully follow Alpha LLSC semantics, should the LL clear any
441 // previously locked cache lines?
442 //
443 m_dataCache_ptr->setLocked(address, m_version);
444 } else if ((m_dataCache_ptr->isTagPresent(address)) &&
445 (m_dataCache_ptr->isLocked(address, m_version))) {
446 //
447 // Normal writes should clear the locked address
448 //
449 m_dataCache_ptr->clearLocked(address);
450 }
451 return success;
452 }
453
454 void
455 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
456 {
457 writeCallback(address, MachineType_NULL, data);
458 }
459
460 void
461 GPUCoalescer::writeCallback(Addr address,
462 MachineType mach,
463 DataBlock& data)
464 {
465 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
466 }
467
468 void
469 GPUCoalescer::writeCallback(Addr address,
470 MachineType mach,
471 DataBlock& data,
472 Cycles initialRequestTime,
473 Cycles forwardRequestTime,
474 Cycles firstResponseTime)
475 {
476 writeCallback(address, mach, data,
477 initialRequestTime, forwardRequestTime, firstResponseTime,
478 false);
479 }
480
481 void
482 GPUCoalescer::writeCallback(Addr address,
483 MachineType mach,
484 DataBlock& data,
485 Cycles initialRequestTime,
486 Cycles forwardRequestTime,
487 Cycles firstResponseTime,
488 bool isRegion)
489 {
490 assert(address == makeLineAddress(address));
491
492 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
493 assert(m_writeRequestTable.count(makeLineAddress(address)));
494
495 RequestTable::iterator i = m_writeRequestTable.find(address);
496 assert(i != m_writeRequestTable.end());
497 GPUCoalescerRequest* request = i->second;
498
499 m_writeRequestTable.erase(i);
500 markRemoved();
501
502 assert((request->m_type == RubyRequestType_ST) ||
503 (request->m_type == RubyRequestType_ATOMIC) ||
504 (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
505 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
506 (request->m_type == RubyRequestType_RMW_Read) ||
507 (request->m_type == RubyRequestType_RMW_Write) ||
508 (request->m_type == RubyRequestType_Load_Linked) ||
509 (request->m_type == RubyRequestType_Store_Conditional) ||
510 (request->m_type == RubyRequestType_Locked_RMW_Read) ||
511 (request->m_type == RubyRequestType_Locked_RMW_Write) ||
512 (request->m_type == RubyRequestType_FLUSH));
513
514
515 //
516 // For Alpha, properly handle LL, SC, and write requests with respect to
517 // locked cache blocks.
518 //
519 // Not valid for Network_test protocl
520 //
521 bool success = true;
522 if (!m_usingNetworkTester)
523 success = handleLlsc(address, request);
524
525 if (request->m_type == RubyRequestType_Locked_RMW_Read) {
526 m_controller->blockOnQueue(address, m_mandatory_q_ptr);
527 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
528 m_controller->unblock(address);
529 }
530
531 hitCallback(request, mach, data, success,
532 request->issue_time, forwardRequestTime, firstResponseTime,
533 isRegion);
534 }
535
536 void
537 GPUCoalescer::readCallback(Addr address, DataBlock& data)
538 {
539 readCallback(address, MachineType_NULL, data);
540 }
541
542 void
543 GPUCoalescer::readCallback(Addr address,
544 MachineType mach,
545 DataBlock& data)
546 {
547 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
548 }
549
550 void
551 GPUCoalescer::readCallback(Addr address,
552 MachineType mach,
553 DataBlock& data,
554 Cycles initialRequestTime,
555 Cycles forwardRequestTime,
556 Cycles firstResponseTime)
557 {
558
559 readCallback(address, mach, data,
560 initialRequestTime, forwardRequestTime, firstResponseTime,
561 false);
562 }
563
564 void
565 GPUCoalescer::readCallback(Addr address,
566 MachineType mach,
567 DataBlock& data,
568 Cycles initialRequestTime,
569 Cycles forwardRequestTime,
570 Cycles firstResponseTime,
571 bool isRegion)
572 {
573 assert(address == makeLineAddress(address));
574 assert(m_readRequestTable.count(makeLineAddress(address)));
575
576 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
577 RequestTable::iterator i = m_readRequestTable.find(address);
578 assert(i != m_readRequestTable.end());
579 GPUCoalescerRequest* request = i->second;
580
581 m_readRequestTable.erase(i);
582 markRemoved();
583
584 assert((request->m_type == RubyRequestType_LD) ||
585 (request->m_type == RubyRequestType_IFETCH));
586
587 hitCallback(request, mach, data, true,
588 request->issue_time, forwardRequestTime, firstResponseTime,
589 isRegion);
590 }
591
592 void
593 GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
594 MachineType mach,
595 DataBlock& data,
596 bool success,
597 Cycles initialRequestTime,
598 Cycles forwardRequestTime,
599 Cycles firstResponseTime,
600 bool isRegion)
601 {
602 PacketPtr pkt = srequest->pkt;
603 Addr request_address = pkt->getAddr();
604 Addr request_line_address = makeLineAddress(request_address);
605
606 RubyRequestType type = srequest->m_type;
607
608 // Set this cache entry to the most recently used
609 if (type == RubyRequestType_IFETCH) {
610 if (m_instCache_ptr->isTagPresent(request_line_address))
611 m_instCache_ptr->setMRU(request_line_address);
612 } else {
613 if (m_dataCache_ptr->isTagPresent(request_line_address))
614 m_dataCache_ptr->setMRU(request_line_address);
615 }
616
617 recordMissLatency(srequest, mach,
618 initialRequestTime,
619 forwardRequestTime,
620 firstResponseTime,
621 success, isRegion);
622 // update the data
623 //
624 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
625 int len = reqCoalescer[request_line_address].size();
626 std::vector<PacketPtr> mylist;
627 for (int i = 0; i < len; ++i) {
628 PacketPtr pkt = reqCoalescer[request_line_address][i].first;
629 assert(type ==
630 reqCoalescer[request_line_address][i].second[PrimaryType]);
631 request_address = pkt->getAddr();
632 request_line_address = makeLineAddress(pkt->getAddr());
633 if (pkt->getPtr<uint8_t>()) {
634 if ((type == RubyRequestType_LD) ||
635 (type == RubyRequestType_ATOMIC) ||
636 (type == RubyRequestType_ATOMIC_RETURN) ||
637 (type == RubyRequestType_IFETCH) ||
638 (type == RubyRequestType_RMW_Read) ||
639 (type == RubyRequestType_Locked_RMW_Read) ||
640 (type == RubyRequestType_Load_Linked)) {
641 memcpy(pkt->getPtr<uint8_t>(),
642 data.getData(getOffset(request_address),
643 pkt->getSize()),
644 pkt->getSize());
645 } else {
646 data.setData(pkt->getPtr<uint8_t>(),
647 getOffset(request_address), pkt->getSize());
648 }
649 } else {
650 DPRINTF(MemoryAccess,
651 "WARNING. Data not transfered from Ruby to M5 for type " \
652 "%s\n",
653 RubyRequestType_to_string(type));
654 }
655
656 // If using the RubyTester, update the RubyTester sender state's
657 // subBlock with the recieved data. The tester will later access
658 // this state.
659 // Note: RubyPort will access it's sender state before the
660 // RubyTester.
661 if (m_usingRubyTester) {
662 RubyPort::SenderState *requestSenderState =
663 safe_cast<RubyPort::SenderState*>(pkt->senderState);
664 RubyTester::SenderState* testerSenderState =
665 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
666 testerSenderState->subBlock.mergeFrom(data);
667 }
668
669 mylist.push_back(pkt);
670 }
671 delete srequest;
672 reqCoalescer.erase(request_line_address);
673 assert(!reqCoalescer.count(request_line_address));
674
675
676
677 completeHitCallback(mylist, len);
678 }
679
680 bool
681 GPUCoalescer::empty() const
682 {
683 return m_writeRequestTable.empty() && m_readRequestTable.empty();
684 }
685
686 // Analyzes the packet to see if this request can be coalesced.
687 // If request can be coalesced, this request is added to the reqCoalescer table
688 // and makeRequest returns RequestStatus_Issued;
689 // If this is the first request to a cacheline, request is added to both
690 // newRequests queue and to the reqCoalescer table; makeRequest
691 // returns RequestStatus_Issued.
692 // If there is a pending request to this cacheline and this request
693 // can't be coalesced, RequestStatus_Aliased is returned and
694 // the packet needs to be reissued.
695 RequestStatus
696 GPUCoalescer::makeRequest(PacketPtr pkt)
697 {
698 // Check for GPU Barrier Kernel End or Kernel Begin
699 // Leave these to be handled by the child class
700 // Kernel End/Barrier = isFlush + isRelease
701 // Kernel Begin = isFlush + isAcquire
702 if (pkt->req->isKernel()) {
703 if (pkt->req->isAcquire()){
704 // This is a Kernel Begin leave handling to
705 // virtual xCoalescer::makeRequest
706 return RequestStatus_Issued;
707 }else if (pkt->req->isRelease()) {
708 // This is a Kernel End leave handling to
709 // virtual xCoalescer::makeRequest
710 // If we are here then we didn't call
711 // a virtual version of this function
712 // so we will also schedule the callback
713 int wf_id = 0;
714 if (pkt->req->hasContextId()) {
715 wf_id = pkt->req->contextId();
716 }
717 insertKernel(wf_id, pkt);
718 newKernelEnds.push_back(wf_id);
719 if (!issueEvent.scheduled()) {
720 schedule(issueEvent, curTick());
721 }
722 return RequestStatus_Issued;
723 }
724 }
725
726 // If number of outstanding requests greater than the max allowed,
727 // return RequestStatus_BufferFull. This logic can be extended to
728 // support proper backpressure.
729 if (m_outstanding_count >= m_max_outstanding_requests) {
730 return RequestStatus_BufferFull;
731 }
732
733 RubyRequestType primary_type = RubyRequestType_NULL;
734 RubyRequestType secondary_type = RubyRequestType_NULL;
735
736 if (pkt->isLLSC()) {
737 //
738 // Alpha LL/SC instructions need to be handled carefully by the cache
739 // coherence protocol to ensure they follow the proper semantics. In
740 // particular, by identifying the operations as atomic, the protocol
741 // should understand that migratory sharing optimizations should not
742 // be performed (i.e. a load between the LL and SC should not steal
743 // away exclusive permission).
744 //
745 if (pkt->isWrite()) {
746 primary_type = RubyRequestType_Store_Conditional;
747 } else {
748 assert(pkt->isRead());
749 primary_type = RubyRequestType_Load_Linked;
750 }
751 secondary_type = RubyRequestType_ATOMIC;
752 } else if (pkt->req->isLockedRMW()) {
753 //
754 // x86 locked instructions are translated to store cache coherence
755 // requests because these requests should always be treated as read
756 // exclusive operations and should leverage any migratory sharing
757 // optimization built into the protocol.
758 //
759 if (pkt->isWrite()) {
760 primary_type = RubyRequestType_Locked_RMW_Write;
761 } else {
762 assert(pkt->isRead());
763 primary_type = RubyRequestType_Locked_RMW_Read;
764 }
765 secondary_type = RubyRequestType_ST;
766 } else if (pkt->isAtomicOp()) {
767 //
768 // GPU Atomic Operation
769 //
770 primary_type = RubyRequestType_ATOMIC;
771 secondary_type = RubyRequestType_ATOMIC;
772 } else {
773 if (pkt->isRead()) {
774 if (pkt->req->isInstFetch()) {
775 primary_type = secondary_type = RubyRequestType_IFETCH;
776 } else {
777 #if THE_ISA == X86_ISA
778 uint32_t flags = pkt->req->getFlags();
779 bool storeCheck = flags &
780 (TheISA::StoreCheck << TheISA::FlagShift);
781 #else
782 bool storeCheck = false;
783 #endif // X86_ISA
784 if (storeCheck) {
785 primary_type = RubyRequestType_RMW_Read;
786 secondary_type = RubyRequestType_ST;
787 } else {
788 primary_type = secondary_type = RubyRequestType_LD;
789 }
790 }
791 } else if (pkt->isWrite()) {
792 //
793 // Note: M5 packets do not differentiate ST from RMW_Write
794 //
795 primary_type = secondary_type = RubyRequestType_ST;
796 } else if (pkt->isFlush()) {
797 primary_type = secondary_type = RubyRequestType_FLUSH;
798 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
799 if (assumingRfOCoherence) {
800 // If we reached here, this request must be a memFence
801 // and the protocol implements RfO, the coalescer can
802 // assume sequentially consistency and schedule the callback
803 // immediately.
804 // Currently the code implements fence callbacks
805 // by reusing the mechanism for kernel completions.
806 // This should be fixed.
807 int wf_id = 0;
808 if (pkt->req->hasContextId()) {
809 wf_id = pkt->req->contextId();
810 }
811 insertKernel(wf_id, pkt);
812 newKernelEnds.push_back(wf_id);
813 if (!issueEvent.scheduled()) {
814 schedule(issueEvent, curTick());
815 }
816 return RequestStatus_Issued;
817 } else {
818 // If not RfO, return issued here and let the child coalescer
819 // take care of it.
820 return RequestStatus_Issued;
821 }
822 } else {
823 panic("Unsupported ruby packet type\n");
824 }
825 }
826
827 // Check if there is any pending request to this cache line from
828 // previous cycles.
829 // If there is a pending request, return aliased. Since coalescing
830 // across time is not permitted, aliased requests are not coalesced.
831 // If a request for this address has already been issued, we must block
832 RequestStatus status = getRequestStatus(pkt, primary_type);
833 if (status != RequestStatus_Ready)
834 return status;
835
836 Addr line_addr = makeLineAddress(pkt->getAddr());
837
838 // Check if this request can be coalesced with previous
839 // requests from this cycle.
840 if (!reqCoalescer.count(line_addr)) {
841 // This is the first access to this cache line.
842 // A new request to the memory subsystem has to be
843 // made in the next cycle for this cache line, so
844 // add this line addr to the "newRequests" queue
845 newRequests.push_back(line_addr);
846
847 // There was a request to this cache line in this cycle,
848 // let us see if we can coalesce this request with the previous
849 // requests from this cycle
850 } else if (primary_type !=
851 reqCoalescer[line_addr][0].second[PrimaryType]) {
852 // can't coalesce loads, stores and atomics!
853 return RequestStatus_Aliased;
854 } else if (pkt->req->isLockedRMW() ||
855 reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
856 // can't coalesce locked accesses, but can coalesce atomics!
857 return RequestStatus_Aliased;
858 } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
859 pkt->req->contextId() !=
860 reqCoalescer[line_addr][0].first->req->contextId()) {
861 // can't coalesce releases from different wavefronts
862 return RequestStatus_Aliased;
863 }
864
865 // in addition to the packet, we need to save both request types
866 reqCoalescer[line_addr].push_back(
867 RequestDesc(pkt, std::vector<RubyRequestType>()) );
868 reqCoalescer[line_addr].back().second.push_back(primary_type);
869 reqCoalescer[line_addr].back().second.push_back(secondary_type);
870 if (!issueEvent.scheduled())
871 schedule(issueEvent, curTick());
872 // TODO: issue hardware prefetches here
873 return RequestStatus_Issued;
874 }
875
876 void
877 GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
878 {
879
880 int proc_id = -1;
881 if (pkt != NULL && pkt->req->hasContextId()) {
882 proc_id = pkt->req->contextId();
883 }
884
885 // If valid, copy the pc to the ruby request
886 Addr pc = 0;
887 if (pkt->req->hasPC()) {
888 pc = pkt->req->getPC();
889 }
890
891 // At the moment setting scopes only counts
892 // for GPU spill space accesses
893 // which is pkt->req->isStack()
894 // this scope is REPLACE since it
895 // does not need to be flushed at the end
896 // of a kernel Private and local may need
897 // to be visible at the end of the kernel
898 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
899 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
900
901 Addr line_addr = makeLineAddress(pkt->getAddr());
902
903 // Creating WriteMask that records written bytes
904 // and atomic operations. This enables partial writes
905 // and partial reads of those writes
906 DataBlock dataBlock;
907 dataBlock.clear();
908 uint32_t blockSize = RubySystem::getBlockSizeBytes();
909 std::vector<bool> accessMask(blockSize,false);
910 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
911 uint32_t tableSize = reqCoalescer[line_addr].size();
912 for (int i = 0; i < tableSize; i++) {
913 PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
914 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
915 uint32_t tmpSize = tmpPkt->getSize();
916 if (tmpPkt->isAtomicOp()) {
917 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
918 tmpPkt->getAtomicOp());
919 atomicOps.push_back(tmpAtomicOp);
920 } else if (tmpPkt->isWrite()) {
921 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
922 tmpOffset, tmpSize);
923 }
924 for (int j = 0; j < tmpSize; j++) {
925 accessMask[tmpOffset + j] = true;
926 }
927 }
928 std::shared_ptr<RubyRequest> msg;
929 if (pkt->isAtomicOp()) {
930 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
931 pkt->getPtr<uint8_t>(),
932 pkt->getSize(), pc, secondary_type,
933 RubyAccessMode_Supervisor, pkt,
934 PrefetchBit_No, proc_id, 100,
935 blockSize, accessMask,
936 dataBlock, atomicOps,
937 accessScope, accessSegment);
938 } else {
939 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
940 pkt->getPtr<uint8_t>(),
941 pkt->getSize(), pc, secondary_type,
942 RubyAccessMode_Supervisor, pkt,
943 PrefetchBit_No, proc_id, 100,
944 blockSize, accessMask,
945 dataBlock,
946 accessScope, accessSegment);
947 }
948 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
949 curTick(), m_version, "Coal", "Begin", "", "",
950 printAddress(msg->getPhysicalAddress()),
951 RubyRequestType_to_string(secondary_type));
952
953 fatal_if(secondary_type == RubyRequestType_IFETCH,
954 "there should not be any I-Fetch requests in the GPU Coalescer");
955
956 // Send the message to the cache controller
957 fatal_if(m_data_cache_hit_latency == 0,
958 "should not have a latency of zero");
959
960 assert(m_mandatory_q_ptr);
961 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
962 }
963
964 template <class KEY, class VALUE>
965 std::ostream &
966 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
967 {
968 out << "[";
969 for (auto i = map.begin(); i != map.end(); ++i)
970 out << " " << i->first << "=" << i->second;
971 out << " ]";
972
973 return out;
974 }
975
976 void
977 GPUCoalescer::print(ostream& out) const
978 {
979 out << "[GPUCoalescer: " << m_version
980 << ", outstanding requests: " << m_outstanding_count
981 << ", read request table: " << m_readRequestTable
982 << ", write request table: " << m_writeRequestTable
983 << "]";
984 }
985
986 // this can be called from setState whenever coherence permissions are
987 // upgraded when invoked, coherence violations will be checked for the
988 // given block
989 void
990 GPUCoalescer::checkCoherence(Addr addr)
991 {
992 #ifdef CHECK_COHERENCE
993 m_ruby_system->checkGlobalCoherenceInvariant(addr);
994 #endif
995 }
996
997 void
998 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
999 DPRINTF(RubyStats, "Recorded statistic: %s\n",
1000 SequencerRequestType_to_string(requestType));
1001 }
1002
1003 GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
1004 : Event(Progress_Event_Pri), seq(_seq)
1005 {
1006 }
1007
1008
1009 void
1010 GPUCoalescer::completeIssue()
1011 {
1012 // newRequests has the cacheline addresses of all the
1013 // requests which need to be issued to the memory subsystem
1014 // in this cycle
1015 int len = newRequests.size();
1016 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1017 for (int i = 0; i < len; ++i) {
1018 // Get the requests from reqCoalescer table. Get only the
1019 // first request for each cacheline, the remaining requests
1020 // can be coalesced with the first request. So, only
1021 // one request is issued per cacheline.
1022 RequestDesc info = reqCoalescer[newRequests[i]][0];
1023 PacketPtr pkt = info.first;
1024 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1025 i, pkt->req->getPaddr());
1026 // Insert this request to the read/writeRequestTables. These tables
1027 // are used to track aliased requests in makeRequest subroutine
1028 bool found = insertRequest(pkt, info.second[PrimaryType]);
1029
1030 if (found) {
1031 panic("GPUCoalescer::makeRequest should never be called if the "
1032 "request is already outstanding\n");
1033 }
1034
1035 // Issue request to ruby subsystem
1036 issueRequest(pkt, info.second[SecondaryType]);
1037 }
1038 newRequests.clear();
1039
1040 // have Kernel End releases been issued this cycle
1041 len = newKernelEnds.size();
1042 for (int i = 0; i < len; i++) {
1043 kernelCallback(newKernelEnds[i]);
1044 }
1045 newKernelEnds.clear();
1046 }
1047
1048 void
1049 GPUCoalescer::IssueEvent::process()
1050 {
1051 seq->completeIssue();
1052 }
1053
1054 const char *
1055 GPUCoalescer::IssueEvent::description() const
1056 {
1057 return "Issue coalesced request";
1058 }
1059
1060 void
1061 GPUCoalescer::evictionCallback(Addr address)
1062 {
1063 ruby_eviction_callback(address);
1064 }
1065
1066 void
1067 GPUCoalescer::kernelCallback(int wavefront_id)
1068 {
1069 assert(kernelEndList.count(wavefront_id));
1070
1071 ruby_hit_callback(kernelEndList[wavefront_id]);
1072
1073 kernelEndList.erase(wavefront_id);
1074 }
1075
1076 void
1077 GPUCoalescer::atomicCallback(Addr address,
1078 MachineType mach,
1079 const DataBlock& data)
1080 {
1081 assert(address == makeLineAddress(address));
1082
1083 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1084 assert(m_writeRequestTable.count(makeLineAddress(address)));
1085
1086 RequestTable::iterator i = m_writeRequestTable.find(address);
1087 assert(i != m_writeRequestTable.end());
1088 GPUCoalescerRequest* srequest = i->second;
1089
1090 m_writeRequestTable.erase(i);
1091 markRemoved();
1092
1093 assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1094 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1095 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1096
1097
1098 // Atomics don't write to cache, so there is no MRU update...
1099
1100 recordMissLatency(srequest, mach,
1101 srequest->issue_time, Cycles(0), Cycles(0), true, false);
1102
1103 PacketPtr pkt = srequest->pkt;
1104 Addr request_address = pkt->getAddr();
1105 Addr request_line_address = makeLineAddress(pkt->getAddr());
1106
1107 int len = reqCoalescer[request_line_address].size();
1108 std::vector<PacketPtr> mylist;
1109 for (int i = 0; i < len; ++i) {
1110 PacketPtr pkt = reqCoalescer[request_line_address][i].first;
1111 assert(srequest->m_type ==
1112 reqCoalescer[request_line_address][i].second[PrimaryType]);
1113 request_address = (pkt->getAddr());
1114 request_line_address = makeLineAddress(request_address);
1115 if (pkt->getPtr<uint8_t>() &&
1116 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1117 /* atomics are done in memory, and return the data *before* the atomic op... */
1118 memcpy(pkt->getPtr<uint8_t>(),
1119 data.getData(getOffset(request_address),
1120 pkt->getSize()),
1121 pkt->getSize());
1122 } else {
1123 DPRINTF(MemoryAccess,
1124 "WARNING. Data not transfered from Ruby to M5 for type " \
1125 "%s\n",
1126 RubyRequestType_to_string(srequest->m_type));
1127 }
1128
1129 // If using the RubyTester, update the RubyTester sender state's
1130 // subBlock with the recieved data. The tester will later access
1131 // this state.
1132 // Note: RubyPort will access it's sender state before the
1133 // RubyTester.
1134 if (m_usingRubyTester) {
1135 RubyPort::SenderState *requestSenderState =
1136 safe_cast<RubyPort::SenderState*>(pkt->senderState);
1137 RubyTester::SenderState* testerSenderState =
1138 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1139 testerSenderState->subBlock.mergeFrom(data);
1140 }
1141
1142 mylist.push_back(pkt);
1143 }
1144 delete srequest;
1145 reqCoalescer.erase(request_line_address);
1146 assert(!reqCoalescer.count(request_line_address));
1147
1148 completeHitCallback(mylist, len);
1149 }
1150
1151 void
1152 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1153 {
1154 if (myMachID == senderMachID) {
1155 CP_TCPLdHits++;
1156 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1157 CP_TCPLdTransfers++;
1158 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1159 CP_TCCLdHits++;
1160 } else {
1161 CP_LdMiss++;
1162 }
1163 }
1164
1165 void
1166 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1167 {
1168 if (myMachID == senderMachID) {
1169 CP_TCPStHits++;
1170 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1171 CP_TCPStTransfers++;
1172 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1173 CP_TCCStHits++;
1174 } else {
1175 CP_StMiss++;
1176 }
1177 }
1178
1179 void
1180 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1181 {
1182 for (int i = 0; i < len; ++i) {
1183 RubyPort::SenderState *ss =
1184 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1185 MemSlavePort *port = ss->port;
1186 assert(port != NULL);
1187
1188 mylist[i]->senderState = ss->predecessor;
1189 delete ss;
1190 port->hitCallback(mylist[i]);
1191 trySendRetries();
1192 }
1193
1194 testDrainComplete();
1195 }
1196
1197 PacketPtr
1198 GPUCoalescer::mapAddrToPkt(Addr address)
1199 {
1200 RequestTable::iterator i = m_readRequestTable.find(address);
1201 assert(i != m_readRequestTable.end());
1202 GPUCoalescerRequest* request = i->second;
1203 return request->pkt;
1204 }
1205
1206 void
1207 GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1208 MachineType mach,
1209 Cycles initialRequestTime,
1210 Cycles forwardRequestTime,
1211 Cycles firstResponseTime,
1212 bool success, bool isRegion)
1213 {
1214 RubyRequestType type = srequest->m_type;
1215 Cycles issued_time = srequest->issue_time;
1216 Cycles completion_time = curCycle();
1217 assert(completion_time >= issued_time);
1218 Cycles total_lat = completion_time - issued_time;
1219
1220 // cache stats (valid for RfO protocol only)
1221 if (mach == MachineType_TCP) {
1222 if (type == RubyRequestType_LD) {
1223 GPU_TCPLdHits++;
1224 } else {
1225 GPU_TCPStHits++;
1226 }
1227 } else if (mach == MachineType_L1Cache_wCC) {
1228 if (type == RubyRequestType_LD) {
1229 GPU_TCPLdTransfers++;
1230 } else {
1231 GPU_TCPStTransfers++;
1232 }
1233 } else if (mach == MachineType_TCC) {
1234 if (type == RubyRequestType_LD) {
1235 GPU_TCCLdHits++;
1236 } else {
1237 GPU_TCCStHits++;
1238 }
1239 } else {
1240 if (type == RubyRequestType_LD) {
1241 GPU_LdMiss++;
1242 } else {
1243 GPU_StMiss++;
1244 }
1245 }
1246
1247 // Profile all access latency, even zero latency accesses
1248 m_latencyHist.sample(total_lat);
1249 m_typeLatencyHist[type]->sample(total_lat);
1250
1251 // Profile the miss latency for all non-zero demand misses
1252 if (total_lat != Cycles(0)) {
1253 m_missLatencyHist.sample(total_lat);
1254 m_missTypeLatencyHist[type]->sample(total_lat);
1255
1256 if (mach != MachineType_NUM) {
1257 m_missMachLatencyHist[mach]->sample(total_lat);
1258 m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1259
1260 if ((issued_time <= initialRequestTime) &&
1261 (initialRequestTime <= forwardRequestTime) &&
1262 (forwardRequestTime <= firstResponseTime) &&
1263 (firstResponseTime <= completion_time)) {
1264
1265 m_IssueToInitialDelayHist[mach]->sample(
1266 initialRequestTime - issued_time);
1267 m_InitialToForwardDelayHist[mach]->sample(
1268 forwardRequestTime - initialRequestTime);
1269 m_ForwardToFirstResponseDelayHist[mach]->sample(
1270 firstResponseTime - forwardRequestTime);
1271 m_FirstResponseToCompletionDelayHist[mach]->sample(
1272 completion_time - firstResponseTime);
1273 }
1274 }
1275
1276 }
1277
1278 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1279 curTick(), m_version, "Coal",
1280 success ? "Done" : "SC_Failed", "", "",
1281 printAddress(srequest->pkt->getAddr()), total_lat);
1282 }
1283
1284 void
1285 GPUCoalescer::regStats()
1286 {
1287 // These statistical variables are not for display.
1288 // The profiler will collate these across different
1289 // coalescers and display those collated statistics.
1290 m_outstandReqHist.init(10);
1291 m_latencyHist.init(10);
1292 m_missLatencyHist.init(10);
1293
1294 for (int i = 0; i < RubyRequestType_NUM; i++) {
1295 m_typeLatencyHist.push_back(new Stats::Histogram());
1296 m_typeLatencyHist[i]->init(10);
1297
1298 m_missTypeLatencyHist.push_back(new Stats::Histogram());
1299 m_missTypeLatencyHist[i]->init(10);
1300 }
1301
1302 for (int i = 0; i < MachineType_NUM; i++) {
1303 m_missMachLatencyHist.push_back(new Stats::Histogram());
1304 m_missMachLatencyHist[i]->init(10);
1305
1306 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1307 m_IssueToInitialDelayHist[i]->init(10);
1308
1309 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1310 m_InitialToForwardDelayHist[i]->init(10);
1311
1312 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1313 m_ForwardToFirstResponseDelayHist[i]->init(10);
1314
1315 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1316 m_FirstResponseToCompletionDelayHist[i]->init(10);
1317 }
1318
1319 for (int i = 0; i < RubyRequestType_NUM; i++) {
1320 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1321
1322 for (int j = 0; j < MachineType_NUM; j++) {
1323 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1324 m_missTypeMachLatencyHist[i][j]->init(10);
1325 }
1326 }
1327
1328 // GPU cache stats
1329 GPU_TCPLdHits
1330 .name(name() + ".gpu_tcp_ld_hits")
1331 .desc("loads that hit in the TCP")
1332 ;
1333 GPU_TCPLdTransfers
1334 .name(name() + ".gpu_tcp_ld_transfers")
1335 .desc("TCP to TCP load transfers")
1336 ;
1337 GPU_TCCLdHits
1338 .name(name() + ".gpu_tcc_ld_hits")
1339 .desc("loads that hit in the TCC")
1340 ;
1341 GPU_LdMiss
1342 .name(name() + ".gpu_ld_misses")
1343 .desc("loads that miss in the GPU")
1344 ;
1345
1346 GPU_TCPStHits
1347 .name(name() + ".gpu_tcp_st_hits")
1348 .desc("stores that hit in the TCP")
1349 ;
1350 GPU_TCPStTransfers
1351 .name(name() + ".gpu_tcp_st_transfers")
1352 .desc("TCP to TCP store transfers")
1353 ;
1354 GPU_TCCStHits
1355 .name(name() + ".gpu_tcc_st_hits")
1356 .desc("stores that hit in the TCC")
1357 ;
1358 GPU_StMiss
1359 .name(name() + ".gpu_st_misses")
1360 .desc("stores that miss in the GPU")
1361 ;
1362
1363 // CP cache stats
1364 CP_TCPLdHits
1365 .name(name() + ".cp_tcp_ld_hits")
1366 .desc("loads that hit in the TCP")
1367 ;
1368 CP_TCPLdTransfers
1369 .name(name() + ".cp_tcp_ld_transfers")
1370 .desc("TCP to TCP load transfers")
1371 ;
1372 CP_TCCLdHits
1373 .name(name() + ".cp_tcc_ld_hits")
1374 .desc("loads that hit in the TCC")
1375 ;
1376 CP_LdMiss
1377 .name(name() + ".cp_ld_misses")
1378 .desc("loads that miss in the GPU")
1379 ;
1380
1381 CP_TCPStHits
1382 .name(name() + ".cp_tcp_st_hits")
1383 .desc("stores that hit in the TCP")
1384 ;
1385 CP_TCPStTransfers
1386 .name(name() + ".cp_tcp_st_transfers")
1387 .desc("TCP to TCP store transfers")
1388 ;
1389 CP_TCCStHits
1390 .name(name() + ".cp_tcc_st_hits")
1391 .desc("stores that hit in the TCC")
1392 ;
1393 CP_StMiss
1394 .name(name() + ".cp_st_misses")
1395 .desc("stores that miss in the GPU")
1396 ;
1397 }