2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Sooraj Puthoor
36 #include "base/misc.hh"
37 #include "base/str.hh"
38 #include "config/the_isa.hh"
40 #if THE_ISA == X86_ISA
41 #include "arch/x86/insts/microldstop.hh"
44 #include "mem/ruby/system/GPUCoalescer.hh"
46 #include "cpu/testers/rubytest/RubyTester.hh"
47 #include "debug/GPUCoalescer.hh"
48 #include "debug/MemoryAccess.hh"
49 #include "debug/ProtocolTrace.hh"
50 #include "debug/RubyPort.hh"
51 #include "debug/RubyStats.hh"
52 #include "gpu-compute/shader.hh"
53 #include "mem/packet.hh"
54 #include "mem/ruby/common/DataBlock.hh"
55 #include "mem/ruby/common/SubBlock.hh"
56 #include "mem/ruby/network/MessageBuffer.hh"
57 #include "mem/ruby/profiler/Profiler.hh"
58 #include "mem/ruby/slicc_interface/AbstractController.hh"
59 #include "mem/ruby/slicc_interface/RubyRequest.hh"
60 #include "mem/ruby/structures/CacheMemory.hh"
61 #include "mem/ruby/system/RubySystem.hh"
62 #include "params/RubyGPUCoalescer.hh"
67 RubyGPUCoalescerParams::create()
69 return new GPUCoalescer(this);
73 reqScopeToHSAScope(Request
* req
)
75 HSAScope accessScope
= HSAScope_UNSPECIFIED
;
76 if (req
->isScoped()) {
77 if (req
->isWavefrontScope()) {
78 accessScope
= HSAScope_WAVEFRONT
;
79 } else if (req
->isWorkgroupScope()) {
80 accessScope
= HSAScope_WORKGROUP
;
81 } else if (req
->isDeviceScope()) {
82 accessScope
= HSAScope_DEVICE
;
83 } else if (req
->isSystemScope()) {
84 accessScope
= HSAScope_SYSTEM
;
86 fatal("Bad scope type");
93 reqSegmentToHSASegment(Request
* req
)
95 HSASegment accessSegment
= HSASegment_GLOBAL
;
97 if (req
->isGlobalSegment()) {
98 accessSegment
= HSASegment_GLOBAL
;
99 } else if (req
->isGroupSegment()) {
100 accessSegment
= HSASegment_GROUP
;
101 } else if (req
->isPrivateSegment()) {
102 accessSegment
= HSASegment_PRIVATE
;
103 } else if (req
->isKernargSegment()) {
104 accessSegment
= HSASegment_KERNARG
;
105 } else if (req
->isReadonlySegment()) {
106 accessSegment
= HSASegment_READONLY
;
107 } else if (req
->isSpillSegment()) {
108 accessSegment
= HSASegment_SPILL
;
109 } else if (req
->isArgSegment()) {
110 accessSegment
= HSASegment_ARG
;
112 fatal("Bad segment type");
115 return accessSegment
;
118 GPUCoalescer::GPUCoalescer(const Params
*p
)
119 : RubyPort(p
), issueEvent(this), deadlockCheckEvent(this)
121 m_store_waiting_on_load_cycles
= 0;
122 m_store_waiting_on_store_cycles
= 0;
123 m_load_waiting_on_store_cycles
= 0;
124 m_load_waiting_on_load_cycles
= 0;
126 m_outstanding_count
= 0;
128 m_max_outstanding_requests
= 0;
129 m_deadlock_threshold
= 0;
130 m_instCache_ptr
= nullptr;
131 m_dataCache_ptr
= nullptr;
133 m_instCache_ptr
= p
->icache
;
134 m_dataCache_ptr
= p
->dcache
;
135 m_max_outstanding_requests
= p
->max_outstanding_requests
;
136 m_deadlock_threshold
= p
->deadlock_threshold
;
138 assert(m_max_outstanding_requests
> 0);
139 assert(m_deadlock_threshold
> 0);
140 assert(m_instCache_ptr
);
141 assert(m_dataCache_ptr
);
143 m_data_cache_hit_latency
= p
->dcache_hit_latency
;
145 m_usingNetworkTester
= p
->using_network_tester
;
146 assumingRfOCoherence
= p
->assume_rfo
;
149 GPUCoalescer::~GPUCoalescer()
154 GPUCoalescer::wakeup()
156 // Check for deadlock of any of the requests
157 Cycles current_time
= curCycle();
159 // Check across all outstanding requests
160 int total_outstanding
= 0;
162 RequestTable::iterator read
= m_readRequestTable
.begin();
163 RequestTable::iterator read_end
= m_readRequestTable
.end();
164 for (; read
!= read_end
; ++read
) {
165 GPUCoalescerRequest
* request
= read
->second
;
166 if (current_time
- request
->issue_time
< m_deadlock_threshold
)
169 panic("Possible Deadlock detected. Aborting!\n"
170 "version: %d request.paddr: 0x%x m_readRequestTable: %d "
171 "current time: %u issue_time: %d difference: %d\n", m_version
,
172 request
->pkt
->getAddr(), m_readRequestTable
.size(),
173 current_time
* clockPeriod(), request
->issue_time
* clockPeriod(),
174 (current_time
- request
->issue_time
)*clockPeriod());
177 RequestTable::iterator write
= m_writeRequestTable
.begin();
178 RequestTable::iterator write_end
= m_writeRequestTable
.end();
179 for (; write
!= write_end
; ++write
) {
180 GPUCoalescerRequest
* request
= write
->second
;
181 if (current_time
- request
->issue_time
< m_deadlock_threshold
)
184 panic("Possible Deadlock detected. Aborting!\n"
185 "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
186 "current time: %u issue_time: %d difference: %d\n", m_version
,
187 request
->pkt
->getAddr(), m_writeRequestTable
.size(),
188 current_time
* clockPeriod(), request
->issue_time
* clockPeriod(),
189 (current_time
- request
->issue_time
) * clockPeriod());
192 total_outstanding
+= m_writeRequestTable
.size();
193 total_outstanding
+= m_readRequestTable
.size();
195 assert(m_outstanding_count
== total_outstanding
);
197 if (m_outstanding_count
> 0) {
198 // If there are still outstanding requests, keep checking
199 schedule(deadlockCheckEvent
,
200 m_deadlock_threshold
* clockPeriod() +
206 GPUCoalescer::resetStats()
208 m_latencyHist
.reset();
209 m_missLatencyHist
.reset();
210 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
211 m_typeLatencyHist
[i
]->reset();
212 m_missTypeLatencyHist
[i
]->reset();
213 for (int j
= 0; j
< MachineType_NUM
; j
++) {
214 m_missTypeMachLatencyHist
[i
][j
]->reset();
218 for (int i
= 0; i
< MachineType_NUM
; i
++) {
219 m_missMachLatencyHist
[i
]->reset();
221 m_IssueToInitialDelayHist
[i
]->reset();
222 m_InitialToForwardDelayHist
[i
]->reset();
223 m_ForwardToFirstResponseDelayHist
[i
]->reset();
224 m_FirstResponseToCompletionDelayHist
[i
]->reset();
229 GPUCoalescer::printProgress(ostream
& out
) const
234 GPUCoalescer::getRequestStatus(PacketPtr pkt
, RubyRequestType request_type
)
236 Addr line_addr
= makeLineAddress(pkt
->getAddr());
238 if (!m_mandatory_q_ptr
->areNSlotsAvailable(1, clockEdge())) {
239 return RequestStatus_BufferFull
;
242 if (m_controller
->isBlocked(line_addr
) &&
243 request_type
!= RubyRequestType_Locked_RMW_Write
) {
244 return RequestStatus_Aliased
;
247 if ((request_type
== RubyRequestType_ST
) ||
248 (request_type
== RubyRequestType_ATOMIC
) ||
249 (request_type
== RubyRequestType_ATOMIC_RETURN
) ||
250 (request_type
== RubyRequestType_ATOMIC_NO_RETURN
) ||
251 (request_type
== RubyRequestType_RMW_Read
) ||
252 (request_type
== RubyRequestType_RMW_Write
) ||
253 (request_type
== RubyRequestType_Load_Linked
) ||
254 (request_type
== RubyRequestType_Store_Conditional
) ||
255 (request_type
== RubyRequestType_Locked_RMW_Read
) ||
256 (request_type
== RubyRequestType_Locked_RMW_Write
) ||
257 (request_type
== RubyRequestType_FLUSH
)) {
259 // Check if there is any outstanding read request for the same
261 if (m_readRequestTable
.count(line_addr
) > 0) {
262 m_store_waiting_on_load_cycles
++;
263 return RequestStatus_Aliased
;
266 if (m_writeRequestTable
.count(line_addr
) > 0) {
267 // There is an outstanding write request for the cache line
268 m_store_waiting_on_store_cycles
++;
269 return RequestStatus_Aliased
;
272 // Check if there is any outstanding write request for the same
274 if (m_writeRequestTable
.count(line_addr
) > 0) {
275 m_load_waiting_on_store_cycles
++;
276 return RequestStatus_Aliased
;
279 if (m_readRequestTable
.count(line_addr
) > 0) {
280 // There is an outstanding read request for the cache line
281 m_load_waiting_on_load_cycles
++;
282 return RequestStatus_Aliased
;
286 return RequestStatus_Ready
;
292 // sets the kernelEndList
294 GPUCoalescer::insertKernel(int wavefront_id
, PacketPtr pkt
)
296 // Don't know if this will happen or is possible
297 // but I just want to be careful and not have it become
298 // simulator hang in the future
299 DPRINTF(GPUCoalescer
, "inserting wf: %d to kernelEndlist\n", wavefront_id
);
300 assert(kernelEndList
.count(wavefront_id
) == 0);
302 kernelEndList
[wavefront_id
] = pkt
;
303 DPRINTF(GPUCoalescer
, "kernelEndList->size() = %d\n",
304 kernelEndList
.size());
308 // Insert the request on the correct request table. Return true if
309 // the entry was already present.
311 GPUCoalescer::insertRequest(PacketPtr pkt
, RubyRequestType request_type
)
313 assert(getRequestStatus(pkt
, request_type
) == RequestStatus_Ready
||
314 pkt
->req
->isLockedRMW() ||
315 !m_mandatory_q_ptr
->areNSlotsAvailable(1, clockEdge()));
317 int total_outstanding M5_VAR_USED
=
318 m_writeRequestTable
.size() + m_readRequestTable
.size();
320 assert(m_outstanding_count
== total_outstanding
);
322 // See if we should schedule a deadlock check
323 if (deadlockCheckEvent
.scheduled() == false) {
324 schedule(deadlockCheckEvent
, m_deadlock_threshold
+ curTick());
327 Addr line_addr
= makeLineAddress(pkt
->getAddr());
328 if ((request_type
== RubyRequestType_ST
) ||
329 (request_type
== RubyRequestType_ATOMIC
) ||
330 (request_type
== RubyRequestType_ATOMIC_RETURN
) ||
331 (request_type
== RubyRequestType_ATOMIC_NO_RETURN
) ||
332 (request_type
== RubyRequestType_RMW_Read
) ||
333 (request_type
== RubyRequestType_RMW_Write
) ||
334 (request_type
== RubyRequestType_Load_Linked
) ||
335 (request_type
== RubyRequestType_Store_Conditional
) ||
336 (request_type
== RubyRequestType_Locked_RMW_Read
) ||
337 (request_type
== RubyRequestType_Locked_RMW_Write
) ||
338 (request_type
== RubyRequestType_FLUSH
)) {
340 pair
<RequestTable::iterator
, bool> r
=
341 m_writeRequestTable
.insert(RequestTable::value_type(line_addr
,
342 (GPUCoalescerRequest
*) NULL
));
344 RequestTable::iterator i
= r
.first
;
345 i
->second
= new GPUCoalescerRequest(pkt
, request_type
,
347 DPRINTF(GPUCoalescer
,
348 "Inserting write request for paddr %#x for type %d\n",
349 pkt
->req
->getPaddr(), i
->second
->m_type
);
350 m_outstanding_count
++;
355 pair
<RequestTable::iterator
, bool> r
=
356 m_readRequestTable
.insert(RequestTable::value_type(line_addr
,
357 (GPUCoalescerRequest
*) NULL
));
360 RequestTable::iterator i
= r
.first
;
361 i
->second
= new GPUCoalescerRequest(pkt
, request_type
,
363 DPRINTF(GPUCoalescer
,
364 "Inserting read request for paddr %#x for type %d\n",
365 pkt
->req
->getPaddr(), i
->second
->m_type
);
366 m_outstanding_count
++;
372 m_outstandReqHist
.sample(m_outstanding_count
);
374 total_outstanding
= m_writeRequestTable
.size() + m_readRequestTable
.size();
375 assert(m_outstanding_count
== total_outstanding
);
381 GPUCoalescer::markRemoved()
383 m_outstanding_count
--;
384 assert(m_outstanding_count
==
385 m_writeRequestTable
.size() + m_readRequestTable
.size());
389 GPUCoalescer::removeRequest(GPUCoalescerRequest
* srequest
)
391 assert(m_outstanding_count
==
392 m_writeRequestTable
.size() + m_readRequestTable
.size());
394 Addr line_addr
= makeLineAddress(srequest
->pkt
->getAddr());
395 if ((srequest
->m_type
== RubyRequestType_ST
) ||
396 (srequest
->m_type
== RubyRequestType_RMW_Read
) ||
397 (srequest
->m_type
== RubyRequestType_RMW_Write
) ||
398 (srequest
->m_type
== RubyRequestType_Load_Linked
) ||
399 (srequest
->m_type
== RubyRequestType_Store_Conditional
) ||
400 (srequest
->m_type
== RubyRequestType_Locked_RMW_Read
) ||
401 (srequest
->m_type
== RubyRequestType_Locked_RMW_Write
)) {
402 m_writeRequestTable
.erase(line_addr
);
404 m_readRequestTable
.erase(line_addr
);
411 GPUCoalescer::handleLlsc(Addr address
, GPUCoalescerRequest
* request
)
414 // The success flag indicates whether the LLSC operation was successful.
415 // LL ops will always succeed, but SC may fail if the cache line is no
419 if (request
->m_type
== RubyRequestType_Store_Conditional
) {
420 if (!m_dataCache_ptr
->isLocked(address
, m_version
)) {
422 // For failed SC requests, indicate the failure to the cpu by
423 // setting the extra data to zero.
425 request
->pkt
->req
->setExtraData(0);
429 // For successful SC requests, indicate the success to the cpu by
430 // setting the extra data to one.
432 request
->pkt
->req
->setExtraData(1);
435 // Independent of success, all SC operations must clear the lock
437 m_dataCache_ptr
->clearLocked(address
);
438 } else if (request
->m_type
== RubyRequestType_Load_Linked
) {
440 // Note: To fully follow Alpha LLSC semantics, should the LL clear any
441 // previously locked cache lines?
443 m_dataCache_ptr
->setLocked(address
, m_version
);
444 } else if ((m_dataCache_ptr
->isTagPresent(address
)) &&
445 (m_dataCache_ptr
->isLocked(address
, m_version
))) {
447 // Normal writes should clear the locked address
449 m_dataCache_ptr
->clearLocked(address
);
455 GPUCoalescer::writeCallback(Addr address
, DataBlock
& data
)
457 writeCallback(address
, MachineType_NULL
, data
);
461 GPUCoalescer::writeCallback(Addr address
,
465 writeCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
469 GPUCoalescer::writeCallback(Addr address
,
472 Cycles initialRequestTime
,
473 Cycles forwardRequestTime
,
474 Cycles firstResponseTime
)
476 writeCallback(address
, mach
, data
,
477 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
482 GPUCoalescer::writeCallback(Addr address
,
485 Cycles initialRequestTime
,
486 Cycles forwardRequestTime
,
487 Cycles firstResponseTime
,
490 assert(address
== makeLineAddress(address
));
492 DPRINTF(GPUCoalescer
, "write callback for address %#x\n", address
);
493 assert(m_writeRequestTable
.count(makeLineAddress(address
)));
495 RequestTable::iterator i
= m_writeRequestTable
.find(address
);
496 assert(i
!= m_writeRequestTable
.end());
497 GPUCoalescerRequest
* request
= i
->second
;
499 m_writeRequestTable
.erase(i
);
502 assert((request
->m_type
== RubyRequestType_ST
) ||
503 (request
->m_type
== RubyRequestType_ATOMIC
) ||
504 (request
->m_type
== RubyRequestType_ATOMIC_RETURN
) ||
505 (request
->m_type
== RubyRequestType_ATOMIC_NO_RETURN
) ||
506 (request
->m_type
== RubyRequestType_RMW_Read
) ||
507 (request
->m_type
== RubyRequestType_RMW_Write
) ||
508 (request
->m_type
== RubyRequestType_Load_Linked
) ||
509 (request
->m_type
== RubyRequestType_Store_Conditional
) ||
510 (request
->m_type
== RubyRequestType_Locked_RMW_Read
) ||
511 (request
->m_type
== RubyRequestType_Locked_RMW_Write
) ||
512 (request
->m_type
== RubyRequestType_FLUSH
));
516 // For Alpha, properly handle LL, SC, and write requests with respect to
517 // locked cache blocks.
519 // Not valid for Network_test protocl
522 if (!m_usingNetworkTester
)
523 success
= handleLlsc(address
, request
);
525 if (request
->m_type
== RubyRequestType_Locked_RMW_Read
) {
526 m_controller
->blockOnQueue(address
, m_mandatory_q_ptr
);
527 } else if (request
->m_type
== RubyRequestType_Locked_RMW_Write
) {
528 m_controller
->unblock(address
);
531 hitCallback(request
, mach
, data
, success
,
532 request
->issue_time
, forwardRequestTime
, firstResponseTime
,
537 GPUCoalescer::readCallback(Addr address
, DataBlock
& data
)
539 readCallback(address
, MachineType_NULL
, data
);
543 GPUCoalescer::readCallback(Addr address
,
547 readCallback(address
, mach
, data
, Cycles(0), Cycles(0), Cycles(0));
551 GPUCoalescer::readCallback(Addr address
,
554 Cycles initialRequestTime
,
555 Cycles forwardRequestTime
,
556 Cycles firstResponseTime
)
559 readCallback(address
, mach
, data
,
560 initialRequestTime
, forwardRequestTime
, firstResponseTime
,
565 GPUCoalescer::readCallback(Addr address
,
568 Cycles initialRequestTime
,
569 Cycles forwardRequestTime
,
570 Cycles firstResponseTime
,
573 assert(address
== makeLineAddress(address
));
574 assert(m_readRequestTable
.count(makeLineAddress(address
)));
576 DPRINTF(GPUCoalescer
, "read callback for address %#x\n", address
);
577 RequestTable::iterator i
= m_readRequestTable
.find(address
);
578 assert(i
!= m_readRequestTable
.end());
579 GPUCoalescerRequest
* request
= i
->second
;
581 m_readRequestTable
.erase(i
);
584 assert((request
->m_type
== RubyRequestType_LD
) ||
585 (request
->m_type
== RubyRequestType_IFETCH
));
587 hitCallback(request
, mach
, data
, true,
588 request
->issue_time
, forwardRequestTime
, firstResponseTime
,
593 GPUCoalescer::hitCallback(GPUCoalescerRequest
* srequest
,
597 Cycles initialRequestTime
,
598 Cycles forwardRequestTime
,
599 Cycles firstResponseTime
,
602 PacketPtr pkt
= srequest
->pkt
;
603 Addr request_address
= pkt
->getAddr();
604 Addr request_line_address
= makeLineAddress(request_address
);
606 RubyRequestType type
= srequest
->m_type
;
608 // Set this cache entry to the most recently used
609 if (type
== RubyRequestType_IFETCH
) {
610 if (m_instCache_ptr
->isTagPresent(request_line_address
))
611 m_instCache_ptr
->setMRU(request_line_address
);
613 if (m_dataCache_ptr
->isTagPresent(request_line_address
))
614 m_dataCache_ptr
->setMRU(request_line_address
);
617 recordMissLatency(srequest
, mach
,
624 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
625 int len
= reqCoalescer
[request_line_address
].size();
626 std::vector
<PacketPtr
> mylist
;
627 for (int i
= 0; i
< len
; ++i
) {
628 PacketPtr pkt
= reqCoalescer
[request_line_address
][i
].first
;
630 reqCoalescer
[request_line_address
][i
].second
[PrimaryType
]);
631 request_address
= pkt
->getAddr();
632 request_line_address
= makeLineAddress(pkt
->getAddr());
633 if (pkt
->getPtr
<uint8_t>()) {
634 if ((type
== RubyRequestType_LD
) ||
635 (type
== RubyRequestType_ATOMIC
) ||
636 (type
== RubyRequestType_ATOMIC_RETURN
) ||
637 (type
== RubyRequestType_IFETCH
) ||
638 (type
== RubyRequestType_RMW_Read
) ||
639 (type
== RubyRequestType_Locked_RMW_Read
) ||
640 (type
== RubyRequestType_Load_Linked
)) {
641 memcpy(pkt
->getPtr
<uint8_t>(),
642 data
.getData(getOffset(request_address
),
646 data
.setData(pkt
->getPtr
<uint8_t>(),
647 getOffset(request_address
), pkt
->getSize());
650 DPRINTF(MemoryAccess
,
651 "WARNING. Data not transfered from Ruby to M5 for type " \
653 RubyRequestType_to_string(type
));
656 // If using the RubyTester, update the RubyTester sender state's
657 // subBlock with the recieved data. The tester will later access
659 // Note: RubyPort will access it's sender state before the
661 if (m_usingRubyTester
) {
662 RubyPort::SenderState
*requestSenderState
=
663 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
664 RubyTester::SenderState
* testerSenderState
=
665 safe_cast
<RubyTester::SenderState
*>(requestSenderState
->predecessor
);
666 testerSenderState
->subBlock
.mergeFrom(data
);
669 mylist
.push_back(pkt
);
672 reqCoalescer
.erase(request_line_address
);
673 assert(!reqCoalescer
.count(request_line_address
));
677 completeHitCallback(mylist
, len
);
681 GPUCoalescer::empty() const
683 return m_writeRequestTable
.empty() && m_readRequestTable
.empty();
686 // Analyzes the packet to see if this request can be coalesced.
687 // If request can be coalesced, this request is added to the reqCoalescer table
688 // and makeRequest returns RequestStatus_Issued;
689 // If this is the first request to a cacheline, request is added to both
690 // newRequests queue and to the reqCoalescer table; makeRequest
691 // returns RequestStatus_Issued.
692 // If there is a pending request to this cacheline and this request
693 // can't be coalesced, RequestStatus_Aliased is returned and
694 // the packet needs to be reissued.
696 GPUCoalescer::makeRequest(PacketPtr pkt
)
698 // Check for GPU Barrier Kernel End or Kernel Begin
699 // Leave these to be handled by the child class
700 // Kernel End/Barrier = isFlush + isRelease
701 // Kernel Begin = isFlush + isAcquire
702 if (pkt
->req
->isKernel()) {
703 if (pkt
->req
->isAcquire()){
704 // This is a Kernel Begin leave handling to
705 // virtual xCoalescer::makeRequest
706 return RequestStatus_Issued
;
707 }else if (pkt
->req
->isRelease()) {
708 // This is a Kernel End leave handling to
709 // virtual xCoalescer::makeRequest
710 // If we are here then we didn't call
711 // a virtual version of this function
712 // so we will also schedule the callback
714 if (pkt
->req
->hasContextId()) {
715 wf_id
= pkt
->req
->contextId();
717 insertKernel(wf_id
, pkt
);
718 newKernelEnds
.push_back(wf_id
);
719 if (!issueEvent
.scheduled()) {
720 schedule(issueEvent
, curTick());
722 return RequestStatus_Issued
;
726 // If number of outstanding requests greater than the max allowed,
727 // return RequestStatus_BufferFull. This logic can be extended to
728 // support proper backpressure.
729 if (m_outstanding_count
>= m_max_outstanding_requests
) {
730 return RequestStatus_BufferFull
;
733 RubyRequestType primary_type
= RubyRequestType_NULL
;
734 RubyRequestType secondary_type
= RubyRequestType_NULL
;
738 // Alpha LL/SC instructions need to be handled carefully by the cache
739 // coherence protocol to ensure they follow the proper semantics. In
740 // particular, by identifying the operations as atomic, the protocol
741 // should understand that migratory sharing optimizations should not
742 // be performed (i.e. a load between the LL and SC should not steal
743 // away exclusive permission).
745 if (pkt
->isWrite()) {
746 primary_type
= RubyRequestType_Store_Conditional
;
748 assert(pkt
->isRead());
749 primary_type
= RubyRequestType_Load_Linked
;
751 secondary_type
= RubyRequestType_ATOMIC
;
752 } else if (pkt
->req
->isLockedRMW()) {
754 // x86 locked instructions are translated to store cache coherence
755 // requests because these requests should always be treated as read
756 // exclusive operations and should leverage any migratory sharing
757 // optimization built into the protocol.
759 if (pkt
->isWrite()) {
760 primary_type
= RubyRequestType_Locked_RMW_Write
;
762 assert(pkt
->isRead());
763 primary_type
= RubyRequestType_Locked_RMW_Read
;
765 secondary_type
= RubyRequestType_ST
;
766 } else if (pkt
->isAtomicOp()) {
768 // GPU Atomic Operation
770 primary_type
= RubyRequestType_ATOMIC
;
771 secondary_type
= RubyRequestType_ATOMIC
;
774 if (pkt
->req
->isInstFetch()) {
775 primary_type
= secondary_type
= RubyRequestType_IFETCH
;
777 #if THE_ISA == X86_ISA
778 uint32_t flags
= pkt
->req
->getFlags();
779 bool storeCheck
= flags
&
780 (TheISA::StoreCheck
<< TheISA::FlagShift
);
782 bool storeCheck
= false;
785 primary_type
= RubyRequestType_RMW_Read
;
786 secondary_type
= RubyRequestType_ST
;
788 primary_type
= secondary_type
= RubyRequestType_LD
;
791 } else if (pkt
->isWrite()) {
793 // Note: M5 packets do not differentiate ST from RMW_Write
795 primary_type
= secondary_type
= RubyRequestType_ST
;
796 } else if (pkt
->isFlush()) {
797 primary_type
= secondary_type
= RubyRequestType_FLUSH
;
798 } else if (pkt
->req
->isRelease() || pkt
->req
->isAcquire()) {
799 if (assumingRfOCoherence
) {
800 // If we reached here, this request must be a memFence
801 // and the protocol implements RfO, the coalescer can
802 // assume sequentially consistency and schedule the callback
804 // Currently the code implements fence callbacks
805 // by reusing the mechanism for kernel completions.
806 // This should be fixed.
808 if (pkt
->req
->hasContextId()) {
809 wf_id
= pkt
->req
->contextId();
811 insertKernel(wf_id
, pkt
);
812 newKernelEnds
.push_back(wf_id
);
813 if (!issueEvent
.scheduled()) {
814 schedule(issueEvent
, curTick());
816 return RequestStatus_Issued
;
818 // If not RfO, return issued here and let the child coalescer
820 return RequestStatus_Issued
;
823 panic("Unsupported ruby packet type\n");
827 // Check if there is any pending request to this cache line from
829 // If there is a pending request, return aliased. Since coalescing
830 // across time is not permitted, aliased requests are not coalesced.
831 // If a request for this address has already been issued, we must block
832 RequestStatus status
= getRequestStatus(pkt
, primary_type
);
833 if (status
!= RequestStatus_Ready
)
836 Addr line_addr
= makeLineAddress(pkt
->getAddr());
838 // Check if this request can be coalesced with previous
839 // requests from this cycle.
840 if (!reqCoalescer
.count(line_addr
)) {
841 // This is the first access to this cache line.
842 // A new request to the memory subsystem has to be
843 // made in the next cycle for this cache line, so
844 // add this line addr to the "newRequests" queue
845 newRequests
.push_back(line_addr
);
847 // There was a request to this cache line in this cycle,
848 // let us see if we can coalesce this request with the previous
849 // requests from this cycle
850 } else if (primary_type
!=
851 reqCoalescer
[line_addr
][0].second
[PrimaryType
]) {
852 // can't coalesce loads, stores and atomics!
853 return RequestStatus_Aliased
;
854 } else if (pkt
->req
->isLockedRMW() ||
855 reqCoalescer
[line_addr
][0].first
->req
->isLockedRMW()) {
856 // can't coalesce locked accesses, but can coalesce atomics!
857 return RequestStatus_Aliased
;
858 } else if (pkt
->req
->hasContextId() && pkt
->req
->isRelease() &&
859 pkt
->req
->contextId() !=
860 reqCoalescer
[line_addr
][0].first
->req
->contextId()) {
861 // can't coalesce releases from different wavefronts
862 return RequestStatus_Aliased
;
865 // in addition to the packet, we need to save both request types
866 reqCoalescer
[line_addr
].push_back(
867 RequestDesc(pkt
, std::vector
<RubyRequestType
>()) );
868 reqCoalescer
[line_addr
].back().second
.push_back(primary_type
);
869 reqCoalescer
[line_addr
].back().second
.push_back(secondary_type
);
870 if (!issueEvent
.scheduled())
871 schedule(issueEvent
, curTick());
872 // TODO: issue hardware prefetches here
873 return RequestStatus_Issued
;
877 GPUCoalescer::issueRequest(PacketPtr pkt
, RubyRequestType secondary_type
)
881 if (pkt
!= NULL
&& pkt
->req
->hasContextId()) {
882 proc_id
= pkt
->req
->contextId();
885 // If valid, copy the pc to the ruby request
887 if (pkt
->req
->hasPC()) {
888 pc
= pkt
->req
->getPC();
891 // At the moment setting scopes only counts
892 // for GPU spill space accesses
893 // which is pkt->req->isStack()
894 // this scope is REPLACE since it
895 // does not need to be flushed at the end
896 // of a kernel Private and local may need
897 // to be visible at the end of the kernel
898 HSASegment accessSegment
= reqSegmentToHSASegment(pkt
->req
);
899 HSAScope accessScope
= reqScopeToHSAScope(pkt
->req
);
901 Addr line_addr
= makeLineAddress(pkt
->getAddr());
903 // Creating WriteMask that records written bytes
904 // and atomic operations. This enables partial writes
905 // and partial reads of those writes
908 uint32_t blockSize
= RubySystem::getBlockSizeBytes();
909 std::vector
<bool> accessMask(blockSize
,false);
910 std::vector
< std::pair
<int,AtomicOpFunctor
*> > atomicOps
;
911 uint32_t tableSize
= reqCoalescer
[line_addr
].size();
912 for (int i
= 0; i
< tableSize
; i
++) {
913 PacketPtr tmpPkt
= reqCoalescer
[line_addr
][i
].first
;
914 uint32_t tmpOffset
= (tmpPkt
->getAddr()) - line_addr
;
915 uint32_t tmpSize
= tmpPkt
->getSize();
916 if (tmpPkt
->isAtomicOp()) {
917 std::pair
<int,AtomicOpFunctor
*> tmpAtomicOp(tmpOffset
,
918 tmpPkt
->getAtomicOp());
919 atomicOps
.push_back(tmpAtomicOp
);
920 } else if (tmpPkt
->isWrite()) {
921 dataBlock
.setData(tmpPkt
->getPtr
<uint8_t>(),
924 for (int j
= 0; j
< tmpSize
; j
++) {
925 accessMask
[tmpOffset
+ j
] = true;
928 std::shared_ptr
<RubyRequest
> msg
;
929 if (pkt
->isAtomicOp()) {
930 msg
= std::make_shared
<RubyRequest
>(clockEdge(), pkt
->getAddr(),
931 pkt
->getPtr
<uint8_t>(),
932 pkt
->getSize(), pc
, secondary_type
,
933 RubyAccessMode_Supervisor
, pkt
,
934 PrefetchBit_No
, proc_id
, 100,
935 blockSize
, accessMask
,
936 dataBlock
, atomicOps
,
937 accessScope
, accessSegment
);
939 msg
= std::make_shared
<RubyRequest
>(clockEdge(), pkt
->getAddr(),
940 pkt
->getPtr
<uint8_t>(),
941 pkt
->getSize(), pc
, secondary_type
,
942 RubyAccessMode_Supervisor
, pkt
,
943 PrefetchBit_No
, proc_id
, 100,
944 blockSize
, accessMask
,
946 accessScope
, accessSegment
);
948 DPRINTFR(ProtocolTrace
, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
949 curTick(), m_version
, "Coal", "Begin", "", "",
950 printAddress(msg
->getPhysicalAddress()),
951 RubyRequestType_to_string(secondary_type
));
953 fatal_if(secondary_type
== RubyRequestType_IFETCH
,
954 "there should not be any I-Fetch requests in the GPU Coalescer");
956 // Send the message to the cache controller
957 fatal_if(m_data_cache_hit_latency
== 0,
958 "should not have a latency of zero");
960 assert(m_mandatory_q_ptr
);
961 m_mandatory_q_ptr
->enqueue(msg
, clockEdge(), m_data_cache_hit_latency
);
964 template <class KEY
, class VALUE
>
966 operator<<(ostream
&out
, const std::unordered_map
<KEY
, VALUE
> &map
)
969 for (auto i
= map
.begin(); i
!= map
.end(); ++i
)
970 out
<< " " << i
->first
<< "=" << i
->second
;
977 GPUCoalescer::print(ostream
& out
) const
979 out
<< "[GPUCoalescer: " << m_version
980 << ", outstanding requests: " << m_outstanding_count
981 << ", read request table: " << m_readRequestTable
982 << ", write request table: " << m_writeRequestTable
986 // this can be called from setState whenever coherence permissions are
987 // upgraded when invoked, coherence violations will be checked for the
990 GPUCoalescer::checkCoherence(Addr addr
)
992 #ifdef CHECK_COHERENCE
993 m_ruby_system
->checkGlobalCoherenceInvariant(addr
);
998 GPUCoalescer::recordRequestType(SequencerRequestType requestType
) {
999 DPRINTF(RubyStats
, "Recorded statistic: %s\n",
1000 SequencerRequestType_to_string(requestType
));
1003 GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer
* _seq
)
1004 : Event(Progress_Event_Pri
), seq(_seq
)
1010 GPUCoalescer::completeIssue()
1012 // newRequests has the cacheline addresses of all the
1013 // requests which need to be issued to the memory subsystem
1015 int len
= newRequests
.size();
1016 DPRINTF(GPUCoalescer
, "Completing issue for %d new requests.\n", len
);
1017 for (int i
= 0; i
< len
; ++i
) {
1018 // Get the requests from reqCoalescer table. Get only the
1019 // first request for each cacheline, the remaining requests
1020 // can be coalesced with the first request. So, only
1021 // one request is issued per cacheline.
1022 RequestDesc info
= reqCoalescer
[newRequests
[i
]][0];
1023 PacketPtr pkt
= info
.first
;
1024 DPRINTF(GPUCoalescer
, "Completing for newReq %d: paddr %#x\n",
1025 i
, pkt
->req
->getPaddr());
1026 // Insert this request to the read/writeRequestTables. These tables
1027 // are used to track aliased requests in makeRequest subroutine
1028 bool found
= insertRequest(pkt
, info
.second
[PrimaryType
]);
1031 panic("GPUCoalescer::makeRequest should never be called if the "
1032 "request is already outstanding\n");
1035 // Issue request to ruby subsystem
1036 issueRequest(pkt
, info
.second
[SecondaryType
]);
1038 newRequests
.clear();
1040 // have Kernel End releases been issued this cycle
1041 len
= newKernelEnds
.size();
1042 for (int i
= 0; i
< len
; i
++) {
1043 kernelCallback(newKernelEnds
[i
]);
1045 newKernelEnds
.clear();
1049 GPUCoalescer::IssueEvent::process()
1051 seq
->completeIssue();
1055 GPUCoalescer::IssueEvent::description() const
1057 return "Issue coalesced request";
1061 GPUCoalescer::evictionCallback(Addr address
)
1063 ruby_eviction_callback(address
);
1067 GPUCoalescer::kernelCallback(int wavefront_id
)
1069 assert(kernelEndList
.count(wavefront_id
));
1071 ruby_hit_callback(kernelEndList
[wavefront_id
]);
1073 kernelEndList
.erase(wavefront_id
);
1077 GPUCoalescer::atomicCallback(Addr address
,
1079 const DataBlock
& data
)
1081 assert(address
== makeLineAddress(address
));
1083 DPRINTF(GPUCoalescer
, "atomic callback for address %#x\n", address
);
1084 assert(m_writeRequestTable
.count(makeLineAddress(address
)));
1086 RequestTable::iterator i
= m_writeRequestTable
.find(address
);
1087 assert(i
!= m_writeRequestTable
.end());
1088 GPUCoalescerRequest
* srequest
= i
->second
;
1090 m_writeRequestTable
.erase(i
);
1093 assert((srequest
->m_type
== RubyRequestType_ATOMIC
) ||
1094 (srequest
->m_type
== RubyRequestType_ATOMIC_RETURN
) ||
1095 (srequest
->m_type
== RubyRequestType_ATOMIC_NO_RETURN
));
1098 // Atomics don't write to cache, so there is no MRU update...
1100 recordMissLatency(srequest
, mach
,
1101 srequest
->issue_time
, Cycles(0), Cycles(0), true, false);
1103 PacketPtr pkt
= srequest
->pkt
;
1104 Addr request_address
= pkt
->getAddr();
1105 Addr request_line_address
= makeLineAddress(pkt
->getAddr());
1107 int len
= reqCoalescer
[request_line_address
].size();
1108 std::vector
<PacketPtr
> mylist
;
1109 for (int i
= 0; i
< len
; ++i
) {
1110 PacketPtr pkt
= reqCoalescer
[request_line_address
][i
].first
;
1111 assert(srequest
->m_type
==
1112 reqCoalescer
[request_line_address
][i
].second
[PrimaryType
]);
1113 request_address
= (pkt
->getAddr());
1114 request_line_address
= makeLineAddress(request_address
);
1115 if (pkt
->getPtr
<uint8_t>() &&
1116 srequest
->m_type
!= RubyRequestType_ATOMIC_NO_RETURN
) {
1117 /* atomics are done in memory, and return the data *before* the atomic op... */
1118 memcpy(pkt
->getPtr
<uint8_t>(),
1119 data
.getData(getOffset(request_address
),
1123 DPRINTF(MemoryAccess
,
1124 "WARNING. Data not transfered from Ruby to M5 for type " \
1126 RubyRequestType_to_string(srequest
->m_type
));
1129 // If using the RubyTester, update the RubyTester sender state's
1130 // subBlock with the recieved data. The tester will later access
1132 // Note: RubyPort will access it's sender state before the
1134 if (m_usingRubyTester
) {
1135 RubyPort::SenderState
*requestSenderState
=
1136 safe_cast
<RubyPort::SenderState
*>(pkt
->senderState
);
1137 RubyTester::SenderState
* testerSenderState
=
1138 safe_cast
<RubyTester::SenderState
*>(requestSenderState
->predecessor
);
1139 testerSenderState
->subBlock
.mergeFrom(data
);
1142 mylist
.push_back(pkt
);
1145 reqCoalescer
.erase(request_line_address
);
1146 assert(!reqCoalescer
.count(request_line_address
));
1148 completeHitCallback(mylist
, len
);
1152 GPUCoalescer::recordCPReadCallBack(MachineID myMachID
, MachineID senderMachID
)
1154 if (myMachID
== senderMachID
) {
1156 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCP
) {
1157 CP_TCPLdTransfers
++;
1158 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCC
) {
1166 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID
, MachineID senderMachID
)
1168 if (myMachID
== senderMachID
) {
1170 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCP
) {
1171 CP_TCPStTransfers
++;
1172 } else if (machineIDToMachineType(senderMachID
) == MachineType_TCC
) {
1180 GPUCoalescer::completeHitCallback(std::vector
<PacketPtr
> & mylist
, int len
)
1182 for (int i
= 0; i
< len
; ++i
) {
1183 RubyPort::SenderState
*ss
=
1184 safe_cast
<RubyPort::SenderState
*>(mylist
[i
]->senderState
);
1185 MemSlavePort
*port
= ss
->port
;
1186 assert(port
!= NULL
);
1188 mylist
[i
]->senderState
= ss
->predecessor
;
1190 port
->hitCallback(mylist
[i
]);
1194 testDrainComplete();
1198 GPUCoalescer::mapAddrToPkt(Addr address
)
1200 RequestTable::iterator i
= m_readRequestTable
.find(address
);
1201 assert(i
!= m_readRequestTable
.end());
1202 GPUCoalescerRequest
* request
= i
->second
;
1203 return request
->pkt
;
1207 GPUCoalescer::recordMissLatency(GPUCoalescerRequest
* srequest
,
1209 Cycles initialRequestTime
,
1210 Cycles forwardRequestTime
,
1211 Cycles firstResponseTime
,
1212 bool success
, bool isRegion
)
1214 RubyRequestType type
= srequest
->m_type
;
1215 Cycles issued_time
= srequest
->issue_time
;
1216 Cycles completion_time
= curCycle();
1217 assert(completion_time
>= issued_time
);
1218 Cycles total_lat
= completion_time
- issued_time
;
1220 // cache stats (valid for RfO protocol only)
1221 if (mach
== MachineType_TCP
) {
1222 if (type
== RubyRequestType_LD
) {
1227 } else if (mach
== MachineType_L1Cache_wCC
) {
1228 if (type
== RubyRequestType_LD
) {
1229 GPU_TCPLdTransfers
++;
1231 GPU_TCPStTransfers
++;
1233 } else if (mach
== MachineType_TCC
) {
1234 if (type
== RubyRequestType_LD
) {
1240 if (type
== RubyRequestType_LD
) {
1247 // Profile all access latency, even zero latency accesses
1248 m_latencyHist
.sample(total_lat
);
1249 m_typeLatencyHist
[type
]->sample(total_lat
);
1251 // Profile the miss latency for all non-zero demand misses
1252 if (total_lat
!= Cycles(0)) {
1253 m_missLatencyHist
.sample(total_lat
);
1254 m_missTypeLatencyHist
[type
]->sample(total_lat
);
1256 if (mach
!= MachineType_NUM
) {
1257 m_missMachLatencyHist
[mach
]->sample(total_lat
);
1258 m_missTypeMachLatencyHist
[type
][mach
]->sample(total_lat
);
1260 if ((issued_time
<= initialRequestTime
) &&
1261 (initialRequestTime
<= forwardRequestTime
) &&
1262 (forwardRequestTime
<= firstResponseTime
) &&
1263 (firstResponseTime
<= completion_time
)) {
1265 m_IssueToInitialDelayHist
[mach
]->sample(
1266 initialRequestTime
- issued_time
);
1267 m_InitialToForwardDelayHist
[mach
]->sample(
1268 forwardRequestTime
- initialRequestTime
);
1269 m_ForwardToFirstResponseDelayHist
[mach
]->sample(
1270 firstResponseTime
- forwardRequestTime
);
1271 m_FirstResponseToCompletionDelayHist
[mach
]->sample(
1272 completion_time
- firstResponseTime
);
1278 DPRINTFR(ProtocolTrace
, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1279 curTick(), m_version
, "Coal",
1280 success
? "Done" : "SC_Failed", "", "",
1281 printAddress(srequest
->pkt
->getAddr()), total_lat
);
1285 GPUCoalescer::regStats()
1287 // These statistical variables are not for display.
1288 // The profiler will collate these across different
1289 // coalescers and display those collated statistics.
1290 m_outstandReqHist
.init(10);
1291 m_latencyHist
.init(10);
1292 m_missLatencyHist
.init(10);
1294 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
1295 m_typeLatencyHist
.push_back(new Stats::Histogram());
1296 m_typeLatencyHist
[i
]->init(10);
1298 m_missTypeLatencyHist
.push_back(new Stats::Histogram());
1299 m_missTypeLatencyHist
[i
]->init(10);
1302 for (int i
= 0; i
< MachineType_NUM
; i
++) {
1303 m_missMachLatencyHist
.push_back(new Stats::Histogram());
1304 m_missMachLatencyHist
[i
]->init(10);
1306 m_IssueToInitialDelayHist
.push_back(new Stats::Histogram());
1307 m_IssueToInitialDelayHist
[i
]->init(10);
1309 m_InitialToForwardDelayHist
.push_back(new Stats::Histogram());
1310 m_InitialToForwardDelayHist
[i
]->init(10);
1312 m_ForwardToFirstResponseDelayHist
.push_back(new Stats::Histogram());
1313 m_ForwardToFirstResponseDelayHist
[i
]->init(10);
1315 m_FirstResponseToCompletionDelayHist
.push_back(new Stats::Histogram());
1316 m_FirstResponseToCompletionDelayHist
[i
]->init(10);
1319 for (int i
= 0; i
< RubyRequestType_NUM
; i
++) {
1320 m_missTypeMachLatencyHist
.push_back(std::vector
<Stats::Histogram
*>());
1322 for (int j
= 0; j
< MachineType_NUM
; j
++) {
1323 m_missTypeMachLatencyHist
[i
].push_back(new Stats::Histogram());
1324 m_missTypeMachLatencyHist
[i
][j
]->init(10);
1330 .name(name() + ".gpu_tcp_ld_hits")
1331 .desc("loads that hit in the TCP")
1334 .name(name() + ".gpu_tcp_ld_transfers")
1335 .desc("TCP to TCP load transfers")
1338 .name(name() + ".gpu_tcc_ld_hits")
1339 .desc("loads that hit in the TCC")
1342 .name(name() + ".gpu_ld_misses")
1343 .desc("loads that miss in the GPU")
1347 .name(name() + ".gpu_tcp_st_hits")
1348 .desc("stores that hit in the TCP")
1351 .name(name() + ".gpu_tcp_st_transfers")
1352 .desc("TCP to TCP store transfers")
1355 .name(name() + ".gpu_tcc_st_hits")
1356 .desc("stores that hit in the TCC")
1359 .name(name() + ".gpu_st_misses")
1360 .desc("stores that miss in the GPU")
1365 .name(name() + ".cp_tcp_ld_hits")
1366 .desc("loads that hit in the TCP")
1369 .name(name() + ".cp_tcp_ld_transfers")
1370 .desc("TCP to TCP load transfers")
1373 .name(name() + ".cp_tcc_ld_hits")
1374 .desc("loads that hit in the TCC")
1377 .name(name() + ".cp_ld_misses")
1378 .desc("loads that miss in the GPU")
1382 .name(name() + ".cp_tcp_st_hits")
1383 .desc("stores that hit in the TCP")
1386 .name(name() + ".cp_tcp_st_transfers")
1387 .desc("TCP to TCP store transfers")
1390 .name(name() + ".cp_tcc_st_hits")
1391 .desc("stores that hit in the TCC")
1394 .name(name() + ".cp_st_misses")
1395 .desc("stores that miss in the GPU")