2 * Copyright (c) 2014-2017 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "gpu-compute/fetch_unit.hh"
36 #include "debug/GPUFetch.hh"
37 #include "debug/GPUPort.hh"
38 #include "debug/GPUTLB.hh"
39 #include "gpu-compute/compute_unit.hh"
40 #include "gpu-compute/gpu_dyn_inst.hh"
41 #include "gpu-compute/gpu_static_inst.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/wavefront.hh"
44 #include "mem/ruby/system/RubySystem.hh"
46 uint32_t FetchUnit::globalFetchUnitID
;
48 FetchUnit::FetchUnit(const ComputeUnitParams
*p
, ComputeUnit
&cu
)
49 : timingSim(true), computeUnit(cu
), fetchScheduler(p
),
50 waveList(nullptr), fetchDepth(p
->fetch_depth
)
54 FetchUnit::~FetchUnit()
57 fetchStatusQueue
.clear();
63 timingSim
= computeUnit
.shader
->timingSim
;
65 fetchStatusQueue
.resize(computeUnit
.shader
->n_wf
);
66 fetchBuf
.resize(computeUnit
.shader
->n_wf
, FetchBufDesc());
68 for (int i
= 0; i
< computeUnit
.shader
->n_wf
; ++i
) {
69 Wavefront
*wf
= waveList
->at(i
);
70 assert(wf
->wfSlotId
== i
);
71 fetchStatusQueue
[i
] = std::make_pair(wf
, false);
72 fetchBuf
[i
].allocateBuf(fetchDepth
, computeUnit
.cacheLineSize(), wf
);
73 fetchBuf
[i
].decoder(&decoder
);
76 fetchScheduler
.bindList(&fetchQueue
);
83 * now we check if any of the fetch buffers have
84 * buffered instruction data that can be decoded
85 * and sent to its wavefront's instruction buffer.
86 * then we check if any of the fetch buffer entries
87 * can be released. we only check if we can
90 for (auto &fetch_buf
: fetchBuf
) {
91 if (!fetch_buf
.hasFreeSpace()) {
92 fetch_buf
.checkWaveReleaseBuf();
94 if (fetch_buf
.hasFetchDataToProcess()) {
95 fetch_buf
.decodeInsts();
99 // re-evaluate waves which are marked as not ready for fetch
100 for (int j
= 0; j
< computeUnit
.shader
->n_wf
; ++j
) {
101 // Following code assumes 64-bit opertaion and all insts are
102 // represented by 64-bit pointers to inst objects.
103 Wavefront
*curWave
= fetchStatusQueue
[j
].first
;
106 // The wavefront has to be active, the IB occupancy has to be
107 // 4 or less instructions and it can not have any branches to
108 // prevent speculative instruction fetches
109 if (!fetchStatusQueue
[j
].second
) {
110 if ((curWave
->getStatus() == Wavefront::S_RUNNING
||
111 curWave
->getStatus() == Wavefront::S_WAITCNT
) &&
112 fetchBuf
[j
].hasFreeSpace() &&
113 !curWave
->stopFetch() &&
114 !curWave
->pendingFetch
) {
115 fetchQueue
.push_back(curWave
);
116 fetchStatusQueue
[j
].second
= true;
121 // Fetch only if there is some wave ready to be fetched
122 // An empty fetchQueue will cause the schedular to panic
123 if (fetchQueue
.size()) {
124 Wavefront
*waveToBeFetched
= fetchScheduler
.chooseWave();
125 waveToBeFetched
->pendingFetch
= true;
126 fetchStatusQueue
[waveToBeFetched
->wfSlotId
].second
= false;
127 initiateFetch(waveToBeFetched
);
132 FetchUnit::initiateFetch(Wavefront
*wavefront
)
134 assert(fetchBuf
.at(wavefront
->wfSlotId
).hasFreeSpace());
137 * calculate the virtual address to fetch from the SQC. the fetch
138 * buffer holds a configurable number of cache lines. we start
139 * fetching at the address of the cache line immediately following
140 * the buffered line(s).
142 Addr vaddr
= fetchBuf
.at(wavefront
->wfSlotId
).nextFetchAddr();
144 // this should already be aligned to a cache line
145 assert(vaddr
== makeLineAddress(vaddr
,
146 computeUnit
.getCacheLineBits()));
148 // shouldn't be fetching a line that is already buffered
149 assert(!fetchBuf
.at(wavefront
->wfSlotId
).pcBuffered(vaddr
));
151 fetchBuf
.at(wavefront
->wfSlotId
).reserveBuf(vaddr
);
153 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: Id%d: Initiate fetch "
154 "from pc: %d %#x\n", computeUnit
.cu_id
, wavefront
->simdId
,
155 wavefront
->wfSlotId
, wavefront
->wfDynId
, wavefront
->pc(), vaddr
);
157 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
158 computeUnit
.cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
, vaddr
);
160 // set up virtual request
161 RequestPtr req
= std::make_shared
<Request
>(
162 vaddr
, computeUnit
.cacheLineSize(), Request::INST_FETCH
,
163 computeUnit
.masterId(), 0, 0, nullptr);
165 PacketPtr pkt
= new Packet(req
, MemCmd::ReadReq
);
168 // SenderState needed on Return
169 pkt
->senderState
= new ComputeUnit::ITLBPort::SenderState(wavefront
);
171 // Sender State needed by TLB hierarchy
173 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute
,
174 computeUnit
.shader
->gpuTc
,
175 false, pkt
->senderState
);
177 if (computeUnit
.sqcTLBPort
.isStalled()) {
178 assert(computeUnit
.sqcTLBPort
.retries
.size() > 0);
180 DPRINTF(GPUTLB
, "Failed to send TLB req for FETCH addr %#x\n",
183 computeUnit
.sqcTLBPort
.retries
.push_back(pkt
);
184 } else if (!computeUnit
.sqcTLBPort
.sendTimingReq(pkt
)) {
185 // Stall the data port;
186 // No more packet is issued till
187 // ruby indicates resources are freed by
188 // a recvReqRetry() call back on this port.
189 computeUnit
.sqcTLBPort
.stallPort();
191 DPRINTF(GPUTLB
, "Failed to send TLB req for FETCH addr %#x\n",
194 computeUnit
.sqcTLBPort
.retries
.push_back(pkt
);
196 DPRINTF(GPUTLB
, "sent FETCH translation request for %#x\n", vaddr
);
200 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute
,
201 computeUnit
.shader
->gpuTc
);
203 computeUnit
.sqcTLBPort
.sendFunctional(pkt
);
205 TheISA::GpuTLB::TranslationState
*sender_state
=
206 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
208 delete sender_state
->tlbEntry
;
210 // fetch the instructions from the SQC when we operate in
211 // functional mode only
212 fetch(pkt
, wavefront
);
217 FetchUnit::fetch(PacketPtr pkt
, Wavefront
*wavefront
)
219 assert(pkt
->req
->hasPaddr());
220 assert(pkt
->req
->hasSize());
222 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
223 computeUnit
.cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
224 pkt
->req
->getPaddr());
227 * this is necessary because the GPU TLB receives packets instead of
228 * requests. when the translation is complete, all relevent fields in
229 * the request will be populated, but not in the packet. here we create
230 * the new packet so we can set the size, addr, and proper flags.
232 PacketPtr oldPkt
= pkt
;
233 pkt
= new Packet(oldPkt
->req
, oldPkt
->cmd
);
237 * if we have not reserved an entry in the fetch buffer,
238 * stop fetching. this can happen due to a branch instruction
239 * flushing the fetch buffer while an ITLB or I-cache request is still
240 * pending, in the same cycle another instruction is trying to fetch.
242 if (!fetchBuf
.at(wavefront
->wfSlotId
).isReserved(pkt
->req
->getVaddr())) {
247 * we should have reserved an entry in the fetch buffer
248 * for this cache line. here we get the pointer to the
249 * entry used to buffer this request's line data.
251 pkt
->dataStatic(fetchBuf
.at(wavefront
->wfSlotId
)
252 .reservedBuf(pkt
->req
->getVaddr()));
254 // New SenderState for the memory access
255 pkt
->senderState
= new ComputeUnit::SQCPort::SenderState(wavefront
);
258 // translation is done. Send the appropriate timing memory request.
260 if (!computeUnit
.sqcPort
.sendTimingReq(pkt
)) {
261 computeUnit
.sqcPort
.retries
.push_back(std::make_pair(pkt
,
264 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
265 computeUnit
.cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
266 pkt
->req
->getPaddr());
268 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
269 computeUnit
.cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
270 pkt
->req
->getPaddr());
273 computeUnit
.sqcPort
.sendFunctional(pkt
);
274 processFetchReturn(pkt
);
279 FetchUnit::processFetchReturn(PacketPtr pkt
)
281 ComputeUnit::SQCPort::SenderState
*sender_state
=
282 safe_cast
<ComputeUnit::SQCPort::SenderState
*>(pkt
->senderState
);
284 Wavefront
*wavefront
= sender_state
->wavefront
;
286 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
287 "%d bytes!\n", computeUnit
.cu_id
, wavefront
->simdId
,
288 wavefront
->wfSlotId
, pkt
->req
->getPaddr(), pkt
->req
->getSize());
290 if (wavefront
->dropFetch
) {
291 assert(wavefront
->instructionBuffer
.empty());
292 assert(!fetchBuf
.at(wavefront
->wfSlotId
).hasFetchDataToProcess());
293 wavefront
->dropFetch
= false;
295 fetchBuf
.at(wavefront
->wfSlotId
).fetchDone(pkt
->req
->getVaddr());
298 wavefront
->pendingFetch
= false;
300 delete pkt
->senderState
;
305 FetchUnit::flushBuf(int wfSlotId
)
307 fetchBuf
.at(wfSlotId
).flushBuf();
311 FetchUnit::bindWaveList(std::vector
<Wavefront
*> *wave_list
)
313 waveList
= wave_list
;
318 FetchUnit::FetchBufDesc::allocateBuf(int fetch_depth
, int cache_line_size
,
322 fetchDepth
= fetch_depth
;
323 maxIbSize
= wavefront
->maxIbSize
;
324 cacheLineSize
= cache_line_size
;
325 maxFbSize
= cacheLineSize
* fetchDepth
;
327 // Calculate the number of bits to address a cache line
328 panic_if(!isPowerOf2(cacheLineSize
),
329 "Cache line size should be a power of two.");
330 cacheLineBits
= floorLog2(cacheLineSize
);
332 bufStart
= new uint8_t[maxFbSize
];
334 bufEnd
= bufStart
+ maxFbSize
;
336 for (int i
= 0; i
< fetchDepth
; ++i
) {
337 freeList
.emplace_back(readPtr
+ i
* cacheLineSize
);
342 FetchUnit::FetchBufDesc::flushBuf()
344 restartFromBranch
= true;
346 * free list may have some entries
347 * so we clear it here to avoid duplicates
354 for (int i
= 0; i
< fetchDepth
; ++i
) {
355 freeList
.push_back(bufStart
+ i
* cacheLineSize
);
358 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
359 "buffer\n", wavefront
->simdId
, wavefront
->wfSlotId
,
364 FetchUnit::FetchBufDesc::nextFetchAddr()
368 if (bufferedAndReservedLines()) {
369 Addr last_line_fetched
= 0;
370 if (!reservedLines()) {
372 * get the PC of the most recently fetched cache line,
373 * then return the address of the next line.
375 last_line_fetched
= bufferedPCs
.rbegin()->first
;
377 last_line_fetched
= reservedPCs
.rbegin()->first
;
380 next_line
= last_line_fetched
+ cacheLineSize
;
383 * should not be trying to fetch a line that has already
386 assert(bufferedPCs
.find(next_line
) == bufferedPCs
.end());
387 assert(reservedPCs
.find(next_line
) == reservedPCs
.end());
390 * we do not have any buffered cache lines yet, so we
391 * assume this is the initial fetch, or the first fetch
392 * after a branch, and get the PC directly from the WF.
393 * in the case of a branch, we may not start at the
394 * beginning of a cache line, so we adjust the readPtr by
395 * the current PC's offset from the start of the line.
397 next_line
= makeLineAddress(wavefront
->pc(), cacheLineBits
);
401 * if we are here we have no buffered lines. in the case we flushed
402 * the buffer due to a branch, we may need to start fetching from
403 * some offset from the start of the fetch buffer, so we adjust for
406 if (restartFromBranch
) {
407 restartFromBranch
= false;
409 = wavefront
->pc() - makeLineAddress(wavefront
->pc(),
411 readPtr
+= byte_offset
;
419 FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr
)
421 // we should have free buffer space, and the line
422 // at vaddr should not already be cached.
423 assert(hasFreeSpace());
424 assert(bufferedPCs
.find(vaddr
) == bufferedPCs
.end());
425 assert(reservedPCs
.find(vaddr
) == reservedPCs
.end());
426 assert(bufferedAndReservedLines() < fetchDepth
);
428 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d reserved fetch buffer entry "
429 "for PC = %#x\n", wavefront
->simdId
, wavefront
->wfSlotId
,
430 wavefront
->wfDynId
, vaddr
);
433 * we reserve buffer space, by moving it out of the
434 * free list, however we do not mark the buffered
435 * line as valid until the fetch unit for this buffer
436 * has receieved the response from the memory system.
438 uint8_t *inst_buf
= freeList
.front();
439 reservedPCs
.emplace(vaddr
, inst_buf
);
440 freeList
.pop_front();
444 FetchUnit::FetchBufDesc::fetchDone(Addr vaddr
)
446 assert(bufferedPCs
.find(vaddr
) == bufferedPCs
.end());
447 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
448 wavefront
->simdId
, wavefront
->wfSlotId
,
449 wavefront
->wfDynId
, vaddr
);
452 * this address should have an entry reserved in the
453 * fetch buffer already, however it should be invalid
454 * until the fetch completes.
456 auto reserved_pc
= reservedPCs
.find(vaddr
);
457 assert(reserved_pc
!= reservedPCs
.end());
458 bufferedPCs
.emplace(vaddr
, reserved_pc
->second
);
460 if (readPtr
== bufEnd
) {
464 reserved_pc
->second
= nullptr;
465 reservedPCs
.erase(reserved_pc
);
469 FetchUnit::FetchBufDesc::hasFetchDataToProcess() const
471 return fetchBytesRemaining() >= sizeof(TheGpuISA::RawMachInst
);
475 FetchUnit::FetchBufDesc::checkWaveReleaseBuf()
477 Addr cur_wave_pc
= roundDown(wavefront
->pc(),
478 wavefront
->computeUnit
->cacheLineSize());
479 if (reservedPCs
.find(cur_wave_pc
) != reservedPCs
.end()) {
480 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d current wave PC(%#x) still "
481 "being fetched.\n", wavefront
->simdId
, wavefront
->wfSlotId
,
482 wavefront
->wfDynId
, cur_wave_pc
);
484 // should be reserved, but not buffered yet
485 assert(bufferedPCs
.find(cur_wave_pc
) == bufferedPCs
.end());
490 auto current_buffered_pc
= bufferedPCs
.find(cur_wave_pc
);
491 auto oldest_buffered_pc
= bufferedPCs
.begin();
493 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d checking if PC block addr = %#x"
494 "(PC = %#x) can be released.\n", wavefront
->simdId
,
495 wavefront
->wfSlotId
, wavefront
->wfDynId
, cur_wave_pc
,
500 for (const auto &buf_pc
: bufferedPCs
) {
501 DPRINTF(GPUFetch
, "PC[%d] = %#x\n", idx
, buf_pc
.first
);
506 // if we haven't buffered data for this PC, we shouldn't
507 // be fetching from it.
508 assert(current_buffered_pc
!= bufferedPCs
.end());
511 * we're using a std::map so the addresses are sorted. if this
512 * PC is not the oldest one in the map, we must be fetching from
513 * a newer block, and we can release the oldest PC's fetch buffer
514 * entry back to the free list.
516 if (current_buffered_pc
!= oldest_buffered_pc
) {
517 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d done fetching for PC = %#x, "
518 "removing it from the fetch buffer.\n", wavefront
->simdId
,
519 wavefront
->wfSlotId
, wavefront
->wfDynId
,
520 oldest_buffered_pc
->first
);
522 freeList
.emplace_back(oldest_buffered_pc
->second
);
523 oldest_buffered_pc
->second
= nullptr;
524 bufferedPCs
.erase(oldest_buffered_pc
);
525 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d has %d lines buffered.\n",
526 wavefront
->simdId
, wavefront
->wfSlotId
, wavefront
->wfDynId
,
532 FetchUnit::FetchBufDesc::decodeInsts()
540 while (wavefront
->instructionBuffer
.size() < maxIbSize
541 && hasFetchDataToProcess()) {
545 TheGpuISA::MachInst mach_inst
546 = reinterpret_cast<TheGpuISA::MachInst
>(readPtr
);
547 GPUStaticInst
*gpu_static_inst
= _decoder
->decode(mach_inst
);
548 readPtr
+= gpu_static_inst
->instSize();
550 assert(readPtr
<= bufEnd
);
552 GPUDynInstPtr gpu_dyn_inst
553 = std::make_shared
<GPUDynInst
>(wavefront
->computeUnit
,
554 wavefront
, gpu_static_inst
,
555 wavefront
->computeUnit
->
557 wavefront
->instructionBuffer
.push_back(gpu_dyn_inst
);
559 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%ld decoded %s (%d bytes). "
560 "%d bytes remain.\n", wavefront
->simdId
,
561 wavefront
->wfSlotId
, wavefront
->wfDynId
,
562 gpu_static_inst
->disassemble(),
563 gpu_static_inst
->instSize(),
564 fetchBytesRemaining());
570 FetchUnit::FetchBufDesc::decodeSplitInst()
572 TheGpuISA::RawMachInst split_inst
= 0;
573 int dword_size
= sizeof(uint32_t);
574 int num_dwords
= sizeof(TheGpuISA::RawMachInst
) / dword_size
;
576 for (int i
= 0; i
< num_dwords
; ++i
) {
577 ((uint32_t*)(&split_inst
))[i
] = *reinterpret_cast<uint32_t*>(readPtr
);
578 if (readPtr
+ dword_size
>= bufEnd
) {
583 assert(readPtr
== bufStart
);
585 TheGpuISA::MachInst mach_inst
586 = reinterpret_cast<TheGpuISA::MachInst
>(&split_inst
);
587 GPUStaticInst
*gpu_static_inst
= _decoder
->decode(mach_inst
);
588 readPtr
+= (gpu_static_inst
->instSize() - dword_size
);
589 assert(readPtr
< bufEnd
);
591 GPUDynInstPtr gpu_dyn_inst
592 = std::make_shared
<GPUDynInst
>(wavefront
->computeUnit
,
593 wavefront
, gpu_static_inst
,
594 wavefront
->computeUnit
->
596 wavefront
->instructionBuffer
.push_back(gpu_dyn_inst
);
598 DPRINTF(GPUFetch
, "WF[%d][%d]: Id%d decoded split inst %s (%#x) "
599 "(%d bytes). %d bytes remain in %d buffered lines.\n",
600 wavefront
->simdId
, wavefront
->wfSlotId
, wavefront
->wfDynId
,
601 gpu_static_inst
->disassemble(), split_inst
,
602 gpu_static_inst
->instSize(), fetchBytesRemaining(),
607 FetchUnit::FetchBufDesc::splitDecode() const
610 * if a read of a raw instruction would go beyond the end
611 * of the fetch buffer, then we must perform a split decode.
613 bool is_split
= (readPtr
+ sizeof(TheGpuISA::RawMachInst
)) > bufEnd
;
619 FetchUnit::FetchBufDesc::fetchBytesRemaining() const
621 int bytes_remaining
= 0;
623 if (bufferedLines() && readPtr
!= bufEnd
) {
624 auto last_buf_pc
= bufferedPCs
.rbegin();
625 uint8_t *end_ptr
= last_buf_pc
->second
+ cacheLineSize
;
626 int byte_diff
= end_ptr
- readPtr
;
628 if (end_ptr
> readPtr
) {
629 bytes_remaining
= byte_diff
;
630 } else if (end_ptr
< readPtr
) {
631 bytes_remaining
= bufferedBytes() + byte_diff
;
635 assert(bytes_remaining
<= bufferedBytes());
636 return bytes_remaining
;