2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Brad Beckmann, Sooraj Puthoor
36 #include "gpu-compute/fetch_unit.hh"
38 #include "debug/GPUFetch.hh"
39 #include "debug/GPUPort.hh"
40 #include "debug/GPUTLB.hh"
41 #include "gpu-compute/compute_unit.hh"
42 #include "gpu-compute/gpu_dyn_inst.hh"
43 #include "gpu-compute/gpu_static_inst.hh"
44 #include "gpu-compute/shader.hh"
45 #include "gpu-compute/wavefront.hh"
46 #include "mem/ruby/system/RubySystem.hh"
48 uint32_t FetchUnit::globalFetchUnitID
;
50 FetchUnit::FetchUnit(const ComputeUnitParams
* params
) :
53 fetchScheduler(params
),
58 FetchUnit::~FetchUnit()
61 fetchStatusQueue
.clear();
65 FetchUnit::init(ComputeUnit
*cu
)
68 timingSim
= computeUnit
->shader
->timingSim
;
70 fetchStatusQueue
.resize(computeUnit
->shader
->n_wf
);
72 for (int j
= 0; j
< computeUnit
->shader
->n_wf
; ++j
) {
73 fetchStatusQueue
[j
] = std::make_pair(waveList
->at(j
), false);
76 fetchScheduler
.bindList(&fetchQueue
);
82 // re-evaluate waves which are marked as not ready for fetch
83 for (int j
= 0; j
< computeUnit
->shader
->n_wf
; ++j
) {
84 // Following code assumes 64-bit opertaion and all insts are
85 // represented by 64-bit pointers to inst objects.
86 Wavefront
*curWave
= fetchStatusQueue
[j
].first
;
89 // The wavefront has to be active, the IB occupancy has to be
90 // 4 or less instructions and it can not have any branches to
91 // prevent speculative instruction fetches
92 if (!fetchStatusQueue
[j
].second
) {
93 if (curWave
->status
== Wavefront::S_RUNNING
&&
94 curWave
->instructionBuffer
.size() <= 4 &&
95 !curWave
->instructionBufferHasBranch() &&
96 !curWave
->pendingFetch
) {
97 fetchQueue
.push_back(curWave
);
98 fetchStatusQueue
[j
].second
= true;
103 // Fetch only if there is some wave ready to be fetched
104 // An empty fetchQueue will cause the schedular to panic
105 if (fetchQueue
.size()) {
106 Wavefront
*waveToBeFetched
= fetchScheduler
.chooseWave();
107 waveToBeFetched
->pendingFetch
= true;
108 fetchStatusQueue
[waveToBeFetched
->wfSlotId
].second
= false;
109 initiateFetch(waveToBeFetched
);
114 FetchUnit::initiateFetch(Wavefront
*wavefront
)
116 // calculate the virtual address to fetch from the SQC
117 Addr vaddr
= wavefront
->pc() + wavefront
->instructionBuffer
.size();
118 vaddr
= wavefront
->base_ptr
+ vaddr
* sizeof(GPUStaticInst
*);
120 DPRINTF(GPUTLB
, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
121 computeUnit
->cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
, vaddr
);
123 // Since this is an instruction prefetch, if you're split then just finish
124 // out the current line.
125 unsigned block_size
= RubySystem::getBlockSizeBytes();
126 // check for split accesses
127 Addr split_addr
= roundDown(vaddr
+ block_size
- 1, block_size
);
128 unsigned size
= block_size
;
130 if (split_addr
> vaddr
) {
131 // misaligned access, just grab the rest of the line
132 size
= split_addr
- vaddr
;
135 // set up virtual request
136 Request
*req
= new Request(0, vaddr
, size
, Request::INST_FETCH
,
137 computeUnit
->masterId(), 0, 0, 0);
139 PacketPtr pkt
= new Packet(req
, MemCmd::ReadReq
);
140 // This fetchBlock is kind of faux right now - because the translations so
141 // far don't actually return Data
143 pkt
->dataStatic(&fetchBlock
);
146 // SenderState needed on Return
147 pkt
->senderState
= new ComputeUnit::ITLBPort::SenderState(wavefront
);
149 // Sender State needed by TLB hierarchy
151 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute
,
152 computeUnit
->shader
->gpuTc
,
153 false, pkt
->senderState
);
155 if (computeUnit
->sqcTLBPort
->isStalled()) {
156 assert(computeUnit
->sqcTLBPort
->retries
.size() > 0);
158 DPRINTF(GPUTLB
, "Failed to send TLB req for FETCH addr %#x\n",
161 computeUnit
->sqcTLBPort
->retries
.push_back(pkt
);
162 } else if (!computeUnit
->sqcTLBPort
->sendTimingReq(pkt
)) {
163 // Stall the data port;
164 // No more packet is issued till
165 // ruby indicates resources are freed by
166 // a recvReqRetry() call back on this port.
167 computeUnit
->sqcTLBPort
->stallPort();
169 DPRINTF(GPUTLB
, "Failed to send TLB req for FETCH addr %#x\n",
172 computeUnit
->sqcTLBPort
->retries
.push_back(pkt
);
174 DPRINTF(GPUTLB
, "sent FETCH translation request for %#x\n", vaddr
);
178 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute
,
179 computeUnit
->shader
->gpuTc
);
181 computeUnit
->sqcTLBPort
->sendFunctional(pkt
);
183 TheISA::GpuTLB::TranslationState
*sender_state
=
184 safe_cast
<TheISA::GpuTLB::TranslationState
*>(pkt
->senderState
);
186 delete sender_state
->tlbEntry
;
188 // fetch the instructions from the SQC when we operate in
189 // functional mode only
190 fetch(pkt
, wavefront
);
195 FetchUnit::fetch(PacketPtr pkt
, Wavefront
*wavefront
)
197 assert(pkt
->req
->hasPaddr());
198 assert(pkt
->req
->hasSize());
200 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
201 computeUnit
->cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
202 pkt
->req
->getPaddr());
204 // this is necessary because the GPU TLB receives packets instead of
205 // requests. when the translation is complete, all relevent fields in the
206 // request will be populated, but not in the packet. here we create the
207 // new packet so we can set the size, addr, and proper flags.
208 PacketPtr oldPkt
= pkt
;
209 pkt
= new Packet(oldPkt
->req
, oldPkt
->cmd
);
212 TheGpuISA::RawMachInst
*data
=
213 new TheGpuISA::RawMachInst
[pkt
->req
->getSize() /
214 sizeof(TheGpuISA::RawMachInst
)];
216 pkt
->dataDynamic
<TheGpuISA::RawMachInst
>(data
);
218 // New SenderState for the memory access
219 pkt
->senderState
= new ComputeUnit::SQCPort::SenderState(wavefront
);
222 // translation is done. Send the appropriate timing memory request.
224 if (!computeUnit
->sqcPort
->sendTimingReq(pkt
)) {
225 computeUnit
->sqcPort
->retries
.push_back(std::make_pair(pkt
,
228 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
229 computeUnit
->cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
230 pkt
->req
->getPaddr());
232 DPRINTF(GPUPort
, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
233 computeUnit
->cu_id
, wavefront
->simdId
, wavefront
->wfSlotId
,
234 pkt
->req
->getPaddr());
237 computeUnit
->sqcPort
->sendFunctional(pkt
);
238 processFetchReturn(pkt
);
243 FetchUnit::processFetchReturn(PacketPtr pkt
)
245 ComputeUnit::SQCPort::SenderState
*sender_state
=
246 safe_cast
<ComputeUnit::SQCPort::SenderState
*>(pkt
->senderState
);
248 Wavefront
*wavefront
= sender_state
->wavefront
;
250 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
251 "%d bytes, %d instructions!\n", computeUnit
->cu_id
,
252 wavefront
->simdId
, wavefront
->wfSlotId
, pkt
->req
->getPaddr(),
253 pkt
->req
->getSize(), pkt
->req
->getSize() /
254 sizeof(TheGpuISA::RawMachInst
));
256 if (wavefront
->dropFetch
) {
257 assert(wavefront
->instructionBuffer
.empty());
258 wavefront
->dropFetch
= false;
260 TheGpuISA::RawMachInst
*inst_index_ptr
=
261 (TheGpuISA::RawMachInst
*)pkt
->getPtr
<uint8_t>();
263 assert(wavefront
->instructionBuffer
.size() <= 4);
265 for (int i
= 0; i
< pkt
->req
->getSize() /
266 sizeof(TheGpuISA::RawMachInst
); ++i
) {
267 GPUStaticInst
*inst_ptr
= decoder
.decode(inst_index_ptr
[i
]);
270 DPRINTF(GPUFetch
, "CU%d: WF[%d][%d]: added %s\n",
271 computeUnit
->cu_id
, wavefront
->simdId
,
272 wavefront
->wfSlotId
, inst_ptr
->disassemble());
274 GPUDynInstPtr gpuDynInst
=
275 std::make_shared
<GPUDynInst
>(computeUnit
, wavefront
, inst_ptr
,
276 computeUnit
->getAndIncSeqNum());
278 wavefront
->instructionBuffer
.push_back(gpuDynInst
);
282 wavefront
->pendingFetch
= false;
284 delete pkt
->senderState
;
290 FetchUnit::bindWaveList(std::vector
<Wavefront
*> *wave_list
)
292 waveList
= wave_list
;