arch-gcn3, gpu-compute: Implement out-of-range accesses
[gem5.git] / src / gpu-compute / global_memory_pipeline.cc
1 /*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #define __STDC_FORMAT_MACROS
35 #include <cinttypes>
36 #include "debug/GPUCoalescer.hh"
37 #include "debug/GPUMem.hh"
38 #include "debug/GPUReg.hh"
39 #include "gpu-compute/compute_unit.hh"
40 #include "gpu-compute/global_memory_pipeline.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/vector_register_file.hh"
44 #include "gpu-compute/wavefront.hh"
45
46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
47 computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
48 maxWaveRequests(p->max_wave_requests), inflightStores(0),
49 inflightLoads(0)
50 {
51 }
52
53 void
54 GlobalMemPipeline::init(ComputeUnit *cu)
55 {
56 computeUnit = cu;
57 globalMemSize = computeUnit->shader->globalMemSize;
58 _name = computeUnit->name() + ".GlobalMemPipeline";
59 }
60
61 bool
62 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
63 {
64 // We require one token from the coalescer's uncoalesced table to
65 // proceed
66 int token_count = 1;
67
68 // Make sure the vector port has tokens. There is a single pool
69 // of tokens so only one port in the vector port needs to be checked.
70 // Lane 0 is chosen arbirarily.
71 DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
72 if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
73 DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
74 return false;
75 }
76
77 return true;
78 }
79
80 void
81 GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
82 {
83 // We require one token from the coalescer's uncoalesced table to
84 // proceed
85 int token_count = 1;
86
87 DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
88 assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
89 mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
90 }
91
92 bool
93 GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
94 {
95 // Ensure we haven't exceeded the maximum number of vmem requests
96 // for this wavefront
97 if ((mp->wavefront()->outstandingReqsRdGm
98 + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
99 return false;
100 }
101
102 return true;
103 }
104
105 void
106 GlobalMemPipeline::exec()
107 {
108 // apply any returned global memory operations
109 GPUDynInstPtr m = getNextReadyResp();
110
111 bool accessVrf = true;
112 Wavefront *w = nullptr;
113
114 // check the VRF to see if the operands of a load (or load component
115 // of an atomic) are accessible
116 if (m && (m->isLoad() || m->isAtomicRet())) {
117 w = m->wavefront();
118
119 accessVrf = w->computeUnit->vrf[w->simdId]->
120 canScheduleWriteOperandsFromLoad(w, m);
121
122 }
123
124 if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
125 accessVrf && (computeUnit->shader->coissue_return ||
126 computeUnit->vectorGlobalMemUnit.rdy())) {
127
128 w = m->wavefront();
129
130 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
131 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
132 m->completeAcc(m);
133
134 if (m->isLoad() || m->isAtomicRet()) {
135 w->computeUnit->vrf[w->simdId]->
136 scheduleWriteOperandsFromLoad(w, m);
137 }
138
139 completeRequest(m);
140
141 Tick accessTime = curTick() - m->getAccessTime();
142
143 // Decrement outstanding requests count
144 computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
145 if (m->isStore() || m->isAtomic() || m->isMemSync()) {
146 computeUnit->shader->sampleStore(accessTime);
147 computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
148 m->time, -1);
149 }
150
151 if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
152 computeUnit->shader->sampleLoad(accessTime);
153 computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
154 m->time, -1);
155 }
156
157 w->validateRequestCounters();
158
159 // Generate stats for round-trip time for vectory memory insts
160 // going all the way to memory and stats for individual cache
161 // blocks generated by the instruction.
162 m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
163 computeUnit->shader->sampleInstRoundTrip(m->getRoundTripTime());
164 computeUnit->shader->sampleLineRoundTrip(m->getLineAddressTime());
165
166 // Mark write bus busy for appropriate amount of time
167 computeUnit->glbMemToVrfBus.set(m->time);
168 if (!computeUnit->shader->coissue_return)
169 w->computeUnit->vectorGlobalMemUnit.set(m->time);
170 }
171
172 // If pipeline has executed a global memory instruction
173 // execute global memory packets and issue global
174 // memory packets to DTLB
175 if (!gmIssuedRequests.empty()) {
176 GPUDynInstPtr mp = gmIssuedRequests.front();
177 if (mp->isLoad() || mp->isAtomic()) {
178 if (inflightLoads >= gmQueueSize) {
179 return;
180 } else {
181 ++inflightLoads;
182 }
183 } else if (mp->isStore()) {
184 if (inflightStores >= gmQueueSize) {
185 return;
186 } else {
187 ++inflightStores;
188 }
189 }
190
191 DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
192 mp->disassemble(), mp->seqNum());
193 mp->initiateAcc(mp);
194
195 if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
196 /**
197 * if we are not in out-of-order data delivery mode
198 * then we keep the responses sorted in program order.
199 * in order to do so we must reserve an entry in the
200 * resp buffer before we issue the request to the mem
201 * system. mem fence requests will not be stored here
202 * because once they are issued from the GM pipeline,
203 * they do not send any response back to it.
204 */
205 gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
206 std::make_pair(mp, false)));
207 }
208
209 if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
210 /**
211 * Memory accesses instructions that do not generate any memory
212 * requests (such as out-of-bounds buffer acceses where all lanes
213 * are out of bounds) will not trigger a callback to complete the
214 * request, so we need to mark it as completed as soon as it is
215 * issued. Note this this will still insert an entry in the
216 * ordered return FIFO such that waitcnt is still resolved
217 * correctly.
218 */
219 handleResponse(mp);
220 computeUnit->getTokenManager()->recvTokens(1);
221 }
222
223 gmIssuedRequests.pop();
224
225 DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
226 computeUnit->cu_id, mp->simdId, mp->wfSlotId);
227 }
228 }
229
230 GPUDynInstPtr
231 GlobalMemPipeline::getNextReadyResp()
232 {
233 if (!gmOrderedRespBuffer.empty()) {
234 auto mem_req = gmOrderedRespBuffer.begin();
235
236 if (mem_req->second.second) {
237 return mem_req->second.first;
238 }
239 }
240
241 return nullptr;
242 }
243
244 void
245 GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
246 {
247 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
248 assert(inflightLoads > 0);
249 --inflightLoads;
250 } else if (gpuDynInst->isStore()) {
251 assert(inflightStores > 0);
252 --inflightStores;
253 }
254
255 // we should only pop the oldest requst, and it
256 // should be marked as done if we are here
257 assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
258 assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
259 assert(gmOrderedRespBuffer.begin()->second.second);
260 // remove this instruction from the buffer by its
261 // unique seq ID
262 gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
263 }
264
265 void
266 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
267 {
268 gpuDynInst->setAccessTime(curTick());
269 gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
270 gmIssuedRequests.push(gpuDynInst);
271 }
272
273 void
274 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
275 {
276 auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
277 // if we are getting a response for this mem request,
278 // then it ought to already be in the ordered response
279 // buffer
280 assert(mem_req != gmOrderedRespBuffer.end());
281 mem_req->second.second = true;
282 }
283
284 void
285 GlobalMemPipeline::regStats()
286 {
287 loadVrfBankConflictCycles
288 .name(name() + ".load_vrf_bank_conflict_cycles")
289 .desc("total number of cycles GM data are delayed before updating "
290 "the VRF")
291 ;
292 }