ext,tests: Copy test's output files from /tmp to testing-results
[gem5.git] / src / gpu-compute / global_memory_pipeline.cc
1 /*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #define __STDC_FORMAT_MACROS
35 #include <cinttypes>
36 #include "debug/GPUCoalescer.hh"
37 #include "debug/GPUMem.hh"
38 #include "debug/GPUReg.hh"
39 #include "gpu-compute/compute_unit.hh"
40 #include "gpu-compute/global_memory_pipeline.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/vector_register_file.hh"
44 #include "gpu-compute/wavefront.hh"
45
46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p,
47 ComputeUnit &cu)
48 : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),
49 gmQueueSize(p->global_mem_queue_size),
50 maxWaveRequests(p->max_wave_requests), inflightStores(0),
51 inflightLoads(0)
52 {
53 }
54
55 void
56 GlobalMemPipeline::init()
57 {
58 globalMemSize = computeUnit.shader->globalMemSize;
59 }
60
61 bool
62 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
63 {
64 // We require one token from the coalescer's uncoalesced table to
65 // proceed
66 int token_count = 1;
67
68 // Make sure the vector port has tokens. There is a single pool
69 // of tokens so only one port in the vector port needs to be checked.
70 // Lane 0 is chosen arbirarily.
71 DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
72 if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
73 DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
74 return false;
75 }
76
77 return true;
78 }
79
80 void
81 GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)
82 {
83 // We require one token from the coalescer's uncoalesced table to
84 // proceed
85 int token_count = 1;
86
87 DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);
88 assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));
89 mp->computeUnit()->getTokenManager()->acquireTokens(token_count);
90 }
91
92 bool
93 GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const
94 {
95 // Ensure we haven't exceeded the maximum number of vmem requests
96 // for this wavefront
97 if ((mp->wavefront()->outstandingReqsRdGm
98 + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {
99 return false;
100 }
101
102 return true;
103 }
104
105 void
106 GlobalMemPipeline::exec()
107 {
108 // apply any returned global memory operations
109 GPUDynInstPtr m = getNextReadyResp();
110
111 bool accessVrf = true;
112 Wavefront *w = nullptr;
113
114 // check the VRF to see if the operands of a load (or load component
115 // of an atomic) are accessible
116 if (m && (m->isLoad() || m->isAtomicRet())) {
117 w = m->wavefront();
118
119 accessVrf = w->computeUnit->vrf[w->simdId]->
120 canScheduleWriteOperandsFromLoad(w, m);
121
122 }
123
124 if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&
125 accessVrf && (computeUnit.shader->coissue_return ||
126 computeUnit.vectorGlobalMemUnit.rdy())) {
127
128 w = m->wavefront();
129
130 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",
131 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
132 m->completeAcc(m);
133 w->decVMemInstsIssued();
134
135 if (m->isLoad() || m->isAtomicRet()) {
136 w->computeUnit->vrf[w->simdId]->
137 scheduleWriteOperandsFromLoad(w, m);
138 }
139
140 completeRequest(m);
141
142 Tick accessTime = curTick() - m->getAccessTime();
143
144 // Decrement outstanding requests count
145 computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
146 if (m->isStore() || m->isAtomic() || m->isMemSync()) {
147 computeUnit.shader->sampleStore(accessTime);
148 computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,
149 m->time, -1);
150 }
151
152 if (m->isLoad() || m->isAtomic() || m->isMemSync()) {
153 computeUnit.shader->sampleLoad(accessTime);
154 computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,
155 m->time, -1);
156 }
157
158 w->validateRequestCounters();
159
160 // Generate stats for round-trip time for vectory memory insts
161 // going all the way to memory and stats for individual cache
162 // blocks generated by the instruction.
163 m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);
164 computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());
165 computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());
166
167 // Mark write bus busy for appropriate amount of time
168 computeUnit.glbMemToVrfBus.set(m->time);
169 if (!computeUnit.shader->coissue_return)
170 w->computeUnit->vectorGlobalMemUnit.set(m->time);
171 }
172
173 // If pipeline has executed a global memory instruction
174 // execute global memory packets and issue global
175 // memory packets to DTLB
176 if (!gmIssuedRequests.empty()) {
177 GPUDynInstPtr mp = gmIssuedRequests.front();
178 if (mp->isLoad() || mp->isAtomic()) {
179 if (inflightLoads >= gmQueueSize) {
180 return;
181 } else {
182 ++inflightLoads;
183 }
184 } else if (mp->isStore()) {
185 if (inflightStores >= gmQueueSize) {
186 return;
187 } else {
188 ++inflightStores;
189 }
190 }
191
192 DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
193 mp->disassemble(), mp->seqNum());
194 mp->initiateAcc(mp);
195
196 if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {
197 /**
198 * if we are not in out-of-order data delivery mode
199 * then we keep the responses sorted in program order.
200 * in order to do so we must reserve an entry in the
201 * resp buffer before we issue the request to the mem
202 * system. mem fence requests will not be stored here
203 * because once they are issued from the GM pipeline,
204 * they do not send any response back to it.
205 */
206 gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),
207 std::make_pair(mp, false)));
208 }
209
210 if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
211 /**
212 * Memory accesses instructions that do not generate any memory
213 * requests (such as out-of-bounds buffer acceses where all lanes
214 * are out of bounds) will not trigger a callback to complete the
215 * request, so we need to mark it as completed as soon as it is
216 * issued. Note this this will still insert an entry in the
217 * ordered return FIFO such that waitcnt is still resolved
218 * correctly.
219 */
220 handleResponse(mp);
221 computeUnit.getTokenManager()->recvTokens(1);
222 }
223
224 gmIssuedRequests.pop();
225
226 DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",
227 computeUnit.cu_id, mp->simdId, mp->wfSlotId);
228 }
229 }
230
231 GPUDynInstPtr
232 GlobalMemPipeline::getNextReadyResp()
233 {
234 if (!gmOrderedRespBuffer.empty()) {
235 auto mem_req = gmOrderedRespBuffer.begin();
236
237 if (mem_req->second.second) {
238 return mem_req->second.first;
239 }
240 }
241
242 return nullptr;
243 }
244
245 void
246 GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)
247 {
248 if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
249 assert(inflightLoads > 0);
250 --inflightLoads;
251 } else if (gpuDynInst->isStore()) {
252 assert(inflightStores > 0);
253 --inflightStores;
254 }
255
256 // we should only pop the oldest requst, and it
257 // should be marked as done if we are here
258 assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());
259 assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);
260 assert(gmOrderedRespBuffer.begin()->second.second);
261 // remove this instruction from the buffer by its
262 // unique seq ID
263 gmOrderedRespBuffer.erase(gpuDynInst->seqNum());
264 }
265
266 void
267 GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
268 {
269 gpuDynInst->setAccessTime(curTick());
270 gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);
271 gmIssuedRequests.push(gpuDynInst);
272 }
273
274 void
275 GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)
276 {
277 auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());
278 // if we are getting a response for this mem request,
279 // then it ought to already be in the ordered response
280 // buffer
281 assert(mem_req != gmOrderedRespBuffer.end());
282 mem_req->second.second = true;
283 }
284
285 void
286 GlobalMemPipeline::regStats()
287 {
288 loadVrfBankConflictCycles
289 .name(name() + ".load_vrf_bank_conflict_cycles")
290 .desc("total number of cycles GM data are delayed before updating "
291 "the VRF")
292 ;
293 }