gpu-compute: Dropping fetchs when no entry is reserved in the buffer
[gem5.git] / src / gpu-compute / lds_state.cc
1 /*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/lds_state.hh"
35
36 #include <array>
37 #include <cstdio>
38 #include <cstdlib>
39
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/shader.hh"
43
44 /**
45 * the default constructor that works with SWIG
46 */
47 LdsState::LdsState(const Params *params) :
48 ClockedObject(params),
49 tickEvent(this),
50 cuPort(name() + ".port", this),
51 maximumSize(params->size),
52 range(params->range),
53 bankConflictPenalty(params->bankConflictPenalty),
54 banks(params->banks)
55 {
56 fatal_if(params->banks <= 0,
57 "Number of LDS banks should be positive number");
58 fatal_if((params->banks & (params->banks - 1)) != 0,
59 "Number of LDS banks should be a power of 2");
60 fatal_if(params->size <= 0,
61 "cannot allocate an LDS with a size less than 1");
62 fatal_if(params->size % 2,
63 "the LDS should be an even number");
64 }
65
66 /**
67 * Needed by the SWIG compiler
68 */
69 LdsState *
70 LdsStateParams::create()
71 {
72 return new LdsState(this);
73 }
74
75 /**
76 * set the parent and name based on the parent
77 */
78 void
79 LdsState::setParent(ComputeUnit *x_parent)
80 {
81 // check that this gets assigned to the same thing each time
82 fatal_if(!x_parent, "x_parent should not be nullptr");
83 fatal_if(x_parent == parent,
84 "should not be setting the parent twice");
85
86 parent = x_parent;
87 _name = x_parent->name() + ".LdsState";
88 }
89
90 /**
91 * derive the gpu mem packet from the packet and then count the bank conflicts
92 */
93 unsigned
94 LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
95 {
96 Packet::SenderState *baseSenderState = packet->senderState;
97 while (baseSenderState->predecessor) {
98 baseSenderState = baseSenderState->predecessor;
99 }
100 const ComputeUnit::LDSPort::SenderState *senderState =
101 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
102
103 fatal_if(!senderState,
104 "did not get the right sort of sender state");
105
106 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
107
108 return countBankConflicts(gpuDynInst, bankAccesses);
109 }
110
111 // Count the total number of bank conflicts for the local memory packet
112 unsigned
113 LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
114 unsigned *numBankAccesses)
115 {
116 int bank_conflicts = 0;
117 std::vector<int> bank;
118 // the number of LDS banks being touched by the memory instruction
119 int numBanks = std::min(parent->wfSize(), banks);
120 // if the wavefront size is larger than the number of LDS banks, we
121 // need to iterate over all work items to calculate the total
122 // number of bank conflicts
123 int groups = (parent->wfSize() > numBanks) ?
124 (parent->wfSize() / numBanks) : 1;
125 for (int i = 0; i < groups; i++) {
126 // Address Array holding all the work item addresses of an instruction
127 std::vector<Addr> addr_array;
128 addr_array.resize(numBanks, 0);
129 bank.clear();
130 bank.resize(banks, 0);
131 int max_bank = 0;
132
133 // populate the address array for all active work items
134 for (int j = 0; j < numBanks; j++) {
135 if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
136 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
137 } else {
138 addr_array[j] = std::numeric_limits<Addr>::max();
139 }
140 }
141
142 if (gpuDynInst->isLoad() || gpuDynInst->isStore()) {
143 // mask identical addresses
144 for (int j = 0; j < numBanks; ++j) {
145 for (int j0 = 0; j0 < j; j0++) {
146 if (addr_array[j] != std::numeric_limits<Addr>::max()
147 && addr_array[j] == addr_array[j0]) {
148 addr_array[j] = std::numeric_limits<Addr>::max();
149 }
150 }
151 }
152 }
153 // calculate bank conflicts
154 for (int j = 0; j < numBanks; ++j) {
155 if (addr_array[j] != std::numeric_limits<Addr>::max()) {
156 int bankId = addr_array[j] % banks;
157 bank[bankId]++;
158 max_bank = std::max(max_bank, bank[bankId]);
159 // Count the number of LDS banks accessed.
160 // Since we have masked identical addresses all remaining
161 // accesses will need to be serialized if they access
162 // the same bank (bank conflict).
163 (*numBankAccesses)++;
164 }
165 }
166 bank_conflicts += max_bank;
167 }
168 panic_if(bank_conflicts > parent->wfSize(),
169 "Max bank conflicts should match num of work items per instr");
170 return bank_conflicts;
171 }
172
173 /**
174 * receive the packet from the CU
175 */
176 bool
177 LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
178 {
179 return ownerLds->processPacket(packet);
180 }
181
182 GPUDynInstPtr
183 LdsState::getDynInstr(PacketPtr packet)
184 {
185 ComputeUnit::LDSPort::SenderState *ss =
186 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
187 packet->senderState);
188 return ss->getMemInst();
189 }
190
191 /**
192 * process an incoming packet, add it to the return queue
193 */
194 bool
195 LdsState::processPacket(PacketPtr packet)
196 {
197 unsigned bankAccesses = 0;
198 // the number of conflicts this packet will have when accessing the LDS
199 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
200 // count the total number of physical LDS bank accessed
201 parent->ldsBankAccesses += bankAccesses;
202 // count the LDS bank conflicts. A number set to 1 indicates one
203 // access per bank maximum so there are no bank conflicts
204 parent->ldsBankConflictDist.sample(bankConflicts-1);
205
206 GPUDynInstPtr dynInst = getDynInstr(packet);
207 // account for the LDS bank conflict overhead
208 int busLength = (dynInst->isLoad()) ? parent->loadBusLength() :
209 (dynInst->isStore()) ? parent->storeBusLength() :
210 parent->loadBusLength();
211 // delay for accessing the LDS
212 Tick processingTime =
213 parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
214 parent->cyclesToTicks(Cycles(busLength));
215 // choose (delay + last packet in queue) or (now + delay) as the time to
216 // return this
217 Tick doneAt = earliestReturnTime() + processingTime;
218 // then store it for processing
219 return returnQueuePush(std::make_pair(doneAt, packet));
220 }
221
222 /**
223 * add this to the queue of packets to be returned
224 */
225 bool
226 LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
227 {
228 // TODO add time limits (e.g. one packet per cycle) and queue size limits
229 // and implement flow control
230 returnQueue.push(thePair);
231
232 // if there is no set wakeup time, look through the queue
233 if (!tickEvent.scheduled()) {
234 process();
235 }
236
237 return true;
238 }
239
240 /**
241 * receive a packet in functional mode
242 */
243 void
244 LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
245 {
246 fatal("not implemented");
247 }
248
249 /**
250 * receive a retry for a response
251 */
252 void
253 LdsState::CuSidePort::recvRespRetry()
254 {
255 // TODO verify that this is the right way to do this
256 assert(ownerLds->isRetryResp());
257 ownerLds->setRetryResp(false);
258 ownerLds->process();
259 }
260
261 /**
262 * receive a retry
263 */
264 void
265 LdsState::CuSidePort::recvRetry()
266 {
267 fatal("not implemented");
268 }
269
270 /**
271 * look for packets to return at this time
272 */
273 bool
274 LdsState::process()
275 {
276 Tick now = clockEdge();
277
278 // send back completed packets
279 while (!returnQueue.empty() && returnQueue.front().first <= now) {
280 PacketPtr packet = returnQueue.front().second;
281
282 ComputeUnit::LDSPort::SenderState *ss =
283 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
284 packet->senderState);
285
286 GPUDynInstPtr gpuDynInst = ss->getMemInst();
287
288 gpuDynInst->initiateAcc(gpuDynInst);
289
290 packet->makeTimingResponse();
291
292 returnQueue.pop();
293
294 bool success = cuPort.sendTimingResp(packet);
295
296 if (!success) {
297 retryResp = true;
298 panic("have not handled timing responses being NACK'd when sent"
299 "back");
300 }
301 }
302
303 // determine the next wakeup time
304 if (!returnQueue.empty()) {
305
306 Tick next = returnQueue.front().first;
307
308 if (tickEvent.scheduled()) {
309
310 if (next < tickEvent.when()) {
311
312 tickEvent.deschedule();
313 tickEvent.schedule(next);
314 }
315 } else {
316 tickEvent.schedule(next);
317 }
318 }
319
320 return true;
321 }
322
323 /**
324 * wake up at this time and perform specified actions
325 */
326 void
327 LdsState::TickEvent::process()
328 {
329 ldsState->process();
330 }