ccf758b47379391995e6d97a913f6c3318f9c9d4
[gem5.git] / src / gpu-compute / lds_state.hh
1 /*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: John Kalamatianos,
34 * Joe Gross
35 */
36
37 #ifndef __LDS_STATE_HH__
38 #define __LDS_STATE_HH__
39
40 #include <array>
41 #include <queue>
42 #include <string>
43 #include <unordered_map>
44 #include <utility>
45 #include <vector>
46
47 #include "enums/MemType.hh"
48 #include "gpu-compute/misc.hh"
49 #include "mem/mem_object.hh"
50 #include "mem/port.hh"
51 #include "params/LdsState.hh"
52
53 class ComputeUnit;
54
55 /**
56 * this represents a slice of the overall LDS, intended to be associated with an
57 * individual workgroup
58 */
59 class LdsChunk
60 {
61 public:
62 LdsChunk(const uint32_t x_size):
63 chunk(x_size)
64 {
65 }
66
67 LdsChunk() {}
68
69 /**
70 * a read operation
71 */
72 template<class T>
73 T
74 read(const uint32_t index)
75 {
76 fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
77 fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
78 T *p0 = (T *) (&(chunk.at(index)));
79 return *p0;
80 }
81
82 /**
83 * a write operation
84 */
85 template<class T>
86 void
87 write(const uint32_t index, const T value)
88 {
89 fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
90 fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
91 T *p0 = (T *) (&(chunk.at(index)));
92 *p0 = value;
93 }
94
95 /**
96 * get the size of this chunk
97 */
98 std::vector<uint8_t>::size_type
99 size() const
100 {
101 return chunk.size();
102 }
103
104 protected:
105 // the actual data store for this slice of the LDS
106 std::vector<uint8_t> chunk;
107 };
108
109 // Local Data Share (LDS) State per Wavefront (contents of the LDS region
110 // allocated to the WorkGroup of this Wavefront)
111 class LdsState: public MemObject
112 {
113 protected:
114
115 /**
116 * an event to allow event-driven execution
117 */
118 class TickEvent: public Event
119 {
120 protected:
121
122 LdsState *ldsState = nullptr;
123
124 Tick nextTick = 0;
125
126 public:
127
128 TickEvent(LdsState *_ldsState) :
129 ldsState(_ldsState)
130 {
131 }
132
133 virtual void
134 process();
135
136 void
137 schedule(Tick when)
138 {
139 mainEventQueue[0]->schedule(this, when);
140 }
141
142 void
143 deschedule()
144 {
145 mainEventQueue[0]->deschedule(this);
146 }
147 };
148
149 /**
150 * CuSidePort is the LDS Port closer to the CU side
151 */
152 class CuSidePort: public SlavePort
153 {
154 public:
155 CuSidePort(const std::string &_name, LdsState *_ownerLds) :
156 SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
157 {
158 }
159
160 protected:
161 LdsState *ownerLds;
162
163 virtual bool
164 recvTimingReq(PacketPtr pkt);
165
166 virtual Tick
167 recvAtomic(PacketPtr pkt)
168 {
169 return 0;
170 }
171
172 virtual void
173 recvFunctional(PacketPtr pkt);
174
175 virtual void
176 recvRangeChange()
177 {
178 }
179
180 virtual void
181 recvRetry();
182
183 virtual void
184 recvRespRetry();
185
186 virtual AddrRangeList
187 getAddrRanges() const
188 {
189 AddrRangeList ranges;
190 ranges.push_back(ownerLds->getAddrRange());
191 return ranges;
192 }
193
194 template<typename T>
195 void
196 loadData(PacketPtr packet);
197
198 template<typename T>
199 void
200 storeData(PacketPtr packet);
201
202 template<typename T>
203 void
204 atomicOperation(PacketPtr packet);
205 };
206
207 protected:
208
209 // the lds reference counter
210 // The key is the workgroup ID and dispatch ID
211 // The value is the number of wavefronts that reference this LDS, as
212 // wavefronts are launched, the counter goes up for that workgroup and when
213 // they return it decreases, once it reaches 0 then this chunk of the LDS is
214 // returned to the available pool. However,it is deallocated on the 1->0
215 // transition, not whenever the counter is 0 as it always starts with 0 when
216 // the workgroup asks for space
217 std::unordered_map<uint32_t,
218 std::unordered_map<uint32_t, int32_t>> refCounter;
219
220 // the map that allows workgroups to access their own chunk of the LDS
221 std::unordered_map<uint32_t,
222 std::unordered_map<uint32_t, LdsChunk>> chunkMap;
223
224 // an event to allow the LDS to wake up at a specified time
225 TickEvent tickEvent;
226
227 // the queue of packets that are going back to the CU after a
228 // read/write/atomic op
229 // TODO need to make this have a maximum size to create flow control
230 std::queue<std::pair<Tick, PacketPtr>> returnQueue;
231
232 // whether or not there are pending responses
233 bool retryResp = false;
234
235 bool
236 process();
237
238 GPUDynInstPtr
239 getDynInstr(PacketPtr packet);
240
241 bool
242 processPacket(PacketPtr packet);
243
244 unsigned
245 countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
246
247 unsigned
248 countBankConflicts(GPUDynInstPtr gpuDynInst,
249 unsigned *numBankAccesses);
250
251 public:
252 typedef LdsStateParams Params;
253
254 LdsState(const Params *params);
255
256 // prevent copy construction
257 LdsState(const LdsState&) = delete;
258
259 ~LdsState()
260 {
261 parent = nullptr;
262 }
263
264 const Params *
265 params() const
266 {
267 return dynamic_cast<const Params *>(_params);
268 }
269
270 bool
271 isRetryResp() const
272 {
273 return retryResp;
274 }
275
276 void
277 setRetryResp(const bool value)
278 {
279 retryResp = value;
280 }
281
282 // prevent assignment
283 LdsState &
284 operator=(const LdsState &) = delete;
285
286 /**
287 * use the dynamic wave id to create or just increase the reference count
288 */
289 int
290 increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
291 {
292 int refCount = getRefCounter(dispatchId, wgId);
293 fatal_if(refCount < 0,
294 "reference count should not be below zero");
295 return ++refCounter[dispatchId][wgId];
296 }
297
298 /**
299 * decrease the reference count after making sure it is in the list
300 * give back this chunk if the ref counter has reached 0
301 */
302 int
303 decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
304 {
305 int refCount = getRefCounter(dispatchId, wgId);
306
307 fatal_if(refCount <= 0,
308 "reference count should not be below zero or at zero to"
309 "decrement");
310
311 refCounter[dispatchId][wgId]--;
312
313 if (refCounter[dispatchId][wgId] == 0) {
314 releaseSpace(dispatchId, wgId);
315 return 0;
316 } else {
317 return refCounter[dispatchId][wgId];
318 }
319 }
320
321 /**
322 * return the current reference count for this workgroup id
323 */
324 int
325 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
326 {
327 auto dispatchIter = chunkMap.find(dispatchId);
328 fatal_if(dispatchIter == chunkMap.end(),
329 "could not locate this dispatch id [%d]", dispatchId);
330
331 auto workgroup = dispatchIter->second.find(wgId);
332 fatal_if(workgroup == dispatchIter->second.end(),
333 "could not find this workgroup id within this dispatch id"
334 " did[%d] wgid[%d]", dispatchId, wgId);
335
336 auto refCountIter = refCounter.find(dispatchId);
337 if (refCountIter == refCounter.end()) {
338 fatal("could not locate this dispatch id [%d]", dispatchId);
339 } else {
340 auto workgroup = refCountIter->second.find(wgId);
341 if (workgroup == refCountIter->second.end()) {
342 fatal("could not find this workgroup id within this dispatch id"
343 " did[%d] wgid[%d]", dispatchId, wgId);
344 } else {
345 return refCounter.at(dispatchId).at(wgId);
346 }
347 }
348
349 fatal("should not reach this point");
350 return 0;
351 }
352
353 /**
354 * assign a parent and request this amount of space be set aside
355 * for this wgid
356 */
357 LdsChunk *
358 reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
359 const uint32_t size)
360 {
361 if (chunkMap.find(dispatchId) != chunkMap.end()) {
362 fatal_if(
363 chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
364 "duplicate workgroup ID asking for space in the LDS "
365 "did[%d] wgid[%d]", dispatchId, wgId);
366 }
367
368 fatal_if(bytesAllocated + size > maximumSize,
369 "request would ask for more space than is available");
370
371 bytesAllocated += size;
372
373 chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
374 // make an entry for this workgroup
375 refCounter[dispatchId][wgId] = 0;
376
377 return &chunkMap[dispatchId][wgId];
378 }
379
380 bool
381 returnQueuePush(std::pair<Tick, PacketPtr> thePair);
382
383 Tick
384 earliestReturnTime() const
385 {
386 // TODO set to max(lastCommand+1, curTick())
387 return returnQueue.empty() ? curTick() : returnQueue.back().first;
388 }
389
390 void
391 setParent(ComputeUnit *x_parent);
392
393 // accessors
394 ComputeUnit *
395 getParent() const
396 {
397 return parent;
398 }
399
400 std::string
401 getName()
402 {
403 return _name;
404 }
405
406 int
407 getBanks() const
408 {
409 return banks;
410 }
411
412 ComputeUnit *
413 getComputeUnit() const
414 {
415 return parent;
416 }
417
418 int
419 getBankConflictPenalty() const
420 {
421 return bankConflictPenalty;
422 }
423
424 /**
425 * get the allocated size for this workgroup
426 */
427 std::size_t
428 ldsSize(const uint32_t x_wgId)
429 {
430 return chunkMap[x_wgId].size();
431 }
432
433 AddrRange
434 getAddrRange() const
435 {
436 return range;
437 }
438
439 virtual BaseSlavePort &
440 getSlavePort(const std::string& if_name, PortID idx)
441 {
442 if (if_name == "cuPort") {
443 // TODO need to set name dynamically at this point?
444 return cuPort;
445 } else {
446 fatal("cannot resolve the port name " + if_name);
447 }
448 }
449
450 /**
451 * can this much space be reserved for a workgroup?
452 */
453 bool
454 canReserve(uint32_t x_size) const
455 {
456 return bytesAllocated + x_size <= maximumSize;
457 }
458
459 private:
460 /**
461 * give back the space
462 */
463 bool
464 releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
465 {
466 auto dispatchIter = chunkMap.find(x_dispatchId);
467
468 if (dispatchIter == chunkMap.end()) {
469 fatal("dispatch id not found [%d]", x_dispatchId);
470 } else {
471 auto workgroupIter = dispatchIter->second.find(x_wgId);
472 if (workgroupIter == dispatchIter->second.end()) {
473 fatal("workgroup id [%d] not found in dispatch id [%d]",
474 x_wgId, x_dispatchId);
475 }
476 }
477
478 fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
479 "releasing more space than was allocated");
480
481 bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
482 chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
483 return true;
484 }
485
486 // the port that connects this LDS to its owner CU
487 CuSidePort cuPort;
488
489 ComputeUnit* parent = nullptr;
490
491 std::string _name;
492
493 // the number of bytes currently reserved by all workgroups
494 int bytesAllocated = 0;
495
496 // the size of the LDS, the most bytes available
497 int maximumSize;
498
499 // Address range of this memory
500 AddrRange range;
501
502 // the penalty, in cycles, for each LDS bank conflict
503 int bankConflictPenalty = 0;
504
505 // the number of banks in the LDS underlying data store
506 int banks = 0;
507 };
508
509 #endif // __LDS_STATE_HH__