gpu-compute: update port terminology
[gem5.git] / src / gpu-compute / lds_state.hh
1 /*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifndef __LDS_STATE_HH__
35 #define __LDS_STATE_HH__
36
37 #include <array>
38 #include <queue>
39 #include <string>
40 #include <unordered_map>
41 #include <utility>
42 #include <vector>
43
44 #include "gpu-compute/misc.hh"
45 #include "mem/port.hh"
46 #include "params/LdsState.hh"
47 #include "sim/clocked_object.hh"
48
49 class ComputeUnit;
50
51 /**
52 * this represents a slice of the overall LDS, intended to be associated with
53 * an individual workgroup
54 */
55 class LdsChunk
56 {
57 public:
58 LdsChunk(const uint32_t x_size):
59 chunk(x_size)
60 {
61 }
62
63 LdsChunk() {}
64
65 /**
66 * a read operation
67 */
68 template<class T>
69 T
70 read(const uint32_t index)
71 {
72 /**
73 * For reads that are outside the bounds of the LDS
74 * chunk allocated to this WG we return 0.
75 */
76 if (index >= chunk.size()) {
77 return (T)0;
78 }
79
80 T *p0 = (T *) (&(chunk.at(index)));
81 return *p0;
82 }
83
84 /**
85 * a write operation
86 */
87 template<class T>
88 void
89 write(const uint32_t index, const T value)
90 {
91 /**
92 * Writes that are outside the bounds of the LDS
93 * chunk allocated to this WG are dropped.
94 */
95 if (index >= chunk.size()) {
96 return;
97 }
98
99 T *p0 = (T *) (&(chunk.at(index)));
100 *p0 = value;
101 }
102
103 /**
104 * get the size of this chunk
105 */
106 std::vector<uint8_t>::size_type
107 size() const
108 {
109 return chunk.size();
110 }
111
112 protected:
113 // the actual data store for this slice of the LDS
114 std::vector<uint8_t> chunk;
115 };
116
117 // Local Data Share (LDS) State per Wavefront (contents of the LDS region
118 // allocated to the WorkGroup of this Wavefront)
119 class LdsState: public ClockedObject
120 {
121 protected:
122
123 /**
124 * an event to allow event-driven execution
125 */
126 class TickEvent: public Event
127 {
128 protected:
129
130 LdsState *ldsState = nullptr;
131
132 Tick nextTick = 0;
133
134 public:
135
136 TickEvent(LdsState *_ldsState) :
137 ldsState(_ldsState)
138 {
139 }
140
141 virtual void
142 process();
143
144 void
145 schedule(Tick when)
146 {
147 mainEventQueue[0]->schedule(this, when);
148 }
149
150 void
151 deschedule()
152 {
153 mainEventQueue[0]->deschedule(this);
154 }
155 };
156
157 /**
158 * CuSidePort is the LDS Port closer to the CU side
159 */
160 class CuSidePort: public ResponsePort
161 {
162 public:
163 CuSidePort(const std::string &_name, LdsState *_ownerLds) :
164 ResponsePort(_name, _ownerLds), ownerLds(_ownerLds)
165 {
166 }
167
168 protected:
169 LdsState *ownerLds;
170
171 virtual bool
172 recvTimingReq(PacketPtr pkt);
173
174 virtual Tick
175 recvAtomic(PacketPtr pkt)
176 {
177 return 0;
178 }
179
180 virtual void
181 recvFunctional(PacketPtr pkt);
182
183 virtual void
184 recvRangeChange()
185 {
186 }
187
188 virtual void
189 recvRetry();
190
191 virtual void
192 recvRespRetry();
193
194 virtual AddrRangeList
195 getAddrRanges() const
196 {
197 AddrRangeList ranges;
198 ranges.push_back(ownerLds->getAddrRange());
199 return ranges;
200 }
201
202 template<typename T>
203 void
204 loadData(PacketPtr packet);
205
206 template<typename T>
207 void
208 storeData(PacketPtr packet);
209
210 template<typename T>
211 void
212 atomicOperation(PacketPtr packet);
213 };
214
215 protected:
216
217 /**
218 * the lds reference counter
219 * The key is the workgroup ID and dispatch ID
220 * The value is the number of wavefronts that reference this LDS, as
221 * wavefronts are launched, the counter goes up for that workgroup and when
222 * they return it decreases, once it reaches 0 then this chunk of the LDS
223 * is returned to the available pool. However,it is deallocated on the 1->0
224 * transition, not whenever the counter is 0 as it always starts with 0
225 * when the workgroup asks for space
226 */
227 std::unordered_map<uint32_t,
228 std::unordered_map<uint32_t, int32_t>> refCounter;
229
230 // the map that allows workgroups to access their own chunk of the LDS
231 std::unordered_map<uint32_t,
232 std::unordered_map<uint32_t, LdsChunk>> chunkMap;
233
234 // an event to allow the LDS to wake up at a specified time
235 TickEvent tickEvent;
236
237 // the queue of packets that are going back to the CU after a
238 // read/write/atomic op
239 // TODO need to make this have a maximum size to create flow control
240 std::queue<std::pair<Tick, PacketPtr>> returnQueue;
241
242 // whether or not there are pending responses
243 bool retryResp = false;
244
245 bool
246 process();
247
248 GPUDynInstPtr
249 getDynInstr(PacketPtr packet);
250
251 bool
252 processPacket(PacketPtr packet);
253
254 unsigned
255 countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
256
257 unsigned
258 countBankConflicts(GPUDynInstPtr gpuDynInst,
259 unsigned *numBankAccesses);
260
261 public:
262 typedef LdsStateParams Params;
263
264 LdsState(const Params *params);
265
266 // prevent copy construction
267 LdsState(const LdsState&) = delete;
268
269 ~LdsState()
270 {
271 parent = nullptr;
272 }
273
274 const Params *
275 params() const
276 {
277 return dynamic_cast<const Params *>(_params);
278 }
279
280 bool
281 isRetryResp() const
282 {
283 return retryResp;
284 }
285
286 void
287 setRetryResp(const bool value)
288 {
289 retryResp = value;
290 }
291
292 // prevent assignment
293 LdsState &
294 operator=(const LdsState &) = delete;
295
296 /**
297 * use the dynamic wave id to create or just increase the reference count
298 */
299 int
300 increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
301 {
302 int refCount = getRefCounter(dispatchId, wgId);
303 fatal_if(refCount < 0,
304 "reference count should not be below zero");
305 return ++refCounter[dispatchId][wgId];
306 }
307
308 /**
309 * decrease the reference count after making sure it is in the list
310 * give back this chunk if the ref counter has reached 0
311 */
312 int
313 decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
314 {
315 int refCount = getRefCounter(dispatchId, wgId);
316
317 fatal_if(refCount <= 0,
318 "reference count should not be below zero or at zero to"
319 "decrement");
320
321 refCounter[dispatchId][wgId]--;
322
323 if (refCounter[dispatchId][wgId] == 0) {
324 releaseSpace(dispatchId, wgId);
325 return 0;
326 } else {
327 return refCounter[dispatchId][wgId];
328 }
329 }
330
331 /**
332 * return the current reference count for this workgroup id
333 */
334 int
335 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
336 {
337 auto dispatchIter = chunkMap.find(dispatchId);
338 fatal_if(dispatchIter == chunkMap.end(),
339 "could not locate this dispatch id [%d]", dispatchId);
340
341 auto workgroup = dispatchIter->second.find(wgId);
342 fatal_if(workgroup == dispatchIter->second.end(),
343 "could not find this workgroup id within this dispatch id"
344 " did[%d] wgid[%d]", dispatchId, wgId);
345
346 auto refCountIter = refCounter.find(dispatchId);
347 if (refCountIter == refCounter.end()) {
348 fatal("could not locate this dispatch id [%d]", dispatchId);
349 } else {
350 auto workgroup = refCountIter->second.find(wgId);
351 if (workgroup == refCountIter->second.end()) {
352 fatal("could not find this workgroup id within this dispatch id"
353 " did[%d] wgid[%d]", dispatchId, wgId);
354 } else {
355 return refCounter.at(dispatchId).at(wgId);
356 }
357 }
358
359 fatal("should not reach this point");
360 return 0;
361 }
362
363 /**
364 * assign a parent and request this amount of space be set aside
365 * for this wgid
366 */
367 LdsChunk *
368 reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
369 const uint32_t size)
370 {
371 if (chunkMap.find(dispatchId) != chunkMap.end()) {
372 panic_if(
373 chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
374 "duplicate workgroup ID asking for space in the LDS "
375 "did[%d] wgid[%d]", dispatchId, wgId);
376 }
377
378 if (bytesAllocated + size > maximumSize) {
379 return nullptr;
380 } else {
381 bytesAllocated += size;
382
383 auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
384 panic_if(!value.second, "was unable to allocate a new chunkMap");
385
386 // make an entry for this workgroup
387 refCounter[dispatchId][wgId] = 0;
388
389 return &chunkMap[dispatchId][wgId];
390 }
391 }
392
393 /*
394 * return pointer to lds chunk for wgid
395 */
396 LdsChunk *
397 getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
398 {
399 fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
400 "fetch for unknown dispatch ID did[%d]", dispatchId);
401
402 fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
403 "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
404 wgId, dispatchId);
405
406 return &chunkMap[dispatchId][wgId];
407 }
408
409 bool
410 returnQueuePush(std::pair<Tick, PacketPtr> thePair);
411
412 Tick
413 earliestReturnTime() const
414 {
415 // TODO set to max(lastCommand+1, curTick())
416 return returnQueue.empty() ? curTick() : returnQueue.back().first;
417 }
418
419 void
420 setParent(ComputeUnit *x_parent);
421
422 // accessors
423 ComputeUnit *
424 getParent() const
425 {
426 return parent;
427 }
428
429 std::string
430 getName()
431 {
432 return _name;
433 }
434
435 int
436 getBanks() const
437 {
438 return banks;
439 }
440
441 ComputeUnit *
442 getComputeUnit() const
443 {
444 return parent;
445 }
446
447 int
448 getBankConflictPenalty() const
449 {
450 return bankConflictPenalty;
451 }
452
453 /**
454 * get the allocated size for this workgroup
455 */
456 std::size_t
457 ldsSize(const uint32_t x_wgId)
458 {
459 return chunkMap[x_wgId].size();
460 }
461
462 AddrRange
463 getAddrRange() const
464 {
465 return range;
466 }
467
468 Port &
469 getPort(const std::string &if_name, PortID idx)
470 {
471 if (if_name == "cuPort") {
472 // TODO need to set name dynamically at this point?
473 return cuPort;
474 } else {
475 fatal("cannot resolve the port name " + if_name);
476 }
477 }
478
479 /**
480 * can this much space be reserved for a workgroup?
481 */
482 bool
483 canReserve(uint32_t x_size) const
484 {
485 return bytesAllocated + x_size <= maximumSize;
486 }
487
488 private:
489 /**
490 * give back the space
491 */
492 bool
493 releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
494 {
495 auto dispatchIter = chunkMap.find(x_dispatchId);
496
497 if (dispatchIter == chunkMap.end()) {
498 fatal("dispatch id not found [%d]", x_dispatchId);
499 } else {
500 auto workgroupIter = dispatchIter->second.find(x_wgId);
501 if (workgroupIter == dispatchIter->second.end()) {
502 fatal("workgroup id [%d] not found in dispatch id [%d]",
503 x_wgId, x_dispatchId);
504 }
505 }
506
507 fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
508 "releasing more space than was allocated");
509
510 bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
511 chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
512 return true;
513 }
514
515 // the port that connects this LDS to its owner CU
516 CuSidePort cuPort;
517
518 ComputeUnit* parent = nullptr;
519
520 std::string _name;
521
522 // the number of bytes currently reserved by all workgroups
523 int bytesAllocated = 0;
524
525 // the size of the LDS, the most bytes available
526 int maximumSize;
527
528 // Address range of this memory
529 AddrRange range;
530
531 // the penalty, in cycles, for each LDS bank conflict
532 int bankConflictPenalty = 0;
533
534 // the number of banks in the LDS underlying data store
535 int banks = 0;
536 };
537
538 #endif // __LDS_STATE_HH__