src/gpu-compute/lds_state.hh

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef __LDS_STATE_HH__
  35 #define __LDS_STATE_HH__
  36
  37 #include <array>
  38 #include <queue>
  39 #include <string>
  40 #include <unordered_map>
  41 #include <utility>
  42 #include <vector>
  43
  44 #include "gpu-compute/misc.hh"
  45 #include "mem/port.hh"
  46 #include "params/LdsState.hh"
  47 #include "sim/clocked_object.hh"
  48
  49 class ComputeUnit;
  50
  51 /**
  52  * this represents a slice of the overall LDS, intended to be associated with
  53  * an individual workgroup
  54  */
  55 class LdsChunk
  56 {
  57   public:
  58     LdsChunk(const uint32_t x_size):
  59         chunk(x_size)
  60     {
  61     }
  62
  63     LdsChunk() {}
  64
  65     /**
  66      * a read operation
  67      */
  68     template<class T>
  69     T
  70     read(const uint32_t index)
  71     {
  72         /**
  73          * For reads that are outside the bounds of the LDS
  74          * chunk allocated to this WG we return 0.
  75          */
  76         if (index >= chunk.size()) {
  77             return (T)0;
  78         }
  79
  80         T *p0 = (T *) (&(chunk.at(index)));
  81         return *p0;
  82     }
  83
  84     /**
  85      * a write operation
  86      */
  87     template<class T>
  88     void
  89     write(const uint32_t index, const T value)
  90     {
  91         /**
  92          * Writes that are outside the bounds of the LDS
  93          * chunk allocated to this WG are dropped.
  94          */
  95         if (index >= chunk.size()) {
  96             return;
  97         }
  98
  99         T *p0 = (T *) (&(chunk.at(index)));
 100         *p0 = value;
 101     }
 102
 103     /**
 104      * get the size of this chunk
 105      */
 106     std::vector<uint8_t>::size_type
 107     size() const
 108     {
 109         return chunk.size();
 110     }
 111
 112   protected:
 113     // the actual data store for this slice of the LDS
 114     std::vector<uint8_t> chunk;
 115 };
 116
 117 // Local Data Share (LDS) State per Wavefront (contents of the LDS region
 118 // allocated to the WorkGroup of this Wavefront)
 119 class LdsState: public ClockedObject
 120 {
 121   protected:
 122
 123     /**
 124      * an event to allow event-driven execution
 125      */
 126     class TickEvent: public Event
 127     {
 128       protected:
 129
 130         LdsState *ldsState = nullptr;
 131
 132         Tick nextTick = 0;
 133
 134       public:
 135
 136         TickEvent(LdsState *_ldsState) :
 137             ldsState(_ldsState)
 138         {
 139         }
 140
 141         virtual void
 142         process();
 143
 144         void
 145         schedule(Tick when)
 146         {
 147             mainEventQueue[0]->schedule(this, when);
 148         }
 149
 150         void
 151         deschedule()
 152         {
 153             mainEventQueue[0]->deschedule(this);
 154         }
 155     };
 156
 157     /**
 158      * CuSidePort is the LDS Port closer to the CU side
 159      */
 160     class CuSidePort: public ResponsePort
 161     {
 162       public:
 163         CuSidePort(const std::string &_name, LdsState *_ownerLds) :
 164                 ResponsePort(_name, _ownerLds), ownerLds(_ownerLds)
 165         {
 166         }
 167
 168       protected:
 169         LdsState *ownerLds;
 170
 171         virtual bool
 172         recvTimingReq(PacketPtr pkt);
 173
 174         virtual Tick
 175         recvAtomic(PacketPtr pkt)
 176         {
 177           return 0;
 178         }
 179
 180         virtual void
 181         recvFunctional(PacketPtr pkt);
 182
 183         virtual void
 184         recvRangeChange()
 185         {
 186         }
 187
 188         virtual void
 189         recvRetry();
 190
 191         virtual void
 192         recvRespRetry();
 193
 194         virtual AddrRangeList
 195         getAddrRanges() const
 196         {
 197           AddrRangeList ranges;
 198           ranges.push_back(ownerLds->getAddrRange());
 199           return ranges;
 200         }
 201
 202         template<typename T>
 203         void
 204         loadData(PacketPtr packet);
 205
 206         template<typename T>
 207         void
 208         storeData(PacketPtr packet);
 209
 210         template<typename T>
 211         void
 212         atomicOperation(PacketPtr packet);
 213     };
 214
 215   protected:
 216
 217     /**
 218      * the lds reference counter
 219      * The key is the workgroup ID and dispatch ID
 220      * The value is the number of wavefronts that reference this LDS, as
 221      * wavefronts are launched, the counter goes up for that workgroup and when
 222      * they return it decreases, once it reaches 0 then this chunk of the LDS
 223      * is returned to the available pool. However,it is deallocated on the 1->0
 224      * transition, not whenever the counter is 0 as it always starts with 0
 225      * when the workgroup asks for space
 226      */
 227     std::unordered_map<uint32_t,
 228                        std::unordered_map<uint32_t, int32_t>> refCounter;
 229
 230     // the map that allows workgroups to access their own chunk of the LDS
 231     std::unordered_map<uint32_t,
 232                        std::unordered_map<uint32_t, LdsChunk>> chunkMap;
 233
 234     // an event to allow the LDS to wake up at a specified time
 235     TickEvent tickEvent;
 236
 237     // the queue of packets that are going back to the CU after a
 238     // read/write/atomic op
 239     // TODO need to make this have a maximum size to create flow control
 240     std::queue<std::pair<Tick, PacketPtr>> returnQueue;
 241
 242     // whether or not there are pending responses
 243     bool retryResp = false;
 244
 245     bool
 246     process();
 247
 248     GPUDynInstPtr
 249     getDynInstr(PacketPtr packet);
 250
 251     bool
 252     processPacket(PacketPtr packet);
 253
 254     unsigned
 255     countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
 256
 257     unsigned
 258     countBankConflicts(GPUDynInstPtr gpuDynInst,
 259                        unsigned *numBankAccesses);
 260
 261   public:
 262     typedef LdsStateParams Params;
 263
 264     LdsState(const Params *params);
 265
 266     // prevent copy construction
 267     LdsState(const LdsState&) = delete;
 268
 269     ~LdsState()
 270     {
 271         parent = nullptr;
 272     }
 273
 274     const Params *
 275     params() const
 276     {
 277         return dynamic_cast<const Params *>(_params);
 278     }
 279
 280     bool
 281     isRetryResp() const
 282     {
 283         return retryResp;
 284     }
 285
 286     void
 287     setRetryResp(const bool value)
 288     {
 289         retryResp = value;
 290     }
 291
 292     // prevent assignment
 293     LdsState &
 294     operator=(const LdsState &) = delete;
 295
 296     /**
 297      * use the dynamic wave id to create or just increase the reference count
 298      */
 299     int
 300     increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
 301     {
 302         int refCount = getRefCounter(dispatchId, wgId);
 303         fatal_if(refCount < 0,
 304                  "reference count should not be below zero");
 305         return ++refCounter[dispatchId][wgId];
 306     }
 307
 308     /**
 309      * decrease the reference count after making sure it is in the list
 310      * give back this chunk if the ref counter has reached 0
 311      */
 312     int
 313     decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
 314     {
 315       int refCount = getRefCounter(dispatchId, wgId);
 316
 317       fatal_if(refCount <= 0,
 318               "reference count should not be below zero or at zero to"
 319               "decrement");
 320
 321       refCounter[dispatchId][wgId]--;
 322
 323       if (refCounter[dispatchId][wgId] == 0) {
 324         releaseSpace(dispatchId, wgId);
 325         return 0;
 326       } else {
 327         return refCounter[dispatchId][wgId];
 328       }
 329     }
 330
 331     /**
 332      * return the current reference count for this workgroup id
 333      */
 334     int
 335     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
 336     {
 337       auto dispatchIter = chunkMap.find(dispatchId);
 338       fatal_if(dispatchIter == chunkMap.end(),
 339                "could not locate this dispatch id [%d]", dispatchId);
 340
 341       auto workgroup = dispatchIter->second.find(wgId);
 342       fatal_if(workgroup == dispatchIter->second.end(),
 343                "could not find this workgroup id within this dispatch id"
 344                " did[%d] wgid[%d]", dispatchId, wgId);
 345
 346       auto refCountIter = refCounter.find(dispatchId);
 347       if (refCountIter == refCounter.end()) {
 348         fatal("could not locate this dispatch id [%d]", dispatchId);
 349       } else {
 350         auto workgroup = refCountIter->second.find(wgId);
 351         if (workgroup == refCountIter->second.end()) {
 352           fatal("could not find this workgroup id within this dispatch id"
 353                   " did[%d] wgid[%d]", dispatchId, wgId);
 354         } else {
 355           return refCounter.at(dispatchId).at(wgId);
 356         }
 357       }
 358
 359       fatal("should not reach this point");
 360       return 0;
 361     }
 362
 363     /**
 364      * assign a parent and request this amount of space be set aside
 365      * for this wgid
 366      */
 367     LdsChunk *
 368     reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
 369             const uint32_t size)
 370     {
 371         if (chunkMap.find(dispatchId) != chunkMap.end()) {
 372             panic_if(
 373                 chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
 374                 "duplicate workgroup ID asking for space in the LDS "
 375                 "did[%d] wgid[%d]", dispatchId, wgId);
 376         }
 377
 378         if (bytesAllocated + size > maximumSize) {
 379             return nullptr;
 380         } else {
 381             bytesAllocated += size;
 382
 383             auto value = chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
 384             panic_if(!value.second, "was unable to allocate a new chunkMap");
 385
 386             // make an entry for this workgroup
 387             refCounter[dispatchId][wgId] = 0;
 388
 389             return &chunkMap[dispatchId][wgId];
 390         }
 391     }
 392
 393     /*
 394      * return pointer to lds chunk for wgid
 395      */
 396     LdsChunk *
 397     getLdsChunk(const uint32_t dispatchId, const uint32_t wgId)
 398     {
 399       fatal_if(chunkMap.find(dispatchId) == chunkMap.end(),
 400           "fetch for unknown dispatch ID did[%d]", dispatchId);
 401
 402       fatal_if(chunkMap[dispatchId].find(wgId) == chunkMap[dispatchId].end(),
 403           "fetch for unknown workgroup ID wgid[%d] in dispatch ID did[%d]",
 404           wgId, dispatchId);
 405
 406       return &chunkMap[dispatchId][wgId];
 407     }
 408
 409     bool
 410     returnQueuePush(std::pair<Tick, PacketPtr> thePair);
 411
 412     Tick
 413     earliestReturnTime() const
 414     {
 415         // TODO set to max(lastCommand+1, curTick())
 416         return returnQueue.empty() ? curTick() : returnQueue.back().first;
 417     }
 418
 419     void
 420     setParent(ComputeUnit *x_parent);
 421
 422     // accessors
 423     ComputeUnit *
 424     getParent() const
 425     {
 426         return parent;
 427     }
 428
 429     std::string
 430     getName()
 431     {
 432         return _name;
 433     }
 434
 435     int
 436     getBanks() const
 437     {
 438         return banks;
 439     }
 440
 441     ComputeUnit *
 442     getComputeUnit() const
 443     {
 444         return parent;
 445     }
 446
 447     int
 448     getBankConflictPenalty() const
 449     {
 450         return bankConflictPenalty;
 451     }
 452
 453     /**
 454      * get the allocated size for this workgroup
 455      */
 456     std::size_t
 457     ldsSize(const uint32_t x_wgId)
 458     {
 459         return chunkMap[x_wgId].size();
 460     }
 461
 462     AddrRange
 463     getAddrRange() const
 464     {
 465         return range;
 466     }
 467
 468     Port &
 469     getPort(const std::string &if_name, PortID idx)
 470     {
 471         if (if_name == "cuPort") {
 472             // TODO need to set name dynamically at this point?
 473             return cuPort;
 474         } else {
 475             fatal("cannot resolve the port name " + if_name);
 476         }
 477     }
 478
 479     /**
 480      * can this much space be reserved for a workgroup?
 481      */
 482     bool
 483     canReserve(uint32_t x_size) const
 484     {
 485       return bytesAllocated + x_size <= maximumSize;
 486     }
 487
 488   private:
 489     /**
 490      * give back the space
 491      */
 492     bool
 493     releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
 494     {
 495         auto dispatchIter = chunkMap.find(x_dispatchId);
 496
 497         if (dispatchIter == chunkMap.end()) {
 498           fatal("dispatch id not found [%d]", x_dispatchId);
 499         } else {
 500           auto workgroupIter = dispatchIter->second.find(x_wgId);
 501           if (workgroupIter == dispatchIter->second.end()) {
 502             fatal("workgroup id [%d] not found in dispatch id [%d]",
 503                     x_wgId, x_dispatchId);
 504           }
 505         }
 506
 507         fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
 508                  "releasing more space than was allocated");
 509
 510         bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
 511         chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
 512         return true;
 513     }
 514
 515     // the port that connects this LDS to its owner CU
 516     CuSidePort cuPort;
 517
 518     ComputeUnit* parent = nullptr;
 519
 520     std::string _name;
 521
 522     // the number of bytes currently reserved by all workgroups
 523     int bytesAllocated = 0;
 524
 525     // the size of the LDS, the most bytes available
 526     int maximumSize;
 527
 528     // Address range of this memory
 529     AddrRange range;
 530
 531     // the penalty, in cycles, for each LDS bank conflict
 532     int bankConflictPenalty = 0;
 533
 534     // the number of banks in the LDS underlying data store
 535     int banks = 0;
 536 };
 537
 538 #endif // __LDS_STATE_HH__