src/gpu-compute/compute_unit.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: John Kalamatianos, Anthony Gutierrez
  34  */
  35
  36 #ifndef __COMPUTE_UNIT_HH__
  37 #define __COMPUTE_UNIT_HH__
  38
  39 #include <deque>
  40 #include <map>
  41 #include <unordered_map>
  42 #include <vector>
  43
  44 #include "base/callback.hh"
  45 #include "base/statistics.hh"
  46 #include "base/types.hh"
  47 #include "enums/PrefetchType.hh"
  48 #include "gpu-compute/exec_stage.hh"
  49 #include "gpu-compute/fetch_stage.hh"
  50 #include "gpu-compute/global_memory_pipeline.hh"
  51 #include "gpu-compute/local_memory_pipeline.hh"
  52 #include "gpu-compute/qstruct.hh"
  53 #include "gpu-compute/schedule_stage.hh"
  54 #include "gpu-compute/scoreboard_check_stage.hh"
  55 #include "mem/mem_object.hh"
  56 #include "mem/port.hh"
  57
  58 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
  59 static const int MAX_WIDTH_FOR_MEM_INST = 32;
  60
  61 class NDRange;
  62 class Shader;
  63 class VectorRegisterFile;
  64
  65 struct ComputeUnitParams;
  66
  67 enum EXEC_POLICY
  68 {
  69     OLDEST = 0,
  70     RR
  71 };
  72
  73 // List of execution units
  74 enum EXEC_UNIT
  75 {
  76     SIMD0 = 0,
  77     SIMD1,
  78     SIMD2,
  79     SIMD3,
  80     GLBMEM_PIPE,
  81     LDSMEM_PIPE,
  82     NUM_UNITS
  83 };
  84
  85 enum TLB_CACHE
  86 {
  87     TLB_MISS_CACHE_MISS = 0,
  88     TLB_MISS_CACHE_HIT,
  89     TLB_HIT_CACHE_MISS,
  90     TLB_HIT_CACHE_HIT
  91 };
  92
  93 class ComputeUnit : public MemObject
  94 {
  95   public:
  96     FetchStage fetchStage;
  97     ScoreboardCheckStage scoreboardCheckStage;
  98     ScheduleStage scheduleStage;
  99     ExecStage execStage;
 100     GlobalMemPipeline globalMemoryPipe;
 101     LocalMemPipeline localMemoryPipe;
 102
 103     // Buffers used to communicate between various pipeline stages
 104
 105     // List of waves which are ready to be scheduled.
 106     // Each execution resource has a ready list. readyList is
 107     // used to communicate between scoreboardCheck stage and
 108     // schedule stage
 109     // TODO: make enum to index readyList
 110     std::vector<std::vector<Wavefront*>> readyList;
 111
 112     // Stores the status of waves. A READY implies the
 113     // wave is ready to be scheduled this cycle and
 114     // is already present in the readyList. waveStatusList is
 115     // used to communicate between scoreboardCheck stage and
 116     // schedule stage
 117     // TODO: convert std::pair to a class to increase readability
 118     std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
 119
 120     // List of waves which will be dispatched to
 121     // each execution resource. A FILLED implies
 122     // dispatch list is non-empty and
 123     // execution unit has something to execute
 124     // this cycle. Currently, the dispatch list of
 125     // an execution resource can hold only one wave because
 126     // an execution resource can execute only one wave in a cycle.
 127     // dispatchList is used to communicate between schedule
 128     // and exec stage
 129     // TODO: convert std::pair to a class to increase readability
 130     std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
 131
 132     int rrNextMemID; // used by RR WF exec policy to cycle through WF's
 133     int rrNextALUWp;
 134     typedef ComputeUnitParams Params;
 135     std::vector<std::vector<Wavefront*>> wfList;
 136     int cu_id;
 137
 138     // array of vector register files, one per SIMD
 139     std::vector<VectorRegisterFile*> vrf;
 140     // Number of vector ALU units (SIMDs) in CU
 141     int numSIMDs;
 142     // number of pipe stages for bypassing data to next dependent single
 143     // precision vector instruction inside the vector ALU pipeline
 144     int spBypassPipeLength;
 145     // number of pipe stages for bypassing data to next dependent double
 146     // precision vector instruction inside the vector ALU pipeline
 147     int dpBypassPipeLength;
 148     // number of cycles per issue period
 149     int issuePeriod;
 150
 151     // Number of global and local memory execution resources in CU
 152     int numGlbMemUnits;
 153     int numLocMemUnits;
 154     // tracks the last cycle a vector instruction was executed on a SIMD
 155     std::vector<uint64_t> lastExecCycle;
 156
 157     // true if we allow a separate TLB per lane
 158     bool perLaneTLB;
 159     // if 0, TLB prefetching is off.
 160     int prefetchDepth;
 161     // if fixed-stride prefetching, this is the stride.
 162     int prefetchStride;
 163
 164     std::vector<Addr> lastVaddrCU;
 165     std::vector<std::vector<Addr>> lastVaddrSimd;
 166     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
 167     Enums::PrefetchType prefetchType;
 168     EXEC_POLICY exec_policy;
 169
 170     bool xact_cas_mode;
 171     bool debugSegFault;
 172     bool functionalTLB;
 173     bool localMemBarrier;
 174
 175     /*
 176      * for Counting page accesses
 177      *
 178      * cuExitCallback inherits from Callback. When you register a callback
 179      * function as an exit callback, it will get added to an exit callback
 180      * queue, such that on simulation exit, all callbacks in the callback
 181      * queue will have their process() function called.
 182      */
 183     bool countPages;
 184
 185     Shader *shader;
 186     uint32_t barrier_id;
 187     // vector of Vector ALU (MACC) pipelines
 188     std::vector<WaitClass> aluPipe;
 189     // minimum issue period per SIMD unit (in cycles)
 190     std::vector<WaitClass> wfWait;
 191
 192     // Resource control for Vector Register File->Global Memory pipe buses
 193     std::vector<WaitClass> vrfToGlobalMemPipeBus;
 194     // Resource control for Vector Register File->Local Memory pipe buses
 195     std::vector<WaitClass> vrfToLocalMemPipeBus;
 196     int nextGlbMemBus;
 197     int nextLocMemBus;
 198     // Resource control for global memory to VRF data/address bus
 199     WaitClass glbMemToVrfBus;
 200     // Resource control for local memory to VRF data/address bus
 201     WaitClass locMemToVrfBus;
 202
 203     uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
 204     uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
 205     uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
 206     uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
 207
 208     Tick req_tick_latency;
 209     Tick resp_tick_latency;
 210
 211     // number of vector registers being reserved for each SIMD unit
 212     std::vector<int> vectorRegsReserved;
 213     // number of vector registers per SIMD unit
 214     uint32_t numVecRegsPerSimd;
 215     // Support for scheduling VGPR status update events
 216     std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
 217     std::vector<uint64_t> timestampVec;
 218     std::vector<uint8_t>  statusVec;
 219
 220     void
 221     registerEvent(uint32_t simdId,
 222                   uint32_t regIdx,
 223                   uint32_t operandSize,
 224                   uint64_t when,
 225                   uint8_t newStatus) {
 226         regIdxVec.push_back(std::make_pair(simdId, regIdx));
 227         timestampVec.push_back(when);
 228         statusVec.push_back(newStatus);
 229         if (operandSize > 4) {
 230             regIdxVec.push_back(std::make_pair(simdId,
 231                                                ((regIdx + 1) %
 232                                                 numVecRegsPerSimd)));
 233             timestampVec.push_back(when);
 234             statusVec.push_back(newStatus);
 235         }
 236     }
 237
 238     void updateEvents();
 239
 240     // this hash map will keep track of page divergence
 241     // per memory instruction per wavefront. The hash map
 242     // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
 243     std::map<Addr, int> pagesTouched;
 244
 245     ComputeUnit(const Params *p);
 246     ~ComputeUnit();
 247     int spBypassLength() { return spBypassPipeLength; };
 248     int dpBypassLength() { return dpBypassPipeLength; };
 249     int storeBusLength() { return numCyclesPerStoreTransfer; };
 250     int loadBusLength() { return numCyclesPerLoadTransfer; };
 251     int wfSize() const { return wavefrontSize; };
 252
 253     void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
 254     void exec();
 255     void initiateFetch(Wavefront *wavefront);
 256     void fetch(PacketPtr pkt, Wavefront *wavefront);
 257     void fillKernelState(Wavefront *w, NDRange *ndr);
 258
 259     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 260                         NDRange *ndr);
 261
 262     void StartWorkgroup(NDRange *ndr);
 263     int ReadyWorkgroup(NDRange *ndr);
 264
 265     bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
 266     bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
 267     bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
 268     int GlbMemUnitId() { return GLBMEM_PIPE; }
 269     int ShrMemUnitId() { return LDSMEM_PIPE; }
 270     int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
 271     int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
 272     /* This function cycles through all the wavefronts in all the phases to see
 273      * if all of the wavefronts which should be associated with one barrier
 274      * (denoted with _barrier_id), are all at the same barrier in the program
 275      * (denoted by bcnt). When the number at the barrier matches bslots, then
 276      * return true.
 277      */
 278     int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
 279     bool cedeSIMD(int simdId, int wfSlotId);
 280
 281     template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
 282     virtual void init();
 283     void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
 284     void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
 285     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
 286                               bool kernelLaunch=true,
 287                               RequestPtr req=nullptr);
 288     void handleMemPacket(PacketPtr pkt, int memport_index);
 289     bool processTimingPacket(PacketPtr pkt);
 290     void processFetchReturn(PacketPtr pkt);
 291     void updatePageDivergenceDist(Addr addr);
 292
 293     MasterID masterId() { return _masterId; }
 294
 295     bool isDone() const;
 296     bool isSimdDone(uint32_t) const;
 297
 298   protected:
 299     MasterID _masterId;
 300
 301     LdsState &lds;
 302
 303   public:
 304     Stats::Scalar vALUInsts;
 305     Stats::Formula vALUInstsPerWF;
 306     Stats::Scalar sALUInsts;
 307     Stats::Formula sALUInstsPerWF;
 308     Stats::Scalar instCyclesVALU;
 309     Stats::Scalar instCyclesSALU;
 310     Stats::Scalar threadCyclesVALU;
 311     Stats::Formula vALUUtilization;
 312     Stats::Scalar ldsNoFlatInsts;
 313     Stats::Formula ldsNoFlatInstsPerWF;
 314     Stats::Scalar flatVMemInsts;
 315     Stats::Formula flatVMemInstsPerWF;
 316     Stats::Scalar flatLDSInsts;
 317     Stats::Formula flatLDSInstsPerWF;
 318     Stats::Scalar vectorMemWrites;
 319     Stats::Formula vectorMemWritesPerWF;
 320     Stats::Scalar vectorMemReads;
 321     Stats::Formula vectorMemReadsPerWF;
 322     Stats::Scalar scalarMemWrites;
 323     Stats::Formula scalarMemWritesPerWF;
 324     Stats::Scalar scalarMemReads;
 325     Stats::Formula scalarMemReadsPerWF;
 326
 327     void updateInstStats(GPUDynInstPtr gpuDynInst);
 328
 329     // the following stats compute the avg. TLB accesslatency per
 330     // uncoalesced request (only for data)
 331     Stats::Scalar tlbRequests;
 332     Stats::Scalar tlbCycles;
 333     Stats::Formula tlbLatency;
 334     // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
 335     Stats::Vector hitsPerTLBLevel;
 336
 337     Stats::Scalar ldsBankAccesses;
 338     Stats::Distribution ldsBankConflictDist;
 339
 340     // over all memory instructions executed over all wavefronts
 341     // how many touched 0-4 pages, 4-8, ..., 60-64 pages
 342     Stats::Distribution pageDivergenceDist;
 343     Stats::Scalar dynamicGMemInstrCnt;
 344     Stats::Scalar dynamicLMemInstrCnt;
 345
 346     Stats::Scalar wgBlockedDueLdsAllocation;
 347     // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
 348     // when the instruction is committed, this number is still incremented by 1
 349     Stats::Scalar numInstrExecuted;
 350     // Number of cycles among successive instruction executions across all
 351     // wavefronts of the same CU
 352     Stats::Distribution execRateDist;
 353     // number of individual vector operations executed
 354     Stats::Scalar numVecOpsExecuted;
 355     // Total cycles that something is running on the GPU
 356     Stats::Scalar totalCycles;
 357     Stats::Formula vpc; // vector ops per cycle
 358     Stats::Formula ipc; // vector instructions per cycle
 359     Stats::Distribution controlFlowDivergenceDist;
 360     Stats::Distribution activeLanesPerGMemInstrDist;
 361     Stats::Distribution activeLanesPerLMemInstrDist;
 362     // number of vector ALU instructions received
 363     Stats::Formula numALUInstsExecuted;
 364     // number of times a WG can not start due to lack of free VGPRs in SIMDs
 365     Stats::Scalar numTimesWgBlockedDueVgprAlloc;
 366     Stats::Scalar numCASOps;
 367     Stats::Scalar numFailedCASOps;
 368     Stats::Scalar completedWfs;
 369     // flag per vector SIMD unit that is set when there is at least one
 370     // WV that has a vector ALU instruction as the oldest in its
 371     // Instruction Buffer: Defined in the Scoreboard stage, consumed
 372     // by the Execute stage.
 373     std::vector<bool> vectorAluInstAvail;
 374     // number of available (oldest) LDS instructions that could have
 375     // been issued to the LDS at a specific issue slot
 376     int shrMemInstAvail;
 377     // number of available Global memory instructions that could have
 378     // been issued to TCP at a specific issue slot
 379     int glbMemInstAvail;
 380
 381     void
 382     regStats();
 383
 384     LdsState &
 385     getLds() const
 386     {
 387         return lds;
 388     }
 389
 390     int32_t
 391     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
 392
 393     int cacheLineSize() const { return _cacheLineSize; }
 394
 395     bool
 396     sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
 397
 398     typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
 399     pageDataStruct pageAccesses;
 400
 401     class CUExitCallback : public Callback
 402     {
 403       private:
 404         ComputeUnit *computeUnit;
 405
 406       public:
 407         virtual ~CUExitCallback() { }
 408
 409         CUExitCallback(ComputeUnit *_cu)
 410         {
 411             computeUnit = _cu;
 412         }
 413
 414         virtual void
 415         process();
 416     };
 417
 418     CUExitCallback *cuExitCallback;
 419
 420     /** Data access Port **/
 421     class DataPort : public MasterPort
 422     {
 423       public:
 424         DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 425             : MasterPort(_name, _cu), computeUnit(_cu),
 426               index(_index) { }
 427
 428         bool snoopRangeSent;
 429
 430         struct SenderState : public Packet::SenderState
 431         {
 432             GPUDynInstPtr _gpuDynInst;
 433             int port_index;
 434             Packet::SenderState *saved;
 435
 436             SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
 437                         Packet::SenderState *sender_state=nullptr)
 438                 : _gpuDynInst(gpuDynInst),
 439                   port_index(_port_index),
 440                   saved(sender_state) { }
 441         };
 442
 443         void processMemReqEvent(PacketPtr pkt);
 444         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
 445
 446         void processMemRespEvent(PacketPtr pkt);
 447         EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
 448
 449         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
 450
 451       protected:
 452         ComputeUnit *computeUnit;
 453         int index;
 454
 455         virtual bool recvTimingResp(PacketPtr pkt);
 456         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 457         virtual void recvFunctional(PacketPtr pkt) { }
 458         virtual void recvRangeChange() { }
 459         virtual void recvReqRetry();
 460
 461         virtual void
 462         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 463         {
 464             resp.clear();
 465             snoop = true;
 466         }
 467
 468     };
 469
 470     // Instruction cache access port
 471     class SQCPort : public MasterPort
 472     {
 473       public:
 474         SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 475             : MasterPort(_name, _cu), computeUnit(_cu),
 476               index(_index) { }
 477
 478         bool snoopRangeSent;
 479
 480         struct SenderState : public Packet::SenderState
 481         {
 482             Wavefront *wavefront;
 483             Packet::SenderState *saved;
 484
 485             SenderState(Wavefront *_wavefront, Packet::SenderState
 486                     *sender_state=nullptr)
 487                 : wavefront(_wavefront), saved(sender_state) { }
 488         };
 489
 490         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
 491
 492       protected:
 493         ComputeUnit *computeUnit;
 494         int index;
 495
 496         virtual bool recvTimingResp(PacketPtr pkt);
 497         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 498         virtual void recvFunctional(PacketPtr pkt) { }
 499         virtual void recvRangeChange() { }
 500         virtual void recvReqRetry();
 501
 502         virtual void
 503         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 504         {
 505             resp.clear();
 506             snoop = true;
 507         }
 508      };
 509
 510     /** Data TLB port **/
 511     class DTLBPort : public MasterPort
 512     {
 513       public:
 514         DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 515             : MasterPort(_name, _cu), computeUnit(_cu),
 516               index(_index), stalled(false)
 517         { }
 518
 519         bool isStalled() { return stalled; }
 520         void stallPort() { stalled = true; }
 521         void unstallPort() { stalled = false; }
 522
 523         /**
 524          * here we queue all the translation requests that were
 525          * not successfully sent.
 526          */
 527         std::deque<PacketPtr> retries;
 528
 529         /** SenderState is information carried along with the packet
 530          * throughout the TLB hierarchy
 531          */
 532         struct SenderState: public Packet::SenderState
 533         {
 534             // the memInst that this is associated with
 535             GPUDynInstPtr _gpuDynInst;
 536
 537             // the lane in the memInst this is associated with, so we send
 538             // the memory request down the right port
 539             int portIndex;
 540
 541             // constructor used for packets involved in timing accesses
 542             SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
 543                 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
 544
 545         };
 546
 547       protected:
 548         ComputeUnit *computeUnit;
 549         int index;
 550         bool stalled;
 551
 552         virtual bool recvTimingResp(PacketPtr pkt);
 553         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 554         virtual void recvFunctional(PacketPtr pkt) { }
 555         virtual void recvRangeChange() { }
 556         virtual void recvReqRetry();
 557     };
 558
 559     class ITLBPort : public MasterPort
 560     {
 561       public:
 562         ITLBPort(const std::string &_name, ComputeUnit *_cu)
 563             : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
 564
 565
 566         bool isStalled() { return stalled; }
 567         void stallPort() { stalled = true; }
 568         void unstallPort() { stalled = false; }
 569
 570         /**
 571          * here we queue all the translation requests that were
 572          * not successfully sent.
 573          */
 574         std::deque<PacketPtr> retries;
 575
 576         /** SenderState is information carried along with the packet
 577          * throughout the TLB hierarchy
 578          */
 579         struct SenderState: public Packet::SenderState
 580         {
 581             // The wavefront associated with this request
 582             Wavefront *wavefront;
 583
 584             SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
 585         };
 586
 587       protected:
 588         ComputeUnit *computeUnit;
 589         bool stalled;
 590
 591         virtual bool recvTimingResp(PacketPtr pkt);
 592         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 593         virtual void recvFunctional(PacketPtr pkt) { }
 594         virtual void recvRangeChange() { }
 595         virtual void recvReqRetry();
 596     };
 597
 598     /**
 599      * the port intended to communicate between the CU and its LDS
 600      */
 601     class LDSPort : public MasterPort
 602     {
 603       public:
 604         LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
 605         : MasterPort(_name, _cu, _id), computeUnit(_cu)
 606         {
 607         }
 608
 609         bool isStalled() const { return stalled; }
 610         void stallPort() { stalled = true; }
 611         void unstallPort() { stalled = false; }
 612
 613         /**
 614          * here we queue all the requests that were
 615          * not successfully sent.
 616          */
 617         std::queue<PacketPtr> retries;
 618
 619         /**
 620          *  SenderState is information carried along with the packet, esp. the
 621          *  GPUDynInstPtr
 622          */
 623         class SenderState: public Packet::SenderState
 624         {
 625           protected:
 626             // The actual read/write/atomic request that goes with this command
 627             GPUDynInstPtr _gpuDynInst = nullptr;
 628
 629           public:
 630             SenderState(GPUDynInstPtr gpuDynInst):
 631               _gpuDynInst(gpuDynInst)
 632             {
 633             }
 634
 635             GPUDynInstPtr
 636             getMemInst() const
 637             {
 638               return _gpuDynInst;
 639             }
 640         };
 641
 642         virtual bool
 643         sendTimingReq(PacketPtr pkt);
 644
 645       protected:
 646
 647         bool stalled = false; ///< whether or not it is stalled
 648
 649         ComputeUnit *computeUnit;
 650
 651         virtual bool
 652         recvTimingResp(PacketPtr pkt);
 653
 654         virtual Tick
 655         recvAtomic(PacketPtr pkt) { return 0; }
 656
 657         virtual void
 658         recvFunctional(PacketPtr pkt)
 659         {
 660         }
 661
 662         virtual void
 663         recvRangeChange()
 664         {
 665         }
 666
 667         virtual void
 668         recvReqRetry();
 669     };
 670
 671     /** The port to access the Local Data Store
 672      *  Can be connected to a LDS object
 673      */
 674     LDSPort *ldsPort = nullptr;
 675
 676     LDSPort *
 677     getLdsPort() const
 678     {
 679         return ldsPort;
 680     }
 681
 682     /** The memory port for SIMD data accesses.
 683      *  Can be connected to PhysMem for Ruby for timing simulations
 684      */
 685     std::vector<DataPort*> memPort;
 686     // port to the TLB hierarchy (i.e., the L1 TLB)
 687     std::vector<DTLBPort*> tlbPort;
 688     // port to the SQC (i.e. the I-cache)
 689     SQCPort *sqcPort;
 690     // port to the SQC TLB (there's a separate TLB for each I-cache)
 691     ITLBPort *sqcTLBPort;
 692
 693     virtual BaseMasterPort&
 694     getMasterPort(const std::string &if_name, PortID idx)
 695     {
 696         if (if_name == "memory_port") {
 697             memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
 698                                         this, idx);
 699             return *memPort[idx];
 700         } else if (if_name == "translation_port") {
 701             tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
 702                                         this, idx);
 703             return *tlbPort[idx];
 704         } else if (if_name == "sqc_port") {
 705             sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
 706                                   this, idx);
 707             return *sqcPort;
 708         } else if (if_name == "sqc_tlb_port") {
 709             sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
 710             return *sqcTLBPort;
 711         } else if (if_name == "ldsPort") {
 712             if (ldsPort) {
 713                 fatal("an LDS port was already allocated");
 714             }
 715             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
 716             return *ldsPort;
 717         } else {
 718             panic("incorrect port name");
 719         }
 720     }
 721
 722     // xact_cas_load()
 723     class waveIdentifier
 724     {
 725       public:
 726         waveIdentifier() { }
 727         waveIdentifier(int _simdId, int _wfSlotId)
 728           : simdId(_simdId), wfSlotId(_wfSlotId) { }
 729
 730         int simdId;
 731         int wfSlotId;
 732     };
 733
 734     class waveQueue
 735     {
 736       public:
 737         std::list<waveIdentifier> waveIDQueue;
 738     };
 739     std::map<unsigned, waveQueue> xactCasLoadMap;
 740
 741     uint64_t getAndIncSeqNum() { return globalSeqNum++; }
 742
 743   private:
 744     const int _cacheLineSize;
 745     uint64_t globalSeqNum;
 746     int wavefrontSize;
 747     GPUStaticInst *kernelLaunchInst;
 748 };
 749
 750 #endif // __COMPUTE_UNIT_HH__