src/gpu-compute/compute_unit.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: John Kalamatianos,
  34  *          Anthony Gutierrez
  35  */
  36
  37 #ifndef __COMPUTE_UNIT_HH__
  38 #define __COMPUTE_UNIT_HH__
  39
  40 #include <deque>
  41 #include <map>
  42 #include <unordered_map>
  43 #include <vector>
  44
  45 #include "base/callback.hh"
  46 #include "base/statistics.hh"
  47 #include "base/types.hh"
  48 #include "enums/PrefetchType.hh"
  49 #include "gpu-compute/exec_stage.hh"
  50 #include "gpu-compute/fetch_stage.hh"
  51 #include "gpu-compute/global_memory_pipeline.hh"
  52 #include "gpu-compute/local_memory_pipeline.hh"
  53 #include "gpu-compute/qstruct.hh"
  54 #include "gpu-compute/schedule_stage.hh"
  55 #include "gpu-compute/scoreboard_check_stage.hh"
  56 #include "mem/mem_object.hh"
  57 #include "mem/port.hh"
  58
  59 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
  60 static const int MAX_WIDTH_FOR_MEM_INST = 32;
  61
  62 class NDRange;
  63 class Shader;
  64 class VectorRegisterFile;
  65
  66 struct ComputeUnitParams;
  67
  68 enum EXEC_POLICY
  69 {
  70     OLDEST = 0,
  71     RR
  72 };
  73
  74 // List of execution units
  75 enum EXEC_UNIT
  76 {
  77     SIMD0 = 0,
  78     SIMD1,
  79     SIMD2,
  80     SIMD3,
  81     GLBMEM_PIPE,
  82     LDSMEM_PIPE,
  83     NUM_UNITS
  84 };
  85
  86 enum TLB_CACHE
  87 {
  88     TLB_MISS_CACHE_MISS = 0,
  89     TLB_MISS_CACHE_HIT,
  90     TLB_HIT_CACHE_MISS,
  91     TLB_HIT_CACHE_HIT
  92 };
  93
  94 class ComputeUnit : public MemObject
  95 {
  96   public:
  97     FetchStage fetchStage;
  98     ScoreboardCheckStage scoreboardCheckStage;
  99     ScheduleStage scheduleStage;
 100     ExecStage execStage;
 101     GlobalMemPipeline globalMemoryPipe;
 102     LocalMemPipeline localMemoryPipe;
 103
 104     // Buffers used to communicate between various pipeline stages
 105
 106     // List of waves which are ready to be scheduled.
 107     // Each execution resource has a ready list. readyList is
 108     // used to communicate between scoreboardCheck stage and
 109     // schedule stage
 110     // TODO: make enum to index readyList
 111     std::vector<std::vector<Wavefront*>> readyList;
 112
 113     // Stores the status of waves. A READY implies the
 114     // wave is ready to be scheduled this cycle and
 115     // is already present in the readyList. waveStatusList is
 116     // used to communicate between scoreboardCheck stage and
 117     // schedule stage
 118     // TODO: convert std::pair to a class to increase readability
 119     std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
 120
 121     // List of waves which will be dispatched to
 122     // each execution resource. A FILLED implies
 123     // dispatch list is non-empty and
 124     // execution unit has something to execute
 125     // this cycle. Currently, the dispatch list of
 126     // an execution resource can hold only one wave because
 127     // an execution resource can execute only one wave in a cycle.
 128     // dispatchList is used to communicate between schedule
 129     // and exec stage
 130     // TODO: convert std::pair to a class to increase readability
 131     std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
 132
 133     int rrNextMemID; // used by RR WF exec policy to cycle through WF's
 134     int rrNextALUWp;
 135     typedef ComputeUnitParams Params;
 136     std::vector<std::vector<Wavefront*>> wfList;
 137     int cu_id;
 138
 139     // array of vector register files, one per SIMD
 140     std::vector<VectorRegisterFile*> vrf;
 141     // Number of vector ALU units (SIMDs) in CU
 142     int numSIMDs;
 143     // number of pipe stages for bypassing data to next dependent single
 144     // precision vector instruction inside the vector ALU pipeline
 145     int spBypassPipeLength;
 146     // number of pipe stages for bypassing data to next dependent double
 147     // precision vector instruction inside the vector ALU pipeline
 148     int dpBypassPipeLength;
 149     // number of cycles per issue period
 150     int issuePeriod;
 151
 152     // Number of global and local memory execution resources in CU
 153     int numGlbMemUnits;
 154     int numLocMemUnits;
 155     // tracks the last cycle a vector instruction was executed on a SIMD
 156     std::vector<uint64_t> lastExecCycle;
 157
 158     // true if we allow a separate TLB per lane
 159     bool perLaneTLB;
 160     // if 0, TLB prefetching is off.
 161     int prefetchDepth;
 162     // if fixed-stride prefetching, this is the stride.
 163     int prefetchStride;
 164
 165     std::vector<Addr> lastVaddrCU;
 166     std::vector<std::vector<Addr>> lastVaddrSimd;
 167     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
 168     Enums::PrefetchType prefetchType;
 169     EXEC_POLICY exec_policy;
 170
 171     bool xact_cas_mode;
 172     bool debugSegFault;
 173     bool functionalTLB;
 174     bool localMemBarrier;
 175
 176     /*
 177      * for Counting page accesses
 178      *
 179      * cuExitCallback inherits from Callback. When you register a callback
 180      * function as an exit callback, it will get added to an exit callback
 181      * queue, such that on simulation exit, all callbacks in the callback
 182      * queue will have their process() function called.
 183      */
 184     bool countPages;
 185
 186     Shader *shader;
 187     uint32_t barrier_id;
 188     // vector of Vector ALU (MACC) pipelines
 189     std::vector<WaitClass> aluPipe;
 190     // minimum issue period per SIMD unit (in cycles)
 191     std::vector<WaitClass> wfWait;
 192
 193     // Resource control for Vector Register File->Global Memory pipe buses
 194     std::vector<WaitClass> vrfToGlobalMemPipeBus;
 195     // Resource control for Vector Register File->Local Memory pipe buses
 196     std::vector<WaitClass> vrfToLocalMemPipeBus;
 197     int nextGlbMemBus;
 198     int nextLocMemBus;
 199     // Resource control for global memory to VRF data/address bus
 200     WaitClass glbMemToVrfBus;
 201     // Resource control for local memory to VRF data/address bus
 202     WaitClass locMemToVrfBus;
 203
 204     uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
 205     uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
 206     uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
 207     uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
 208
 209     Tick req_tick_latency;
 210     Tick resp_tick_latency;
 211
 212     // number of vector registers being reserved for each SIMD unit
 213     std::vector<int> vectorRegsReserved;
 214     // number of vector registers per SIMD unit
 215     uint32_t numVecRegsPerSimd;
 216     // Support for scheduling VGPR status update events
 217     std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
 218     std::vector<uint64_t> timestampVec;
 219     std::vector<uint8_t>  statusVec;
 220
 221     void
 222     registerEvent(uint32_t simdId,
 223                   uint32_t regIdx,
 224                   uint32_t operandSize,
 225                   uint64_t when,
 226                   uint8_t newStatus) {
 227         regIdxVec.push_back(std::make_pair(simdId, regIdx));
 228         timestampVec.push_back(when);
 229         statusVec.push_back(newStatus);
 230         if (operandSize > 4) {
 231             regIdxVec.push_back(std::make_pair(simdId,
 232                                                ((regIdx + 1) %
 233                                                 numVecRegsPerSimd)));
 234             timestampVec.push_back(when);
 235             statusVec.push_back(newStatus);
 236         }
 237     }
 238
 239     void updateEvents();
 240
 241     // this hash map will keep track of page divergence
 242     // per memory instruction per wavefront. The hash map
 243     // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
 244     std::map<Addr, int> pagesTouched;
 245
 246     ComputeUnit(const Params *p);
 247     ~ComputeUnit();
 248     int spBypassLength() { return spBypassPipeLength; };
 249     int dpBypassLength() { return dpBypassPipeLength; };
 250     int storeBusLength() { return numCyclesPerStoreTransfer; };
 251     int loadBusLength() { return numCyclesPerLoadTransfer; };
 252     int wfSize() const { return wavefrontSize; };
 253
 254     void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
 255     void exec();
 256     void initiateFetch(Wavefront *wavefront);
 257     void fetch(PacketPtr pkt, Wavefront *wavefront);
 258     void fillKernelState(Wavefront *w, NDRange *ndr);
 259
 260     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 261                         NDRange *ndr);
 262
 263     void StartWorkgroup(NDRange *ndr);
 264     int ReadyWorkgroup(NDRange *ndr);
 265
 266     bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
 267     bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
 268     bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
 269     int GlbMemUnitId() { return GLBMEM_PIPE; }
 270     int ShrMemUnitId() { return LDSMEM_PIPE; }
 271     int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
 272     int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
 273     /* This function cycles through all the wavefronts in all the phases to see
 274      * if all of the wavefronts which should be associated with one barrier
 275      * (denoted with _barrier_id), are all at the same barrier in the program
 276      * (denoted by bcnt). When the number at the barrier matches bslots, then
 277      * return true.
 278      */
 279     int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
 280     bool cedeSIMD(int simdId, int wfSlotId);
 281
 282     template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
 283     virtual void init();
 284     void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
 285     void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
 286     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
 287                               bool kernelLaunch=true,
 288                               RequestPtr req=nullptr);
 289     void handleMemPacket(PacketPtr pkt, int memport_index);
 290     bool processTimingPacket(PacketPtr pkt);
 291     void processFetchReturn(PacketPtr pkt);
 292     void updatePageDivergenceDist(Addr addr);
 293
 294     MasterID masterId() { return _masterId; }
 295
 296     bool isDone() const;
 297     bool isSimdDone(uint32_t) const;
 298
 299   protected:
 300     MasterID _masterId;
 301
 302     LdsState &lds;
 303
 304   public:
 305     Stats::Scalar vALUInsts;
 306     Stats::Formula vALUInstsPerWF;
 307     Stats::Scalar sALUInsts;
 308     Stats::Formula sALUInstsPerWF;
 309     Stats::Scalar instCyclesVALU;
 310     Stats::Scalar instCyclesSALU;
 311     Stats::Scalar threadCyclesVALU;
 312     Stats::Formula vALUUtilization;
 313     Stats::Scalar ldsNoFlatInsts;
 314     Stats::Formula ldsNoFlatInstsPerWF;
 315     Stats::Scalar flatVMemInsts;
 316     Stats::Formula flatVMemInstsPerWF;
 317     Stats::Scalar flatLDSInsts;
 318     Stats::Formula flatLDSInstsPerWF;
 319     Stats::Scalar vectorMemWrites;
 320     Stats::Formula vectorMemWritesPerWF;
 321     Stats::Scalar vectorMemReads;
 322     Stats::Formula vectorMemReadsPerWF;
 323     Stats::Scalar scalarMemWrites;
 324     Stats::Formula scalarMemWritesPerWF;
 325     Stats::Scalar scalarMemReads;
 326     Stats::Formula scalarMemReadsPerWF;
 327
 328     void updateInstStats(GPUDynInstPtr gpuDynInst);
 329
 330     // the following stats compute the avg. TLB accesslatency per
 331     // uncoalesced request (only for data)
 332     Stats::Scalar tlbRequests;
 333     Stats::Scalar tlbCycles;
 334     Stats::Formula tlbLatency;
 335     // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
 336     Stats::Vector hitsPerTLBLevel;
 337
 338     Stats::Scalar ldsBankAccesses;
 339     Stats::Distribution ldsBankConflictDist;
 340
 341     // over all memory instructions executed over all wavefronts
 342     // how many touched 0-4 pages, 4-8, ..., 60-64 pages
 343     Stats::Distribution pageDivergenceDist;
 344     Stats::Scalar dynamicGMemInstrCnt;
 345     Stats::Scalar dynamicLMemInstrCnt;
 346
 347     Stats::Scalar wgBlockedDueLdsAllocation;
 348     // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
 349     // when the instruction is committed, this number is still incremented by 1
 350     Stats::Scalar numInstrExecuted;
 351     // Number of cycles among successive instruction executions across all
 352     // wavefronts of the same CU
 353     Stats::Distribution execRateDist;
 354     // number of individual vector operations executed
 355     Stats::Scalar numVecOpsExecuted;
 356     // Total cycles that something is running on the GPU
 357     Stats::Scalar totalCycles;
 358     Stats::Formula vpc; // vector ops per cycle
 359     Stats::Formula ipc; // vector instructions per cycle
 360     Stats::Distribution controlFlowDivergenceDist;
 361     Stats::Distribution activeLanesPerGMemInstrDist;
 362     Stats::Distribution activeLanesPerLMemInstrDist;
 363     // number of vector ALU instructions received
 364     Stats::Formula numALUInstsExecuted;
 365     // number of times a WG can not start due to lack of free VGPRs in SIMDs
 366     Stats::Scalar numTimesWgBlockedDueVgprAlloc;
 367     Stats::Scalar numCASOps;
 368     Stats::Scalar numFailedCASOps;
 369     Stats::Scalar completedWfs;
 370     // flag per vector SIMD unit that is set when there is at least one
 371     // WV that has a vector ALU instruction as the oldest in its
 372     // Instruction Buffer: Defined in the Scoreboard stage, consumed
 373     // by the Execute stage.
 374     std::vector<bool> vectorAluInstAvail;
 375     // number of available (oldest) LDS instructions that could have
 376     // been issued to the LDS at a specific issue slot
 377     int shrMemInstAvail;
 378     // number of available Global memory instructions that could have
 379     // been issued to TCP at a specific issue slot
 380     int glbMemInstAvail;
 381
 382     void
 383     regStats();
 384
 385     LdsState &
 386     getLds() const
 387     {
 388         return lds;
 389     }
 390
 391     int32_t
 392     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
 393
 394     int cacheLineSize() const { return _cacheLineSize; }
 395
 396     bool
 397     sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
 398
 399     typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
 400     pageDataStruct pageAccesses;
 401
 402     class CUExitCallback : public Callback
 403     {
 404       private:
 405         ComputeUnit *computeUnit;
 406
 407       public:
 408         virtual ~CUExitCallback() { }
 409
 410         CUExitCallback(ComputeUnit *_cu)
 411         {
 412             computeUnit = _cu;
 413         }
 414
 415         virtual void
 416         process();
 417     };
 418
 419     CUExitCallback *cuExitCallback;
 420
 421     /** Data access Port **/
 422     class DataPort : public MasterPort
 423     {
 424       public:
 425         DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 426             : MasterPort(_name, _cu), computeUnit(_cu),
 427               index(_index) { }
 428
 429         bool snoopRangeSent;
 430
 431         struct SenderState : public Packet::SenderState
 432         {
 433             GPUDynInstPtr _gpuDynInst;
 434             int port_index;
 435             Packet::SenderState *saved;
 436
 437             SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
 438                         Packet::SenderState *sender_state=nullptr)
 439                 : _gpuDynInst(gpuDynInst),
 440                   port_index(_port_index),
 441                   saved(sender_state) { }
 442         };
 443
 444         void processMemReqEvent(PacketPtr pkt);
 445         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
 446
 447         void processMemRespEvent(PacketPtr pkt);
 448         EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
 449
 450         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
 451
 452       protected:
 453         ComputeUnit *computeUnit;
 454         int index;
 455
 456         virtual bool recvTimingResp(PacketPtr pkt);
 457         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 458         virtual void recvFunctional(PacketPtr pkt) { }
 459         virtual void recvRangeChange() { }
 460         virtual void recvReqRetry();
 461
 462         virtual void
 463         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 464         {
 465             resp.clear();
 466             snoop = true;
 467         }
 468
 469     };
 470
 471     // Instruction cache access port
 472     class SQCPort : public MasterPort
 473     {
 474       public:
 475         SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 476             : MasterPort(_name, _cu), computeUnit(_cu),
 477               index(_index) { }
 478
 479         bool snoopRangeSent;
 480
 481         struct SenderState : public Packet::SenderState
 482         {
 483             Wavefront *wavefront;
 484             Packet::SenderState *saved;
 485
 486             SenderState(Wavefront *_wavefront, Packet::SenderState
 487                     *sender_state=nullptr)
 488                 : wavefront(_wavefront), saved(sender_state) { }
 489         };
 490
 491         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
 492
 493       protected:
 494         ComputeUnit *computeUnit;
 495         int index;
 496
 497         virtual bool recvTimingResp(PacketPtr pkt);
 498         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 499         virtual void recvFunctional(PacketPtr pkt) { }
 500         virtual void recvRangeChange() { }
 501         virtual void recvReqRetry();
 502
 503         virtual void
 504         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 505         {
 506             resp.clear();
 507             snoop = true;
 508         }
 509      };
 510
 511     /** Data TLB port **/
 512     class DTLBPort : public MasterPort
 513     {
 514       public:
 515         DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 516             : MasterPort(_name, _cu), computeUnit(_cu),
 517               index(_index), stalled(false)
 518         { }
 519
 520         bool isStalled() { return stalled; }
 521         void stallPort() { stalled = true; }
 522         void unstallPort() { stalled = false; }
 523
 524         /**
 525          * here we queue all the translation requests that were
 526          * not successfully sent.
 527          */
 528         std::deque<PacketPtr> retries;
 529
 530         /** SenderState is information carried along with the packet
 531          * throughout the TLB hierarchy
 532          */
 533         struct SenderState: public Packet::SenderState
 534         {
 535             // the memInst that this is associated with
 536             GPUDynInstPtr _gpuDynInst;
 537
 538             // the lane in the memInst this is associated with, so we send
 539             // the memory request down the right port
 540             int portIndex;
 541
 542             // constructor used for packets involved in timing accesses
 543             SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
 544                 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
 545
 546         };
 547
 548       protected:
 549         ComputeUnit *computeUnit;
 550         int index;
 551         bool stalled;
 552
 553         virtual bool recvTimingResp(PacketPtr pkt);
 554         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 555         virtual void recvFunctional(PacketPtr pkt) { }
 556         virtual void recvRangeChange() { }
 557         virtual void recvReqRetry();
 558     };
 559
 560     class ITLBPort : public MasterPort
 561     {
 562       public:
 563         ITLBPort(const std::string &_name, ComputeUnit *_cu)
 564             : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
 565
 566
 567         bool isStalled() { return stalled; }
 568         void stallPort() { stalled = true; }
 569         void unstallPort() { stalled = false; }
 570
 571         /**
 572          * here we queue all the translation requests that were
 573          * not successfully sent.
 574          */
 575         std::deque<PacketPtr> retries;
 576
 577         /** SenderState is information carried along with the packet
 578          * throughout the TLB hierarchy
 579          */
 580         struct SenderState: public Packet::SenderState
 581         {
 582             // The wavefront associated with this request
 583             Wavefront *wavefront;
 584
 585             SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
 586         };
 587
 588       protected:
 589         ComputeUnit *computeUnit;
 590         bool stalled;
 591
 592         virtual bool recvTimingResp(PacketPtr pkt);
 593         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 594         virtual void recvFunctional(PacketPtr pkt) { }
 595         virtual void recvRangeChange() { }
 596         virtual void recvReqRetry();
 597     };
 598
 599     /**
 600      * the port intended to communicate between the CU and its LDS
 601      */
 602     class LDSPort : public MasterPort
 603     {
 604       public:
 605         LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
 606         : MasterPort(_name, _cu, _id), computeUnit(_cu)
 607         {
 608         }
 609
 610         bool isStalled() const { return stalled; }
 611         void stallPort() { stalled = true; }
 612         void unstallPort() { stalled = false; }
 613
 614         /**
 615          * here we queue all the requests that were
 616          * not successfully sent.
 617          */
 618         std::queue<PacketPtr> retries;
 619
 620         /**
 621          *  SenderState is information carried along with the packet, esp. the
 622          *  GPUDynInstPtr
 623          */
 624         class SenderState: public Packet::SenderState
 625         {
 626           protected:
 627             // The actual read/write/atomic request that goes with this command
 628             GPUDynInstPtr _gpuDynInst = nullptr;
 629
 630           public:
 631             SenderState(GPUDynInstPtr gpuDynInst):
 632               _gpuDynInst(gpuDynInst)
 633             {
 634             }
 635
 636             GPUDynInstPtr
 637             getMemInst() const
 638             {
 639               return _gpuDynInst;
 640             }
 641         };
 642
 643         virtual bool
 644         sendTimingReq(PacketPtr pkt);
 645
 646       protected:
 647
 648         bool stalled = false; ///< whether or not it is stalled
 649
 650         ComputeUnit *computeUnit;
 651
 652         virtual bool
 653         recvTimingResp(PacketPtr pkt);
 654
 655         virtual Tick
 656         recvAtomic(PacketPtr pkt) { return 0; }
 657
 658         virtual void
 659         recvFunctional(PacketPtr pkt)
 660         {
 661         }
 662
 663         virtual void
 664         recvRangeChange()
 665         {
 666         }
 667
 668         virtual void
 669         recvReqRetry();
 670     };
 671
 672     /** The port to access the Local Data Store
 673      *  Can be connected to a LDS object
 674      */
 675     LDSPort *ldsPort = nullptr;
 676
 677     LDSPort *
 678     getLdsPort() const
 679     {
 680         return ldsPort;
 681     }
 682
 683     /** The memory port for SIMD data accesses.
 684      *  Can be connected to PhysMem for Ruby for timing simulations
 685      */
 686     std::vector<DataPort*> memPort;
 687     // port to the TLB hierarchy (i.e., the L1 TLB)
 688     std::vector<DTLBPort*> tlbPort;
 689     // port to the SQC (i.e. the I-cache)
 690     SQCPort *sqcPort;
 691     // port to the SQC TLB (there's a separate TLB for each I-cache)
 692     ITLBPort *sqcTLBPort;
 693
 694     Port &
 695     getPort(const std::string &if_name, PortID idx) override
 696     {
 697         if (if_name == "memory_port") {
 698             memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
 699                                         this, idx);
 700             return *memPort[idx];
 701         } else if (if_name == "translation_port") {
 702             tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
 703                                         this, idx);
 704             return *tlbPort[idx];
 705         } else if (if_name == "sqc_port") {
 706             sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
 707                                   this, idx);
 708             return *sqcPort;
 709         } else if (if_name == "sqc_tlb_port") {
 710             sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
 711             return *sqcTLBPort;
 712         } else if (if_name == "ldsPort") {
 713             if (ldsPort) {
 714                 fatal("an LDS port was already allocated");
 715             }
 716             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
 717             return *ldsPort;
 718         } else {
 719             panic("incorrect port name");
 720         }
 721     }
 722
 723     // xact_cas_load()
 724     class waveIdentifier
 725     {
 726       public:
 727         waveIdentifier() { }
 728         waveIdentifier(int _simdId, int _wfSlotId)
 729           : simdId(_simdId), wfSlotId(_wfSlotId) { }
 730
 731         int simdId;
 732         int wfSlotId;
 733     };
 734
 735     class waveQueue
 736     {
 737       public:
 738         std::list<waveIdentifier> waveIDQueue;
 739     };
 740     std::map<unsigned, waveQueue> xactCasLoadMap;
 741
 742     uint64_t getAndIncSeqNum() { return globalSeqNum++; }
 743
 744   private:
 745     const int _cacheLineSize;
 746     uint64_t globalSeqNum;
 747     int wavefrontSize;
 748     GPUStaticInst *kernelLaunchInst;
 749 };
 750
 751 #endif // __COMPUTE_UNIT_HH__