src/gpu-compute/compute_unit.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef __COMPUTE_UNIT_HH__
  35 #define __COMPUTE_UNIT_HH__
  36
  37 #include <deque>
  38 #include <map>
  39 #include <unordered_set>
  40 #include <vector>
  41
  42 #include "base/callback.hh"
  43 #include "base/statistics.hh"
  44 #include "base/types.hh"
  45 #include "config/the_gpu_isa.hh"
  46 #include "enums/PrefetchType.hh"
  47 #include "gpu-compute/comm.hh"
  48 #include "gpu-compute/exec_stage.hh"
  49 #include "gpu-compute/fetch_stage.hh"
  50 #include "gpu-compute/global_memory_pipeline.hh"
  51 #include "gpu-compute/hsa_queue_entry.hh"
  52 #include "gpu-compute/local_memory_pipeline.hh"
  53 #include "gpu-compute/register_manager.hh"
  54 #include "gpu-compute/scalar_memory_pipeline.hh"
  55 #include "gpu-compute/schedule_stage.hh"
  56 #include "gpu-compute/scoreboard_check_stage.hh"
  57 #include "mem/port.hh"
  58 #include "mem/token_port.hh"
  59 #include "sim/clocked_object.hh"
  60
  61 class HSAQueueEntry;
  62 class LdsChunk;
  63 class ScalarRegisterFile;
  64 class Shader;
  65 class VectorRegisterFile;
  66
  67 struct ComputeUnitParams;
  68
  69 enum EXEC_POLICY
  70 {
  71     OLDEST = 0,
  72     RR
  73 };
  74
  75 enum TLB_CACHE
  76 {
  77     TLB_MISS_CACHE_MISS = 0,
  78     TLB_MISS_CACHE_HIT,
  79     TLB_HIT_CACHE_MISS,
  80     TLB_HIT_CACHE_HIT
  81 };
  82
  83 /**
  84  * WF barrier slots. This represents the barrier resource for
  85  * WF-level barriers (i.e., barriers to sync WFs within a WG).
  86  */
  87 class WFBarrier
  88 {
  89   public:
  90     WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0)
  91     {
  92     }
  93
  94     static const int InvalidID = -1;
  95
  96     int
  97     numAtBarrier() const
  98     {
  99         return _numAtBarrier;
 100     }
 101
 102     /**
 103      * Number of WFs that have not yet reached the barrier.
 104      */
 105     int
 106     numYetToReachBarrier() const
 107     {
 108         return _maxBarrierCnt - _numAtBarrier;
 109     }
 110
 111     int
 112     maxBarrierCnt() const
 113     {
 114         return _maxBarrierCnt;
 115     }
 116
 117     /**
 118      * Set the maximum barrier count (i.e., the number of WFs that are
 119      * participating in the barrier).
 120      */
 121     void
 122     setMaxBarrierCnt(int max_barrier_cnt)
 123     {
 124         _maxBarrierCnt = max_barrier_cnt;
 125     }
 126
 127     /**
 128      * Mark that a WF has reached the barrier.
 129      */
 130     void
 131     incNumAtBarrier()
 132     {
 133         assert(_numAtBarrier < _maxBarrierCnt);
 134         ++_numAtBarrier;
 135     }
 136
 137     /**
 138      * Have all WFs participating in this barrier reached the barrier?
 139      * If so, then the barrier is satisfied and WFs may proceed past
 140      * the barrier.
 141      */
 142     bool
 143     allAtBarrier() const
 144     {
 145         return _numAtBarrier == _maxBarrierCnt;
 146     }
 147
 148     /**
 149      * Decrement the number of WFs that are participating in this barrier.
 150      * This should be called when a WF exits.
 151      */
 152     void
 153     decMaxBarrierCnt()
 154     {
 155         assert(_maxBarrierCnt > 0);
 156         --_maxBarrierCnt;
 157     }
 158
 159     /**
 160      * Release this barrier resource so it can be used by other WGs. This
 161      * is generally called when a WG has finished.
 162      */
 163     void
 164     release()
 165     {
 166         _numAtBarrier = 0;
 167         _maxBarrierCnt = 0;
 168     }
 169
 170     /**
 171      * Reset the barrier. This is used to reset the barrier, usually when
 172      * a dynamic instance of a barrier has been satisfied.
 173      */
 174     void
 175     reset()
 176     {
 177         _numAtBarrier = 0;
 178     }
 179
 180   private:
 181     /**
 182      * The number of WFs in the WG that have reached the barrier. Once
 183      * the number of WFs that reach a barrier matches the number of WFs
 184      * in the WG, the barrier is satisfied.
 185      */
 186     int _numAtBarrier;
 187
 188     /**
 189      * The maximum number of WFs that can reach this barrier. This is
 190      * essentially the number of WFs in the WG, and a barrier is satisfied
 191      * when the number of WFs that reach the barrier equal this value. If
 192      * a WF exits early it must decrement this value so that it is no
 193      * longer considered for this barrier.
 194      */
 195     int _maxBarrierCnt;
 196 };
 197
 198 class ComputeUnit : public ClockedObject
 199 {
 200   public:
 201
 202
 203     // Execution resources
 204     //
 205     // The ordering of units is:
 206     // Vector ALUs
 207     // Scalar ALUs
 208     // GM Pipe
 209     // LM Pipe
 210     // Scalar Mem Pipe
 211     //
 212     // Note: the ordering of units is important and the code assumes the
 213     // above ordering. However, there may be more than one resource of
 214     // each type (e.g., 4 VALUs or 2 SALUs)
 215
 216     int numVectorGlobalMemUnits;
 217     // Resource control for global memory to VRF data/address bus
 218     WaitClass glbMemToVrfBus;
 219     // Resource control for Vector Register File->Global Memory pipe buses
 220     WaitClass vrfToGlobalMemPipeBus;
 221     // Resource control for Vector Global Memory execution unit
 222     WaitClass vectorGlobalMemUnit;
 223
 224     int numVectorSharedMemUnits;
 225     // Resource control for local memory to VRF data/address bus
 226     WaitClass locMemToVrfBus;
 227     // Resource control for Vector Register File->Local Memory pipe buses
 228     WaitClass vrfToLocalMemPipeBus;
 229     // Resource control for Vector Shared/Local Memory execution unit
 230     WaitClass vectorSharedMemUnit;
 231
 232     int numScalarMemUnits;
 233     // Resource control for scalar memory to SRF data/address bus
 234     WaitClass scalarMemToSrfBus;
 235     // Resource control for Scalar Register File->Scalar Memory pipe buses
 236     WaitClass srfToScalarMemPipeBus;
 237     // Resource control for Scalar Memory execution unit
 238     WaitClass scalarMemUnit;
 239
 240     // vector ALU execution resources
 241     int numVectorALUs;
 242     std::vector<WaitClass> vectorALUs;
 243
 244     // scalar ALU execution resources
 245     int numScalarALUs;
 246     std::vector<WaitClass> scalarALUs;
 247
 248     // Return total number of execution units on this CU
 249     int numExeUnits() const;
 250     // index into readyList of the first memory unit
 251     int firstMemUnit() const;
 252     // index into readyList of the last memory unit
 253     int lastMemUnit() const;
 254     // index into scalarALUs vector of SALU used by the wavefront
 255     int mapWaveToScalarAlu(Wavefront *w) const;
 256     // index into readyList of SALU used by wavefront
 257     int mapWaveToScalarAluGlobalIdx(Wavefront *w) const;
 258     // index into readyList of Global Memory unit used by wavefront
 259     int mapWaveToGlobalMem(Wavefront *w) const;
 260     // index into readyList of Local Memory unit used by wavefront
 261     int mapWaveToLocalMem(Wavefront *w) const;
 262     // index into readyList of Scalar Memory unit used by wavefront
 263     int mapWaveToScalarMem(Wavefront *w) const;
 264
 265     int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
 266     int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
 267     int numCyclesPerStoreTransfer;  // number of cycles per vector store
 268     int numCyclesPerLoadTransfer;  // number of cycles per vector load
 269
 270     // track presence of dynamic instructions in the Schedule pipeline
 271     // stage. This is used to check the readiness of the oldest,
 272     // non-dispatched instruction of every WF in the Scoreboard stage.
 273     std::unordered_set<uint64_t> pipeMap;
 274
 275     RegisterManager* registerManager;
 276
 277     FetchStage fetchStage;
 278     ScoreboardCheckStage scoreboardCheckStage;
 279     ScheduleStage scheduleStage;
 280     ExecStage execStage;
 281     GlobalMemPipeline globalMemoryPipe;
 282     LocalMemPipeline localMemoryPipe;
 283     ScalarMemPipeline scalarMemoryPipe;
 284
 285     EventFunctionWrapper tickEvent;
 286
 287     typedef ComputeUnitParams Params;
 288     std::vector<std::vector<Wavefront*>> wfList;
 289     int cu_id;
 290
 291     // array of vector register files, one per SIMD
 292     std::vector<VectorRegisterFile*> vrf;
 293     // array of scalar register files, one per SIMD
 294     std::vector<ScalarRegisterFile*> srf;
 295
 296     // Width per VALU/SIMD unit: number of work items that can be executed
 297     // on the vector ALU simultaneously in a SIMD unit
 298     int simdWidth;
 299     // number of pipe stages for bypassing data to next dependent single
 300     // precision vector instruction inside the vector ALU pipeline
 301     int spBypassPipeLength;
 302     // number of pipe stages for bypassing data to next dependent double
 303     // precision vector instruction inside the vector ALU pipeline
 304     int dpBypassPipeLength;
 305     // number of pipe stages for scalar ALU
 306     int scalarPipeStages;
 307     // number of pipe stages for operand collection & distribution network
 308     int operandNetworkLength;
 309     // number of cycles per instruction issue period
 310     Cycles issuePeriod;
 311
 312     // VRF to GM Bus latency
 313     Cycles vrf_gm_bus_latency;
 314     // SRF to Scalar Mem Bus latency
 315     Cycles srf_scm_bus_latency;
 316     // VRF to LM Bus latency
 317     Cycles vrf_lm_bus_latency;
 318
 319     // tracks the last cycle a vector instruction was executed on a SIMD
 320     std::vector<uint64_t> lastExecCycle;
 321
 322     // Track the amount of interleaving between wavefronts on each SIMD.
 323     // This stat is sampled using instExecPerSimd to compute the number of
 324     // instructions that have been executed on a SIMD between a WF executing
 325     // two successive instructions.
 326     Stats::VectorDistribution instInterleave;
 327
 328     // tracks the number of dyn inst executed per SIMD
 329     std::vector<uint64_t> instExecPerSimd;
 330
 331     // true if we allow a separate TLB per lane
 332     bool perLaneTLB;
 333     // if 0, TLB prefetching is off.
 334     int prefetchDepth;
 335     // if fixed-stride prefetching, this is the stride.
 336     int prefetchStride;
 337
 338     std::vector<Addr> lastVaddrCU;
 339     std::vector<std::vector<Addr>> lastVaddrSimd;
 340     std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
 341     Enums::PrefetchType prefetchType;
 342     EXEC_POLICY exec_policy;
 343
 344     bool debugSegFault;
 345     // Idle CU timeout in ticks
 346     Tick idleCUTimeout;
 347     int idleWfs;
 348     bool functionalTLB;
 349     bool localMemBarrier;
 350
 351     /*
 352      * for Counting page accesses
 353      */
 354     bool countPages;
 355
 356     Shader *shader;
 357
 358     Tick req_tick_latency;
 359     Tick resp_tick_latency;
 360
 361     /**
 362      * Number of WFs to schedule to each SIMD. This vector is populated
 363      * by hasDispResources(), and consumed by the subsequent call to
 364      * dispWorkgroup(), to schedule the specified number of WFs to the
 365      * SIMD units. Entry I provides the number of WFs to schedule to SIMD I.
 366      */
 367     std::vector<int> numWfsToSched;
 368
 369     // number of currently reserved vector registers per SIMD unit
 370     std::vector<int> vectorRegsReserved;
 371     // number of currently reserved scalar registers per SIMD unit
 372     std::vector<int> scalarRegsReserved;
 373     // number of vector registers per SIMD unit
 374     int numVecRegsPerSimd;
 375     // number of available scalar registers per SIMD unit
 376     int numScalarRegsPerSimd;
 377
 378     // this hash map will keep track of page divergence
 379     // per memory instruction per wavefront. The hash map
 380     // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
 381     std::map<Addr, int> pagesTouched;
 382
 383     void insertInPipeMap(Wavefront *w);
 384     void deleteFromPipeMap(Wavefront *w);
 385
 386     ComputeUnit(const Params *p);
 387     ~ComputeUnit();
 388
 389     // Timing Functions
 390     int oprNetPipeLength() const { return operandNetworkLength; }
 391     int simdUnitWidth() const { return simdWidth; }
 392     int spBypassLength() const { return spBypassPipeLength; }
 393     int dpBypassLength() const { return dpBypassPipeLength; }
 394     int scalarPipeLength() const { return scalarPipeStages; }
 395     int storeBusLength() const { return numCyclesPerStoreTransfer; }
 396     int loadBusLength() const { return numCyclesPerLoadTransfer; }
 397     int wfSize() const { return wavefrontSize; }
 398
 399     void exec();
 400     void initiateFetch(Wavefront *wavefront);
 401     void fetch(PacketPtr pkt, Wavefront *wavefront);
 402     void fillKernelState(Wavefront *w, HSAQueueEntry *task);
 403
 404     void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 405                         HSAQueueEntry *task, int bar_id,
 406                         bool fetchContext=false);
 407
 408     void doInvalidate(RequestPtr req, int kernId);
 409     void doFlush(GPUDynInstPtr gpuDynInst);
 410
 411     void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
 412     bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
 413
 414     int cacheLineSize() const { return _cacheLineSize; }
 415     int getCacheLineBits() const { return cacheLineBits; }
 416
 417   private:
 418     WFBarrier&
 419     barrierSlot(int bar_id)
 420     {
 421         assert(bar_id > WFBarrier::InvalidID);
 422         return wfBarrierSlots.at(bar_id);
 423     }
 424
 425     int
 426     getFreeBarrierId()
 427     {
 428         assert(freeBarrierIds.size());
 429         auto free_bar_id = freeBarrierIds.begin();
 430         int bar_id = *free_bar_id;
 431         freeBarrierIds.erase(free_bar_id);
 432         return bar_id;
 433     }
 434
 435   public:
 436     int numYetToReachBarrier(int bar_id);
 437     bool allAtBarrier(int bar_id);
 438     void incNumAtBarrier(int bar_id);
 439     int numAtBarrier(int bar_id);
 440     int maxBarrierCnt(int bar_id);
 441     void resetBarrier(int bar_id);
 442     void decMaxBarrierCnt(int bar_id);
 443     void releaseBarrier(int bar_id);
 444     void releaseWFsFromBarrier(int bar_id);
 445     int numBarrierSlots() const { return _numBarrierSlots; }
 446
 447     template<typename c0, typename c1>
 448     void doSmReturn(GPUDynInstPtr gpuDynInst);
 449
 450     virtual void init() override;
 451     void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
 452     void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt);
 453     void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
 454                               bool kernelMemSync,
 455                               RequestPtr req=nullptr);
 456     void handleMemPacket(PacketPtr pkt, int memport_index);
 457     bool processTimingPacket(PacketPtr pkt);
 458     void processFetchReturn(PacketPtr pkt);
 459     void updatePageDivergenceDist(Addr addr);
 460
 461     MasterID masterId() { return _masterId; }
 462
 463     bool isDone() const;
 464     bool isVectorAluIdle(uint32_t simdId) const;
 465
 466   protected:
 467     MasterID _masterId;
 468
 469     LdsState &lds;
 470
 471   public:
 472     Stats::Scalar vALUInsts;
 473     Stats::Formula vALUInstsPerWF;
 474     Stats::Scalar sALUInsts;
 475     Stats::Formula sALUInstsPerWF;
 476     Stats::Scalar instCyclesVALU;
 477     Stats::Scalar instCyclesSALU;
 478     Stats::Scalar threadCyclesVALU;
 479     Stats::Formula vALUUtilization;
 480     Stats::Scalar ldsNoFlatInsts;
 481     Stats::Formula ldsNoFlatInstsPerWF;
 482     Stats::Scalar flatVMemInsts;
 483     Stats::Formula flatVMemInstsPerWF;
 484     Stats::Scalar flatLDSInsts;
 485     Stats::Formula flatLDSInstsPerWF;
 486     Stats::Scalar vectorMemWrites;
 487     Stats::Formula vectorMemWritesPerWF;
 488     Stats::Scalar vectorMemReads;
 489     Stats::Formula vectorMemReadsPerWF;
 490     Stats::Scalar scalarMemWrites;
 491     Stats::Formula scalarMemWritesPerWF;
 492     Stats::Scalar scalarMemReads;
 493     Stats::Formula scalarMemReadsPerWF;
 494
 495     Stats::Formula vectorMemReadsPerKiloInst;
 496     Stats::Formula vectorMemWritesPerKiloInst;
 497     Stats::Formula vectorMemInstsPerKiloInst;
 498     Stats::Formula scalarMemReadsPerKiloInst;
 499     Stats::Formula scalarMemWritesPerKiloInst;
 500     Stats::Formula scalarMemInstsPerKiloInst;
 501
 502     // Cycles required to send register source (addr and data) from
 503     // register files to memory pipeline, per SIMD.
 504     Stats::Vector instCyclesVMemPerSimd;
 505     Stats::Vector instCyclesScMemPerSimd;
 506     Stats::Vector instCyclesLdsPerSimd;
 507
 508     Stats::Scalar globalReads;
 509     Stats::Scalar globalWrites;
 510     Stats::Formula globalMemInsts;
 511     Stats::Scalar argReads;
 512     Stats::Scalar argWrites;
 513     Stats::Formula argMemInsts;
 514     Stats::Scalar spillReads;
 515     Stats::Scalar spillWrites;
 516     Stats::Formula spillMemInsts;
 517     Stats::Scalar groupReads;
 518     Stats::Scalar groupWrites;
 519     Stats::Formula groupMemInsts;
 520     Stats::Scalar privReads;
 521     Stats::Scalar privWrites;
 522     Stats::Formula privMemInsts;
 523     Stats::Scalar readonlyReads;
 524     Stats::Scalar readonlyWrites;
 525     Stats::Formula readonlyMemInsts;
 526     Stats::Scalar kernargReads;
 527     Stats::Scalar kernargWrites;
 528     Stats::Formula kernargMemInsts;
 529
 530     int activeWaves;
 531     Stats::Distribution waveLevelParallelism;
 532
 533     void updateInstStats(GPUDynInstPtr gpuDynInst);
 534
 535     // the following stats compute the avg. TLB accesslatency per
 536     // uncoalesced request (only for data)
 537     Stats::Scalar tlbRequests;
 538     Stats::Scalar tlbCycles;
 539     Stats::Formula tlbLatency;
 540     // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
 541     Stats::Vector hitsPerTLBLevel;
 542
 543     Stats::Scalar ldsBankAccesses;
 544     Stats::Distribution ldsBankConflictDist;
 545
 546     // over all memory instructions executed over all wavefronts
 547     // how many touched 0-4 pages, 4-8, ..., 60-64 pages
 548     Stats::Distribution pageDivergenceDist;
 549     // count of non-flat global memory vector instructions executed
 550     Stats::Scalar dynamicGMemInstrCnt;
 551     // count of flat global memory vector instructions executed
 552     Stats::Scalar dynamicFlatMemInstrCnt;
 553     Stats::Scalar dynamicLMemInstrCnt;
 554
 555     Stats::Scalar wgBlockedDueBarrierAllocation;
 556     Stats::Scalar wgBlockedDueLdsAllocation;
 557     // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are
 558     // active when the instruction is committed, this number is still
 559     // incremented by 1
 560     Stats::Scalar numInstrExecuted;
 561     // Number of cycles among successive instruction executions across all
 562     // wavefronts of the same CU
 563     Stats::Distribution execRateDist;
 564     // number of individual vector operations executed
 565     Stats::Scalar numVecOpsExecuted;
 566     // number of individual f16 vector operations executed
 567     Stats::Scalar numVecOpsExecutedF16;
 568     // number of individual f32 vector operations executed
 569     Stats::Scalar numVecOpsExecutedF32;
 570     // number of individual f64 vector operations executed
 571     Stats::Scalar numVecOpsExecutedF64;
 572     // number of individual FMA 16,32,64 vector operations executed
 573     Stats::Scalar numVecOpsExecutedFMA16;
 574     Stats::Scalar numVecOpsExecutedFMA32;
 575     Stats::Scalar numVecOpsExecutedFMA64;
 576     // number of individual MAC 16,32,64 vector operations executed
 577     Stats::Scalar numVecOpsExecutedMAC16;
 578     Stats::Scalar numVecOpsExecutedMAC32;
 579     Stats::Scalar numVecOpsExecutedMAC64;
 580     // number of individual MAD 16,32,64 vector operations executed
 581     Stats::Scalar numVecOpsExecutedMAD16;
 582     Stats::Scalar numVecOpsExecutedMAD32;
 583     Stats::Scalar numVecOpsExecutedMAD64;
 584     // total number of two op FP vector operations executed
 585     Stats::Scalar numVecOpsExecutedTwoOpFP;
 586     // Total cycles that something is running on the GPU
 587     Stats::Scalar totalCycles;
 588     Stats::Formula vpc; // vector ops per cycle
 589     Stats::Formula vpc_f16; // vector ops per cycle
 590     Stats::Formula vpc_f32; // vector ops per cycle
 591     Stats::Formula vpc_f64; // vector ops per cycle
 592     Stats::Formula ipc; // vector instructions per cycle
 593     Stats::Distribution controlFlowDivergenceDist;
 594     Stats::Distribution activeLanesPerGMemInstrDist;
 595     Stats::Distribution activeLanesPerLMemInstrDist;
 596     // number of vector ALU instructions received
 597     Stats::Formula numALUInstsExecuted;
 598     // number of times a WG can not start due to lack of free VGPRs in SIMDs
 599     Stats::Scalar numTimesWgBlockedDueVgprAlloc;
 600     // number of times a WG can not start due to lack of free SGPRs in SIMDs
 601     Stats::Scalar numTimesWgBlockedDueSgprAlloc;
 602     Stats::Scalar numCASOps;
 603     Stats::Scalar numFailedCASOps;
 604     Stats::Scalar completedWfs;
 605     Stats::Scalar completedWGs;
 606
 607     // distrubtion in latency difference between first and last cache block
 608     // arrival ticks
 609     Stats::Distribution headTailLatency;
 610
 611     void
 612     regStats() override;
 613
 614     LdsState &
 615     getLds() const
 616     {
 617         return lds;
 618     }
 619
 620     int32_t
 621     getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
 622
 623     bool
 624     sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
 625
 626     typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
 627     pageDataStruct pageAccesses;
 628
 629     void exitCallback();
 630
 631     class GMTokenPort : public TokenMasterPort
 632     {
 633       public:
 634         GMTokenPort(const std::string& name, SimObject *owner,
 635                     PortID id = InvalidPortID)
 636             : TokenMasterPort(name, owner, id)
 637         { }
 638         ~GMTokenPort() { }
 639
 640       protected:
 641         bool recvTimingResp(PacketPtr) { return false; }
 642         void recvReqRetry() { }
 643     };
 644
 645     // Manager for the number of tokens available to this compute unit to
 646     // send global memory request packets to the coalescer this is only used
 647     // between global memory pipe and TCP coalescer.
 648     TokenManager *memPortTokens;
 649     GMTokenPort gmTokenPort;
 650
 651     /** Data access Port **/
 652     class DataPort : public RequestPort
 653     {
 654       public:
 655         DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 656             : RequestPort(_name, _cu), computeUnit(_cu),
 657               index(_index) { }
 658
 659         bool snoopRangeSent;
 660
 661         struct SenderState : public Packet::SenderState
 662         {
 663             GPUDynInstPtr _gpuDynInst;
 664             int port_index;
 665             Packet::SenderState *saved;
 666
 667             SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
 668                         Packet::SenderState *sender_state=nullptr)
 669                 : _gpuDynInst(gpuDynInst),
 670                   port_index(_port_index),
 671                   saved(sender_state) { }
 672         };
 673
 674         void processMemReqEvent(PacketPtr pkt);
 675         EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
 676
 677         void processMemRespEvent(PacketPtr pkt);
 678         EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
 679
 680         std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
 681
 682       protected:
 683         ComputeUnit *computeUnit;
 684         int index;
 685
 686         virtual bool recvTimingResp(PacketPtr pkt);
 687         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 688         virtual void recvFunctional(PacketPtr pkt) { }
 689         virtual void recvRangeChange() { }
 690         virtual void recvReqRetry();
 691
 692         virtual void
 693         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 694         {
 695             resp.clear();
 696             snoop = true;
 697         }
 698
 699     };
 700
 701     // Scalar data cache access port
 702     class ScalarDataPort : public RequestPort
 703     {
 704       public:
 705         ScalarDataPort(const std::string &_name, ComputeUnit *_cu,
 706                        PortID _index)
 707             : RequestPort(_name, _cu, _index), computeUnit(_cu), index(_index)
 708         {
 709             (void)index;
 710         }
 711
 712         bool recvTimingResp(PacketPtr pkt) override;
 713         void recvReqRetry() override;
 714
 715         struct SenderState : public Packet::SenderState
 716         {
 717             SenderState(GPUDynInstPtr gpuDynInst,
 718                         Packet::SenderState *sender_state=nullptr)
 719                 : _gpuDynInst(gpuDynInst), saved(sender_state)
 720             {
 721             }
 722
 723             GPUDynInstPtr _gpuDynInst;
 724             Packet::SenderState *saved;
 725         };
 726
 727         class MemReqEvent : public Event
 728         {
 729           private:
 730             ScalarDataPort *scalarDataPort;
 731             PacketPtr pkt;
 732
 733           public:
 734             MemReqEvent(ScalarDataPort *_scalar_data_port, PacketPtr _pkt)
 735                 : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt)
 736             {
 737               setFlags(Event::AutoDelete);
 738             }
 739
 740             void process();
 741             const char *description() const;
 742         };
 743
 744         std::deque<PacketPtr> retries;
 745
 746       private:
 747         ComputeUnit *computeUnit;
 748         PortID index;
 749     };
 750
 751     // Instruction cache access port
 752     class SQCPort : public RequestPort
 753     {
 754       public:
 755         SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 756             : RequestPort(_name, _cu), computeUnit(_cu),
 757               index(_index) { }
 758
 759         bool snoopRangeSent;
 760
 761         struct SenderState : public Packet::SenderState
 762         {
 763             Wavefront *wavefront;
 764             Packet::SenderState *saved;
 765             // kernel id to be used in handling I-Cache invalidate response
 766             int kernId;
 767
 768             SenderState(Wavefront *_wavefront, Packet::SenderState
 769                     *sender_state=nullptr, int _kernId=-1)
 770                 : wavefront(_wavefront), saved(sender_state),
 771                 kernId(_kernId){ }
 772         };
 773
 774         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
 775
 776       protected:
 777         ComputeUnit *computeUnit;
 778         int index;
 779
 780         virtual bool recvTimingResp(PacketPtr pkt);
 781         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 782         virtual void recvFunctional(PacketPtr pkt) { }
 783         virtual void recvRangeChange() { }
 784         virtual void recvReqRetry();
 785
 786         virtual void
 787         getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
 788         {
 789             resp.clear();
 790             snoop = true;
 791         }
 792      };
 793
 794     /** Data TLB port **/
 795     class DTLBPort : public RequestPort
 796     {
 797       public:
 798         DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
 799             : RequestPort(_name, _cu), computeUnit(_cu),
 800               index(_index), stalled(false)
 801         { }
 802
 803         bool isStalled() { return stalled; }
 804         void stallPort() { stalled = true; }
 805         void unstallPort() { stalled = false; }
 806
 807         /**
 808          * here we queue all the translation requests that were
 809          * not successfully sent.
 810          */
 811         std::deque<PacketPtr> retries;
 812
 813         /** SenderState is information carried along with the packet
 814          * throughout the TLB hierarchy
 815          */
 816         struct SenderState: public Packet::SenderState
 817         {
 818             // the memInst that this is associated with
 819             GPUDynInstPtr _gpuDynInst;
 820
 821             // the lane in the memInst this is associated with, so we send
 822             // the memory request down the right port
 823             int portIndex;
 824
 825             // constructor used for packets involved in timing accesses
 826             SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
 827                 : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
 828
 829         };
 830
 831       protected:
 832         ComputeUnit *computeUnit;
 833         int index;
 834         bool stalled;
 835
 836         virtual bool recvTimingResp(PacketPtr pkt);
 837         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 838         virtual void recvFunctional(PacketPtr pkt) { }
 839         virtual void recvRangeChange() { }
 840         virtual void recvReqRetry();
 841     };
 842
 843     class ScalarDTLBPort : public RequestPort
 844     {
 845       public:
 846         ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu)
 847             : RequestPort(_name, _cu), computeUnit(_cu), stalled(false)
 848         {
 849         }
 850
 851         struct SenderState : public Packet::SenderState
 852         {
 853             SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { }
 854             GPUDynInstPtr _gpuDynInst;
 855         };
 856
 857         bool recvTimingResp(PacketPtr pkt) override;
 858         void recvReqRetry() override { assert(false); }
 859
 860         bool isStalled() const { return stalled; }
 861         void stallPort() { stalled = true; }
 862         void unstallPort() { stalled = false; }
 863
 864         std::deque<PacketPtr> retries;
 865
 866       private:
 867         ComputeUnit *computeUnit;
 868         bool stalled;
 869     };
 870
 871     class ITLBPort : public RequestPort
 872     {
 873       public:
 874         ITLBPort(const std::string &_name, ComputeUnit *_cu)
 875             : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { }
 876
 877
 878         bool isStalled() { return stalled; }
 879         void stallPort() { stalled = true; }
 880         void unstallPort() { stalled = false; }
 881
 882         /**
 883          * here we queue all the translation requests that were
 884          * not successfully sent.
 885          */
 886         std::deque<PacketPtr> retries;
 887
 888         /** SenderState is information carried along with the packet
 889          * throughout the TLB hierarchy
 890          */
 891         struct SenderState: public Packet::SenderState
 892         {
 893             // The wavefront associated with this request
 894             Wavefront *wavefront;
 895
 896             SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
 897         };
 898
 899       protected:
 900         ComputeUnit *computeUnit;
 901         bool stalled;
 902
 903         virtual bool recvTimingResp(PacketPtr pkt);
 904         virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 905         virtual void recvFunctional(PacketPtr pkt) { }
 906         virtual void recvRangeChange() { }
 907         virtual void recvReqRetry();
 908     };
 909
 910     /**
 911      * the port intended to communicate between the CU and its LDS
 912      */
 913     class LDSPort : public RequestPort
 914     {
 915       public:
 916         LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
 917         : RequestPort(_name, _cu, _id), computeUnit(_cu)
 918         {
 919         }
 920
 921         bool isStalled() const { return stalled; }
 922         void stallPort() { stalled = true; }
 923         void unstallPort() { stalled = false; }
 924
 925         /**
 926          * here we queue all the requests that were
 927          * not successfully sent.
 928          */
 929         std::queue<PacketPtr> retries;
 930
 931         /**
 932          *  SenderState is information carried along with the packet, esp. the
 933          *  GPUDynInstPtr
 934          */
 935         class SenderState: public Packet::SenderState
 936         {
 937           protected:
 938             // The actual read/write/atomic request that goes with this command
 939             GPUDynInstPtr _gpuDynInst = nullptr;
 940
 941           public:
 942             SenderState(GPUDynInstPtr gpuDynInst):
 943               _gpuDynInst(gpuDynInst)
 944             {
 945             }
 946
 947             GPUDynInstPtr
 948             getMemInst() const
 949             {
 950               return _gpuDynInst;
 951             }
 952         };
 953
 954         virtual bool
 955         sendTimingReq(PacketPtr pkt);
 956
 957       protected:
 958
 959         bool stalled = false; ///< whether or not it is stalled
 960
 961         ComputeUnit *computeUnit;
 962
 963         virtual bool
 964         recvTimingResp(PacketPtr pkt);
 965
 966         virtual Tick
 967         recvAtomic(PacketPtr pkt) { return 0; }
 968
 969         virtual void
 970         recvFunctional(PacketPtr pkt)
 971         {
 972         }
 973
 974         virtual void
 975         recvRangeChange()
 976         {
 977         }
 978
 979         virtual void
 980         recvReqRetry();
 981     };
 982
 983     /** The port to access the Local Data Store
 984      *  Can be connected to a LDS object
 985      */
 986     LDSPort *ldsPort = nullptr;
 987
 988     LDSPort *
 989     getLdsPort() const
 990     {
 991         return ldsPort;
 992     }
 993
 994     TokenManager *
 995     getTokenManager()
 996     {
 997         return memPortTokens;
 998     }
 999
1000     /** The memory port for SIMD data accesses.
1001      *  Can be connected to PhysMem for Ruby for timing simulations
1002      */
1003     std::vector<DataPort*> memPort;
1004     // port to the TLB hierarchy (i.e., the L1 TLB)
1005     std::vector<DTLBPort*> tlbPort;
1006     // port to the scalar data cache
1007     ScalarDataPort *scalarDataPort;
1008     // port to the scalar data TLB
1009     ScalarDTLBPort *scalarDTLBPort;
1010     // port to the SQC (i.e. the I-cache)
1011     SQCPort *sqcPort;
1012     // port to the SQC TLB (there's a separate TLB for each I-cache)
1013     ITLBPort *sqcTLBPort;
1014
1015     Port &
1016     getPort(const std::string &if_name, PortID idx) override
1017     {
1018         if (if_name == "memory_port") {
1019             memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
1020                                         this, idx);
1021             return *memPort[idx];
1022         } else if (if_name == "translation_port") {
1023             tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
1024                                         this, idx);
1025             return *tlbPort[idx];
1026         } else if (if_name == "scalar_port") {
1027             scalarDataPort = new ScalarDataPort(csprintf("%s-port%d", name(),
1028                                                 idx), this, idx);
1029             return *scalarDataPort;
1030         } else if (if_name == "scalar_tlb_port") {
1031             scalarDTLBPort = new ScalarDTLBPort(csprintf("%s-port", name()),
1032                                                 this);
1033             return *scalarDTLBPort;
1034         } else if (if_name == "sqc_port") {
1035             sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
1036                                   this, idx);
1037             return *sqcPort;
1038         } else if (if_name == "sqc_tlb_port") {
1039             sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
1040             return *sqcTLBPort;
1041         } else if (if_name == "ldsPort") {
1042             if (ldsPort) {
1043                 fatal("an LDS port was already allocated");
1044             }
1045             ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
1046             return *ldsPort;
1047         } else if (if_name == "gmTokenPort") {
1048             return gmTokenPort;
1049         } else {
1050             panic("incorrect port name");
1051         }
1052     }
1053
1054     InstSeqNum getAndIncSeqNum() { return globalSeqNum++; }
1055
1056   private:
1057     const int _cacheLineSize;
1058     const int _numBarrierSlots;
1059     int cacheLineBits;
1060     InstSeqNum globalSeqNum;
1061     int wavefrontSize;
1062
1063     /**
1064      * TODO: Update these comments once the pipe stage interface has
1065      *       been fully refactored.
1066      *
1067      * Pipeline stage interfaces.
1068      *
1069      * Buffers used to communicate between various pipeline stages
1070      * List of waves which will be dispatched to
1071      * each execution resource. An EXREADY implies
1072      * dispatch list is non-empty and
1073      * execution unit has something to execute
1074      * this cycle. Currently, the dispatch list of
1075      * an execution resource can hold only one wave because
1076      * an execution resource can execute only one wave in a cycle.
1077      * dispatchList is used to communicate between schedule
1078      * and exec stage
1079      *
1080      * At a high level, the following intra-/inter-stage communication occurs:
1081      * SCB to SCH: readyList provides per exec resource list of waves that
1082      *             passed dependency and readiness checks. If selected by
1083      *             scheduler, attempt to add wave to schList conditional on
1084      *             RF support.
1085      * SCH: schList holds waves that are gathering operands or waiting
1086      *      for execution resource availability. Once ready, waves are
1087      *      placed on the dispatchList as candidates for execution. A wave
1088      *      may spend multiple cycles in SCH stage, on the schList due to
1089      *      RF access conflicts or execution resource contention.
1090      * SCH to EX: dispatchList holds waves that are ready to be executed.
1091      *            LM/FLAT arbitration may remove an LM wave and place it
1092      *            back on the schList. RF model may also force a wave back
1093      *            to the schList if using the detailed model.
1094      */
1095     ScoreboardCheckToSchedule scoreboardCheckToSchedule;
1096     ScheduleToExecute scheduleToExecute;
1097
1098     /**
1099      * The barrier slots for this CU.
1100      */
1101     std::vector<WFBarrier> wfBarrierSlots;
1102     /**
1103      * A set used to easily retrieve a free barrier ID.
1104      */
1105     std::unordered_set<int> freeBarrierIds;
1106
1107     // hold the time of the arrival of the first cache block related to
1108     // a particular GPUDynInst. This is used to calculate the difference
1109     // between the first and last chace block arrival times.
1110     std::unordered_map<GPUDynInstPtr, Tick> headTailMap;
1111 };
1112
1113 #endif // __COMPUTE_UNIT_HH__