src/gpu-compute/shader.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Steve Reinhardt
  34  */
  35
  36 #ifndef __SHADER_HH__
  37 #define __SHADER_HH__
  38
  39 #include <functional>
  40 #include <string>
  41
  42 #include "arch/isa.hh"
  43 #include "arch/isa_traits.hh"
  44 #include "base/types.hh"
  45 #include "cpu/simple/atomic.hh"
  46 #include "cpu/simple/timing.hh"
  47 #include "cpu/simple_thread.hh"
  48 #include "cpu/thread_context.hh"
  49 #include "cpu/thread_state.hh"
  50 #include "gpu-compute/compute_unit.hh"
  51 #include "gpu-compute/gpu_dyn_inst.hh"
  52 #include "gpu-compute/gpu_tlb.hh"
  53 #include "gpu-compute/hsa_queue_entry.hh"
  54 #include "gpu-compute/lds_state.hh"
  55 #include "mem/page_table.hh"
  56 #include "mem/port.hh"
  57 #include "mem/request.hh"
  58 #include "params/Shader.hh"
  59 #include "sim/faults.hh"
  60 #include "sim/process.hh"
  61 #include "sim/sim_object.hh"
  62
  63 class BaseTLB;
  64 class GPUCommandProcessor;
  65 class GPUDispatcher;
  66
  67 namespace TheISA
  68 {
  69     class GpuTLB;
  70 }
  71
  72 static const int LDS_SIZE = 65536;
  73
  74 // aperture (APE) registers define the base/limit
  75 // pair for the ATC mapped memory space. currently
  76 // the only APEs we consider are for GPUVM/LDS/scratch.
  77 // the APEs are registered with unique values based
  78 // on a per-device basis
  79 struct ApertureRegister
  80 {
  81     Addr base;
  82     Addr limit;
  83 };
  84
  85 // Class Shader: This describes a single shader instance. Most
  86 // configurations will only have a single shader.
  87
  88 class Shader : public ClockedObject
  89 {
  90   private:
  91     ApertureRegister _gpuVmApe;
  92     ApertureRegister _ldsApe;
  93     ApertureRegister _scratchApe;
  94     Addr shHiddenPrivateBaseVmid;
  95
  96     // Number of active Cus attached to this shader
  97     int _activeCus;
  98
  99     // Last tick that all CUs attached to this shader were inactive
 100     Tick _lastInactiveTick;
 101
 102     // some stats for measuring latency
 103     Stats::Distribution allLatencyDist;
 104     Stats::Distribution loadLatencyDist;
 105     Stats::Distribution storeLatencyDist;
 106
 107     // average ticks from vmem inst initiateAcc to coalescer issue,
 108     // average ticks from coalescer issue to coalescer hit callback,
 109     // average ticks from coalescer hit callback to GM pipe enqueue,
 110     // and average ticks spent in GM pipe's ordered resp buffer.
 111     Stats::Distribution initToCoalesceLatency;
 112     Stats::Distribution rubyNetworkLatency;
 113     Stats::Distribution gmEnqueueLatency;
 114     Stats::Distribution gmToCompleteLatency;
 115
 116     // average number of cache blocks requested by vmem inst, and
 117     // average ticks for cache blocks to main memory for the Nth
 118     // cache block generated by a vmem inst.
 119     Stats::Distribution coalsrLineAddresses;
 120     Stats::Distribution *cacheBlockRoundTrip;
 121
 122   public:
 123     typedef ShaderParams Params;
 124     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
 125
 126     GPUDispatcher &dispatcher();
 127     void sampleLoad(const Tick accessTime);
 128     void sampleStore(const Tick accessTime);
 129     void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
 130     void sampleLineRoundTrip(const std::map<Addr,
 131         std::vector<Tick>> &roundTripTime);
 132
 133     SimpleThread *cpuThread;
 134     ThreadContext *gpuTc;
 135     BaseCPU *cpuPointer;
 136
 137     const ApertureRegister&
 138     gpuVmApe() const
 139     {
 140         return _gpuVmApe;
 141     }
 142
 143     const ApertureRegister&
 144     ldsApe() const
 145     {
 146         return _ldsApe;
 147     }
 148
 149     const ApertureRegister&
 150     scratchApe() const
 151     {
 152         return _scratchApe;
 153     }
 154
 155     bool
 156     isGpuVmApe(Addr addr) const
 157     {
 158         bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
 159
 160         return is_gpu_vm;
 161     }
 162
 163     bool
 164     isLdsApe(Addr addr) const
 165     {
 166         bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
 167
 168         return is_lds;
 169     }
 170
 171     bool
 172     isScratchApe(Addr addr) const
 173     {
 174         bool is_scratch
 175             = addr >= _scratchApe.base && addr <= _scratchApe.limit;
 176
 177         return is_scratch;
 178     }
 179
 180     Addr
 181     getScratchBase()
 182     {
 183         return _scratchApe.base;
 184     }
 185
 186     Addr
 187     getHiddenPrivateBase()
 188     {
 189         return shHiddenPrivateBaseVmid;
 190     }
 191
 192     void
 193     initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
 194     {
 195         Addr sh_hidden_base_new = queueBase - offset;
 196
 197         // We are initializing sh_hidden_private_base_vmid from the
 198         // amd queue descriptor from the first queue.
 199         // The sh_hidden_private_base_vmid is supposed to be same for
 200         // all the queues from the same process
 201         if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
 202             // Do not panic if shHiddenPrivateBaseVmid == 0,
 203             // that is if it is uninitialized. Panic only
 204             // if the value is initilized and we get
 205             // a differnt base later.
 206             panic_if(shHiddenPrivateBaseVmid != 0,
 207                      "Currently we support only single process\n");
 208         }
 209         shHiddenPrivateBaseVmid = sh_hidden_base_new;
 210     }
 211
 212     EventFunctionWrapper tickEvent;
 213
 214     // is this simulation going to be timing mode in the memory?
 215     bool timingSim;
 216     hsail_mode_e hsail_mode;
 217
 218     // If set, issue acq packet @ kernel launch
 219     int impl_kern_launch_acq;
 220     // If set, issue rel packet @ kernel end
 221     int impl_kern_end_rel;
 222     // If set, fetch returns may be coissued with instructions
 223     int coissue_return;
 224     // If set, always dump all 64 gprs to trace
 225     int trace_vgpr_all;
 226     // Number of cu units in the shader
 227     int n_cu;
 228     // Number of wavefront slots per SIMD per CU
 229     int n_wf;
 230
 231     // The size of global memory
 232     int globalMemSize;
 233
 234     // Tracks CU that rr dispatcher should attempt scheduling
 235     int nextSchedCu;
 236
 237     // Size of scheduled add queue
 238     uint32_t sa_n;
 239
 240     // Pointer to value to be increments
 241     std::vector<int*> sa_val;
 242     // When to do the increment
 243     std::vector<uint64_t> sa_when;
 244     // Amount to increment by
 245     std::vector<int32_t> sa_x;
 246
 247     // List of Compute Units (CU's)
 248     std::vector<ComputeUnit*> cuList;
 249
 250     GPUCommandProcessor &gpuCmdProc;
 251     GPUDispatcher &_dispatcher;
 252
 253     /**
 254      * Statistics
 255      */
 256     Stats::Scalar shaderActiveTicks;
 257     Stats::Vector vectorInstSrcOperand;
 258     Stats::Vector vectorInstDstOperand;
 259     void regStats();
 260
 261     int max_valu_insts;
 262     int total_valu_insts;
 263
 264     Shader(const Params *p);
 265     ~Shader();
 266     virtual void init();
 267
 268     // Run shader scheduled adds
 269     void execScheduledAdds();
 270
 271     // Schedule a 32-bit value to be incremented some time in the future
 272     void ScheduleAdd(int *val, Tick when, int x);
 273     bool processTimingPacket(PacketPtr pkt);
 274
 275     void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 276                    MemCmd cmd, bool suppress_func_errors);
 277
 278     void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
 279
 280     void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
 281                  bool suppress_func_errors);
 282
 283     void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
 284
 285     void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
 286                   bool suppress_func_errors);
 287
 288     void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
 289                             bool suppress_func_errors, int cu_id);
 290
 291     void
 292     registerCU(int cu_id, ComputeUnit *compute_unit)
 293     {
 294         cuList[cu_id] = compute_unit;
 295     }
 296
 297     void prepareInvalidate(HSAQueueEntry *task);
 298     void prepareFlush(GPUDynInstPtr gpuDynInst);
 299
 300     bool dispatchWorkgroups(HSAQueueEntry *task);
 301     Addr mmap(int length);
 302     void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
 303     void updateContext(int cid);
 304     void hostWakeUp(BaseCPU *cpu);
 305     void notifyCuSleep();
 306 };
 307
 308 #endif // __SHADER_HH__