gpu-compute: enable flexible control of kernel boundary syncs
[gem5.git] / src / gpu-compute / shader.hh
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Steve Reinhardt
34 */
35
36 #ifndef __SHADER_HH__
37 #define __SHADER_HH__
38
39 #include <functional>
40 #include <string>
41
42 #include "arch/isa.hh"
43 #include "arch/isa_traits.hh"
44 #include "base/types.hh"
45 #include "cpu/simple/atomic.hh"
46 #include "cpu/simple/timing.hh"
47 #include "cpu/simple_thread.hh"
48 #include "cpu/thread_context.hh"
49 #include "cpu/thread_state.hh"
50 #include "gpu-compute/compute_unit.hh"
51 #include "gpu-compute/gpu_dyn_inst.hh"
52 #include "gpu-compute/gpu_tlb.hh"
53 #include "gpu-compute/hsa_queue_entry.hh"
54 #include "gpu-compute/lds_state.hh"
55 #include "mem/page_table.hh"
56 #include "mem/port.hh"
57 #include "mem/request.hh"
58 #include "params/Shader.hh"
59 #include "sim/faults.hh"
60 #include "sim/process.hh"
61 #include "sim/sim_object.hh"
62
63 class BaseTLB;
64 class GPUCommandProcessor;
65 class GPUDispatcher;
66
67 namespace TheISA
68 {
69 class GpuTLB;
70 }
71
72 static const int LDS_SIZE = 65536;
73
74 // aperture (APE) registers define the base/limit
75 // pair for the ATC mapped memory space. currently
76 // the only APEs we consider are for GPUVM/LDS/scratch.
77 // the APEs are registered with unique values based
78 // on a per-device basis
79 struct ApertureRegister
80 {
81 Addr base;
82 Addr limit;
83 };
84
85 // Class Shader: This describes a single shader instance. Most
86 // configurations will only have a single shader.
87
88 class Shader : public ClockedObject
89 {
90 private:
91 ApertureRegister _gpuVmApe;
92 ApertureRegister _ldsApe;
93 ApertureRegister _scratchApe;
94 Addr shHiddenPrivateBaseVmid;
95
96 // Number of active Cus attached to this shader
97 int _activeCus;
98
99 // Last tick that all CUs attached to this shader were inactive
100 Tick _lastInactiveTick;
101
102 // some stats for measuring latency
103 Stats::Distribution allLatencyDist;
104 Stats::Distribution loadLatencyDist;
105 Stats::Distribution storeLatencyDist;
106
107 // average ticks from vmem inst initiateAcc to coalescer issue,
108 // average ticks from coalescer issue to coalescer hit callback,
109 // average ticks from coalescer hit callback to GM pipe enqueue,
110 // and average ticks spent in GM pipe's ordered resp buffer.
111 Stats::Distribution initToCoalesceLatency;
112 Stats::Distribution rubyNetworkLatency;
113 Stats::Distribution gmEnqueueLatency;
114 Stats::Distribution gmToCompleteLatency;
115
116 // average number of cache blocks requested by vmem inst, and
117 // average ticks for cache blocks to main memory for the Nth
118 // cache block generated by a vmem inst.
119 Stats::Distribution coalsrLineAddresses;
120 Stats::Distribution *cacheBlockRoundTrip;
121
122 public:
123 typedef ShaderParams Params;
124 enum hsail_mode_e {SIMT,VECTOR_SCALAR};
125
126 GPUDispatcher &dispatcher();
127 void sampleLoad(const Tick accessTime);
128 void sampleStore(const Tick accessTime);
129 void sampleInstRoundTrip(std::vector<Tick> roundTripTime);
130 void sampleLineRoundTrip(const std::map<Addr,
131 std::vector<Tick>> &roundTripTime);
132
133 SimpleThread *cpuThread;
134 ThreadContext *gpuTc;
135 BaseCPU *cpuPointer;
136
137 const ApertureRegister&
138 gpuVmApe() const
139 {
140 return _gpuVmApe;
141 }
142
143 const ApertureRegister&
144 ldsApe() const
145 {
146 return _ldsApe;
147 }
148
149 const ApertureRegister&
150 scratchApe() const
151 {
152 return _scratchApe;
153 }
154
155 bool
156 isGpuVmApe(Addr addr) const
157 {
158 bool is_gpu_vm = addr >= _gpuVmApe.base && addr <= _gpuVmApe.limit;
159
160 return is_gpu_vm;
161 }
162
163 bool
164 isLdsApe(Addr addr) const
165 {
166 bool is_lds = addr >= _ldsApe.base && addr <= _ldsApe.limit;
167
168 return is_lds;
169 }
170
171 bool
172 isScratchApe(Addr addr) const
173 {
174 bool is_scratch
175 = addr >= _scratchApe.base && addr <= _scratchApe.limit;
176
177 return is_scratch;
178 }
179
180 Addr
181 getScratchBase()
182 {
183 return _scratchApe.base;
184 }
185
186 Addr
187 getHiddenPrivateBase()
188 {
189 return shHiddenPrivateBaseVmid;
190 }
191
192 void
193 initShHiddenPrivateBase(Addr queueBase, uint32_t offset)
194 {
195 Addr sh_hidden_base_new = queueBase - offset;
196
197 // We are initializing sh_hidden_private_base_vmid from the
198 // amd queue descriptor from the first queue.
199 // The sh_hidden_private_base_vmid is supposed to be same for
200 // all the queues from the same process
201 if (shHiddenPrivateBaseVmid != sh_hidden_base_new) {
202 // Do not panic if shHiddenPrivateBaseVmid == 0,
203 // that is if it is uninitialized. Panic only
204 // if the value is initilized and we get
205 // a differnt base later.
206 panic_if(shHiddenPrivateBaseVmid != 0,
207 "Currently we support only single process\n");
208 }
209 shHiddenPrivateBaseVmid = sh_hidden_base_new;
210 }
211
212 EventFunctionWrapper tickEvent;
213
214 // is this simulation going to be timing mode in the memory?
215 bool timingSim;
216 hsail_mode_e hsail_mode;
217
218 // If set, issue acq packet @ kernel launch
219 int impl_kern_launch_acq;
220 // If set, issue rel packet @ kernel end
221 int impl_kern_end_rel;
222 // If set, fetch returns may be coissued with instructions
223 int coissue_return;
224 // If set, always dump all 64 gprs to trace
225 int trace_vgpr_all;
226 // Number of cu units in the shader
227 int n_cu;
228 // Number of wavefront slots per SIMD per CU
229 int n_wf;
230
231 // The size of global memory
232 int globalMemSize;
233
234 // Tracks CU that rr dispatcher should attempt scheduling
235 int nextSchedCu;
236
237 // Size of scheduled add queue
238 uint32_t sa_n;
239
240 // Pointer to value to be increments
241 std::vector<int*> sa_val;
242 // When to do the increment
243 std::vector<uint64_t> sa_when;
244 // Amount to increment by
245 std::vector<int32_t> sa_x;
246
247 // List of Compute Units (CU's)
248 std::vector<ComputeUnit*> cuList;
249
250 GPUCommandProcessor &gpuCmdProc;
251 GPUDispatcher &_dispatcher;
252
253 /**
254 * Statistics
255 */
256 Stats::Scalar shaderActiveTicks;
257 Stats::Vector vectorInstSrcOperand;
258 Stats::Vector vectorInstDstOperand;
259 void regStats();
260
261 int max_valu_insts;
262 int total_valu_insts;
263
264 Shader(const Params *p);
265 ~Shader();
266 virtual void init();
267
268 // Run shader scheduled adds
269 void execScheduledAdds();
270
271 // Schedule a 32-bit value to be incremented some time in the future
272 void ScheduleAdd(int *val, Tick when, int x);
273 bool processTimingPacket(PacketPtr pkt);
274
275 void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
276 MemCmd cmd, bool suppress_func_errors);
277
278 void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
279
280 void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
281 bool suppress_func_errors);
282
283 void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
284
285 void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
286 bool suppress_func_errors);
287
288 void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data,
289 bool suppress_func_errors, int cu_id);
290
291 void
292 registerCU(int cu_id, ComputeUnit *compute_unit)
293 {
294 cuList[cu_id] = compute_unit;
295 }
296
297 void prepareInvalidate(HSAQueueEntry *task);
298 void prepareFlush(GPUDynInstPtr gpuDynInst);
299
300 bool dispatchWorkgroups(HSAQueueEntry *task);
301 Addr mmap(int length);
302 void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
303 void updateContext(int cid);
304 void hostWakeUp(BaseCPU *cpu);
305 void notifyCuSleep();
306 };
307
308 #endif // __SHADER_HH__