bee08e3dfb89ec08519ecae3adb356aad2218e8e
[gem5.git] / src / gpu-compute / gpu_dyn_inst.hh
1 /*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifndef __GPU_DYN_INST_HH__
35 #define __GPU_DYN_INST_HH__
36
37 #include <cstdint>
38 #include <string>
39
40 #include "base/amo.hh"
41 #include "base/logging.hh"
42 #include "enums/MemType.hh"
43 #include "enums/StorageClassType.hh"
44 #include "gpu-compute/compute_unit.hh"
45 #include "gpu-compute/gpu_exec_context.hh"
46
47 class GPUStaticInst;
48
49 template<typename T>
50 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
51 {
52 public:
53 T c;
54 T s;
55
56 ComputeUnit *computeUnit;
57
58 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
59 : c(_c), s(_s), computeUnit(compute_unit) { }
60
61 void
62 execute(T *b)
63 {
64 computeUnit->numCASOps++;
65
66 if (*b == c) {
67 *b = s;
68 } else {
69 computeUnit->numFailedCASOps++;
70 }
71
72 if (computeUnit->xact_cas_mode) {
73 computeUnit->xactCasLoadMap.clear();
74 }
75 }
76 AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
77 };
78
79 typedef enum
80 {
81 VT_32,
82 VT_64,
83 } vgpr_type;
84
85 class GPUDynInst : public GPUExecContext
86 {
87 public:
88 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
89 uint64_t instSeqNum);
90 ~GPUDynInst();
91 void execute(GPUDynInstPtr gpuDynInst);
92 int numSrcRegOperands();
93 int numDstRegOperands();
94 int getNumOperands();
95 bool isVectorRegister(int operandIdx);
96 bool isScalarRegister(int operandIdx);
97 bool isCondRegister(int operandIdx);
98 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
99 int getOperandSize(int operandIdx);
100 bool isDstOperand(int operandIdx);
101 bool isSrcOperand(int operandIdx);
102
103 const std::string &disassemble() const;
104
105 uint64_t seqNum() const;
106
107 Enums::StorageClassType executedAs();
108
109 // The address of the memory operation
110 std::vector<Addr> addr;
111 Addr pAddr;
112
113 // The data to get written
114 uint8_t *d_data;
115 // Additional data (for atomics)
116 uint8_t *a_data;
117 // Additional data (for atomics)
118 uint8_t *x_data;
119 // The execution mask
120 VectorMask exec_mask;
121
122 // The memory type (M_U32, M_S32, ...)
123 Enums::MemType m_type;
124
125 // The equivalency class
126 int equiv;
127 // The return VGPR type (VT_32 or VT_64)
128 vgpr_type v_type;
129 // Number of VGPR's accessed (1, 2, or 4)
130 int n_reg;
131 // The return VGPR index
132 int dst_reg;
133 // There can be max 4 dest regs>
134 int dst_reg_vec[4];
135 // SIMD where the WF of the memory instruction has been mapped to
136 int simdId;
137 // unique id of the WF where the memory instruction belongs to
138 int wfDynId;
139 // The kernel id of the requesting wf
140 int kern_id;
141 // The CU id of the requesting wf
142 int cu_id;
143 // HW slot id where the WF is mapped to inside a SIMD unit
144 int wfSlotId;
145 // execution pipeline id where the memory instruction has been scheduled
146 int pipeId;
147 // The execution time of this operation
148 Tick time;
149 // The latency of this operation
150 WaitClass latency;
151 // A list of bank conflicts for the 4 cycles.
152 uint32_t bc[4];
153
154 // A pointer to ROM
155 uint8_t *rom;
156 // The size of the READONLY segment
157 int sz_rom;
158
159 // Initiate the specified memory operation, by creating a
160 // memory request and sending it off to the memory system.
161 void initiateAcc(GPUDynInstPtr gpuDynInst);
162 // Complete the specified memory operation, by writing
163 // value back to the RF in the case of a load or atomic
164 // return or, in the case of a store, we do nothing
165 void completeAcc(GPUDynInstPtr gpuDynInst);
166
167 void updateStats();
168
169 GPUStaticInst* staticInstruction() { return _staticInst; }
170
171 bool isALU() const;
172 bool isBranch() const;
173 bool isNop() const;
174 bool isReturn() const;
175 bool isUnconditionalJump() const;
176 bool isSpecialOp() const;
177 bool isWaitcnt() const;
178
179 bool isBarrier() const;
180 bool isMemFence() const;
181 bool isMemRef() const;
182 bool isFlat() const;
183 bool isLoad() const;
184 bool isStore() const;
185
186 bool isAtomic() const;
187 bool isAtomicNoRet() const;
188 bool isAtomicRet() const;
189
190 bool isScalar() const;
191 bool readsSCC() const;
192 bool writesSCC() const;
193 bool readsVCC() const;
194 bool writesVCC() const;
195
196 bool isAtomicAnd() const;
197 bool isAtomicOr() const;
198 bool isAtomicXor() const;
199 bool isAtomicCAS() const;
200 bool isAtomicExch() const;
201 bool isAtomicAdd() const;
202 bool isAtomicSub() const;
203 bool isAtomicInc() const;
204 bool isAtomicDec() const;
205 bool isAtomicMax() const;
206 bool isAtomicMin() const;
207
208 bool isArgLoad() const;
209 bool isGlobalMem() const;
210 bool isLocalMem() const;
211
212 bool isArgSeg() const;
213 bool isGlobalSeg() const;
214 bool isGroupSeg() const;
215 bool isKernArgSeg() const;
216 bool isPrivateSeg() const;
217 bool isReadOnlySeg() const;
218 bool isSpillSeg() const;
219
220 bool isWorkitemScope() const;
221 bool isWavefrontScope() const;
222 bool isWorkgroupScope() const;
223 bool isDeviceScope() const;
224 bool isSystemScope() const;
225 bool isNoScope() const;
226
227 bool isRelaxedOrder() const;
228 bool isAcquire() const;
229 bool isRelease() const;
230 bool isAcquireRelease() const;
231 bool isNoOrder() const;
232
233 bool isGloballyCoherent() const;
234 bool isSystemCoherent() const;
235
236 /*
237 * Loads/stores/atomics may have acquire/release semantics associated
238 * withthem. Some protocols want to see the acquire/release as separate
239 * requests from the load/store/atomic. We implement that separation
240 * using continuations (i.e., a function pointer with an object associated
241 * with it). When, for example, the front-end generates a store with
242 * release semantics, we will first issue a normal store and set the
243 * continuation in the GPUDynInst to a function that generate a
244 * release request. That continuation will be called when the normal
245 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
246 * continuation will be called in the context of the same GPUDynInst
247 * that generated the initial store.
248 */
249 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
250
251 // when true, call execContinuation when response arrives
252 bool useContinuation;
253
254 template<typename c0> AtomicOpFunctorPtr
255 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
256 {
257 if (isAtomicAnd()) {
258 return m5::make_unique<AtomicOpAnd<c0>>(*reg0);
259 } else if (isAtomicOr()) {
260 return m5::make_unique<AtomicOpOr<c0>>(*reg0);
261 } else if (isAtomicXor()) {
262 return m5::make_unique<AtomicOpXor<c0>>(*reg0);
263 } else if (isAtomicCAS()) {
264 return m5::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
265 } else if (isAtomicExch()) {
266 return m5::make_unique<AtomicOpExch<c0>>(*reg0);
267 } else if (isAtomicAdd()) {
268 return m5::make_unique<AtomicOpAdd<c0>>(*reg0);
269 } else if (isAtomicSub()) {
270 return m5::make_unique<AtomicOpSub<c0>>(*reg0);
271 } else if (isAtomicInc()) {
272 return m5::make_unique<AtomicOpInc<c0>>();
273 } else if (isAtomicDec()) {
274 return m5::make_unique<AtomicOpDec<c0>>();
275 } else if (isAtomicMax()) {
276 return m5::make_unique<AtomicOpMax<c0>>(*reg0);
277 } else if (isAtomicMin()) {
278 return m5::make_unique<AtomicOpMin<c0>>(*reg0);
279 } else {
280 fatal("Unrecognized atomic operation");
281 }
282 }
283
284 void
285 setRequestFlags(RequestPtr req, bool setMemOrder=true)
286 {
287 // currently these are the easy scopes to deduce
288 if (isPrivateSeg()) {
289 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
290 } else if (isSpillSeg()) {
291 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
292 } else if (isGlobalSeg()) {
293 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
294 } else if (isReadOnlySeg()) {
295 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
296 } else if (isGroupSeg()) {
297 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
298 } else if (isFlat()) {
299 panic("TODO: translate to correct scope");
300 } else {
301 fatal("%s has bad segment type\n", disassemble());
302 }
303
304 if (isWavefrontScope()) {
305 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
306 Request::WAVEFRONT_SCOPE);
307 } else if (isWorkgroupScope()) {
308 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
309 Request::WORKGROUP_SCOPE);
310 } else if (isDeviceScope()) {
311 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
312 Request::DEVICE_SCOPE);
313 } else if (isSystemScope()) {
314 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
315 Request::SYSTEM_SCOPE);
316 } else if (!isNoScope() && !isWorkitemScope()) {
317 fatal("%s has bad scope type\n", disassemble());
318 }
319
320 if (setMemOrder) {
321 // set acquire and release flags
322 if (isAcquire()) {
323 req->setFlags(Request::ACQUIRE);
324 } else if (isRelease()) {
325 req->setFlags(Request::RELEASE);
326 } else if (isAcquireRelease()) {
327 req->setFlags(Request::ACQUIRE | Request::RELEASE);
328 } else if (!isNoOrder()) {
329 fatal("%s has bad memory order\n", disassemble());
330 }
331 }
332
333 // set atomic type
334 // currently, the instruction genenerator only produces atomic return
335 // but a magic instruction can produce atomic no return
336 if (isAtomicRet()) {
337 req->setFlags(Request::ATOMIC_RETURN_OP);
338 } else if (isAtomicNoRet()) {
339 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
340 }
341 }
342
343 // Map returned packets and the addresses they satisfy with which lane they
344 // were requested from
345 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
346 StatusVector memStatusVector;
347
348 // Track the status of memory requests per lane, a bit per lane
349 VectorMask statusBitVector;
350 // for ld_v# or st_v#
351 std::vector<int> statusVector;
352 std::vector<int> tlbHitLevel;
353
354 private:
355 GPUStaticInst *_staticInst;
356 uint64_t _seqNum;
357 };
358
359 #endif // __GPU_DYN_INST_HH__