2 * Copyright (c) 2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: Anthony Gutierrez
36 #ifndef __GPU_DYN_INST_HH__
37 #define __GPU_DYN_INST_HH__
42 #include "enums/MemType.hh"
43 #include "enums/StorageClassType.hh"
44 #include "gpu-compute/compute_unit.hh"
45 #include "gpu-compute/gpu_exec_context.hh"
50 class AtomicOpAnd : public TypedAtomicOpFunctor<T>
55 AtomicOpAnd(T _a) : a(_a) { }
56 void execute(T *b) { *b &= a; }
60 class AtomicOpOr : public TypedAtomicOpFunctor<T>
64 AtomicOpOr(T _a) : a(_a) { }
65 void execute(T *b) { *b |= a; }
69 class AtomicOpXor : public TypedAtomicOpFunctor<T>
73 AtomicOpXor(T _a) : a(_a) {}
74 void execute(T *b) { *b ^= a; }
78 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
84 ComputeUnit *computeUnit;
86 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
87 : c(_c), s(_s), computeUnit(compute_unit) { }
92 computeUnit->numCASOps++;
97 computeUnit->numFailedCASOps++;
100 if (computeUnit->xact_cas_mode) {
101 computeUnit->xactCasLoadMap.clear();
107 class AtomicOpExch : public TypedAtomicOpFunctor<T>
111 AtomicOpExch(T _a) : a(_a) { }
112 void execute(T *b) { *b = a; }
116 class AtomicOpAdd : public TypedAtomicOpFunctor<T>
120 AtomicOpAdd(T _a) : a(_a) { }
121 void execute(T *b) { *b += a; }
125 class AtomicOpSub : public TypedAtomicOpFunctor<T>
129 AtomicOpSub(T _a) : a(_a) { }
130 void execute(T *b) { *b -= a; }
134 class AtomicOpInc : public TypedAtomicOpFunctor<T>
138 void execute(T *b) { *b += 1; }
142 class AtomicOpDec : public TypedAtomicOpFunctor<T>
146 void execute(T *b) { *b -= 1; }
150 class AtomicOpMax : public TypedAtomicOpFunctor<T>
154 AtomicOpMax(T _a) : a(_a) { }
165 class AtomicOpMin : public TypedAtomicOpFunctor<T>
169 AtomicOpMin(T _a) : a(_a) {}
185 class GPUDynInst : public GPUExecContext
188 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
189 uint64_t instSeqNum);
191 void execute(GPUDynInstPtr gpuDynInst);
192 int numSrcRegOperands();
193 int numDstRegOperands();
194 int getNumOperands();
195 bool isVectorRegister(int operandIdx);
196 bool isScalarRegister(int operandIdx);
197 bool isCondRegister(int operandIdx);
198 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
199 int getOperandSize(int operandIdx);
200 bool isDstOperand(int operandIdx);
201 bool isSrcOperand(int operandIdx);
203 const std::string &disassemble() const;
205 uint64_t seqNum() const;
207 Enums::StorageClassType executedAs();
209 // The address of the memory operation
210 std::vector<Addr> addr;
213 // The data to get written
215 // Additional data (for atomics)
217 // Additional data (for atomics)
219 // The execution mask
220 VectorMask exec_mask;
222 // The memory type (M_U32, M_S32, ...)
223 Enums::MemType m_type;
225 // The equivalency class
227 // The return VGPR type (VT_32 or VT_64)
229 // Number of VGPR's accessed (1, 2, or 4)
231 // The return VGPR index
233 // There can be max 4 dest regs>
235 // SIMD where the WF of the memory instruction has been mapped to
237 // unique id of the WF where the memory instruction belongs to
239 // The kernel id of the requesting wf
241 // The CU id of the requesting wf
243 // HW slot id where the WF is mapped to inside a SIMD unit
245 // execution pipeline id where the memory instruction has been scheduled
247 // The execution time of this operation
249 // The latency of this operation
251 // A list of bank conflicts for the 4 cycles.
256 // The size of the READONLY segment
259 // Initiate the specified memory operation, by creating a
260 // memory request and sending it off to the memory system.
261 void initiateAcc(GPUDynInstPtr gpuDynInst);
262 // Complete the specified memory operation, by writing
263 // value back to the RF in the case of a load or atomic
264 // return or, in the case of a store, we do nothing
265 void completeAcc(GPUDynInstPtr gpuDynInst);
269 GPUStaticInst* staticInstruction() { return _staticInst; }
272 bool isBranch() const;
274 bool isReturn() const;
275 bool isUnconditionalJump() const;
276 bool isSpecialOp() const;
277 bool isWaitcnt() const;
279 bool isBarrier() const;
280 bool isMemFence() const;
281 bool isMemRef() const;
284 bool isStore() const;
286 bool isAtomic() const;
287 bool isAtomicNoRet() const;
288 bool isAtomicRet() const;
290 bool isScalar() const;
291 bool readsSCC() const;
292 bool writesSCC() const;
293 bool readsVCC() const;
294 bool writesVCC() const;
296 bool isAtomicAnd() const;
297 bool isAtomicOr() const;
298 bool isAtomicXor() const;
299 bool isAtomicCAS() const;
300 bool isAtomicExch() const;
301 bool isAtomicAdd() const;
302 bool isAtomicSub() const;
303 bool isAtomicInc() const;
304 bool isAtomicDec() const;
305 bool isAtomicMax() const;
306 bool isAtomicMin() const;
308 bool isArgLoad() const;
309 bool isGlobalMem() const;
310 bool isLocalMem() const;
312 bool isArgSeg() const;
313 bool isGlobalSeg() const;
314 bool isGroupSeg() const;
315 bool isKernArgSeg() const;
316 bool isPrivateSeg() const;
317 bool isReadOnlySeg() const;
318 bool isSpillSeg() const;
320 bool isWorkitemScope() const;
321 bool isWavefrontScope() const;
322 bool isWorkgroupScope() const;
323 bool isDeviceScope() const;
324 bool isSystemScope() const;
325 bool isNoScope() const;
327 bool isRelaxedOrder() const;
328 bool isAcquire() const;
329 bool isRelease() const;
330 bool isAcquireRelease() const;
331 bool isNoOrder() const;
333 bool isGloballyCoherent() const;
334 bool isSystemCoherent() const;
337 * Loads/stores/atomics may have acquire/release semantics associated
338 * withthem. Some protocols want to see the acquire/release as separate
339 * requests from the load/store/atomic. We implement that separation
340 * using continuations (i.e., a function pointer with an object associated
341 * with it). When, for example, the front-end generates a store with
342 * release semantics, we will first issue a normal store and set the
343 * continuation in the GPUDynInst to a function that generate a
344 * release request. That continuation will be called when the normal
345 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
346 * continuation will be called in the context of the same GPUDynInst
347 * that generated the initial store.
349 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
351 // when true, call execContinuation when response arrives
352 bool useContinuation;
354 template<typename c0> AtomicOpFunctor*
355 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
358 return new AtomicOpAnd<c0>(*reg0);
359 } else if (isAtomicOr()) {
360 return new AtomicOpOr<c0>(*reg0);
361 } else if (isAtomicXor()) {
362 return new AtomicOpXor<c0>(*reg0);
363 } else if (isAtomicCAS()) {
364 return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
365 } else if (isAtomicExch()) {
366 return new AtomicOpExch<c0>(*reg0);
367 } else if (isAtomicAdd()) {
368 return new AtomicOpAdd<c0>(*reg0);
369 } else if (isAtomicSub()) {
370 return new AtomicOpSub<c0>(*reg0);
371 } else if (isAtomicInc()) {
372 return new AtomicOpInc<c0>();
373 } else if (isAtomicDec()) {
374 return new AtomicOpDec<c0>();
375 } else if (isAtomicMax()) {
376 return new AtomicOpMax<c0>(*reg0);
377 } else if (isAtomicMin()) {
378 return new AtomicOpMin<c0>(*reg0);
380 fatal("Unrecognized atomic operation");
385 setRequestFlags(Request *req, bool setMemOrder=true)
387 // currently these are the easy scopes to deduce
388 if (isPrivateSeg()) {
389 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
390 } else if (isSpillSeg()) {
391 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
392 } else if (isGlobalSeg()) {
393 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
394 } else if (isReadOnlySeg()) {
395 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
396 } else if (isGroupSeg()) {
397 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
398 } else if (isFlat()) {
399 // TODO: translate to correct scope
402 fatal("%s has bad segment type\n", disassemble());
405 if (isWavefrontScope()) {
406 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
407 Request::WAVEFRONT_SCOPE);
408 } else if (isWorkgroupScope()) {
409 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
410 Request::WORKGROUP_SCOPE);
411 } else if (isDeviceScope()) {
412 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
413 Request::DEVICE_SCOPE);
414 } else if (isSystemScope()) {
415 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
416 Request::SYSTEM_SCOPE);
417 } else if (!isNoScope() && !isWorkitemScope()) {
418 fatal("%s has bad scope type\n", disassemble());
422 // set acquire and release flags
424 req->setFlags(Request::ACQUIRE);
425 } else if (isRelease()) {
426 req->setFlags(Request::RELEASE);
427 } else if (isAcquireRelease()) {
428 req->setFlags(Request::ACQUIRE | Request::RELEASE);
429 } else if (!isNoOrder()) {
430 fatal("%s has bad memory order\n", disassemble());
435 // currently, the instruction genenerator only produces atomic return
436 // but a magic instruction can produce atomic no return
438 req->setFlags(Request::ATOMIC_RETURN_OP);
439 } else if (isAtomicNoRet()) {
440 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
444 // Map returned packets and the addresses they satisfy with which lane they
445 // were requested from
446 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
447 StatusVector memStatusVector;
449 // Track the status of memory requests per lane, a bit per lane
450 VectorMask statusBitVector;
451 // for ld_v# or st_v#
452 std::vector<int> statusVector;
453 std::vector<int> tlbHitLevel;
456 GPUStaticInst *_staticInst;
460 #endif // __GPU_DYN_INST_HH__