2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Authors: Anthony Gutierrez
36 #ifndef __GPU_DYN_INST_HH__
37 #define __GPU_DYN_INST_HH__
42 #include "enums/MemType.hh"
43 #include "enums/StorageClassType.hh"
44 #include "gpu-compute/compute_unit.hh"
45 #include "gpu-compute/gpu_exec_context.hh"
50 class AtomicOpAnd : public TypedAtomicOpFunctor<T>
55 AtomicOpAnd(T _a) : a(_a) { }
56 void execute(T *b) { *b &= a; }
57 AtomicOpFunctor* clone () { return new AtomicOpAnd(a); }
61 class AtomicOpOr : public TypedAtomicOpFunctor<T>
65 AtomicOpOr(T _a) : a(_a) { }
66 void execute(T *b) { *b |= a; }
67 AtomicOpFunctor* clone () { return new AtomicOpOr(a); }
71 class AtomicOpXor : public TypedAtomicOpFunctor<T>
75 AtomicOpXor(T _a) : a(_a) {}
76 void execute(T *b) { *b ^= a; }
77 AtomicOpFunctor* clone () { return new AtomicOpXor(a); }
81 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
87 ComputeUnit *computeUnit;
89 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
90 : c(_c), s(_s), computeUnit(compute_unit) { }
95 computeUnit->numCASOps++;
100 computeUnit->numFailedCASOps++;
103 if (computeUnit->xact_cas_mode) {
104 computeUnit->xactCasLoadMap.clear();
107 AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
111 class AtomicOpExch : public TypedAtomicOpFunctor<T>
115 AtomicOpExch(T _a) : a(_a) { }
116 void execute(T *b) { *b = a; }
117 AtomicOpFunctor* clone () { return new AtomicOpExch(a); }
121 class AtomicOpAdd : public TypedAtomicOpFunctor<T>
125 AtomicOpAdd(T _a) : a(_a) { }
126 void execute(T *b) { *b += a; }
127 AtomicOpFunctor* clone () { return new AtomicOpAdd(a); }
131 class AtomicOpSub : public TypedAtomicOpFunctor<T>
135 AtomicOpSub(T _a) : a(_a) { }
136 void execute(T *b) { *b -= a; }
137 AtomicOpFunctor* clone () { return new AtomicOpSub(a); }
141 class AtomicOpInc : public TypedAtomicOpFunctor<T>
145 void execute(T *b) { *b += 1; }
146 AtomicOpFunctor* clone () { return new AtomicOpInc(); }
150 class AtomicOpDec : public TypedAtomicOpFunctor<T>
154 void execute(T *b) { *b -= 1; }
155 AtomicOpFunctor* clone () { return new AtomicOpDec(); }
159 class AtomicOpMax : public TypedAtomicOpFunctor<T>
163 AtomicOpMax(T _a) : a(_a) { }
171 AtomicOpFunctor* clone () { return new AtomicOpMax(a); }
175 class AtomicOpMin : public TypedAtomicOpFunctor<T>
179 AtomicOpMin(T _a) : a(_a) {}
187 AtomicOpFunctor* clone () { return new AtomicOpMin(a); }
196 class GPUDynInst : public GPUExecContext
199 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
200 uint64_t instSeqNum);
202 void execute(GPUDynInstPtr gpuDynInst);
203 int numSrcRegOperands();
204 int numDstRegOperands();
205 int getNumOperands();
206 bool isVectorRegister(int operandIdx);
207 bool isScalarRegister(int operandIdx);
208 bool isCondRegister(int operandIdx);
209 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
210 int getOperandSize(int operandIdx);
211 bool isDstOperand(int operandIdx);
212 bool isSrcOperand(int operandIdx);
214 const std::string &disassemble() const;
216 uint64_t seqNum() const;
218 Enums::StorageClassType executedAs();
220 // The address of the memory operation
221 std::vector<Addr> addr;
224 // The data to get written
226 // Additional data (for atomics)
228 // Additional data (for atomics)
230 // The execution mask
231 VectorMask exec_mask;
233 // The memory type (M_U32, M_S32, ...)
234 Enums::MemType m_type;
236 // The equivalency class
238 // The return VGPR type (VT_32 or VT_64)
240 // Number of VGPR's accessed (1, 2, or 4)
242 // The return VGPR index
244 // There can be max 4 dest regs>
246 // SIMD where the WF of the memory instruction has been mapped to
248 // unique id of the WF where the memory instruction belongs to
250 // The kernel id of the requesting wf
252 // The CU id of the requesting wf
254 // HW slot id where the WF is mapped to inside a SIMD unit
256 // execution pipeline id where the memory instruction has been scheduled
258 // The execution time of this operation
260 // The latency of this operation
262 // A list of bank conflicts for the 4 cycles.
267 // The size of the READONLY segment
270 // Initiate the specified memory operation, by creating a
271 // memory request and sending it off to the memory system.
272 void initiateAcc(GPUDynInstPtr gpuDynInst);
273 // Complete the specified memory operation, by writing
274 // value back to the RF in the case of a load or atomic
275 // return or, in the case of a store, we do nothing
276 void completeAcc(GPUDynInstPtr gpuDynInst);
280 GPUStaticInst* staticInstruction() { return _staticInst; }
283 bool isBranch() const;
285 bool isReturn() const;
286 bool isUnconditionalJump() const;
287 bool isSpecialOp() const;
288 bool isWaitcnt() const;
290 bool isBarrier() const;
291 bool isMemFence() const;
292 bool isMemRef() const;
295 bool isStore() const;
297 bool isAtomic() const;
298 bool isAtomicNoRet() const;
299 bool isAtomicRet() const;
301 bool isScalar() const;
302 bool readsSCC() const;
303 bool writesSCC() const;
304 bool readsVCC() const;
305 bool writesVCC() const;
307 bool isAtomicAnd() const;
308 bool isAtomicOr() const;
309 bool isAtomicXor() const;
310 bool isAtomicCAS() const;
311 bool isAtomicExch() const;
312 bool isAtomicAdd() const;
313 bool isAtomicSub() const;
314 bool isAtomicInc() const;
315 bool isAtomicDec() const;
316 bool isAtomicMax() const;
317 bool isAtomicMin() const;
319 bool isArgLoad() const;
320 bool isGlobalMem() const;
321 bool isLocalMem() const;
323 bool isArgSeg() const;
324 bool isGlobalSeg() const;
325 bool isGroupSeg() const;
326 bool isKernArgSeg() const;
327 bool isPrivateSeg() const;
328 bool isReadOnlySeg() const;
329 bool isSpillSeg() const;
331 bool isWorkitemScope() const;
332 bool isWavefrontScope() const;
333 bool isWorkgroupScope() const;
334 bool isDeviceScope() const;
335 bool isSystemScope() const;
336 bool isNoScope() const;
338 bool isRelaxedOrder() const;
339 bool isAcquire() const;
340 bool isRelease() const;
341 bool isAcquireRelease() const;
342 bool isNoOrder() const;
344 bool isGloballyCoherent() const;
345 bool isSystemCoherent() const;
348 * Loads/stores/atomics may have acquire/release semantics associated
349 * withthem. Some protocols want to see the acquire/release as separate
350 * requests from the load/store/atomic. We implement that separation
351 * using continuations (i.e., a function pointer with an object associated
352 * with it). When, for example, the front-end generates a store with
353 * release semantics, we will first issue a normal store and set the
354 * continuation in the GPUDynInst to a function that generate a
355 * release request. That continuation will be called when the normal
356 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
357 * continuation will be called in the context of the same GPUDynInst
358 * that generated the initial store.
360 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
362 // when true, call execContinuation when response arrives
363 bool useContinuation;
365 template<typename c0> AtomicOpFunctor*
366 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
369 return new AtomicOpAnd<c0>(*reg0);
370 } else if (isAtomicOr()) {
371 return new AtomicOpOr<c0>(*reg0);
372 } else if (isAtomicXor()) {
373 return new AtomicOpXor<c0>(*reg0);
374 } else if (isAtomicCAS()) {
375 return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
376 } else if (isAtomicExch()) {
377 return new AtomicOpExch<c0>(*reg0);
378 } else if (isAtomicAdd()) {
379 return new AtomicOpAdd<c0>(*reg0);
380 } else if (isAtomicSub()) {
381 return new AtomicOpSub<c0>(*reg0);
382 } else if (isAtomicInc()) {
383 return new AtomicOpInc<c0>();
384 } else if (isAtomicDec()) {
385 return new AtomicOpDec<c0>();
386 } else if (isAtomicMax()) {
387 return new AtomicOpMax<c0>(*reg0);
388 } else if (isAtomicMin()) {
389 return new AtomicOpMin<c0>(*reg0);
391 fatal("Unrecognized atomic operation");
396 setRequestFlags(RequestPtr req, bool setMemOrder=true)
398 // currently these are the easy scopes to deduce
399 if (isPrivateSeg()) {
400 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
401 } else if (isSpillSeg()) {
402 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
403 } else if (isGlobalSeg()) {
404 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
405 } else if (isReadOnlySeg()) {
406 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
407 } else if (isGroupSeg()) {
408 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
409 } else if (isFlat()) {
410 // TODO: translate to correct scope
413 fatal("%s has bad segment type\n", disassemble());
416 if (isWavefrontScope()) {
417 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
418 Request::WAVEFRONT_SCOPE);
419 } else if (isWorkgroupScope()) {
420 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
421 Request::WORKGROUP_SCOPE);
422 } else if (isDeviceScope()) {
423 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
424 Request::DEVICE_SCOPE);
425 } else if (isSystemScope()) {
426 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
427 Request::SYSTEM_SCOPE);
428 } else if (!isNoScope() && !isWorkitemScope()) {
429 fatal("%s has bad scope type\n", disassemble());
433 // set acquire and release flags
435 req->setFlags(Request::ACQUIRE);
436 } else if (isRelease()) {
437 req->setFlags(Request::RELEASE);
438 } else if (isAcquireRelease()) {
439 req->setFlags(Request::ACQUIRE | Request::RELEASE);
440 } else if (!isNoOrder()) {
441 fatal("%s has bad memory order\n", disassemble());
446 // currently, the instruction genenerator only produces atomic return
447 // but a magic instruction can produce atomic no return
449 req->setFlags(Request::ATOMIC_RETURN_OP);
450 } else if (isAtomicNoRet()) {
451 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
455 // Map returned packets and the addresses they satisfy with which lane they
456 // were requested from
457 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
458 StatusVector memStatusVector;
460 // Track the status of memory requests per lane, a bit per lane
461 VectorMask statusBitVector;
462 // for ld_v# or st_v#
463 std::vector<int> statusVector;
464 std::vector<int> tlbHitLevel;
467 GPUStaticInst *_staticInst;
471 #endif // __GPU_DYN_INST_HH__