2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #ifndef __GPU_DYN_INST_HH__
35 #define __GPU_DYN_INST_HH__
40 #include "base/amo.hh"
41 #include "base/logging.hh"
42 #include "enums/MemType.hh"
43 #include "enums/StorageClassType.hh"
44 #include "gpu-compute/compute_unit.hh"
45 #include "gpu-compute/gpu_exec_context.hh"
50 class AtomicOpCAS : public TypedAtomicOpFunctor<T>
56 ComputeUnit *computeUnit;
58 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
59 : c(_c), s(_s), computeUnit(compute_unit) { }
64 computeUnit->numCASOps++;
69 computeUnit->numFailedCASOps++;
72 if (computeUnit->xact_cas_mode) {
73 computeUnit->xactCasLoadMap.clear();
76 AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
85 class GPUDynInst : public GPUExecContext
88 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
91 void execute(GPUDynInstPtr gpuDynInst);
92 int numSrcRegOperands();
93 int numDstRegOperands();
95 bool isVectorRegister(int operandIdx);
96 bool isScalarRegister(int operandIdx);
97 bool isCondRegister(int operandIdx);
98 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
99 int getOperandSize(int operandIdx);
100 bool isDstOperand(int operandIdx);
101 bool isSrcOperand(int operandIdx);
103 const std::string &disassemble() const;
105 uint64_t seqNum() const;
107 Enums::StorageClassType executedAs();
109 // The address of the memory operation
110 std::vector<Addr> addr;
113 // The data to get written
115 // Additional data (for atomics)
117 // Additional data (for atomics)
119 // The execution mask
120 VectorMask exec_mask;
122 // The memory type (M_U32, M_S32, ...)
123 Enums::MemType m_type;
125 // The equivalency class
127 // The return VGPR type (VT_32 or VT_64)
129 // Number of VGPR's accessed (1, 2, or 4)
131 // The return VGPR index
133 // There can be max 4 dest regs>
135 // SIMD where the WF of the memory instruction has been mapped to
137 // unique id of the WF where the memory instruction belongs to
139 // The kernel id of the requesting wf
141 // The CU id of the requesting wf
143 // HW slot id where the WF is mapped to inside a SIMD unit
145 // execution pipeline id where the memory instruction has been scheduled
147 // The execution time of this operation
149 // The latency of this operation
151 // A list of bank conflicts for the 4 cycles.
156 // The size of the READONLY segment
159 // Initiate the specified memory operation, by creating a
160 // memory request and sending it off to the memory system.
161 void initiateAcc(GPUDynInstPtr gpuDynInst);
162 // Complete the specified memory operation, by writing
163 // value back to the RF in the case of a load or atomic
164 // return or, in the case of a store, we do nothing
165 void completeAcc(GPUDynInstPtr gpuDynInst);
169 GPUStaticInst* staticInstruction() { return _staticInst; }
172 bool isBranch() const;
174 bool isReturn() const;
175 bool isUnconditionalJump() const;
176 bool isSpecialOp() const;
177 bool isWaitcnt() const;
179 bool isBarrier() const;
180 bool isMemFence() const;
181 bool isMemRef() const;
184 bool isStore() const;
186 bool isAtomic() const;
187 bool isAtomicNoRet() const;
188 bool isAtomicRet() const;
190 bool isScalar() const;
191 bool readsSCC() const;
192 bool writesSCC() const;
193 bool readsVCC() const;
194 bool writesVCC() const;
196 bool isAtomicAnd() const;
197 bool isAtomicOr() const;
198 bool isAtomicXor() const;
199 bool isAtomicCAS() const;
200 bool isAtomicExch() const;
201 bool isAtomicAdd() const;
202 bool isAtomicSub() const;
203 bool isAtomicInc() const;
204 bool isAtomicDec() const;
205 bool isAtomicMax() const;
206 bool isAtomicMin() const;
208 bool isArgLoad() const;
209 bool isGlobalMem() const;
210 bool isLocalMem() const;
212 bool isArgSeg() const;
213 bool isGlobalSeg() const;
214 bool isGroupSeg() const;
215 bool isKernArgSeg() const;
216 bool isPrivateSeg() const;
217 bool isReadOnlySeg() const;
218 bool isSpillSeg() const;
220 bool isWorkitemScope() const;
221 bool isWavefrontScope() const;
222 bool isWorkgroupScope() const;
223 bool isDeviceScope() const;
224 bool isSystemScope() const;
225 bool isNoScope() const;
227 bool isRelaxedOrder() const;
228 bool isAcquire() const;
229 bool isRelease() const;
230 bool isAcquireRelease() const;
231 bool isNoOrder() const;
233 bool isGloballyCoherent() const;
234 bool isSystemCoherent() const;
237 * Loads/stores/atomics may have acquire/release semantics associated
238 * withthem. Some protocols want to see the acquire/release as separate
239 * requests from the load/store/atomic. We implement that separation
240 * using continuations (i.e., a function pointer with an object associated
241 * with it). When, for example, the front-end generates a store with
242 * release semantics, we will first issue a normal store and set the
243 * continuation in the GPUDynInst to a function that generate a
244 * release request. That continuation will be called when the normal
245 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
246 * continuation will be called in the context of the same GPUDynInst
247 * that generated the initial store.
249 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
251 // when true, call execContinuation when response arrives
252 bool useContinuation;
254 template<typename c0> AtomicOpFunctorPtr
255 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
258 return m5::make_unique<AtomicOpAnd<c0>>(*reg0);
259 } else if (isAtomicOr()) {
260 return m5::make_unique<AtomicOpOr<c0>>(*reg0);
261 } else if (isAtomicXor()) {
262 return m5::make_unique<AtomicOpXor<c0>>(*reg0);
263 } else if (isAtomicCAS()) {
264 return m5::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
265 } else if (isAtomicExch()) {
266 return m5::make_unique<AtomicOpExch<c0>>(*reg0);
267 } else if (isAtomicAdd()) {
268 return m5::make_unique<AtomicOpAdd<c0>>(*reg0);
269 } else if (isAtomicSub()) {
270 return m5::make_unique<AtomicOpSub<c0>>(*reg0);
271 } else if (isAtomicInc()) {
272 return m5::make_unique<AtomicOpInc<c0>>();
273 } else if (isAtomicDec()) {
274 return m5::make_unique<AtomicOpDec<c0>>();
275 } else if (isAtomicMax()) {
276 return m5::make_unique<AtomicOpMax<c0>>(*reg0);
277 } else if (isAtomicMin()) {
278 return m5::make_unique<AtomicOpMin<c0>>(*reg0);
280 fatal("Unrecognized atomic operation");
285 setRequestFlags(RequestPtr req, bool setMemOrder=true)
287 // currently these are the easy scopes to deduce
288 if (isPrivateSeg()) {
289 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
290 } else if (isSpillSeg()) {
291 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
292 } else if (isGlobalSeg()) {
293 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
294 } else if (isReadOnlySeg()) {
295 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
296 } else if (isGroupSeg()) {
297 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
298 } else if (isFlat()) {
299 panic("TODO: translate to correct scope");
301 fatal("%s has bad segment type\n", disassemble());
304 if (isWavefrontScope()) {
305 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
306 Request::WAVEFRONT_SCOPE);
307 } else if (isWorkgroupScope()) {
308 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
309 Request::WORKGROUP_SCOPE);
310 } else if (isDeviceScope()) {
311 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
312 Request::DEVICE_SCOPE);
313 } else if (isSystemScope()) {
314 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
315 Request::SYSTEM_SCOPE);
316 } else if (!isNoScope() && !isWorkitemScope()) {
317 fatal("%s has bad scope type\n", disassemble());
321 // set acquire and release flags
323 req->setFlags(Request::ACQUIRE);
324 } else if (isRelease()) {
325 req->setFlags(Request::RELEASE);
326 } else if (isAcquireRelease()) {
327 req->setFlags(Request::ACQUIRE | Request::RELEASE);
328 } else if (!isNoOrder()) {
329 fatal("%s has bad memory order\n", disassemble());
334 // currently, the instruction genenerator only produces atomic return
335 // but a magic instruction can produce atomic no return
337 req->setFlags(Request::ATOMIC_RETURN_OP);
338 } else if (isAtomicNoRet()) {
339 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
343 // Map returned packets and the addresses they satisfy with which lane they
344 // were requested from
345 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
346 StatusVector memStatusVector;
348 // Track the status of memory requests per lane, a bit per lane
349 VectorMask statusBitVector;
350 // for ld_v# or st_v#
351 std::vector<int> statusVector;
352 std::vector<int> tlbHitLevel;
355 GPUStaticInst *_staticInst;
359 #endif // __GPU_DYN_INST_HH__