gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model
[gem5.git] / src / arch / gcn3 / insts / op_encodings.hh
1 /*
2 * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36 #ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
37 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
38
39 #include "arch/gcn3/gpu_decoder.hh"
40 #include "arch/gcn3/insts/gpu_static_inst.hh"
41 #include "arch/gcn3/operand.hh"
42 #include "debug/GPUExec.hh"
43 #include "mem/ruby/system/RubySystem.hh"
44
45 namespace Gcn3ISA
46 {
47 // --- purely virtual instruction classes ---
48
49 class Inst_SOP2 : public GCN3GPUStaticInst
50 {
51 public:
52 Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
53
54 int instSize() const override;
55 void generateDisassembly() override;
56
57 bool isScalarRegister(int opIdx) override;
58 bool isVectorRegister(int opIdx) override;
59 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
60
61 protected:
62 // first instruction DWORD
63 InFmt_SOP2 instData;
64 // possible second DWORD
65 InstFormat extData;
66 uint32_t varSize;
67
68 private:
69 bool hasSecondDword(InFmt_SOP2 *);
70 }; // Inst_SOP2
71
72 class Inst_SOPK : public GCN3GPUStaticInst
73 {
74 public:
75 Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
76 ~Inst_SOPK();
77
78 int instSize() const override;
79 void generateDisassembly() override;
80
81 bool isScalarRegister(int opIdx) override;
82 bool isVectorRegister(int opIdx) override;
83 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
84
85 protected:
86 // first instruction DWORD
87 InFmt_SOPK instData;
88 }; // Inst_SOPK
89
90 class Inst_SOP1 : public GCN3GPUStaticInst
91 {
92 public:
93 Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
94 ~Inst_SOP1();
95
96 int instSize() const override;
97 void generateDisassembly() override;
98
99 bool isScalarRegister(int opIdx) override;
100 bool isVectorRegister(int opIdx) override;
101 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
102
103 protected:
104 // first instruction DWORD
105 InFmt_SOP1 instData;
106 // possible second DWORD
107 InstFormat extData;
108 uint32_t varSize;
109
110 private:
111 bool hasSecondDword(InFmt_SOP1 *);
112 }; // Inst_SOP1
113
114 class Inst_SOPC : public GCN3GPUStaticInst
115 {
116 public:
117 Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
118 ~Inst_SOPC();
119
120 int instSize() const override;
121 void generateDisassembly() override;
122
123 bool isScalarRegister(int opIdx) override;
124 bool isVectorRegister(int opIdx) override;
125 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
126
127 protected:
128 // first instruction DWORD
129 InFmt_SOPC instData;
130 // possible second DWORD
131 InstFormat extData;
132 uint32_t varSize;
133
134 private:
135 bool hasSecondDword(InFmt_SOPC *);
136 }; // Inst_SOPC
137
138 class Inst_SOPP : public GCN3GPUStaticInst
139 {
140 public:
141 Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
142 ~Inst_SOPP();
143
144 int instSize() const override;
145 void generateDisassembly() override;
146
147 bool isScalarRegister(int opIdx) override;
148 bool isVectorRegister(int opIdx) override;
149 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
150
151 protected:
152 // first instruction DWORD
153 InFmt_SOPP instData;
154 }; // Inst_SOPP
155
156 class Inst_SMEM : public GCN3GPUStaticInst
157 {
158 public:
159 Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
160 ~Inst_SMEM();
161
162 int instSize() const override;
163 void generateDisassembly() override;
164
165 bool isScalarRegister(int opIdx) override;
166 bool isVectorRegister(int opIdx) override;
167 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
168
169 protected:
170 /**
171 * initiate a memory read access for N dwords
172 */
173 template<int N>
174 void
175 initMemRead(GPUDynInstPtr gpuDynInst)
176 {
177 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
178 int req_size = N * sizeof(ScalarRegU32);
179 Addr vaddr = gpuDynInst->scalarAddr;
180
181 /**
182 * the base address of the cache line where the the last byte of
183 * the request will be stored.
184 */
185 Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
186
187 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
188 /**
189 * if the base cache line address of the last byte is greater
190 * than the address of the first byte then we have a misaligned
191 * access.
192 */
193 bool misaligned_acc = split_addr > vaddr;
194
195 RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
196 gpuDynInst->computeUnit()->masterId(), 0,
197 gpuDynInst->wfDynId);
198
199 if (misaligned_acc) {
200 RequestPtr req1, req2;
201 req->splitOnVaddr(split_addr, req1, req2);
202 gpuDynInst->numScalarReqs = 2;
203 gpuDynInst->setRequestFlags(req1);
204 gpuDynInst->setRequestFlags(req2);
205 PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
206 PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
207 pkt1->dataStatic(gpuDynInst->scalar_data);
208 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
209 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
210 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
211 } else {
212 gpuDynInst->numScalarReqs = 1;
213 gpuDynInst->setRequestFlags(req);
214 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
215 pkt->dataStatic(gpuDynInst->scalar_data);
216 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
217 }
218 }
219
220 /**
221 * initiate a memory write access for N dwords
222 */
223 template<int N>
224 void
225 initMemWrite(GPUDynInstPtr gpuDynInst)
226 {
227 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
228 int req_size = N * sizeof(ScalarRegU32);
229 Addr vaddr = gpuDynInst->scalarAddr;
230
231 /**
232 * the base address of the cache line where the the last byte of
233 * the request will be stored.
234 */
235 Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
236
237 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
238 /**
239 * if the base cache line address of the last byte is greater
240 * than the address of the first byte then we have a misaligned
241 * access.
242 */
243 bool misaligned_acc = split_addr > vaddr;
244
245 RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
246 gpuDynInst->computeUnit()->masterId(), 0,
247 gpuDynInst->wfDynId);
248
249 if (misaligned_acc) {
250 RequestPtr req1, req2;
251 req->splitOnVaddr(split_addr, req1, req2);
252 gpuDynInst->numScalarReqs = 2;
253 gpuDynInst->setRequestFlags(req1);
254 gpuDynInst->setRequestFlags(req2);
255 PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
256 PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
257 pkt1->dataStatic(gpuDynInst->scalar_data);
258 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
259 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
260 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
261 } else {
262 gpuDynInst->numScalarReqs = 1;
263 gpuDynInst->setRequestFlags(req);
264 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
265 pkt->dataStatic(gpuDynInst->scalar_data);
266 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
267 }
268 }
269
270 void
271 calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr,
272 ScalarRegU32 offset)
273 {
274 Addr vaddr = addr.rawData();
275 vaddr += offset;
276 vaddr &= ~0x3;
277 gpuDynInst->scalarAddr = vaddr;
278 }
279
280 // first instruction DWORD
281 InFmt_SMEM instData;
282 // second instruction DWORD
283 InFmt_SMEM_1 extData;
284 }; // Inst_SMEM
285
286 class Inst_VOP2 : public GCN3GPUStaticInst
287 {
288 public:
289 Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
290 ~Inst_VOP2();
291
292 int instSize() const override;
293 void generateDisassembly() override;
294
295 bool isScalarRegister(int opIdx) override;
296 bool isVectorRegister(int opIdx) override;
297 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
298
299 protected:
300 // first instruction DWORD
301 InFmt_VOP2 instData;
302 // possible second DWORD
303 InstFormat extData;
304 uint32_t varSize;
305
306 private:
307 bool hasSecondDword(InFmt_VOP2 *);
308 }; // Inst_VOP2
309
310 class Inst_VOP1 : public GCN3GPUStaticInst
311 {
312 public:
313 Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
314 ~Inst_VOP1();
315
316 int instSize() const override;
317 void generateDisassembly() override;
318
319 bool isScalarRegister(int opIdx) override;
320 bool isVectorRegister(int opIdx) override;
321 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
322
323 protected:
324 // first instruction DWORD
325 InFmt_VOP1 instData;
326 // possible second DWORD
327 InstFormat extData;
328 uint32_t varSize;
329
330 private:
331 bool hasSecondDword(InFmt_VOP1 *);
332 }; // Inst_VOP1
333
334 class Inst_VOPC : public GCN3GPUStaticInst
335 {
336 public:
337 Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
338 ~Inst_VOPC();
339
340 int instSize() const override;
341 void generateDisassembly() override;
342
343 bool isScalarRegister(int opIdx) override;
344 bool isVectorRegister(int opIdx) override;
345 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
346
347 protected:
348 // first instruction DWORD
349 InFmt_VOPC instData;
350 // possible second DWORD
351 InstFormat extData;
352 uint32_t varSize;
353
354 private:
355 bool hasSecondDword(InFmt_VOPC *);
356 }; // Inst_VOPC
357
358 class Inst_VINTRP : public GCN3GPUStaticInst
359 {
360 public:
361 Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
362 ~Inst_VINTRP();
363
364 int instSize() const override;
365
366 protected:
367 // first instruction DWORD
368 InFmt_VINTRP instData;
369 }; // Inst_VINTRP
370
371 class Inst_VOP3 : public GCN3GPUStaticInst
372 {
373 public:
374 Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
375 ~Inst_VOP3();
376
377 int instSize() const override;
378 void generateDisassembly() override;
379
380 bool isScalarRegister(int opIdx) override;
381 bool isVectorRegister(int opIdx) override;
382 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
383
384 protected:
385 // first instruction DWORD
386 InFmt_VOP3 instData;
387 // second instruction DWORD
388 InFmt_VOP3_1 extData;
389
390 private:
391 bool hasSecondDword(InFmt_VOP3 *);
392 /**
393 * the v_cmp and readlane instructions in the VOP3
394 * encoding are unique because they are the only
395 * instructions that use the VDST field to specify
396 * a scalar register destination. for VOP3::V_CMP insts
397 * VDST specifies the arbitrary SGPR pair used to write
398 * VCC. for V_READLANE VDST specifies the SGPR to return
399 * the value of the selected lane in the source VGPR
400 * from which we are reading.
401 */
402 const bool sgprDst;
403 }; // Inst_VOP3
404
405 class Inst_VOP3_SDST_ENC : public GCN3GPUStaticInst
406 {
407 public:
408 Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
409 ~Inst_VOP3_SDST_ENC();
410
411 int instSize() const override;
412 void generateDisassembly() override;
413
414 bool isScalarRegister(int opIdx) override;
415 bool isVectorRegister(int opIdx) override;
416 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
417
418 protected:
419 // first instruction DWORD
420 InFmt_VOP3_SDST_ENC instData;
421 // second instruction DWORD
422 InFmt_VOP3_1 extData;
423
424 private:
425 bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
426 }; // Inst_VOP3_SDST_ENC
427
428 class Inst_DS : public GCN3GPUStaticInst
429 {
430 public:
431 Inst_DS(InFmt_DS*, const std::string &opcode);
432 ~Inst_DS();
433
434 int instSize() const override;
435 void generateDisassembly() override;
436
437 bool isScalarRegister(int opIdx) override;
438 bool isVectorRegister(int opIdx) override;
439 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
440
441 protected:
442 template<typename T>
443 void
444 initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
445 {
446 Wavefront *wf = gpuDynInst->wavefront();
447
448 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
449 if (gpuDynInst->exec_mask[lane]) {
450 Addr vaddr = gpuDynInst->addr[lane] + offset;
451
452 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
453 = wf->ldsChunk->read<T>(vaddr);
454 }
455 }
456 }
457
458 template<typename T>
459 void
460 initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
461 {
462 Wavefront *wf = gpuDynInst->wavefront();
463
464 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
465 if (gpuDynInst->exec_mask[lane]) {
466 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
467 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
468
469 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
470 = wf->ldsChunk->read<T>(vaddr0);
471 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
472 = wf->ldsChunk->read<T>(vaddr1);
473 }
474 }
475 }
476
477 template<typename T>
478 void
479 initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
480 {
481 Wavefront *wf = gpuDynInst->wavefront();
482
483 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
484 if (gpuDynInst->exec_mask[lane]) {
485 Addr vaddr = gpuDynInst->addr[lane] + offset;
486 wf->ldsChunk->write<T>(vaddr,
487 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
488 }
489 }
490 }
491
492 template<typename T>
493 void
494 initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
495 {
496 Wavefront *wf = gpuDynInst->wavefront();
497
498 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
499 if (gpuDynInst->exec_mask[lane]) {
500 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
501 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
502 wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
503 gpuDynInst->d_data))[lane * 2]);
504 wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
505 gpuDynInst->d_data))[lane * 2 + 1]);
506 }
507 }
508 }
509
510 void
511 calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
512 {
513 Wavefront *wf = gpuDynInst->wavefront();
514
515 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
516 if (wf->execMask(lane)) {
517 gpuDynInst->addr.at(lane) = (Addr)addr[lane];
518 }
519 }
520 }
521
522 // first instruction DWORD
523 InFmt_DS instData;
524 // second instruction DWORD
525 InFmt_DS_1 extData;
526 }; // Inst_DS
527
528 class Inst_MUBUF : public GCN3GPUStaticInst
529 {
530 public:
531 Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
532 ~Inst_MUBUF();
533
534 int instSize() const override;
535 void generateDisassembly() override;
536
537 bool isScalarRegister(int opIdx) override;
538 bool isVectorRegister(int opIdx) override;
539 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
540
541 protected:
542 struct BufferRsrcDescriptor
543 {
544 uint64_t baseAddr : 48;
545 uint32_t stride : 14;
546 uint32_t cacheSwizzle : 1;
547 uint32_t swizzleEn : 1;
548 uint32_t numRecords : 32;
549 uint32_t dstSelX : 3;
550 uint32_t dstSelY : 3;
551 uint32_t dstSelZ : 3;
552 uint32_t dstSelW : 3;
553 uint32_t numFmt : 3;
554 uint32_t dataFmt : 4;
555 uint32_t elemSize : 2;
556 uint32_t idxStride : 2;
557 uint32_t addTidEn : 1;
558 uint32_t atc : 1;
559 uint32_t hashEn : 1;
560 uint32_t heap : 1;
561 uint32_t mType : 3;
562 uint32_t type : 2;
563 };
564
565 template<typename T>
566 void
567 initMemRead(GPUDynInstPtr gpuDynInst)
568 {
569 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
570
571 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
572 if (gpuDynInst->exec_mask[lane]) {
573 Addr vaddr = gpuDynInst->addr[lane];
574
575 RequestPtr req = std::make_shared<Request>(vaddr,
576 sizeof(T), 0,
577 gpuDynInst->computeUnit()->masterId(), 0,
578 gpuDynInst->wfDynId);
579
580 gpuDynInst->setRequestFlags(req);
581
582 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
583 pkt->dataStatic(&(reinterpret_cast<T*>(
584 gpuDynInst->d_data))[lane]);
585
586 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
587 pkt);
588 }
589 }
590 }
591
592 template<typename T>
593 void
594 initMemWrite(GPUDynInstPtr gpuDynInst)
595 {
596 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
597
598 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
599 if (gpuDynInst->exec_mask[lane]) {
600 Addr vaddr = gpuDynInst->addr[lane];
601
602 RequestPtr req = std::make_shared<Request>(vaddr,
603 sizeof(T), 0,
604 gpuDynInst->computeUnit()->masterId(),
605 0, gpuDynInst->wfDynId);
606
607 gpuDynInst->setRequestFlags(req);
608 PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
609 pkt->dataStatic(&(reinterpret_cast<T*>(
610 gpuDynInst->d_data))[lane]);
611 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
612 pkt);
613 }
614 }
615 }
616
617 void
618 injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
619 {
620 // create request and set flags
621 gpuDynInst->statusBitVector = VectorMask(1);
622 RequestPtr req = std::make_shared<Request>(0, 0, 0,
623 gpuDynInst->computeUnit()->
624 masterId(), 0,
625 gpuDynInst->wfDynId);
626 gpuDynInst->setRequestFlags(req);
627 gpuDynInst->computeUnit()->
628 injectGlobalMemFence(gpuDynInst, false, req);
629 }
630
631 /**
632 * MUBUF insructions calculate their addresses as follows:
633 *
634 * index = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
635 * offset = (OFFEN ? vgpr_off : 0) + inst_off
636 *
637 * / ====================== LINEAR ADDRESSING ====================== /
638 * VADDR = base + sgpr_off + offset + stride * index
639 *
640 * / ===================== SWIZZLED ADDRESSING ===================== /
641 * index_msb = index / const_index_stride
642 * index_lsb = index % const_index_stride
643 * offset_msb = offset / const_element_size
644 * offset_lsb = offset % const_element_size
645 * buffer_offset = ((index_msb * stride + offset_msb *
646 * const_element_size) * const_index_stride +
647 * index_lsb * const_element_size + offset_lsb)
648 *
649 * VADDR = base + sgpr_off + buffer_offset
650 */
651 template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
652 void
653 calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
654 SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
655 {
656 Addr vaddr = 0;
657 Addr base_addr = 0;
658 Addr stride = 0;
659 Addr buf_idx = 0;
660 Addr buf_off = 0;
661 BufferRsrcDescriptor rsrc_desc;
662
663 std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
664 sizeof(BufferRsrcDescriptor));
665
666 base_addr = rsrc_desc.baseAddr;
667
668 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
669 + rsrc_desc.stride) : rsrc_desc.stride;
670
671 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
672 if (gpuDynInst->exec_mask[lane]) {
673 vaddr = base_addr + s_offset.rawData();
674 /**
675 * first we calculate the buffer's index and offset.
676 * these will be used for either linear or swizzled
677 * buffers.
678 */
679 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
680
681 buf_off = v_off[lane] + inst_offset;
682
683 if (rsrc_desc.swizzleEn) {
684 Addr idx_stride = 8 << rsrc_desc.idxStride;
685 Addr elem_size = 2 << rsrc_desc.elemSize;
686 Addr idx_msb = buf_idx / idx_stride;
687 Addr idx_lsb = buf_idx % idx_stride;
688 Addr off_msb = buf_off / elem_size;
689 Addr off_lsb = buf_off % elem_size;
690
691 vaddr += ((idx_msb * stride + off_msb * elem_size)
692 * idx_stride + idx_lsb * elem_size + off_lsb);
693 } else {
694 vaddr += buf_off + stride * buf_idx;
695 }
696
697 gpuDynInst->addr.at(lane) = vaddr;
698 }
699 }
700 }
701
702 // first instruction DWORD
703 InFmt_MUBUF instData;
704 // second instruction DWORD
705 InFmt_MUBUF_1 extData;
706 }; // Inst_MUBUF
707
708 class Inst_MTBUF : public GCN3GPUStaticInst
709 {
710 public:
711 Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
712 ~Inst_MTBUF();
713
714 int instSize() const override;
715
716 protected:
717 // first instruction DWORD
718 InFmt_MTBUF instData;
719 // second instruction DWORD
720 InFmt_MTBUF_1 extData;
721
722 private:
723 bool hasSecondDword(InFmt_MTBUF *);
724 }; // Inst_MTBUF
725
726 class Inst_MIMG : public GCN3GPUStaticInst
727 {
728 public:
729 Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
730 ~Inst_MIMG();
731
732 int instSize() const override;
733
734 protected:
735 // first instruction DWORD
736 InFmt_MIMG instData;
737 // second instruction DWORD
738 InFmt_MIMG_1 extData;
739 }; // Inst_MIMG
740
741 class Inst_EXP : public GCN3GPUStaticInst
742 {
743 public:
744 Inst_EXP(InFmt_EXP*, const std::string &opcode);
745 ~Inst_EXP();
746
747 int instSize() const override;
748
749 protected:
750 // first instruction DWORD
751 InFmt_EXP instData;
752 // second instruction DWORD
753 InFmt_EXP_1 extData;
754 }; // Inst_EXP
755
756 class Inst_FLAT : public GCN3GPUStaticInst
757 {
758 public:
759 Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
760 ~Inst_FLAT();
761
762 int instSize() const override;
763 void generateDisassembly() override;
764
765 bool isScalarRegister(int opIdx) override;
766 bool isVectorRegister(int opIdx) override;
767 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
768
769 protected:
770 template<typename T>
771 void
772 initMemRead(GPUDynInstPtr gpuDynInst)
773 {
774 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
775
776 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
777 if (gpuDynInst->exec_mask[lane]) {
778 Addr vaddr = gpuDynInst->addr[lane];
779
780 RequestPtr req = std::make_shared<Request>(vaddr,
781 sizeof(T), 0,
782 gpuDynInst->computeUnit()->masterId(), 0,
783 gpuDynInst->wfDynId);
784
785 gpuDynInst->setRequestFlags(req);
786 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
787 pkt->dataStatic(&(reinterpret_cast<T*>(
788 gpuDynInst->d_data))[lane]);
789 gpuDynInst->computeUnit()
790 ->sendRequest(gpuDynInst, lane, pkt);
791 }
792 }
793 }
794
795 template<int N>
796 void
797 initMemRead(GPUDynInstPtr gpuDynInst)
798 {
799 int req_size = N * sizeof(VecElemU32);
800 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
801
802 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
803 if (gpuDynInst->exec_mask[lane]) {
804 Addr vaddr = gpuDynInst->addr[lane];
805
806 RequestPtr req = std::make_shared<Request>(vaddr, req_size,
807 0,
808 gpuDynInst->computeUnit()->masterId(), 0,
809 gpuDynInst->wfDynId);
810
811 gpuDynInst->setRequestFlags(req);
812 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
813 pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
814 gpuDynInst->d_data))[lane * N]);
815 gpuDynInst->computeUnit()
816 ->sendRequest(gpuDynInst, lane, pkt);
817 }
818 }
819 }
820
821 template<typename T>
822 void
823 initMemWrite(GPUDynInstPtr gpuDynInst)
824 {
825 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
826
827 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
828 if (gpuDynInst->exec_mask[lane]) {
829 Addr vaddr = gpuDynInst->addr[lane];
830
831 RequestPtr req = std::make_shared<Request>(vaddr,
832 sizeof(T), 0,
833 gpuDynInst->computeUnit()->masterId(),
834 0, gpuDynInst->wfDynId);
835
836 gpuDynInst->setRequestFlags(req);
837 PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
838 pkt->dataStatic(&(reinterpret_cast<T*>(
839 gpuDynInst->d_data))[lane]);
840 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
841 pkt);
842 }
843 }
844 }
845
846 template<int N>
847 void
848 initMemWrite(GPUDynInstPtr gpuDynInst)
849 {
850 int req_size = N * sizeof(VecElemU32);
851 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
852
853 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
854 if (gpuDynInst->exec_mask[lane]) {
855 Addr vaddr = gpuDynInst->addr[lane];
856
857 RequestPtr req = std::make_shared<Request>(vaddr, req_size,
858 0,
859 gpuDynInst->computeUnit()->masterId(),
860 0, gpuDynInst->wfDynId);
861
862 gpuDynInst->setRequestFlags(req);
863 PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
864 pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
865 gpuDynInst->d_data))[lane * N]);
866 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
867 pkt);
868 }
869 }
870 }
871
872 template<typename T>
873 void
874 initAtomicAccess(GPUDynInstPtr gpuDynInst)
875 {
876 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
877
878 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
879 if (gpuDynInst->exec_mask[lane]) {
880 Addr vaddr = gpuDynInst->addr[lane];
881
882 RequestPtr req = std::make_shared<Request>(vaddr,
883 sizeof(T), 0,
884 gpuDynInst->computeUnit()->masterId(), 0,
885 gpuDynInst->wfDynId,
886 gpuDynInst->makeAtomicOpFunctor<T>(
887 &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
888 &(reinterpret_cast<T*>(
889 gpuDynInst->x_data))[lane]));
890
891 gpuDynInst->setRequestFlags(req);
892
893 PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
894 pkt->dataStatic(&(reinterpret_cast<T*>(
895 gpuDynInst->d_data))[lane]);
896
897 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
898 pkt);
899 }
900 }
901 }
902
903 void
904 calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
905 {
906 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
907 if (gpuDynInst->exec_mask[lane]) {
908 gpuDynInst->addr.at(lane) = addr[lane];
909 }
910 }
911 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
912 }
913
914 // first instruction DWORD
915 InFmt_FLAT instData;
916 // second instruction DWORD
917 InFmt_FLAT_1 extData;
918 }; // Inst_FLAT
919 } // namespace Gcn3ISA
920
921 #endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__