2 * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Authors: Anthony Gutierrez
36 #ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
37 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
39 #include "arch/gcn3/gpu_decoder.hh"
40 #include "arch/gcn3/insts/gpu_static_inst.hh"
41 #include "arch/gcn3/operand.hh"
42 #include "debug/GPUExec.hh"
43 #include "mem/ruby/system/RubySystem.hh"
47 // --- purely virtual instruction classes ---
49 class Inst_SOP2 : public GCN3GPUStaticInst
52 Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
54 int instSize() const override;
55 void generateDisassembly() override;
57 bool isScalarRegister(int opIdx) override;
58 bool isVectorRegister(int opIdx) override;
59 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
62 // first instruction DWORD
64 // possible second DWORD
69 bool hasSecondDword(InFmt_SOP2 *);
72 class Inst_SOPK : public GCN3GPUStaticInst
75 Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
78 int instSize() const override;
79 void generateDisassembly() override;
81 bool isScalarRegister(int opIdx) override;
82 bool isVectorRegister(int opIdx) override;
83 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
86 // first instruction DWORD
90 class Inst_SOP1 : public GCN3GPUStaticInst
93 Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
96 int instSize() const override;
97 void generateDisassembly() override;
99 bool isScalarRegister(int opIdx) override;
100 bool isVectorRegister(int opIdx) override;
101 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
104 // first instruction DWORD
106 // possible second DWORD
111 bool hasSecondDword(InFmt_SOP1 *);
114 class Inst_SOPC : public GCN3GPUStaticInst
117 Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
120 int instSize() const override;
121 void generateDisassembly() override;
123 bool isScalarRegister(int opIdx) override;
124 bool isVectorRegister(int opIdx) override;
125 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
128 // first instruction DWORD
130 // possible second DWORD
135 bool hasSecondDword(InFmt_SOPC *);
138 class Inst_SOPP : public GCN3GPUStaticInst
141 Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
144 int instSize() const override;
145 void generateDisassembly() override;
147 bool isScalarRegister(int opIdx) override;
148 bool isVectorRegister(int opIdx) override;
149 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
152 // first instruction DWORD
156 class Inst_SMEM : public GCN3GPUStaticInst
159 Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
162 int instSize() const override;
163 void generateDisassembly() override;
165 bool isScalarRegister(int opIdx) override;
166 bool isVectorRegister(int opIdx) override;
167 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
171 * initiate a memory read access for N dwords
175 initMemRead(GPUDynInstPtr gpuDynInst)
177 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
178 int req_size = N * sizeof(ScalarRegU32);
179 Addr vaddr = gpuDynInst->scalarAddr;
182 * the base address of the cache line where the the last byte of
183 * the request will be stored.
185 Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
187 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
189 * if the base cache line address of the last byte is greater
190 * than the address of the first byte then we have a misaligned
193 bool misaligned_acc = split_addr > vaddr;
195 RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
196 gpuDynInst->computeUnit()->masterId(), 0,
197 gpuDynInst->wfDynId);
199 if (misaligned_acc) {
200 RequestPtr req1, req2;
201 req->splitOnVaddr(split_addr, req1, req2);
202 gpuDynInst->numScalarReqs = 2;
203 gpuDynInst->setRequestFlags(req1);
204 gpuDynInst->setRequestFlags(req2);
205 PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
206 PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
207 pkt1->dataStatic(gpuDynInst->scalar_data);
208 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
209 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
210 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
212 gpuDynInst->numScalarReqs = 1;
213 gpuDynInst->setRequestFlags(req);
214 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
215 pkt->dataStatic(gpuDynInst->scalar_data);
216 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
221 * initiate a memory write access for N dwords
225 initMemWrite(GPUDynInstPtr gpuDynInst)
227 int block_size = gpuDynInst->computeUnit()->cacheLineSize();
228 int req_size = N * sizeof(ScalarRegU32);
229 Addr vaddr = gpuDynInst->scalarAddr;
232 * the base address of the cache line where the the last byte of
233 * the request will be stored.
235 Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
237 assert(split_addr <= vaddr || split_addr - vaddr < block_size);
239 * if the base cache line address of the last byte is greater
240 * than the address of the first byte then we have a misaligned
243 bool misaligned_acc = split_addr > vaddr;
245 RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
246 gpuDynInst->computeUnit()->masterId(), 0,
247 gpuDynInst->wfDynId);
249 if (misaligned_acc) {
250 RequestPtr req1, req2;
251 req->splitOnVaddr(split_addr, req1, req2);
252 gpuDynInst->numScalarReqs = 2;
253 gpuDynInst->setRequestFlags(req1);
254 gpuDynInst->setRequestFlags(req2);
255 PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
256 PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
257 pkt1->dataStatic(gpuDynInst->scalar_data);
258 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
259 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
260 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
262 gpuDynInst->numScalarReqs = 1;
263 gpuDynInst->setRequestFlags(req);
264 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
265 pkt->dataStatic(gpuDynInst->scalar_data);
266 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
271 calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr,
274 Addr vaddr = addr.rawData();
277 gpuDynInst->scalarAddr = vaddr;
280 // first instruction DWORD
282 // second instruction DWORD
283 InFmt_SMEM_1 extData;
286 class Inst_VOP2 : public GCN3GPUStaticInst
289 Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
292 int instSize() const override;
293 void generateDisassembly() override;
295 bool isScalarRegister(int opIdx) override;
296 bool isVectorRegister(int opIdx) override;
297 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
300 // first instruction DWORD
302 // possible second DWORD
307 bool hasSecondDword(InFmt_VOP2 *);
310 class Inst_VOP1 : public GCN3GPUStaticInst
313 Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
316 int instSize() const override;
317 void generateDisassembly() override;
319 bool isScalarRegister(int opIdx) override;
320 bool isVectorRegister(int opIdx) override;
321 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
324 // first instruction DWORD
326 // possible second DWORD
331 bool hasSecondDword(InFmt_VOP1 *);
334 class Inst_VOPC : public GCN3GPUStaticInst
337 Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
340 int instSize() const override;
341 void generateDisassembly() override;
343 bool isScalarRegister(int opIdx) override;
344 bool isVectorRegister(int opIdx) override;
345 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
348 // first instruction DWORD
350 // possible second DWORD
355 bool hasSecondDword(InFmt_VOPC *);
358 class Inst_VINTRP : public GCN3GPUStaticInst
361 Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
364 int instSize() const override;
367 // first instruction DWORD
368 InFmt_VINTRP instData;
371 class Inst_VOP3 : public GCN3GPUStaticInst
374 Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
377 int instSize() const override;
378 void generateDisassembly() override;
380 bool isScalarRegister(int opIdx) override;
381 bool isVectorRegister(int opIdx) override;
382 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
385 // first instruction DWORD
387 // second instruction DWORD
388 InFmt_VOP3_1 extData;
391 bool hasSecondDword(InFmt_VOP3 *);
393 * the v_cmp and readlane instructions in the VOP3
394 * encoding are unique because they are the only
395 * instructions that use the VDST field to specify
396 * a scalar register destination. for VOP3::V_CMP insts
397 * VDST specifies the arbitrary SGPR pair used to write
398 * VCC. for V_READLANE VDST specifies the SGPR to return
399 * the value of the selected lane in the source VGPR
400 * from which we are reading.
405 class Inst_VOP3_SDST_ENC : public GCN3GPUStaticInst
408 Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
409 ~Inst_VOP3_SDST_ENC();
411 int instSize() const override;
412 void generateDisassembly() override;
414 bool isScalarRegister(int opIdx) override;
415 bool isVectorRegister(int opIdx) override;
416 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
419 // first instruction DWORD
420 InFmt_VOP3_SDST_ENC instData;
421 // second instruction DWORD
422 InFmt_VOP3_1 extData;
425 bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
426 }; // Inst_VOP3_SDST_ENC
428 class Inst_DS : public GCN3GPUStaticInst
431 Inst_DS(InFmt_DS*, const std::string &opcode);
434 int instSize() const override;
435 void generateDisassembly() override;
437 bool isScalarRegister(int opIdx) override;
438 bool isVectorRegister(int opIdx) override;
439 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
444 initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
446 Wavefront *wf = gpuDynInst->wavefront();
448 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
449 if (gpuDynInst->exec_mask[lane]) {
450 Addr vaddr = gpuDynInst->addr[lane] + offset;
452 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
453 = wf->ldsChunk->read<T>(vaddr);
460 initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
462 Wavefront *wf = gpuDynInst->wavefront();
464 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
465 if (gpuDynInst->exec_mask[lane]) {
466 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
467 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
469 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
470 = wf->ldsChunk->read<T>(vaddr0);
471 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
472 = wf->ldsChunk->read<T>(vaddr1);
479 initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
481 Wavefront *wf = gpuDynInst->wavefront();
483 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
484 if (gpuDynInst->exec_mask[lane]) {
485 Addr vaddr = gpuDynInst->addr[lane] + offset;
486 wf->ldsChunk->write<T>(vaddr,
487 (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
494 initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
496 Wavefront *wf = gpuDynInst->wavefront();
498 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
499 if (gpuDynInst->exec_mask[lane]) {
500 Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
501 Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
502 wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
503 gpuDynInst->d_data))[lane * 2]);
504 wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
505 gpuDynInst->d_data))[lane * 2 + 1]);
511 calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
513 Wavefront *wf = gpuDynInst->wavefront();
515 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
516 if (wf->execMask(lane)) {
517 gpuDynInst->addr.at(lane) = (Addr)addr[lane];
522 // first instruction DWORD
524 // second instruction DWORD
528 class Inst_MUBUF : public GCN3GPUStaticInst
531 Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
534 int instSize() const override;
535 void generateDisassembly() override;
537 bool isScalarRegister(int opIdx) override;
538 bool isVectorRegister(int opIdx) override;
539 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
542 struct BufferRsrcDescriptor
544 uint64_t baseAddr : 48;
545 uint32_t stride : 14;
546 uint32_t cacheSwizzle : 1;
547 uint32_t swizzleEn : 1;
548 uint32_t numRecords : 32;
549 uint32_t dstSelX : 3;
550 uint32_t dstSelY : 3;
551 uint32_t dstSelZ : 3;
552 uint32_t dstSelW : 3;
554 uint32_t dataFmt : 4;
555 uint32_t elemSize : 2;
556 uint32_t idxStride : 2;
557 uint32_t addTidEn : 1;
567 initMemRead(GPUDynInstPtr gpuDynInst)
569 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
571 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
572 if (gpuDynInst->exec_mask[lane]) {
573 Addr vaddr = gpuDynInst->addr[lane];
575 RequestPtr req = std::make_shared<Request>(vaddr,
577 gpuDynInst->computeUnit()->masterId(), 0,
578 gpuDynInst->wfDynId);
580 gpuDynInst->setRequestFlags(req);
582 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
583 pkt->dataStatic(&(reinterpret_cast<T*>(
584 gpuDynInst->d_data))[lane]);
586 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
594 initMemWrite(GPUDynInstPtr gpuDynInst)
596 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
598 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
599 if (gpuDynInst->exec_mask[lane]) {
600 Addr vaddr = gpuDynInst->addr[lane];
602 RequestPtr req = std::make_shared<Request>(vaddr,
604 gpuDynInst->computeUnit()->masterId(),
605 0, gpuDynInst->wfDynId);
607 gpuDynInst->setRequestFlags(req);
608 PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
609 pkt->dataStatic(&(reinterpret_cast<T*>(
610 gpuDynInst->d_data))[lane]);
611 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
618 injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
620 // create request and set flags
621 gpuDynInst->statusBitVector = VectorMask(1);
622 RequestPtr req = std::make_shared<Request>(0, 0, 0,
623 gpuDynInst->computeUnit()->
625 gpuDynInst->wfDynId);
626 gpuDynInst->setRequestFlags(req);
627 gpuDynInst->computeUnit()->
628 injectGlobalMemFence(gpuDynInst, false, req);
632 * MUBUF insructions calculate their addresses as follows:
634 * index = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
635 * offset = (OFFEN ? vgpr_off : 0) + inst_off
637 * / ====================== LINEAR ADDRESSING ====================== /
638 * VADDR = base + sgpr_off + offset + stride * index
640 * / ===================== SWIZZLED ADDRESSING ===================== /
641 * index_msb = index / const_index_stride
642 * index_lsb = index % const_index_stride
643 * offset_msb = offset / const_element_size
644 * offset_lsb = offset % const_element_size
645 * buffer_offset = ((index_msb * stride + offset_msb *
646 * const_element_size) * const_index_stride +
647 * index_lsb * const_element_size + offset_lsb)
649 * VADDR = base + sgpr_off + buffer_offset
651 template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
653 calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
654 SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
661 BufferRsrcDescriptor rsrc_desc;
663 std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
664 sizeof(BufferRsrcDescriptor));
666 base_addr = rsrc_desc.baseAddr;
668 stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
669 + rsrc_desc.stride) : rsrc_desc.stride;
671 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
672 if (gpuDynInst->exec_mask[lane]) {
673 vaddr = base_addr + s_offset.rawData();
675 * first we calculate the buffer's index and offset.
676 * these will be used for either linear or swizzled
679 buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
681 buf_off = v_off[lane] + inst_offset;
683 if (rsrc_desc.swizzleEn) {
684 Addr idx_stride = 8 << rsrc_desc.idxStride;
685 Addr elem_size = 2 << rsrc_desc.elemSize;
686 Addr idx_msb = buf_idx / idx_stride;
687 Addr idx_lsb = buf_idx % idx_stride;
688 Addr off_msb = buf_off / elem_size;
689 Addr off_lsb = buf_off % elem_size;
691 vaddr += ((idx_msb * stride + off_msb * elem_size)
692 * idx_stride + idx_lsb * elem_size + off_lsb);
694 vaddr += buf_off + stride * buf_idx;
697 gpuDynInst->addr.at(lane) = vaddr;
702 // first instruction DWORD
703 InFmt_MUBUF instData;
704 // second instruction DWORD
705 InFmt_MUBUF_1 extData;
708 class Inst_MTBUF : public GCN3GPUStaticInst
711 Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
714 int instSize() const override;
717 // first instruction DWORD
718 InFmt_MTBUF instData;
719 // second instruction DWORD
720 InFmt_MTBUF_1 extData;
723 bool hasSecondDword(InFmt_MTBUF *);
726 class Inst_MIMG : public GCN3GPUStaticInst
729 Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
732 int instSize() const override;
735 // first instruction DWORD
737 // second instruction DWORD
738 InFmt_MIMG_1 extData;
741 class Inst_EXP : public GCN3GPUStaticInst
744 Inst_EXP(InFmt_EXP*, const std::string &opcode);
747 int instSize() const override;
750 // first instruction DWORD
752 // second instruction DWORD
756 class Inst_FLAT : public GCN3GPUStaticInst
759 Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
762 int instSize() const override;
763 void generateDisassembly() override;
765 bool isScalarRegister(int opIdx) override;
766 bool isVectorRegister(int opIdx) override;
767 int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
772 initMemRead(GPUDynInstPtr gpuDynInst)
774 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
776 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
777 if (gpuDynInst->exec_mask[lane]) {
778 Addr vaddr = gpuDynInst->addr[lane];
780 RequestPtr req = std::make_shared<Request>(vaddr,
782 gpuDynInst->computeUnit()->masterId(), 0,
783 gpuDynInst->wfDynId);
785 gpuDynInst->setRequestFlags(req);
786 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
787 pkt->dataStatic(&(reinterpret_cast<T*>(
788 gpuDynInst->d_data))[lane]);
789 gpuDynInst->computeUnit()
790 ->sendRequest(gpuDynInst, lane, pkt);
797 initMemRead(GPUDynInstPtr gpuDynInst)
799 int req_size = N * sizeof(VecElemU32);
800 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
802 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
803 if (gpuDynInst->exec_mask[lane]) {
804 Addr vaddr = gpuDynInst->addr[lane];
806 RequestPtr req = std::make_shared<Request>(vaddr, req_size,
808 gpuDynInst->computeUnit()->masterId(), 0,
809 gpuDynInst->wfDynId);
811 gpuDynInst->setRequestFlags(req);
812 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
813 pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
814 gpuDynInst->d_data))[lane * N]);
815 gpuDynInst->computeUnit()
816 ->sendRequest(gpuDynInst, lane, pkt);
823 initMemWrite(GPUDynInstPtr gpuDynInst)
825 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
827 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
828 if (gpuDynInst->exec_mask[lane]) {
829 Addr vaddr = gpuDynInst->addr[lane];
831 RequestPtr req = std::make_shared<Request>(vaddr,
833 gpuDynInst->computeUnit()->masterId(),
834 0, gpuDynInst->wfDynId);
836 gpuDynInst->setRequestFlags(req);
837 PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
838 pkt->dataStatic(&(reinterpret_cast<T*>(
839 gpuDynInst->d_data))[lane]);
840 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
848 initMemWrite(GPUDynInstPtr gpuDynInst)
850 int req_size = N * sizeof(VecElemU32);
851 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
853 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
854 if (gpuDynInst->exec_mask[lane]) {
855 Addr vaddr = gpuDynInst->addr[lane];
857 RequestPtr req = std::make_shared<Request>(vaddr, req_size,
859 gpuDynInst->computeUnit()->masterId(),
860 0, gpuDynInst->wfDynId);
862 gpuDynInst->setRequestFlags(req);
863 PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
864 pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
865 gpuDynInst->d_data))[lane * N]);
866 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
874 initAtomicAccess(GPUDynInstPtr gpuDynInst)
876 gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
878 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
879 if (gpuDynInst->exec_mask[lane]) {
880 Addr vaddr = gpuDynInst->addr[lane];
882 RequestPtr req = std::make_shared<Request>(vaddr,
884 gpuDynInst->computeUnit()->masterId(), 0,
886 gpuDynInst->makeAtomicOpFunctor<T>(
887 &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
888 &(reinterpret_cast<T*>(
889 gpuDynInst->x_data))[lane]));
891 gpuDynInst->setRequestFlags(req);
893 PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
894 pkt->dataStatic(&(reinterpret_cast<T*>(
895 gpuDynInst->d_data))[lane]);
897 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
904 calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
906 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
907 if (gpuDynInst->exec_mask[lane]) {
908 gpuDynInst->addr.at(lane) = addr[lane];
911 gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
914 // first instruction DWORD
916 // second instruction DWORD
917 InFmt_FLAT_1 extData;
919 } // namespace Gcn3ISA
921 #endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__