src/arch/gcn3/insts/op_encodings.hh

   1 /*
   2  * Copyright (c) 2016-2017 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Anthony Gutierrez
  34  */
  35
  36 #ifndef __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
  37 #define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
  38
  39 #include "arch/gcn3/gpu_decoder.hh"
  40 #include "arch/gcn3/insts/gpu_static_inst.hh"
  41 #include "arch/gcn3/operand.hh"
  42 #include "debug/GPUExec.hh"
  43 #include "mem/ruby/system/RubySystem.hh"
  44
  45 namespace Gcn3ISA
  46 {
  47     // --- purely virtual instruction classes ---
  48
  49     class Inst_SOP2 : public GCN3GPUStaticInst
  50     {
  51       public:
  52         Inst_SOP2(InFmt_SOP2*, const std::string &opcode);
  53
  54         int instSize() const override;
  55         void generateDisassembly() override;
  56
  57         bool isScalarRegister(int opIdx) override;
  58         bool isVectorRegister(int opIdx) override;
  59         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
  60
  61       protected:
  62         // first instruction DWORD
  63         InFmt_SOP2 instData;
  64         // possible second DWORD
  65         InstFormat extData;
  66         uint32_t varSize;
  67
  68       private:
  69         bool hasSecondDword(InFmt_SOP2 *);
  70     }; // Inst_SOP2
  71
  72     class Inst_SOPK : public GCN3GPUStaticInst
  73     {
  74       public:
  75         Inst_SOPK(InFmt_SOPK*, const std::string &opcode);
  76         ~Inst_SOPK();
  77
  78         int instSize() const override;
  79         void generateDisassembly() override;
  80
  81         bool isScalarRegister(int opIdx) override;
  82         bool isVectorRegister(int opIdx) override;
  83         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
  84
  85       protected:
  86         // first instruction DWORD
  87         InFmt_SOPK instData;
  88     }; // Inst_SOPK
  89
  90     class Inst_SOP1 : public GCN3GPUStaticInst
  91     {
  92       public:
  93         Inst_SOP1(InFmt_SOP1*, const std::string &opcode);
  94         ~Inst_SOP1();
  95
  96         int instSize() const override;
  97         void generateDisassembly() override;
  98
  99         bool isScalarRegister(int opIdx) override;
 100         bool isVectorRegister(int opIdx) override;
 101         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 102
 103       protected:
 104         // first instruction DWORD
 105         InFmt_SOP1 instData;
 106         // possible second DWORD
 107         InstFormat extData;
 108         uint32_t varSize;
 109
 110       private:
 111         bool hasSecondDword(InFmt_SOP1 *);
 112     }; // Inst_SOP1
 113
 114     class Inst_SOPC : public GCN3GPUStaticInst
 115     {
 116       public:
 117         Inst_SOPC(InFmt_SOPC*, const std::string &opcode);
 118         ~Inst_SOPC();
 119
 120         int instSize() const override;
 121         void generateDisassembly() override;
 122
 123         bool isScalarRegister(int opIdx) override;
 124         bool isVectorRegister(int opIdx) override;
 125         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 126
 127       protected:
 128         // first instruction DWORD
 129         InFmt_SOPC instData;
 130         // possible second DWORD
 131         InstFormat extData;
 132         uint32_t varSize;
 133
 134       private:
 135         bool hasSecondDword(InFmt_SOPC *);
 136     }; // Inst_SOPC
 137
 138     class Inst_SOPP : public GCN3GPUStaticInst
 139     {
 140       public:
 141         Inst_SOPP(InFmt_SOPP*, const std::string &opcode);
 142         ~Inst_SOPP();
 143
 144         int instSize() const override;
 145         void generateDisassembly() override;
 146
 147         bool isScalarRegister(int opIdx) override;
 148         bool isVectorRegister(int opIdx) override;
 149         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 150
 151       protected:
 152         // first instruction DWORD
 153         InFmt_SOPP instData;
 154     }; // Inst_SOPP
 155
 156     class Inst_SMEM : public GCN3GPUStaticInst
 157     {
 158       public:
 159         Inst_SMEM(InFmt_SMEM*, const std::string &opcode);
 160         ~Inst_SMEM();
 161
 162         int instSize() const override;
 163         void generateDisassembly() override;
 164
 165         bool isScalarRegister(int opIdx) override;
 166         bool isVectorRegister(int opIdx) override;
 167         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 168
 169       protected:
 170         /**
 171          * initiate a memory read access for N dwords
 172          */
 173         template<int N>
 174         void
 175         initMemRead(GPUDynInstPtr gpuDynInst)
 176         {
 177             int block_size = gpuDynInst->computeUnit()->cacheLineSize();
 178             int req_size = N * sizeof(ScalarRegU32);
 179             Addr vaddr = gpuDynInst->scalarAddr;
 180
 181             /**
 182              * the base address of the cache line where the the last byte of
 183              * the request will be stored.
 184              */
 185             Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
 186
 187             assert(split_addr <= vaddr || split_addr - vaddr < block_size);
 188             /**
 189              * if the base cache line address of the last byte is greater
 190              * than the address of the first byte then we have a misaligned
 191              * access.
 192              */
 193             bool misaligned_acc = split_addr > vaddr;
 194
 195             RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
 196                     gpuDynInst->computeUnit()->masterId(), 0,
 197                     gpuDynInst->wfDynId);
 198
 199             if (misaligned_acc) {
 200                 RequestPtr req1, req2;
 201                 req->splitOnVaddr(split_addr, req1, req2);
 202                 gpuDynInst->numScalarReqs = 2;
 203                 gpuDynInst->setRequestFlags(req1);
 204                 gpuDynInst->setRequestFlags(req2);
 205                 PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
 206                 PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
 207                 pkt1->dataStatic(gpuDynInst->scalar_data);
 208                 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
 209                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
 210                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
 211             } else {
 212                 gpuDynInst->numScalarReqs = 1;
 213                 gpuDynInst->setRequestFlags(req);
 214                 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 215                 pkt->dataStatic(gpuDynInst->scalar_data);
 216                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
 217             }
 218         }
 219
 220         /**
 221          * initiate a memory write access for N dwords
 222          */
 223         template<int N>
 224         void
 225         initMemWrite(GPUDynInstPtr gpuDynInst)
 226         {
 227             int block_size = gpuDynInst->computeUnit()->cacheLineSize();
 228             int req_size = N * sizeof(ScalarRegU32);
 229             Addr vaddr = gpuDynInst->scalarAddr;
 230
 231             /**
 232              * the base address of the cache line where the the last byte of
 233              * the request will be stored.
 234              */
 235             Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
 236
 237             assert(split_addr <= vaddr || split_addr - vaddr < block_size);
 238             /**
 239              * if the base cache line address of the last byte is greater
 240              * than the address of the first byte then we have a misaligned
 241              * access.
 242              */
 243             bool misaligned_acc = split_addr > vaddr;
 244
 245             RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
 246                     gpuDynInst->computeUnit()->masterId(), 0,
 247                     gpuDynInst->wfDynId);
 248
 249             if (misaligned_acc) {
 250                 RequestPtr req1, req2;
 251                 req->splitOnVaddr(split_addr, req1, req2);
 252                 gpuDynInst->numScalarReqs = 2;
 253                 gpuDynInst->setRequestFlags(req1);
 254                 gpuDynInst->setRequestFlags(req2);
 255                 PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
 256                 PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
 257                 pkt1->dataStatic(gpuDynInst->scalar_data);
 258                 pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
 259                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
 260                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
 261             } else {
 262                 gpuDynInst->numScalarReqs = 1;
 263                 gpuDynInst->setRequestFlags(req);
 264                 PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 265                 pkt->dataStatic(gpuDynInst->scalar_data);
 266                 gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
 267             }
 268         }
 269
 270         void
 271         calcAddr(GPUDynInstPtr gpuDynInst, ConstScalarOperandU64 &addr,
 272             ScalarRegU32 offset)
 273         {
 274             Addr vaddr = addr.rawData();
 275             vaddr += offset;
 276             vaddr &= ~0x3;
 277             gpuDynInst->scalarAddr = vaddr;
 278         }
 279
 280         // first instruction DWORD
 281         InFmt_SMEM instData;
 282         // second instruction DWORD
 283         InFmt_SMEM_1 extData;
 284     }; // Inst_SMEM
 285
 286     class Inst_VOP2 : public GCN3GPUStaticInst
 287     {
 288       public:
 289         Inst_VOP2(InFmt_VOP2*, const std::string &opcode);
 290         ~Inst_VOP2();
 291
 292         int instSize() const override;
 293         void generateDisassembly() override;
 294
 295         bool isScalarRegister(int opIdx) override;
 296         bool isVectorRegister(int opIdx) override;
 297         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 298
 299       protected:
 300         // first instruction DWORD
 301         InFmt_VOP2 instData;
 302         // possible second DWORD
 303         InstFormat extData;
 304         uint32_t varSize;
 305
 306       private:
 307         bool hasSecondDword(InFmt_VOP2 *);
 308     }; // Inst_VOP2
 309
 310     class Inst_VOP1 : public GCN3GPUStaticInst
 311     {
 312       public:
 313         Inst_VOP1(InFmt_VOP1*, const std::string &opcode);
 314         ~Inst_VOP1();
 315
 316         int instSize() const override;
 317         void generateDisassembly() override;
 318
 319         bool isScalarRegister(int opIdx) override;
 320         bool isVectorRegister(int opIdx) override;
 321         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 322
 323       protected:
 324         // first instruction DWORD
 325         InFmt_VOP1 instData;
 326         // possible second DWORD
 327         InstFormat extData;
 328         uint32_t varSize;
 329
 330       private:
 331         bool hasSecondDword(InFmt_VOP1 *);
 332     }; // Inst_VOP1
 333
 334     class Inst_VOPC : public GCN3GPUStaticInst
 335     {
 336       public:
 337         Inst_VOPC(InFmt_VOPC*, const std::string &opcode);
 338         ~Inst_VOPC();
 339
 340         int instSize() const override;
 341         void generateDisassembly() override;
 342
 343         bool isScalarRegister(int opIdx) override;
 344         bool isVectorRegister(int opIdx) override;
 345         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 346
 347       protected:
 348         // first instruction DWORD
 349         InFmt_VOPC instData;
 350         // possible second DWORD
 351         InstFormat extData;
 352         uint32_t varSize;
 353
 354       private:
 355         bool hasSecondDword(InFmt_VOPC *);
 356     }; // Inst_VOPC
 357
 358     class Inst_VINTRP : public GCN3GPUStaticInst
 359     {
 360       public:
 361         Inst_VINTRP(InFmt_VINTRP*, const std::string &opcode);
 362         ~Inst_VINTRP();
 363
 364         int instSize() const override;
 365
 366       protected:
 367         // first instruction DWORD
 368         InFmt_VINTRP instData;
 369     }; // Inst_VINTRP
 370
 371     class Inst_VOP3 : public GCN3GPUStaticInst
 372     {
 373       public:
 374         Inst_VOP3(InFmt_VOP3*, const std::string &opcode, bool sgpr_dst);
 375         ~Inst_VOP3();
 376
 377         int instSize() const override;
 378         void generateDisassembly() override;
 379
 380         bool isScalarRegister(int opIdx) override;
 381         bool isVectorRegister(int opIdx) override;
 382         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 383
 384       protected:
 385         // first instruction DWORD
 386         InFmt_VOP3 instData;
 387         // second instruction DWORD
 388         InFmt_VOP3_1 extData;
 389
 390       private:
 391         bool hasSecondDword(InFmt_VOP3 *);
 392         /**
 393          * the v_cmp and readlane instructions in the VOP3
 394          * encoding are unique because they are the only
 395          * instructions that use the VDST field to specify
 396          * a scalar register destination. for VOP3::V_CMP insts
 397          * VDST specifies the arbitrary SGPR pair used to write
 398          * VCC. for V_READLANE VDST specifies the SGPR to return
 399          * the value of the selected lane in the source VGPR
 400          * from which we are reading.
 401          */
 402         const bool sgprDst;
 403     }; // Inst_VOP3
 404
 405     class Inst_VOP3_SDST_ENC : public GCN3GPUStaticInst
 406     {
 407       public:
 408         Inst_VOP3_SDST_ENC(InFmt_VOP3_SDST_ENC*, const std::string &opcode);
 409         ~Inst_VOP3_SDST_ENC();
 410
 411         int instSize() const override;
 412         void generateDisassembly() override;
 413
 414         bool isScalarRegister(int opIdx) override;
 415         bool isVectorRegister(int opIdx) override;
 416         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 417
 418       protected:
 419         // first instruction DWORD
 420         InFmt_VOP3_SDST_ENC instData;
 421         // second instruction DWORD
 422         InFmt_VOP3_1 extData;
 423
 424       private:
 425         bool hasSecondDword(InFmt_VOP3_SDST_ENC *);
 426     }; // Inst_VOP3_SDST_ENC
 427
 428     class Inst_DS : public GCN3GPUStaticInst
 429     {
 430       public:
 431         Inst_DS(InFmt_DS*, const std::string &opcode);
 432         ~Inst_DS();
 433
 434         int instSize() const override;
 435         void generateDisassembly() override;
 436
 437         bool isScalarRegister(int opIdx) override;
 438         bool isVectorRegister(int opIdx) override;
 439         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 440
 441       protected:
 442         template<typename T>
 443         void
 444         initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
 445         {
 446             Wavefront *wf = gpuDynInst->wavefront();
 447
 448             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 449                 if (gpuDynInst->exec_mask[lane]) {
 450                     Addr vaddr = gpuDynInst->addr[lane] + offset;
 451
 452                     (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
 453                         = wf->ldsChunk->read<T>(vaddr);
 454                 }
 455             }
 456         }
 457
 458         template<typename T>
 459         void
 460         initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
 461         {
 462             Wavefront *wf = gpuDynInst->wavefront();
 463
 464             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 465                 if (gpuDynInst->exec_mask[lane]) {
 466                     Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
 467                     Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
 468
 469                     (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2]
 470                         = wf->ldsChunk->read<T>(vaddr0);
 471                     (reinterpret_cast<T*>(gpuDynInst->d_data))[lane * 2 + 1]
 472                         = wf->ldsChunk->read<T>(vaddr1);
 473                 }
 474             }
 475         }
 476
 477         template<typename T>
 478         void
 479         initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
 480         {
 481             Wavefront *wf = gpuDynInst->wavefront();
 482
 483             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 484                 if (gpuDynInst->exec_mask[lane]) {
 485                     Addr vaddr = gpuDynInst->addr[lane] + offset;
 486                     wf->ldsChunk->write<T>(vaddr,
 487                         (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
 488                 }
 489             }
 490         }
 491
 492         template<typename T>
 493         void
 494         initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
 495         {
 496             Wavefront *wf = gpuDynInst->wavefront();
 497
 498             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 499                 if (gpuDynInst->exec_mask[lane]) {
 500                     Addr vaddr0 = gpuDynInst->addr[lane] + offset0;
 501                     Addr vaddr1 = gpuDynInst->addr[lane] + offset1;
 502                     wf->ldsChunk->write<T>(vaddr0, (reinterpret_cast<T*>(
 503                         gpuDynInst->d_data))[lane * 2]);
 504                     wf->ldsChunk->write<T>(vaddr1, (reinterpret_cast<T*>(
 505                         gpuDynInst->d_data))[lane * 2 + 1]);
 506                 }
 507             }
 508         }
 509
 510         void
 511         calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU32 &addr)
 512         {
 513             Wavefront *wf = gpuDynInst->wavefront();
 514
 515             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 516                 if (wf->execMask(lane)) {
 517                     gpuDynInst->addr.at(lane) = (Addr)addr[lane];
 518                 }
 519             }
 520         }
 521
 522         // first instruction DWORD
 523         InFmt_DS instData;
 524         // second instruction DWORD
 525         InFmt_DS_1 extData;
 526     }; // Inst_DS
 527
 528     class Inst_MUBUF : public GCN3GPUStaticInst
 529     {
 530       public:
 531         Inst_MUBUF(InFmt_MUBUF*, const std::string &opcode);
 532         ~Inst_MUBUF();
 533
 534         int instSize() const override;
 535         void generateDisassembly() override;
 536
 537         bool isScalarRegister(int opIdx) override;
 538         bool isVectorRegister(int opIdx) override;
 539         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 540
 541       protected:
 542         struct BufferRsrcDescriptor
 543         {
 544             uint64_t baseAddr : 48;
 545             uint32_t stride : 14;
 546             uint32_t cacheSwizzle : 1;
 547             uint32_t swizzleEn : 1;
 548             uint32_t numRecords : 32;
 549             uint32_t dstSelX : 3;
 550             uint32_t dstSelY : 3;
 551             uint32_t dstSelZ : 3;
 552             uint32_t dstSelW : 3;
 553             uint32_t numFmt : 3;
 554             uint32_t dataFmt : 4;
 555             uint32_t elemSize : 2;
 556             uint32_t idxStride : 2;
 557             uint32_t addTidEn : 1;
 558             uint32_t atc : 1;
 559             uint32_t hashEn : 1;
 560             uint32_t heap : 1;
 561             uint32_t mType : 3;
 562             uint32_t type : 2;
 563         };
 564
 565         template<typename T>
 566         void
 567         initMemRead(GPUDynInstPtr gpuDynInst)
 568         {
 569             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 570
 571             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 572                 if (gpuDynInst->exec_mask[lane]) {
 573                     Addr vaddr = gpuDynInst->addr[lane];
 574
 575                     RequestPtr req = std::make_shared<Request>(vaddr,
 576                         sizeof(T), 0,
 577                         gpuDynInst->computeUnit()->masterId(), 0,
 578                         gpuDynInst->wfDynId);
 579
 580                     gpuDynInst->setRequestFlags(req);
 581
 582                     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 583                     pkt->dataStatic(&(reinterpret_cast<T*>(
 584                         gpuDynInst->d_data))[lane]);
 585
 586                     gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
 587                         pkt);
 588                 }
 589             }
 590         }
 591
 592         template<typename T>
 593         void
 594         initMemWrite(GPUDynInstPtr gpuDynInst)
 595         {
 596             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 597
 598             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 599                 if (gpuDynInst->exec_mask[lane]) {
 600                     Addr vaddr = gpuDynInst->addr[lane];
 601
 602                     RequestPtr req = std::make_shared<Request>(vaddr,
 603                         sizeof(T), 0,
 604                         gpuDynInst->computeUnit()->masterId(),
 605                         0, gpuDynInst->wfDynId);
 606
 607                     gpuDynInst->setRequestFlags(req);
 608                     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
 609                     pkt->dataStatic(&(reinterpret_cast<T*>(
 610                         gpuDynInst->d_data))[lane]);
 611                     gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
 612                         pkt);
 613                 }
 614             }
 615         }
 616
 617         void
 618         injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
 619         {
 620             // create request and set flags
 621             gpuDynInst->statusBitVector = VectorMask(1);
 622             RequestPtr req = std::make_shared<Request>(0, 0, 0,
 623                                        gpuDynInst->computeUnit()->
 624                                        masterId(), 0,
 625                                        gpuDynInst->wfDynId);
 626             gpuDynInst->setRequestFlags(req);
 627             gpuDynInst->computeUnit()->
 628                 injectGlobalMemFence(gpuDynInst, false, req);
 629         }
 630
 631         /**
 632          * MUBUF insructions calculate their addresses as follows:
 633          *
 634          * index  = (IDXEN ? vgpr_idx : 0) + (const_add_tid_en ? TID : 0)
 635          * offset = (OFFEN ? vgpr_off : 0) + inst_off
 636          *
 637          * / ====================== LINEAR ADDRESSING ====================== /
 638          * VADDR = base + sgpr_off + offset + stride * index
 639          *
 640          * / ===================== SWIZZLED ADDRESSING ===================== /
 641          * index_msb  = index / const_index_stride
 642          * index_lsb  = index % const_index_stride
 643          * offset_msb = offset / const_element_size
 644          * offset_lsb = offset % const_element_size
 645          * buffer_offset = ((index_msb * stride + offset_msb *
 646          *                  const_element_size) * const_index_stride +
 647          *                  index_lsb * const_element_size + offset_lsb)
 648          *
 649          * VADDR = base + sgpr_off + buffer_offset
 650          */
 651         template<typename VOFF, typename VIDX, typename SRSRC, typename SOFF>
 652         void
 653         calcAddr(GPUDynInstPtr gpuDynInst, VOFF v_off, VIDX v_idx,
 654             SRSRC s_rsrc_desc, SOFF s_offset, int inst_offset)
 655         {
 656             Addr vaddr = 0;
 657             Addr base_addr = 0;
 658             Addr stride = 0;
 659             Addr buf_idx = 0;
 660             Addr buf_off = 0;
 661             BufferRsrcDescriptor rsrc_desc;
 662
 663             std::memcpy((void*)&rsrc_desc, s_rsrc_desc.rawDataPtr(),
 664                 sizeof(BufferRsrcDescriptor));
 665
 666             base_addr = rsrc_desc.baseAddr;
 667
 668             stride = rsrc_desc.addTidEn ? ((rsrc_desc.dataFmt << 14)
 669                 + rsrc_desc.stride) : rsrc_desc.stride;
 670
 671             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 672                 if (gpuDynInst->exec_mask[lane]) {
 673                     vaddr = base_addr + s_offset.rawData();
 674                     /**
 675                      * first we calculate the buffer's index and offset.
 676                      * these will be used for either linear or swizzled
 677                      * buffers.
 678                      */
 679                     buf_idx = v_idx[lane] + (rsrc_desc.addTidEn ? lane : 0);
 680
 681                     buf_off = v_off[lane] + inst_offset;
 682
 683                     if (rsrc_desc.swizzleEn) {
 684                         Addr idx_stride = 8 << rsrc_desc.idxStride;
 685                         Addr elem_size = 2 << rsrc_desc.elemSize;
 686                         Addr idx_msb = buf_idx / idx_stride;
 687                         Addr idx_lsb = buf_idx % idx_stride;
 688                         Addr off_msb = buf_off / elem_size;
 689                         Addr off_lsb = buf_off % elem_size;
 690
 691                         vaddr += ((idx_msb * stride + off_msb * elem_size)
 692                             * idx_stride + idx_lsb * elem_size + off_lsb);
 693                     } else {
 694                         vaddr += buf_off + stride * buf_idx;
 695                     }
 696
 697                     gpuDynInst->addr.at(lane) = vaddr;
 698                 }
 699             }
 700         }
 701
 702         // first instruction DWORD
 703         InFmt_MUBUF instData;
 704         // second instruction DWORD
 705         InFmt_MUBUF_1 extData;
 706     }; // Inst_MUBUF
 707
 708     class Inst_MTBUF : public GCN3GPUStaticInst
 709     {
 710       public:
 711         Inst_MTBUF(InFmt_MTBUF*, const std::string &opcode);
 712         ~Inst_MTBUF();
 713
 714         int instSize() const override;
 715
 716       protected:
 717         // first instruction DWORD
 718         InFmt_MTBUF instData;
 719         // second instruction DWORD
 720         InFmt_MTBUF_1 extData;
 721
 722       private:
 723         bool hasSecondDword(InFmt_MTBUF *);
 724     }; // Inst_MTBUF
 725
 726     class Inst_MIMG : public GCN3GPUStaticInst
 727     {
 728       public:
 729         Inst_MIMG(InFmt_MIMG*, const std::string &opcode);
 730         ~Inst_MIMG();
 731
 732         int instSize() const override;
 733
 734       protected:
 735         // first instruction DWORD
 736         InFmt_MIMG instData;
 737         // second instruction DWORD
 738         InFmt_MIMG_1 extData;
 739     }; // Inst_MIMG
 740
 741     class Inst_EXP : public GCN3GPUStaticInst
 742     {
 743       public:
 744         Inst_EXP(InFmt_EXP*, const std::string &opcode);
 745         ~Inst_EXP();
 746
 747         int instSize() const override;
 748
 749       protected:
 750         // first instruction DWORD
 751         InFmt_EXP instData;
 752         // second instruction DWORD
 753         InFmt_EXP_1 extData;
 754     }; // Inst_EXP
 755
 756     class Inst_FLAT : public GCN3GPUStaticInst
 757     {
 758       public:
 759         Inst_FLAT(InFmt_FLAT*, const std::string &opcode);
 760         ~Inst_FLAT();
 761
 762         int instSize() const override;
 763         void generateDisassembly() override;
 764
 765         bool isScalarRegister(int opIdx) override;
 766         bool isVectorRegister(int opIdx) override;
 767         int getRegisterIndex(int opIdx, GPUDynInstPtr gpuDynInst) override;
 768
 769       protected:
 770         template<typename T>
 771         void
 772         initMemRead(GPUDynInstPtr gpuDynInst)
 773         {
 774             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 775
 776             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 777                 if (gpuDynInst->exec_mask[lane]) {
 778                     Addr vaddr = gpuDynInst->addr[lane];
 779
 780                     RequestPtr req = std::make_shared<Request>(vaddr,
 781                         sizeof(T), 0,
 782                             gpuDynInst->computeUnit()->masterId(), 0,
 783                             gpuDynInst->wfDynId);
 784
 785                     gpuDynInst->setRequestFlags(req);
 786                     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 787                     pkt->dataStatic(&(reinterpret_cast<T*>(
 788                         gpuDynInst->d_data))[lane]);
 789                     gpuDynInst->computeUnit()
 790                         ->sendRequest(gpuDynInst, lane, pkt);
 791                 }
 792             }
 793         }
 794
 795         template<int N>
 796         void
 797         initMemRead(GPUDynInstPtr gpuDynInst)
 798         {
 799             int req_size = N * sizeof(VecElemU32);
 800             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 801
 802             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 803                 if (gpuDynInst->exec_mask[lane]) {
 804                     Addr vaddr = gpuDynInst->addr[lane];
 805
 806                     RequestPtr req = std::make_shared<Request>(vaddr, req_size,
 807                         0,
 808                         gpuDynInst->computeUnit()->masterId(), 0,
 809                         gpuDynInst->wfDynId);
 810
 811                    gpuDynInst->setRequestFlags(req);
 812                    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 813                    pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
 814                         gpuDynInst->d_data))[lane * N]);
 815                    gpuDynInst->computeUnit()
 816                         ->sendRequest(gpuDynInst, lane, pkt);
 817                 }
 818             }
 819         }
 820
 821         template<typename T>
 822         void
 823         initMemWrite(GPUDynInstPtr gpuDynInst)
 824         {
 825             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 826
 827             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 828                 if (gpuDynInst->exec_mask[lane]) {
 829                     Addr vaddr = gpuDynInst->addr[lane];
 830
 831                     RequestPtr req = std::make_shared<Request>(vaddr,
 832                         sizeof(T), 0,
 833                         gpuDynInst->computeUnit()->masterId(),
 834                             0, gpuDynInst->wfDynId);
 835
 836                     gpuDynInst->setRequestFlags(req);
 837                     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
 838                     pkt->dataStatic(&(reinterpret_cast<T*>(
 839                         gpuDynInst->d_data))[lane]);
 840                     gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
 841                                                            pkt);
 842                 }
 843             }
 844         }
 845
 846         template<int N>
 847         void
 848         initMemWrite(GPUDynInstPtr gpuDynInst)
 849         {
 850             int req_size = N * sizeof(VecElemU32);
 851             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 852
 853             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 854                 if (gpuDynInst->exec_mask[lane]) {
 855                     Addr vaddr = gpuDynInst->addr[lane];
 856
 857                     RequestPtr req = std::make_shared<Request>(vaddr, req_size,
 858                         0,
 859                         gpuDynInst->computeUnit()->masterId(),
 860                             0, gpuDynInst->wfDynId);
 861
 862                     gpuDynInst->setRequestFlags(req);
 863                     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
 864                     pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
 865                         gpuDynInst->d_data))[lane * N]);
 866                     gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
 867                         pkt);
 868                 }
 869             }
 870         }
 871
 872         template<typename T>
 873         void
 874         initAtomicAccess(GPUDynInstPtr gpuDynInst)
 875         {
 876             gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
 877
 878             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 879                 if (gpuDynInst->exec_mask[lane]) {
 880                     Addr vaddr = gpuDynInst->addr[lane];
 881
 882                     RequestPtr req = std::make_shared<Request>(vaddr,
 883                         sizeof(T), 0,
 884                         gpuDynInst->computeUnit()->masterId(), 0,
 885                         gpuDynInst->wfDynId,
 886                         gpuDynInst->makeAtomicOpFunctor<T>(
 887                             &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
 888                             &(reinterpret_cast<T*>(
 889                                 gpuDynInst->x_data))[lane]));
 890
 891                     gpuDynInst->setRequestFlags(req);
 892
 893                     PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
 894                     pkt->dataStatic(&(reinterpret_cast<T*>(
 895                         gpuDynInst->d_data))[lane]);
 896
 897                     gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
 898                         pkt);
 899                 }
 900             }
 901         }
 902
 903         void
 904         calcAddr(GPUDynInstPtr gpuDynInst, ConstVecOperandU64 &addr)
 905         {
 906             for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
 907                 if (gpuDynInst->exec_mask[lane]) {
 908                     gpuDynInst->addr.at(lane) = addr[lane];
 909                 }
 910             }
 911             gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
 912         }
 913
 914         // first instruction DWORD
 915         InFmt_FLAT instData;
 916         // second instruction DWORD
 917         InFmt_FLAT_1 extData;
 918     }; // Inst_FLAT
 919 } // namespace Gcn3ISA
 920
 921 #endif // __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__