arch-gcn3: fix bug with SDWA support

author Matt Sinclair <Matthew.Sinclair@amd.com>

Fri, 22 Jun 2018 06:42:39 +0000 (02:42 -0400)

committer Anthony Gutierrez <anthony.gutierrez@amd.com>

Fri, 19 Jun 2020 20:41:59 +0000 (20:41 +0000)
author Matt Sinclair <Matthew.Sinclair@amd.com>
Fri, 22 Jun 2018 06:42:39 +0000 (02:42 -0400)
committer Anthony Gutierrez <anthony.gutierrez@amd.com>
Fri, 19 Jun 2020 20:41:59 +0000 (20:41 +0000)
diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh

index a3b2f4a51bc63a77279086b586d6b893c4037c43..292e3ba301e322e1e299b50e8c636d86174bb5ba 100644 (file)
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -547,8 +547,8 @@ namespace Gcn3ISA
       * operations are done on it.
       */
      template<typename T>
-    T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel,
-                             bool signExt)
+    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
+                             const SDWASelVals sel, const bool signExt)
      {
          // local variables
          int first_bit = 0, last_bit = 0;
@@ -635,16 +635,14 @@ namespace Gcn3ISA
       *   2.  if sign extend is set, then sign extend the value
       */
      template<typename T>
-    void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel,
-                         bool signExt)
+    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
+                         const SDWASelVals sel, const bool signExt)
      {
          // iterate over all lanes, setting appropriate, selected value
-        currOper.read();
-        origCurrOper.read();
          for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
              currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
-                                                   origCurrOper[lane], sel,
-                                                   signExt);
+                                                    origCurrOper[lane], sel,
+                                                    signExt);
          }
      }
  
@@ -656,8 +654,9 @@ namespace Gcn3ISA
       * operations are done on it.
       */
      template<typename T>
-    T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp,
-                             SDWASelVals sel, SDWADstVals unusedBits_format)
+    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
+                             const bool clamp, const SDWASelVals sel,
+                             const SDWADstVals unusedBits_format)
      {
          // local variables
          int first_bit = 0, last_bit = 0;
@@ -756,12 +755,11 @@ namespace Gcn3ISA
       *       2 (SDWA_UNUSED_PRESERVE): select data[31:0]
       */
      template<typename T>
-    void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp,
-                         SDWASelVals sel, SDWADstVals unusedBits_format)
+    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
+                         const SDWASelVals sel,
+                         const SDWADstVals unusedBits_format)
      {
          // iterate over all lanes, setting appropriate, selected value
-        dstOper.read();
-        origDstOper.read();
          for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
              dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
                                                     origDstOper[lane], clamp,
@@ -779,8 +777,9 @@ namespace Gcn3ISA
       */
      template<typename T>
      void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
-                                SDWASelVals src_sel, bool src_signExt,
-                                bool src_abs, bool src_neg)
+                                const SDWASelVals src_sel,
+                                const bool src_signExt, const bool src_abs,
+                                const bool src_neg)
      {
          /**
           * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
@@ -812,14 +811,13 @@ namespace Gcn3ISA
       * processSDWA_src is called before the math.
       */
      template<typename T>
-    void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & src0, T & origSrc0)
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
      {
          // local variables
-        SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        bool src0_signExt = sdwaInst.SRC0_SEXT;
-        bool src0_neg = sdwaInst.SRC0_NEG;
-        bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
  
          // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
          // operand.  So ensure that SRC1 fields are not set, then call helper
@@ -841,18 +839,18 @@ namespace Gcn3ISA
       * called before the math.
       */
      template<typename T>
-    void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & src0, T & origSrc0, T & src1, T & origSrc1)
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
+                         T & src1, T & origSrc1)
      {
          // local variables
-        SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        bool src0_signExt = sdwaInst.SRC0_SEXT;
-        bool src0_neg = sdwaInst.SRC0_NEG;
-        bool src0_abs = sdwaInst.SRC0_ABS;
-        SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
-        bool src1_signExt = sdwaInst.SRC1_SEXT;
-        bool src1_neg = sdwaInst.SRC1_NEG;
-        bool src1_abs = sdwaInst.SRC1_ABS;
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
+        const bool src1_signExt = sdwaInst.SRC1_SEXT;
+        const bool src1_neg = sdwaInst.SRC1_NEG;
+        const bool src1_abs = sdwaInst.SRC1_ABS;
  
          processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
                                 src0_abs, src0_neg);
@@ -869,13 +867,13 @@ namespace Gcn3ISA
       * processSDWA_src is called before the math.
       */
      template<typename T>
-    void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & dst, T & origDst)
+    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
      {
          // local variables
-        SDWADstVals dst_unusedBits_format = (SDWADstVals)sdwaInst.DST_UNUSED;
-        SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
-        bool clamp = sdwaInst.CLAMP;
+        const SDWADstVals dst_unusedBits_format =
+            (SDWADstVals)sdwaInst.DST_UNUSED;
+        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
+        const bool clamp = sdwaInst.CLAMP;
  
          /**
           * STEP 1: select the appropriate bits for dst and pad/sign-extend as
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc

index bd6e4f44e7afa1c68eb04061e71c1f03c605fd59..2789f3e7f8fd592bdb9e23a5ac45a6480ceb507f 100644 (file)
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -5543,12 +5543,20 @@ namespace Gcn3ISA
          VecOperandU32 src1(gpuDynInst, instData.VSRC1);
          VecOperandU32 vdst(gpuDynInst, instData.VDST);
  
+        src0.readSrc();
+        src1.read();
+
          if (isSDWAInst()) {
              VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
              VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                          extData.iFmt_VOP_SDWA.SRC0);
              VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
  
              DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
                      "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
@@ -5566,27 +5574,27 @@ namespace Gcn3ISA
                      extData.iFmt_VOP_SDWA.SRC1_NEG,
                      extData.iFmt_VOP_SDWA.SRC1_ABS);
  
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
  
-        src0.readSrc();
-        src1.read();
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
  
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                }
              }
          }
  
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
-        }
  
          vdst.write();
      }
@@ -5895,12 +5903,20 @@ namespace Gcn3ISA
          VecOperandU32 src1(gpuDynInst, instData.VSRC1);
          VecOperandU32 vdst(gpuDynInst, instData.VDST);
  
+        src0.readSrc();
+        src1.read();
+
          if (isSDWAInst()) {
              VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and vdst during selecting
              VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                          extData.iFmt_VOP_SDWA.SRC0);
              VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
  
              DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
                      "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
@@ -5918,26 +5934,23 @@ namespace Gcn3ISA
                      extData.iFmt_VOP_SDWA.SRC1_NEG,
                      extData.iFmt_VOP_SDWA.SRC1_ABS);
  
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
-
-        src0.readSrc();
-        src1.read();
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
  
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
              }
-        }
  
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+                }
+            }
          }
  
          vdst.write();
@@ -5995,12 +6008,20 @@ namespace Gcn3ISA
          VecOperandU32 src1(gpuDynInst, instData.VSRC1);
          VecOperandU32 vdst(gpuDynInst, instData.VDST);
  
+        src0.readSrc();
+        src1.read();
+
          if (isSDWAInst()) {
              VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
              VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                          extData.iFmt_VOP_SDWA.SRC0);
              VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
  
              DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
                      "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
@@ -6018,26 +6039,23 @@ namespace Gcn3ISA
                      extData.iFmt_VOP_SDWA.SRC1_NEG,
                      extData.iFmt_VOP_SDWA.SRC1_ABS);
  
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
-
-        src0.readSrc();
-        src1.read();
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
  
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] | src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
              }
-        }
  
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] | src1[lane];
+                }
+            }
          }
  
          vdst.write();
@@ -6222,12 +6240,20 @@ namespace Gcn3ISA
          VecOperandU32 vdst(gpuDynInst, instData.VDST);
          ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
  
+        src0.readSrc();
+        src1.read();
+
          if (isSDWAInst()) {
              VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
              VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                          extData.iFmt_VOP_SDWA.SRC0);
              VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
  
              DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
                      "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
@@ -6245,28 +6271,27 @@ namespace Gcn3ISA
                      extData.iFmt_VOP_SDWA.SRC1_NEG,
                      extData.iFmt_VOP_SDWA.SRC1_ABS);
  
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
-
-        src0.readSrc();
-        src1.read();
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
  
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
              }
-        }
  
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                    vcc.setBit(lane, ((VecElemU64)src0[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
          }
  
          vcc.write();
author	Matt Sinclair <Matthew.Sinclair@amd.com>
	Fri, 22 Jun 2018 06:42:39 +0000 (02:42 -0400)
committer	Anthony Gutierrez <anthony.gutierrez@amd.com>
	Fri, 19 Jun 2020 20:41:59 +0000 (20:41 +0000)
src/arch/gcn3/insts/inst_util.hh		patch \| blob \| history
src/arch/gcn3/insts/instructions.cc		patch \| blob \| history