From c1ea14de444aafa9f7b11505ac40077f73a26e9d Mon Sep 17 00:00:00 2001
From: Matt Sinclair <Matthew.Sinclair@amd.com>
Date: Fri, 22 Jun 2018 02:42:39 -0400
Subject: [PATCH] arch-gcn3: fix bug with SDWA support

Instructions that use the SDWA field need to use the extra SRC0
register associated with the SDWA instruction instead of the
"default" SRC0 register, since the default SRC0 register contains
the SDWA information when SDWA is being used.  This commit fixes
15de044c to take this into account.  Additionally, this commit
removes reads of the registers from the SDWA helper functions,
since they overwrite any changes made to the destination register.
Finally, this change modifies the instructions that use SDWA to
simplify the flow through the execute() functions.

Change-Id: I3bad83133808dfffc6a4c40bbd49c3d76599e669
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29922
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/gcn3/insts/inst_util.hh    |  72 ++++++------
 src/arch/gcn3/insts/instructions.cc | 171 ++++++++++++++++------------
 2 files changed, 133 insertions(+), 110 deletions(-)
diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh
index a3b2f4a51..292e3ba30 100644
--- a/src/arch/gcn3/insts/inst_util.hh
+++ b/src/arch/gcn3/insts/inst_util.hh
@@ -547,8 +547,8 @@ namespace Gcn3ISA
      * operations are done on it.
      */
     template<typename T>
-    T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel,
-                             bool signExt)
+    T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
+                             const SDWASelVals sel, const bool signExt)
     {
         // local variables
         int first_bit = 0, last_bit = 0;
@@ -635,16 +635,14 @@ namespace Gcn3ISA
      *   2.  if sign extend is set, then sign extend the value
      */
     template<typename T>
-    void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel,
-                         bool signExt)
+    void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
+                         const SDWASelVals sel, const bool signExt)
     {
         // iterate over all lanes, setting appropriate, selected value
-        currOper.read();
-        origCurrOper.read();
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
-                                                   origCurrOper[lane], sel,
-                                                   signExt);
+                                                    origCurrOper[lane], sel,
+                                                    signExt);
         }
     }
 
@@ -656,8 +654,9 @@ namespace Gcn3ISA
      * operations are done on it.
      */
     template<typename T>
-    T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp,
-                             SDWASelVals sel, SDWADstVals unusedBits_format)
+    T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
+                             const bool clamp, const SDWASelVals sel,
+                             const SDWADstVals unusedBits_format)
     {
         // local variables
         int first_bit = 0, last_bit = 0;
@@ -756,12 +755,11 @@ namespace Gcn3ISA
      *       2 (SDWA_UNUSED_PRESERVE): select data[31:0]
      */
     template<typename T>
-    void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp,
-                         SDWASelVals sel, SDWADstVals unusedBits_format)
+    void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
+                         const SDWASelVals sel,
+                         const SDWADstVals unusedBits_format)
     {
         // iterate over all lanes, setting appropriate, selected value
-        dstOper.read();
-        origDstOper.read();
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
                                                    origDstOper[lane], clamp,
@@ -779,8 +777,9 @@ namespace Gcn3ISA
      */
     template<typename T>
     void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
-                                SDWASelVals src_sel, bool src_signExt,
-                                bool src_abs, bool src_neg)
+                                const SDWASelVals src_sel,
+                                const bool src_signExt, const bool src_abs,
+                                const bool src_neg)
     {
         /**
          * STEP 1: check if the absolute value (ABS) or negation (NEG) tags
@@ -812,14 +811,13 @@ namespace Gcn3ISA
      * processSDWA_src is called before the math.
      */
     template<typename T>
-    void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & src0, T & origSrc0)
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
     {
         // local variables
-        SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        bool src0_signExt = sdwaInst.SRC0_SEXT;
-        bool src0_neg = sdwaInst.SRC0_NEG;
-        bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
 
         // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
         // operand.  So ensure that SRC1 fields are not set, then call helper
@@ -841,18 +839,18 @@ namespace Gcn3ISA
      * called before the math.
      */
     template<typename T>
-    void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & src0, T & origSrc0, T & src1, T & origSrc1)
+    void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
+                         T & src1, T & origSrc1)
     {
         // local variables
-        SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
-        bool src0_signExt = sdwaInst.SRC0_SEXT;
-        bool src0_neg = sdwaInst.SRC0_NEG;
-        bool src0_abs = sdwaInst.SRC0_ABS;
-        SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
-        bool src1_signExt = sdwaInst.SRC1_SEXT;
-        bool src1_neg = sdwaInst.SRC1_NEG;
-        bool src1_abs = sdwaInst.SRC1_ABS;
+        const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+        const bool src0_signExt = sdwaInst.SRC0_SEXT;
+        const bool src0_neg = sdwaInst.SRC0_NEG;
+        const bool src0_abs = sdwaInst.SRC0_ABS;
+        const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
+        const bool src1_signExt = sdwaInst.SRC1_SEXT;
+        const bool src1_neg = sdwaInst.SRC1_NEG;
+        const bool src1_abs = sdwaInst.SRC1_ABS;
 
         processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
                                src0_abs, src0_neg);
@@ -869,13 +867,13 @@ namespace Gcn3ISA
      * processSDWA_src is called before the math.
      */
     template<typename T>
-    void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
-                         T & dst, T & origDst)
+    void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
     {
         // local variables
-        SDWADstVals dst_unusedBits_format = (SDWADstVals)sdwaInst.DST_UNUSED;
-        SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
-        bool clamp = sdwaInst.CLAMP;
+        const SDWADstVals dst_unusedBits_format =
+            (SDWADstVals)sdwaInst.DST_UNUSED;
+        const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
+        const bool clamp = sdwaInst.CLAMP;
 
         /**
          * STEP 1: select the appropriate bits for dst and pad/sign-extend as
diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc
index bd6e4f44e..2789f3e7f 100644
--- a/src/arch/gcn3/insts/instructions.cc
+++ b/src/arch/gcn3/insts/instructions.cc
@@ -5543,12 +5543,20 @@ namespace Gcn3ISA
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
 
+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
 
             DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
                     "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
@@ -5566,27 +5574,27 @@ namespace Gcn3ISA
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);
 
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
 
-        src0.readSrc();
-        src1.read();
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
 
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = bits(src0[lane], 23, 0) *
+                                 bits(src1[lane], 23, 0);
+                }
             }
         }
 
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
-        }
 
         vdst.write();
     }
@@ -5895,12 +5903,20 @@ namespace Gcn3ISA
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
 
+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and vdst during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
 
             DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
                     "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
@@ -5918,26 +5934,23 @@ namespace Gcn3ISA
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);
 
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
-
-        src0.readSrc();
-        src1.read();
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
 
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
             }
-        }
 
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+                }
+            }
         }
 
         vdst.write();
@@ -5995,12 +6008,20 @@ namespace Gcn3ISA
         VecOperandU32 src1(gpuDynInst, instData.VSRC1);
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
 
+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
 
             DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
                     "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
@@ -6018,26 +6039,23 @@ namespace Gcn3ISA
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);
 
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
-
-        src0.readSrc();
-        src1.read();
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
 
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] | src1[lane];
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] | src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
             }
-        }
 
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] | src1[lane];
+                }
+            }
         }
 
         vdst.write();
@@ -6222,12 +6240,20 @@ namespace Gcn3ISA
         VecOperandU32 vdst(gpuDynInst, instData.VDST);
         ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
 
+        src0.readSrc();
+        src1.read();
+
         if (isSDWAInst()) {
             VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0 and src1 during selecting
+            // use copies of original src0, src1, and dest during selecting
             VecOperandU32 origSrc0_sdwa(gpuDynInst,
                                         extData.iFmt_VOP_SDWA.SRC0);
             VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
 
             DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
                     "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
@@ -6245,28 +6271,27 @@ namespace Gcn3ISA
                     extData.iFmt_VOP_SDWA.SRC1_NEG,
                     extData.iFmt_VOP_SDWA.SRC1_ABS);
 
-            processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
-                            origSrc0_sdwa, src1, origSrc1);
-        }
-
-        src0.readSrc();
-        src1.read();
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
 
-        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-            if (wf->execMask(lane)) {
-                vdst[lane] = src0[lane] + src1[lane];
-                vcc.setBit(lane, ((VecElemU64)src0[lane]
-                    + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                    vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
             }
-        }
 
-        // SDWA instructions also may select bytes/words of dest register
-        // (vdst)
-        if (isSDWAInst()) {
-            // use extra copy of dest to retain original values
-            VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
-            processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
-                            vdst_orig);
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                    vcc.setBit(lane, ((VecElemU64)src0[lane]
+                        + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+                }
+            }
         }
 
         vcc.write();
-- 
2.30.2