From: Matt Sinclair Date: Wed, 27 Jun 2018 06:24:18 +0000 (-0400) Subject: arch-gcn3: fix bug with DPP support X-Git-Tag: v20.1.0.0~449 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c7b6e7c61377c8ebce8c82f518756d3d44a8380f;p=gem5.git arch-gcn3: fix bug with DPP support Instructions that use the DPP field need to use the extra SRC0 register associated with the DPP instruction instead of the "default" SRC0 register, since the default SRC0 register contains the DPP information when DPP is being used. This commit fixes 2735c3bb88 to take this into account. Additionally, this commit removes write of the src register from the DPP helper functions, to avoid overwriting any changes made to the destination register. Finally, this change modifies the instructions that use DPP to simplify the flow through the execute() functions. Change-Id: I80fd0af1f131f287f18ff73b3c1c9122d8c60823 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29947 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index 433ccbe8d..b40e890f6 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -505,7 +505,6 @@ namespace Gcn3ISA src0[lane] = 0; } - src0.write(); // reset for next iteration laneDisabled = false; } diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index b852281b1..79e7ddacf 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -5296,8 +5296,12 @@ namespace Gcn3ISA VecOperandF32 src1(gpuDynInst, instData.VSRC1); VecOperandF32 vdst(gpuDynInst, instData.VDST); + src0.readSrc(); + src1.read(); + if (isDPPInst()) { VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); + src0_dpp.read(); DPRINTF(GCN3, "Handling V_ADD_F32 SRC DPP. SRC0: register v[%d], " "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, " @@ -5313,14 +5317,17 @@ namespace Gcn3ISA extData.iFmt_VOP_DPP.ROW_MASK); processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1); - } - - src0.readSrc(); - src1.read(); - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = src0[lane] + src1[lane]; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0_dpp[lane] + src1[lane]; + } + } + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0[lane] + src1[lane]; + } } } @@ -6164,6 +6171,7 @@ namespace Gcn3ISA if (isDPPInst()) { VecOperandF32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); + src0_dpp.read(); DPRINTF(GCN3, "Handling V_MAC_F32 SRC DPP. SRC0: register v[%d], " "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, " @@ -6179,11 +6187,18 @@ namespace Gcn3ISA extData.iFmt_VOP_DPP.ROW_MASK); processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1); - } - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = std::fma(src0_dpp[lane], src1[lane], + vdst[lane]); + } + } + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]); + } } } @@ -7117,8 +7132,11 @@ namespace Gcn3ISA ConstVecOperandU32 src(gpuDynInst, instData.SRC0); VecOperandU32 vdst(gpuDynInst, instData.VDST); + src.readSrc(); + if (isDPPInst()) { - VecOperandU32 src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); + VecOperandU32 src_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0); + src_dpp.read(); DPRINTF(GCN3, "Handling V_MOV_B32 SRC DPP. SRC0: register v[%d], " "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, " @@ -7137,14 +7155,18 @@ namespace Gcn3ISA // to negate it or take the absolute value of it assert(!extData.iFmt_VOP_DPP.SRC1_ABS); assert(!extData.iFmt_VOP_DPP.SRC1_NEG); - processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp); - } - - src.readSrc(); + processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src_dpp); - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = src[lane]; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src_dpp[lane]; + } + } + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src[lane]; + } } }