* operations are done on it.
*/
template<typename T>
- T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel,
- bool signExt)
+ T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal,
+ const SDWASelVals sel, const bool signExt)
{
// local variables
int first_bit = 0, last_bit = 0;
* 2. if sign extend is set, then sign extend the value
*/
template<typename T>
- void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel,
- bool signExt)
+ void sdwaInstSrcImpl(T & currOper, T & origCurrOper,
+ const SDWASelVals sel, const bool signExt)
{
// iterate over all lanes, setting appropriate, selected value
- currOper.read();
- origCurrOper.read();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane],
- origCurrOper[lane], sel,
- signExt);
+ origCurrOper[lane], sel,
+ signExt);
}
}
* operations are done on it.
*/
template<typename T>
- T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp,
- SDWASelVals sel, SDWADstVals unusedBits_format)
+ T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal,
+ const bool clamp, const SDWASelVals sel,
+ const SDWADstVals unusedBits_format)
{
// local variables
int first_bit = 0, last_bit = 0;
* 2 (SDWA_UNUSED_PRESERVE): select data[31:0]
*/
template<typename T>
- void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp,
- SDWASelVals sel, SDWADstVals unusedBits_format)
+ void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp,
+ const SDWASelVals sel,
+ const SDWADstVals unusedBits_format)
{
// iterate over all lanes, setting appropriate, selected value
- dstOper.read();
- origDstOper.read();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane],
origDstOper[lane], clamp,
*/
template<typename T>
void processSDWA_src_helper(T & currSrc, T & origCurrSrc,
- SDWASelVals src_sel, bool src_signExt,
- bool src_abs, bool src_neg)
+ const SDWASelVals src_sel,
+ const bool src_signExt, const bool src_abs,
+ const bool src_neg)
{
/**
* STEP 1: check if the absolute value (ABS) or negation (NEG) tags
* processSDWA_src is called before the math.
*/
template<typename T>
- void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
- T & src0, T & origSrc0)
+ void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0)
{
// local variables
- SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
- bool src0_signExt = sdwaInst.SRC0_SEXT;
- bool src0_neg = sdwaInst.SRC0_NEG;
- bool src0_abs = sdwaInst.SRC0_ABS;
+ const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+ const bool src0_signExt = sdwaInst.SRC0_SEXT;
+ const bool src0_neg = sdwaInst.SRC0_NEG;
+ const bool src0_abs = sdwaInst.SRC0_ABS;
// NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1
// operand. So ensure that SRC1 fields are not set, then call helper
* called before the math.
*/
template<typename T>
- void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
- T & src0, T & origSrc0, T & src1, T & origSrc1)
+ void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0,
+ T & src1, T & origSrc1)
{
// local variables
- SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
- bool src0_signExt = sdwaInst.SRC0_SEXT;
- bool src0_neg = sdwaInst.SRC0_NEG;
- bool src0_abs = sdwaInst.SRC0_ABS;
- SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
- bool src1_signExt = sdwaInst.SRC1_SEXT;
- bool src1_neg = sdwaInst.SRC1_NEG;
- bool src1_abs = sdwaInst.SRC1_ABS;
+ const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL;
+ const bool src0_signExt = sdwaInst.SRC0_SEXT;
+ const bool src0_neg = sdwaInst.SRC0_NEG;
+ const bool src0_abs = sdwaInst.SRC0_ABS;
+ const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL;
+ const bool src1_signExt = sdwaInst.SRC1_SEXT;
+ const bool src1_neg = sdwaInst.SRC1_NEG;
+ const bool src1_abs = sdwaInst.SRC1_ABS;
processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt,
src0_abs, src0_neg);
* processSDWA_src is called before the math.
*/
template<typename T>
- void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst,
- T & dst, T & origDst)
+ void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst)
{
// local variables
- SDWADstVals dst_unusedBits_format = (SDWADstVals)sdwaInst.DST_UNUSED;
- SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
- bool clamp = sdwaInst.CLAMP;
+ const SDWADstVals dst_unusedBits_format =
+ (SDWADstVals)sdwaInst.DST_UNUSED;
+ const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL;
+ const bool clamp = sdwaInst.CLAMP;
/**
* STEP 1: select the appropriate bits for dst and pad/sign-extend as
VecOperandU32 src1(gpuDynInst, instData.VSRC1);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
+ src0.readSrc();
+ src1.read();
+
if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
- // use copies of original src0 and src1 during selecting
+ // use copies of original src0, src1, and dest during selecting
VecOperandU32 origSrc0_sdwa(gpuDynInst,
extData.iFmt_VOP_SDWA.SRC0);
VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+ VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+ src0_sdwa.read();
+ origSrc0_sdwa.read();
+ origSrc1.read();
DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
"v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
extData.iFmt_VOP_SDWA.SRC1_NEG,
extData.iFmt_VOP_SDWA.SRC1_ABS);
- processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
- origSrc0_sdwa, src1, origSrc1);
- }
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+ src1, origSrc1);
- src0.readSrc();
- src1.read();
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
+ bits(src1[lane], 23, 0);
+ origVdst[lane] = vdst[lane]; // keep copy consistent
+ }
+ }
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (wf->execMask(lane)) {
- vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0);
+ processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+ } else {
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = bits(src0[lane], 23, 0) *
+ bits(src1[lane], 23, 0);
+ }
}
}
- // SDWA instructions also may select bytes/words of dest register
- // (vdst)
- if (isSDWAInst()) {
- // use extra copy of dest to retain original values
- VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
- processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
- vdst_orig);
- }
vdst.write();
}
VecOperandU32 src1(gpuDynInst, instData.VSRC1);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
+ src0.readSrc();
+ src1.read();
+
if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
- // use copies of original src0 and src1 during selecting
+ // use copies of original src0, src1, and vdst during selecting
VecOperandU32 origSrc0_sdwa(gpuDynInst,
extData.iFmt_VOP_SDWA.SRC0);
VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+ VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+ src0_sdwa.read();
+ origSrc0_sdwa.read();
+ origSrc1.read();
DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register "
"v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: "
extData.iFmt_VOP_SDWA.SRC1_NEG,
extData.iFmt_VOP_SDWA.SRC1_ABS);
- processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
- origSrc0_sdwa, src1, origSrc1);
- }
-
- src0.readSrc();
- src1.read();
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+ src1, origSrc1);
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (wf->execMask(lane)) {
- vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0);
+ origVdst[lane] = vdst[lane]; // keep copy consistent
+ }
}
- }
- // SDWA instructions also may select bytes/words of dest register
- // (vdst)
- if (isSDWAInst()) {
- // use extra copy of dest to retain original values
- VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
- processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
- vdst_orig);
+ processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+ } else {
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = src1[lane] << bits(src0[lane], 4, 0);
+ }
+ }
}
vdst.write();
VecOperandU32 src1(gpuDynInst, instData.VSRC1);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
+ src0.readSrc();
+ src1.read();
+
if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
- // use copies of original src0 and src1 during selecting
+ // use copies of original src0, src1, and dest during selecting
VecOperandU32 origSrc0_sdwa(gpuDynInst,
extData.iFmt_VOP_SDWA.SRC0);
VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+ VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+ src0_sdwa.read();
+ origSrc0_sdwa.read();
+ origSrc1.read();
DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], "
"DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
extData.iFmt_VOP_SDWA.SRC1_NEG,
extData.iFmt_VOP_SDWA.SRC1_ABS);
- processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
- origSrc0_sdwa, src1, origSrc1);
- }
-
- src0.readSrc();
- src1.read();
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+ src1, origSrc1);
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (wf->execMask(lane)) {
- vdst[lane] = src0[lane] | src1[lane];
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = src0_sdwa[lane] | src1[lane];
+ origVdst[lane] = vdst[lane]; // keep copy consistent
+ }
}
- }
- // SDWA instructions also may select bytes/words of dest register
- // (vdst)
- if (isSDWAInst()) {
- // use extra copy of dest to retain original values
- VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
- processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
- vdst_orig);
+ processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+ } else {
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = src0[lane] | src1[lane];
+ }
+ }
}
vdst.write();
VecOperandU32 vdst(gpuDynInst, instData.VDST);
ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO);
+ src0.readSrc();
+ src1.read();
+
if (isSDWAInst()) {
VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
- // use copies of original src0 and src1 during selecting
+ // use copies of original src0, src1, and dest during selecting
VecOperandU32 origSrc0_sdwa(gpuDynInst,
extData.iFmt_VOP_SDWA.SRC0);
VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+ VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+ src0_sdwa.read();
+ origSrc0_sdwa.read();
+ origSrc1.read();
DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], "
"DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, "
extData.iFmt_VOP_SDWA.SRC1_NEG,
extData.iFmt_VOP_SDWA.SRC1_ABS);
- processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa,
- origSrc0_sdwa, src1, origSrc1);
- }
-
- src0.readSrc();
- src1.read();
+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+ src1, origSrc1);
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (wf->execMask(lane)) {
- vdst[lane] = src0[lane] + src1[lane];
- vcc.setBit(lane, ((VecElemU64)src0[lane]
- + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = src0_sdwa[lane] + src1[lane];
+ origVdst[lane] = vdst[lane]; // keep copy consistent
+ vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane]
+ + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+ }
}
- }
- // SDWA instructions also may select bytes/words of dest register
- // (vdst)
- if (isSDWAInst()) {
- // use extra copy of dest to retain original values
- VecOperandU32 vdst_orig(gpuDynInst, instData.VDST);
- processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst,
- vdst_orig);
+ processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+ } else {
+ for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+ if (wf->execMask(lane)) {
+ vdst[lane] = src0[lane] + src1[lane];
+ vcc.setBit(lane, ((VecElemU64)src0[lane]
+ + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0);
+ }
+ }
}
vcc.write();