From: Matt Sinclair Date: Wed, 29 Jul 2020 20:16:53 +0000 (-0500) Subject: arch-gcn3: add support for flat atomic adds, subs, incs, decs X-Git-Tag: v20.1.0.0~359 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4d84590deedc79790617e4ca87899ad35ec30951;p=gem5.git arch-gcn3: add support for flat atomic adds, subs, incs, decs Add support for all missing flat atomic adds, subtracts, increments, and decrements, including their x2 variants. Change-Id: I37a67fcacca91a09a82be6597facaa366105d2dc Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/31974 Reviewed-by: Anthony Gutierrez Maintainer: Anthony Gutierrez Tested-by: kokoro --- diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 426f99161..6e81e2cd9 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -40643,8 +40643,72 @@ namespace Gcn3ISA void Inst_FLAT__FLAT_ATOMIC_SUB::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); + wf->wrGmReqsInPipe--; + wf->rdGmReqsInPipe--; + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data(gpuDynInst, extData.DATA); + + addr.read(); + data.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->a_data))[lane] + = data[lane]; + } + } + + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { + gpuDynInst->computeUnit()->globalMemoryPipe. + issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } else { + fatal("Non global flat instructions not implemented yet.\n"); + } + + gpuDynInst->wavefront()->outstandingReqs++; + gpuDynInst->wavefront()->validateRequestCounters(); } + void + Inst_FLAT__FLAT_ATOMIC_SUB::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_ATOMIC_SUB::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU32 vdst(gpuDynInst, extData.VDST); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc Inst_FLAT__FLAT_ATOMIC_SMIN::Inst_FLAT__FLAT_ATOMIC_SMIN(InFmt_FLAT *iFmt) : Inst_FLAT(iFmt, "flat_atomic_smin") @@ -40843,9 +40907,74 @@ namespace Gcn3ISA void Inst_FLAT__FLAT_ATOMIC_INC::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); + wf->wrGmReqsInPipe--; + wf->rdGmReqsInPipe--; + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data(gpuDynInst, extData.DATA); + + addr.read(); + data.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->a_data))[lane] + = data[lane]; + } + } + + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { + gpuDynInst->computeUnit()->globalMemoryPipe. + issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } else { + fatal("Non global flat instructions not implemented yet.\n"); + } + + gpuDynInst->wavefront()->outstandingReqs++; + gpuDynInst->wavefront()->validateRequestCounters(); } + void + Inst_FLAT__FLAT_ATOMIC_INC::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_ATOMIC_INC::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU32 vdst(gpuDynInst, extData.VDST); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc + Inst_FLAT__FLAT_ATOMIC_DEC::Inst_FLAT__FLAT_ATOMIC_DEC(InFmt_FLAT *iFmt) : Inst_FLAT(iFmt, "flat_atomic_dec") { @@ -40868,9 +40997,74 @@ namespace Gcn3ISA void Inst_FLAT__FLAT_ATOMIC_DEC::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); + wf->wrGmReqsInPipe--; + wf->rdGmReqsInPipe--; + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data(gpuDynInst, extData.DATA); + + addr.read(); + data.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->a_data))[lane] + = data[lane]; + } + } + + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { + gpuDynInst->computeUnit()->globalMemoryPipe. + issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } else { + fatal("Non global flat instructions not implemented yet.\n"); + } + + gpuDynInst->wavefront()->outstandingReqs++; + gpuDynInst->wavefront()->validateRequestCounters(); } + void + Inst_FLAT__FLAT_ATOMIC_DEC::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_ATOMIC_DEC::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU32 vdst(gpuDynInst, extData.VDST); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc + Inst_FLAT__FLAT_ATOMIC_SWAP_X2::Inst_FLAT__FLAT_ATOMIC_SWAP_X2( InFmt_FLAT *iFmt) : Inst_FLAT(iFmt, "flat_atomic_swap_x2") @@ -41118,9 +41312,75 @@ namespace Gcn3ISA void Inst_FLAT__FLAT_ATOMIC_SUB_X2::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); + wf->wrGmReqsInPipe--; + wf->rdGmReqsInPipe--; + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU64 data(gpuDynInst, extData.DATA); + + addr.read(); + data.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->a_data))[lane] + = data[lane]; + } + } + + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { + gpuDynInst->computeUnit()->globalMemoryPipe. + issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } else { + fatal("Non global flat instructions not implemented yet.\n"); + } + + gpuDynInst->wavefront()->outstandingReqs++; + gpuDynInst->wavefront()->validateRequestCounters(); } + void + Inst_FLAT__FLAT_ATOMIC_SUB_X2::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_ATOMIC_SUB_X2::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU64 vdst(gpuDynInst, extData.VDST); + + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc + Inst_FLAT__FLAT_ATOMIC_SMIN_X2::Inst_FLAT__FLAT_ATOMIC_SMIN_X2( InFmt_FLAT *iFmt) : Inst_FLAT(iFmt, "flat_atomic_smin_x2") @@ -41326,9 +41586,75 @@ namespace Gcn3ISA void Inst_FLAT__FLAT_ATOMIC_INC_X2::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); + wf->wrGmReqsInPipe--; + wf->rdGmReqsInPipe--; + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU64 data(gpuDynInst, extData.DATA); + + addr.read(); + data.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->a_data))[lane] + = data[lane]; + } + } + + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { + gpuDynInst->computeUnit()->globalMemoryPipe. + issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } else { + fatal("Non global flat instructions not implemented yet.\n"); + } + + gpuDynInst->wavefront()->outstandingReqs++; + gpuDynInst->wavefront()->validateRequestCounters(); } + void + Inst_FLAT__FLAT_ATOMIC_INC_X2::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_ATOMIC_INC_X2::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU64 vdst(gpuDynInst, extData.VDST); + + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc + Inst_FLAT__FLAT_ATOMIC_DEC_X2::Inst_FLAT__FLAT_ATOMIC_DEC_X2( InFmt_FLAT *iFmt) : Inst_FLAT(iFmt, "flat_atomic_dec_x2") @@ -41353,6 +41679,72 @@ namespace Gcn3ISA void Inst_FLAT__FLAT_ATOMIC_DEC_X2::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (wf->execMask().none()) { + wf->decVMemInstsIssued(); + wf->decLGKMInstsIssued(); + wf->wrGmReqsInPipe--; + wf->rdGmReqsInPipe--; + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU64 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU64 data(gpuDynInst, extData.DATA); + + addr.read(); + data.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->a_data))[lane] + = data[lane]; + } + } + + if (gpuDynInst->executedAs() == Enums::SC_GLOBAL) { + gpuDynInst->computeUnit()->globalMemoryPipe. + issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } else { + fatal("Non global flat instructions not implemented yet.\n"); + } + + gpuDynInst->wavefront()->outstandingReqs++; + gpuDynInst->wavefront()->validateRequestCounters(); } + + void + Inst_FLAT__FLAT_ATOMIC_DEC_X2::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_FLAT__FLAT_ATOMIC_DEC_X2::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU64 vdst(gpuDynInst, extData.VDST); + + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc } // namespace Gcn3ISA diff --git a/src/arch/gcn3/insts/instructions.hh b/src/arch/gcn3/insts/instructions.hh index f561043de..471c13068 100644 --- a/src/arch/gcn3/insts/instructions.hh +++ b/src/arch/gcn3/insts/instructions.hh @@ -80189,6 +80189,8 @@ namespace Gcn3ISA } // isDstOperand void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_SUB class Inst_FLAT__FLAT_ATOMIC_SMIN : public Inst_FLAT @@ -80717,6 +80719,8 @@ namespace Gcn3ISA } // isDstOperand void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_INC class Inst_FLAT__FLAT_ATOMIC_DEC : public Inst_FLAT @@ -80783,6 +80787,8 @@ namespace Gcn3ISA } // isDstOperand void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_DEC class Inst_FLAT__FLAT_ATOMIC_SWAP_X2 : public Inst_FLAT @@ -81051,6 +81057,8 @@ namespace Gcn3ISA } // isDstOperand void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_SUB_X2 class Inst_FLAT__FLAT_ATOMIC_SMIN_X2 : public Inst_FLAT @@ -81579,6 +81587,8 @@ namespace Gcn3ISA } // isDstOperand void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_INC_X2 class Inst_FLAT__FLAT_ATOMIC_DEC_X2 : public Inst_FLAT @@ -81645,6 +81655,8 @@ namespace Gcn3ISA } // isDstOperand void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_DEC_X2 } // namespace Gcn3ISA