From ed3135ea6a65b442628376f678fd9f6684921a22 Mon Sep 17 00:00:00 2001 From: Matt Sinclair Date: Sat, 7 Jul 2018 19:10:06 -0400 Subject: [PATCH] arch-gcn3: implement multi-dword buffer loads and stores Add support for all multi-dword buffer loads and stores: buffer_load_dword x2, x3, and x4 and buffer_store_dword x2, x3, and x4 Change-Id: I4017b6b4f625fc92002ce8ade695ae29700fa55e Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29946 Maintainer: Anthony Gutierrez Tested-by: kokoro Reviewed-by: Matt Sinclair --- src/arch/gcn3/insts/instructions.cc | 495 +++++++++++++++++++++++++++- src/arch/gcn3/insts/op_encodings.hh | 27 ++ 2 files changed, 504 insertions(+), 18 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index 817b33916..b852281b1 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -34777,7 +34777,11 @@ namespace Gcn3ISA { setFlag(MemoryRef); setFlag(Load); - setFlag(GlobalSegment); + if (instData.LDS) { + setFlag(GroupSegment); + } else { + setFlag(GlobalSegment); + } } // Inst_MUBUF__BUFFER_LOAD_DWORDX2 Inst_MUBUF__BUFFER_LOAD_DWORDX2::~Inst_MUBUF__BUFFER_LOAD_DWORDX2() @@ -34788,17 +34792,88 @@ namespace Gcn3ISA void Inst_MUBUF__BUFFER_LOAD_DWORDX2::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); - } + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + + rsrcDesc.read(); + offset.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + if (isLocalMem()) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); + wf->rdLmReqsInPipe--; + wf->outstandingReqsRdLm++; + } else { + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } + + wf->outstandingReqs++; + wf->validateRequestCounters(); + } // execute void Inst_MUBUF__BUFFER_LOAD_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { + initMemRead<2>(gpuDynInst); } // initiateAcc void Inst_MUBUF__BUFFER_LOAD_DWORDX2::completeAcc(GPUDynInstPtr gpuDynInst) { + VecOperandU32 vdst0(gpuDynInst, extData.VDATA); + VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + if (!oobMask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 2]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 2 + 1]; + } else { + vdst0[lane] = 0; + vdst1[lane] = 0; + } + } + } + + vdst0.write(); + vdst1.write(); } // completeAcc Inst_MUBUF__BUFFER_LOAD_DWORDX3 @@ -34807,7 +34882,11 @@ namespace Gcn3ISA { setFlag(MemoryRef); setFlag(Load); - setFlag(GlobalSegment); + if (instData.LDS) { + setFlag(GroupSegment); + } else { + setFlag(GlobalSegment); + } } // Inst_MUBUF__BUFFER_LOAD_DWORDX3 Inst_MUBUF__BUFFER_LOAD_DWORDX3::~Inst_MUBUF__BUFFER_LOAD_DWORDX3() @@ -34818,17 +34897,93 @@ namespace Gcn3ISA void Inst_MUBUF__BUFFER_LOAD_DWORDX3::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); - } + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + + rsrcDesc.read(); + offset.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + if (isLocalMem()) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); + wf->rdLmReqsInPipe--; + wf->outstandingReqsRdLm++; + } else { + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } + + wf->outstandingReqs++; + wf->validateRequestCounters(); + } // execute void Inst_MUBUF__BUFFER_LOAD_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst) { + initMemRead<3>(gpuDynInst); } // initiateAcc void Inst_MUBUF__BUFFER_LOAD_DWORDX3::completeAcc(GPUDynInstPtr gpuDynInst) { + VecOperandU32 vdst0(gpuDynInst, extData.VDATA); + VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1); + VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + if (!oobMask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 3]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 3 + 1]; + vdst2[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 3 + 2]; + } else { + vdst0[lane] = 0; + vdst1[lane] = 0; + vdst2[lane] = 0; + } + } + } + + vdst0.write(); + vdst1.write(); + vdst2.write(); } // completeAcc Inst_MUBUF__BUFFER_LOAD_DWORDX4 @@ -34837,7 +34992,11 @@ namespace Gcn3ISA { setFlag(MemoryRef); setFlag(Load); - setFlag(GlobalSegment); + if (instData.LDS) { + setFlag(GroupSegment); + } else { + setFlag(GlobalSegment); + } } // Inst_MUBUF__BUFFER_LOAD_DWORDX4 Inst_MUBUF__BUFFER_LOAD_DWORDX4::~Inst_MUBUF__BUFFER_LOAD_DWORDX4() @@ -34848,17 +35007,98 @@ namespace Gcn3ISA void Inst_MUBUF__BUFFER_LOAD_DWORDX4::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); - } + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + + rsrcDesc.read(); + offset.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + if (isLocalMem()) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); + wf->rdLmReqsInPipe--; + wf->outstandingReqsRdLm++; + } else { + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); + wf->rdGmReqsInPipe--; + wf->outstandingReqsRdGm++; + } + + wf->outstandingReqs++; + wf->validateRequestCounters(); + } // execute void Inst_MUBUF__BUFFER_LOAD_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst) { + initMemRead<4>(gpuDynInst); } // initiateAcc void Inst_MUBUF__BUFFER_LOAD_DWORDX4::completeAcc(GPUDynInstPtr gpuDynInst) { + VecOperandU32 vdst0(gpuDynInst, extData.VDATA); + VecOperandU32 vdst1(gpuDynInst, extData.VDATA + 1); + VecOperandU32 vdst2(gpuDynInst, extData.VDATA + 2); + VecOperandU32 vdst3(gpuDynInst, extData.VDATA + 3); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + if (!oobMask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1]; + vdst2[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2]; + vdst3[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 3]; + } else { + vdst0[lane] = 0; + vdst1[lane] = 0; + vdst2[lane] = 0; + vdst3[lane] = 0; + } + } + } + + vdst0.write(); + vdst1.write(); + vdst2.write(); + vdst3.write(); } // completeAcc Inst_MUBUF__BUFFER_STORE_BYTE @@ -35155,7 +35395,11 @@ namespace Gcn3ISA { setFlag(MemoryRef); setFlag(Store); - setFlag(GlobalSegment); + if (instData.LDS) { + setFlag(GroupSegment); + } else { + setFlag(GlobalSegment); + } } // Inst_MUBUF__BUFFER_STORE_DWORDX2 Inst_MUBUF__BUFFER_STORE_DWORDX2::~Inst_MUBUF__BUFFER_STORE_DWORDX2() @@ -35166,12 +35410,77 @@ namespace Gcn3ISA void Inst_MUBUF__BUFFER_STORE_DWORDX2::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); - } + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + ConstVecOperandU32 data0(gpuDynInst, extData.VDATA); + ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1); + + rsrcDesc.read(); + offset.read(); + data0.read(); + data1.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + if (isLocalMem()) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); + wf->wrLmReqsInPipe--; + wf->outstandingReqsWrLm++; + } else { + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->d_data))[lane * 4] + = data0[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane*4 + 1] + = data1[lane]; + } + } + + wf->outstandingReqs++; + wf->validateRequestCounters(); + } // execute void Inst_MUBUF__BUFFER_STORE_DWORDX2::initiateAcc(GPUDynInstPtr gpuDynInst) { + initMemWrite<2>(gpuDynInst); } // initiateAcc void @@ -35185,7 +35494,11 @@ namespace Gcn3ISA { setFlag(MemoryRef); setFlag(Store); - setFlag(GlobalSegment); + if (instData.LDS) { + setFlag(GroupSegment); + } else { + setFlag(GlobalSegment); + } } // Inst_MUBUF__BUFFER_STORE_DWORDX3 Inst_MUBUF__BUFFER_STORE_DWORDX3::~Inst_MUBUF__BUFFER_STORE_DWORDX3() @@ -35196,12 +35509,81 @@ namespace Gcn3ISA void Inst_MUBUF__BUFFER_STORE_DWORDX3::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); - } + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + ConstVecOperandU32 data0(gpuDynInst, extData.VDATA); + ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1); + ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2); + + rsrcDesc.read(); + offset.read(); + data0.read(); + data1.read(); + data2.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + if (isLocalMem()) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); + wf->wrLmReqsInPipe--; + wf->outstandingReqsWrLm++; + } else { + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->d_data))[lane * 4] + = data0[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane*4 + 1] + = data1[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane*4 + 2] + = data2[lane]; + } + } + + wf->outstandingReqs++; + wf->validateRequestCounters(); + } // execute void Inst_MUBUF__BUFFER_STORE_DWORDX3::initiateAcc(GPUDynInstPtr gpuDynInst) { + initMemWrite<3>(gpuDynInst); } // initiateAcc void @@ -35215,7 +35597,11 @@ namespace Gcn3ISA { setFlag(MemoryRef); setFlag(Store); - setFlag(GlobalSegment); + if (instData.LDS) { + setFlag(GroupSegment); + } else { + setFlag(GlobalSegment); + } } // Inst_MUBUF__BUFFER_STORE_DWORDX4 Inst_MUBUF__BUFFER_STORE_DWORDX4::~Inst_MUBUF__BUFFER_STORE_DWORDX4() @@ -35226,12 +35612,85 @@ namespace Gcn3ISA void Inst_MUBUF__BUFFER_STORE_DWORDX4::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); - } + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->exec_mask = wf->execMask(); + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + ConstVecOperandU32 data0(gpuDynInst, extData.VDATA); + ConstVecOperandU32 data1(gpuDynInst, extData.VDATA + 1); + ConstVecOperandU32 data2(gpuDynInst, extData.VDATA + 2); + ConstVecOperandU32 data3(gpuDynInst, extData.VDATA + 3); + + rsrcDesc.read(); + offset.read(); + data0.read(); + data1.read(); + data2.read(); + data3.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + if (isLocalMem()) { + gpuDynInst->computeUnit()->localMemoryPipe + .issueRequest(gpuDynInst); + wf->wrLmReqsInPipe--; + wf->outstandingReqsWrLm++; + } else { + gpuDynInst->computeUnit()->globalMemoryPipe + .issueRequest(gpuDynInst); + wf->wrGmReqsInPipe--; + wf->outstandingReqsWrGm++; + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->d_data))[lane * 4] + = data0[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane*4 + 1] + = data1[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane*4 + 2] + = data2[lane]; + (reinterpret_cast(gpuDynInst->d_data))[lane*4 + 3] + = data3[lane]; + } + } + + wf->outstandingReqs++; + wf->validateRequestCounters(); + } // execute void Inst_MUBUF__BUFFER_STORE_DWORDX4::initiateAcc(GPUDynInstPtr gpuDynInst) { + initMemWrite<4>(gpuDynInst); } // initiateAcc void diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 4f151b919..4056f0a93 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -505,6 +505,20 @@ namespace Gcn3ISA gpuDynInst->exec_mask = old_exec_mask; } + + template + void + initMemRead(GPUDynInstPtr gpuDynInst) + { + // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their + // exec_mask set and are not out of bounds. + VectorMask old_exec_mask = gpuDynInst->exec_mask; + gpuDynInst->exec_mask &= ~oobMask; + initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + gpuDynInst->exec_mask = old_exec_mask; + } + template void initMemWrite(GPUDynInstPtr gpuDynInst) @@ -518,6 +532,19 @@ namespace Gcn3ISA gpuDynInst->exec_mask = old_exec_mask; } + template + void + initMemWrite(GPUDynInstPtr gpuDynInst) + { + // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their + // exec_mask set and are not out of bounds. + VectorMask old_exec_mask = gpuDynInst->exec_mask; + gpuDynInst->exec_mask &= ~oobMask; + initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + gpuDynInst->exec_mask = old_exec_mask; + } + void injectGlobalMemFence(GPUDynInstPtr gpuDynInst) { -- 2.30.2