From ed7daa10aae7639ff3435910721d0b0c292d5d53 Mon Sep 17 00:00:00 2001 From: Michael LeBeane Date: Thu, 31 May 2018 13:03:38 -0400 Subject: [PATCH] arch-gcn3, gpu-compute: Implement out-of-range accesses Certain buffer out-of-range memory accesses should be special cased and not generate memory accesses. This patch implements those special cases and supresses lanes from accessing memory when the calculated address falls in an ISA-specified out-of-range condition. Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29935 Tested-by: kokoro Reviewed-by: Anthony Gutierrez Reviewed-by: Matt Sinclair Maintainer: Anthony Gutierrez --- src/arch/gcn3/insts/instructions.cc | 24 ++++++--- src/arch/gcn3/insts/op_encodings.hh | 64 +++++++++++++++++++++++ src/gpu-compute/global_memory_pipeline.cc | 14 +++++ 3 files changed, 96 insertions(+), 6 deletions(-) diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index b923eaeff..2e39bf5c4 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -34453,8 +34453,12 @@ namespace Gcn3ISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { - vdst[lane] = (VecElemU32)((reinterpret_cast( - gpuDynInst->d_data))[lane]); + if (!oobMask[lane]) { + vdst[lane] = (VecElemU32)((reinterpret_cast( + gpuDynInst->d_data))[lane]); + } else { + vdst[lane] = 0; + } } } @@ -34580,8 +34584,12 @@ namespace Gcn3ISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { - vdst[lane] = (VecElemU32)((reinterpret_cast( - gpuDynInst->d_data))[lane]); + if (!oobMask[lane]) { + vdst[lane] = (VecElemU32)((reinterpret_cast( + gpuDynInst->d_data))[lane]); + } else { + vdst[lane] = 0; + } } } @@ -34707,8 +34715,12 @@ namespace Gcn3ISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (gpuDynInst->exec_mask[lane]) { - vdst[lane] = (reinterpret_cast( - gpuDynInst->d_data))[lane]; + if (!oobMask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } else { + vdst[lane] = 0; + } } } diff --git a/src/arch/gcn3/insts/op_encodings.hh b/src/arch/gcn3/insts/op_encodings.hh index 308560a5f..22c146a78 100644 --- a/src/arch/gcn3/insts/op_encodings.hh +++ b/src/arch/gcn3/insts/op_encodings.hh @@ -40,6 +40,7 @@ #include "arch/gcn3/gpu_mem_helpers.hh" #include "arch/gcn3/insts/gpu_static_inst.hh" #include "arch/gcn3/operand.hh" +#include "debug/GCN3.hh" #include "debug/GPUExec.hh" #include "mem/ruby/system/RubySystem.hh" @@ -489,14 +490,26 @@ namespace Gcn3ISA void initMemRead(GPUDynInstPtr gpuDynInst) { + // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their + // exec_mask set and are not out of bounds. + VectorMask old_exec_mask = gpuDynInst->exec_mask; + gpuDynInst->exec_mask &= ~oobMask; initMemReqHelper(gpuDynInst, MemCmd::ReadReq); + gpuDynInst->exec_mask = old_exec_mask; } template void initMemWrite(GPUDynInstPtr gpuDynInst) { + // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their + // exec_mask set and are not out of bounds. + VectorMask old_exec_mask = gpuDynInst->exec_mask; + gpuDynInst->exec_mask &= ~oobMask; initMemReqHelper(gpuDynInst, MemCmd::WriteReq); + gpuDynInst->exec_mask = old_exec_mask; } void @@ -566,6 +579,42 @@ namespace Gcn3ISA buf_off = v_off[lane] + inst_offset; + + /** + * Range check behavior causes out of range accesses to + * to be treated differently. Out of range accesses return + * 0 for loads and are ignored for stores. For + * non-formatted accesses, this is done on a per-lane + * basis. + */ + if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) { + if (buf_off + stride * buf_idx >= + rsrc_desc.numRecords - s_offset.rawData()) { + DPRINTF(GCN3, "mubuf out-of-bounds condition 1: " + "lane = %d, buffer_offset = %llx, " + "const_stride = %llx, " + "const_num_records = %llx\n", + lane, buf_off + stride * buf_idx, + rsrc_desc.stride, rsrc_desc.numRecords); + oobMask.set(lane); + continue; + } + } + + if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) { + if (buf_idx >= rsrc_desc.numRecords || + buf_off >= stride) { + DPRINTF(GCN3, "mubuf out-of-bounds condition 2: " + "lane = %d, offset = %llx, " + "index = %llx, " + "const_num_records = %llx\n", + lane, buf_off, buf_idx, + rsrc_desc.numRecords); + oobMask.set(lane); + continue; + } + } + if (rsrc_desc.swizzleEn) { Addr idx_stride = 8 << rsrc_desc.idxStride; Addr elem_size = 2 << rsrc_desc.elemSize; @@ -573,6 +622,12 @@ namespace Gcn3ISA Addr idx_lsb = buf_idx % idx_stride; Addr off_msb = buf_off / elem_size; Addr off_lsb = buf_off % elem_size; + DPRINTF(GCN3, "mubuf swizzled lane %d: " + "idx_stride = %llx, elem_size = %llx, " + "idx_msb = %llx, idx_lsb = %llx, " + "off_msb = %llx, off_lsb = %llx\n", + lane, idx_stride, elem_size, idx_msb, idx_lsb, + off_msb, off_lsb); vaddr += ((idx_msb * stride + off_msb * elem_size) * idx_stride + idx_lsb * elem_size + off_lsb); @@ -580,6 +635,11 @@ namespace Gcn3ISA vaddr += buf_off + stride * buf_idx; } + DPRINTF(GCN3, "Calculating mubuf address for lane %d: " + "vaddr = %llx, base_addr = %llx, " + "stride = %llx, buf_idx = %llx, buf_off = %llx\n", + lane, vaddr, base_addr, stride, + buf_idx, buf_off); gpuDynInst->addr.at(lane) = vaddr; } } @@ -589,6 +649,10 @@ namespace Gcn3ISA InFmt_MUBUF instData; // second instruction DWORD InFmt_MUBUF_1 extData; + // Mask of lanes with out-of-bounds accesses. Needs to be tracked + // seperately from the exec_mask so that we remember to write zero + // to the registers associated with out of bounds lanes. + VectorMask oobMask; }; // Inst_MUBUF class Inst_MTBUF : public GCN3GPUStaticInst diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index c73184a59..cfd7c3db1 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -206,6 +206,20 @@ GlobalMemPipeline::exec() std::make_pair(mp, false))); } + if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) { + /** + * Memory accesses instructions that do not generate any memory + * requests (such as out-of-bounds buffer acceses where all lanes + * are out of bounds) will not trigger a callback to complete the + * request, so we need to mark it as completed as soon as it is + * issued. Note this this will still insert an entry in the + * ordered return FIFO such that waitcnt is still resolved + * correctly. + */ + handleResponse(mp); + computeUnit->getTokenManager()->recvTokens(1); + } + gmIssuedRequests.pop(); DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", -- 2.30.2