arch-gcn3, gpu-compute: Implement out-of-range accesses
authorMichael LeBeane <Michael.Lebeane@amd.com>
Thu, 31 May 2018 17:03:38 +0000 (13:03 -0400)
committerAnthony Gutierrez <anthony.gutierrez@amd.com>
Mon, 13 Jul 2020 19:48:00 +0000 (19:48 +0000)
Certain buffer out-of-range memory accesses should be special
cased and not generate memory accesses. This patch implements
those special cases and supresses lanes from accessing memory
when the calculated address falls in an ISA-specified out-of-range
condition.

Change-Id: I8298f861c6b59587789853a01e503ba7d98cb13d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29935
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>

src/arch/gcn3/insts/instructions.cc
src/arch/gcn3/insts/op_encodings.hh
src/gpu-compute/global_memory_pipeline.cc

index b923eaefff413ed7804dcd5cafb1322e6aba7f25..2e39bf5c40d44cb2597d200c73ec05870d7729d8 100644 (file)
@@ -34453,8 +34453,12 @@ namespace Gcn3ISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
-                    gpuDynInst->d_data))[lane]);
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU8*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }
 
@@ -34580,8 +34584,12 @@ namespace Gcn3ISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
-                    gpuDynInst->d_data))[lane]);
+                if (!oobMask[lane]) {
+                    vdst[lane] = (VecElemU32)((reinterpret_cast<VecElemU16*>(
+                        gpuDynInst->d_data))[lane]);
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }
 
@@ -34707,8 +34715,12 @@ namespace Gcn3ISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (gpuDynInst->exec_mask[lane]) {
-                vdst[lane] = (reinterpret_cast<VecElemU32*>(
-                    gpuDynInst->d_data))[lane];
+                if (!oobMask[lane]) {
+                    vdst[lane] = (reinterpret_cast<VecElemU32*>(
+                        gpuDynInst->d_data))[lane];
+                } else {
+                    vdst[lane] = 0;
+                }
             }
         }
 
index 308560a5f7819dddab6a6da8a5fe7c8486fdf1b0..22c146a7831b7cd71f57d56cead95d00ae0c7ef9 100644 (file)
@@ -40,6 +40,7 @@
 #include "arch/gcn3/gpu_mem_helpers.hh"
 #include "arch/gcn3/insts/gpu_static_inst.hh"
 #include "arch/gcn3/operand.hh"
+#include "debug/GCN3.hh"
 #include "debug/GPUExec.hh"
 #include "mem/ruby/system/RubySystem.hh"
 
@@ -489,14 +490,26 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
+            // temporarily modify exec_mask to supress memory accesses to oob
+            // regions.  Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
             initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            gpuDynInst->exec_mask = old_exec_mask;
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
+            // temporarily modify exec_mask to supress memory accesses to oob
+            // regions.  Only issue memory requests for lanes that have their
+            // exec_mask set and are not out of bounds.
+            VectorMask old_exec_mask = gpuDynInst->exec_mask;
+            gpuDynInst->exec_mask &= ~oobMask;
             initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            gpuDynInst->exec_mask = old_exec_mask;
         }
 
         void
@@ -566,6 +579,42 @@ namespace Gcn3ISA
 
                     buf_off = v_off[lane] + inst_offset;
 
+
+                    /**
+                     * Range check behavior causes out of range accesses to
+                     * to be treated differently. Out of range accesses return
+                     * 0 for loads and are ignored for stores. For
+                     * non-formatted accesses, this is done on a per-lane
+                     * basis.
+                     */
+                    if (rsrc_desc.stride == 0 || !rsrc_desc.swizzleEn) {
+                        if (buf_off + stride * buf_idx >=
+                            rsrc_desc.numRecords - s_offset.rawData()) {
+                            DPRINTF(GCN3, "mubuf out-of-bounds condition 1: "
+                                    "lane = %d, buffer_offset = %llx, "
+                                    "const_stride = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off + stride * buf_idx,
+                                    rsrc_desc.stride, rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
+                    if (rsrc_desc.stride != 0 && rsrc_desc.swizzleEn) {
+                        if (buf_idx >= rsrc_desc.numRecords ||
+                            buf_off >= stride) {
+                            DPRINTF(GCN3, "mubuf out-of-bounds condition 2: "
+                                    "lane = %d, offset = %llx, "
+                                    "index = %llx, "
+                                    "const_num_records = %llx\n",
+                                    lane, buf_off, buf_idx,
+                                    rsrc_desc.numRecords);
+                            oobMask.set(lane);
+                            continue;
+                        }
+                    }
+
                     if (rsrc_desc.swizzleEn) {
                         Addr idx_stride = 8 << rsrc_desc.idxStride;
                         Addr elem_size = 2 << rsrc_desc.elemSize;
@@ -573,6 +622,12 @@ namespace Gcn3ISA
                         Addr idx_lsb = buf_idx % idx_stride;
                         Addr off_msb = buf_off / elem_size;
                         Addr off_lsb = buf_off % elem_size;
+                        DPRINTF(GCN3, "mubuf swizzled lane %d: "
+                                "idx_stride = %llx, elem_size = %llx, "
+                                "idx_msb = %llx, idx_lsb = %llx, "
+                                "off_msb = %llx, off_lsb = %llx\n",
+                                lane, idx_stride, elem_size, idx_msb, idx_lsb,
+                                off_msb, off_lsb);
 
                         vaddr += ((idx_msb * stride + off_msb * elem_size)
                             * idx_stride + idx_lsb * elem_size + off_lsb);
@@ -580,6 +635,11 @@ namespace Gcn3ISA
                         vaddr += buf_off + stride * buf_idx;
                     }
 
+                    DPRINTF(GCN3, "Calculating mubuf address for lane %d: "
+                            "vaddr = %llx, base_addr = %llx, "
+                            "stride = %llx, buf_idx = %llx, buf_off = %llx\n",
+                            lane, vaddr, base_addr, stride,
+                            buf_idx, buf_off);
                     gpuDynInst->addr.at(lane) = vaddr;
                 }
             }
@@ -589,6 +649,10 @@ namespace Gcn3ISA
         InFmt_MUBUF instData;
         // second instruction DWORD
         InFmt_MUBUF_1 extData;
+        // Mask of lanes with out-of-bounds accesses.  Needs to be tracked
+        // seperately from the exec_mask so that we remember to write zero
+        // to the registers associated with out of bounds lanes.
+        VectorMask oobMask;
     }; // Inst_MUBUF
 
     class Inst_MTBUF : public GCN3GPUStaticInst
index c73184a59b46c8b669eae8d74e56da86320eba6a..cfd7c3db1ced3af80c3618d2936cfc25b50ac140 100644 (file)
@@ -206,6 +206,20 @@ GlobalMemPipeline::exec()
                 std::make_pair(mp, false)));
         }
 
+        if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {
+            /**
+            * Memory accesses instructions that do not generate any memory
+            * requests (such as out-of-bounds buffer acceses where all lanes
+            * are out of bounds) will not trigger a callback to complete the
+            * request, so we need to mark it as completed as soon as it is
+            * issued.  Note this this will still insert an entry in the
+            * ordered return FIFO such that waitcnt is still resolved
+            * correctly.
+            */
+            handleResponse(mp);
+            computeUnit->getTokenManager()->recvTokens(1);
+        }
+
         gmIssuedRequests.pop();
 
         DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",