i965: Allow 48-bit addressing on Gen8+.

author Kenneth Graunke <kenneth@whitecape.org>

Mon, 12 Feb 2018 15:18:29 +0000 (07:18 -0800)

committer Kenneth Graunke <kenneth@whitecape.org>

Thu, 1 Mar 2018 23:46:11 +0000 (15:46 -0800)
author Kenneth Graunke <kenneth@whitecape.org>
Mon, 12 Feb 2018 15:18:29 +0000 (07:18 -0800)
committer Kenneth Graunke <kenneth@whitecape.org>
Thu, 1 Mar 2018 23:46:11 +0000 (15:46 -0800)
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c

index fb180289a0c1d4f4788e6386f93b9756a8d3bb25..2e54adb3ed260dd42b614102f29cefd5c041a57a 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -119,6 +119,7 @@ struct brw_bufmgr {
     bool has_llc:1;
     bool has_mmap_wc:1;
     bool bo_reuse:1;
+   bool supports_48b_addresses:1;
  };
  
  static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
@@ -409,6 +410,8 @@ retry:
     bo->reusable = true;
     bo->cache_coherent = bufmgr->has_llc;
     bo->index = -1;
+   if (bufmgr->supports_48b_addresses)
+      bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
  
     mtx_unlock(&bufmgr->lock);
  
@@ -1385,6 +1388,24 @@ gem_param(int fd, int name)
     return v;
  }
  
+static bool
+gem_supports_48b_addresses(int fd)
+{
+   struct drm_i915_gem_exec_object2 obj = {
+      .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+   };
+
+   struct drm_i915_gem_execbuffer2 execbuf = {
+      .buffers_ptr = (uintptr_t)&obj,
+      .buffer_count = 1,
+      .rsvd1 = 0xffffffu,
+   };
+
+   int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
+
+   return ret == -1 && errno == ENOENT;
+}
+
  /**
   * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
   * and manage map buffer objections.
@@ -1418,6 +1439,8 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd)
  
     bufmgr->has_llc = devinfo->has_llc;
     bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
+   bufmgr->supports_48b_addresses =
+      devinfo->gen >= 8 && gem_supports_48b_addresses(fd);
  
     init_cache_buckets(bufmgr);
  
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c

index c4ef6812bff9b7b7fded0414863b747340f53916..29d74876c276eba33756fe3094ca71793a56bad3 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -634,6 +634,12 @@ brw_upload_state_base_address(struct brw_context *brw)
     }
  
     if (devinfo->gen >= 8) {
+      /* STATE_BASE_ADDRESS has issues with 48-bit address spaces.  If the
+       * address + size as seen by STATE_BASE_ADDRESS overflows 48 bits,
+       * the GPU appears to treat all accesses to the buffer as being out
+       * of bounds and returns zero.  To work around this, we pin all SBAs
+       * to the bottom 4GB.
+       */
        uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
        int pkt_len = devinfo->gen >= 9 ? 19 : 16;
  
@@ -644,15 +650,14 @@ brw_upload_state_base_address(struct brw_context *brw)
        OUT_BATCH(0);
        OUT_BATCH(mocs_wb << 16);
        /* Surface state base address: */
-      OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
+      OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
        /* Dynamic state base address: */
-      OUT_RELOC64(brw->batch.state.bo, 0, mocs_wb << 4 | 1);
+      OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs_wb << 4 | 1);
        /* Indirect object base address: MEDIA_OBJECT data */
        OUT_BATCH(mocs_wb << 4 | 1);
        OUT_BATCH(0);
        /* Instruction base address: shader kernels (incl. SIP) */
-      OUT_RELOC64(brw->cache.bo, 0, mocs_wb << 4 | 1);
-
+      OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs_wb << 4 | 1);
        /* General state buffer size */
        OUT_BATCH(0xfffff001);
        /* Dynamic state buffer size */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c

index 23bf5a266c538f89573fd78393a464420f0a10ed..af2faea3bfe20aeb10ec03550ae82125463fe75c 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -205,12 +205,23 @@ brw_emit_surface_state(struct brw_context *brw,
         * FIXME: move to the point of assignment.
         */
        assert((aux_offset & 0xfff) == 0);
-      uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
-      *aux_addr = brw_state_reloc(&brw->batch,
-                                  *surf_offset +
-                                  brw->isl_dev.ss.aux_addr_offset,
-                                  aux_bo, *aux_addr,
-                                  reloc_flags);
+
+      if (devinfo->gen >= 8) {
+         uint64_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
+         *aux_addr = brw_state_reloc(&brw->batch,
+                                     *surf_offset +
+                                     brw->isl_dev.ss.aux_addr_offset,
+                                     aux_bo, *aux_addr,
+                                     reloc_flags);
+      } else {
+         uint32_t *aux_addr = state + brw->isl_dev.ss.aux_addr_offset;
+         *aux_addr = brw_state_reloc(&brw->batch,
+                                     *surf_offset +
+                                     brw->isl_dev.ss.aux_addr_offset,
+                                     aux_bo, *aux_addr,
+                                     reloc_flags);
+
+      }
     }
  }
  
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c

index 062171af600312ea83bc49dea1f24b330b98ab74..2bdd93e9bdd8e3c2aeedc99d12cd73f5705f15ee 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -166,6 +166,15 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
        .buffer = brw->batch.state.bo,
        .offset = offset,
  
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .reloc_flags = RELOC_32BIT,
+
  #if GEN_GEN == 10
        .mocs = CNL_MOCS_WB,
  #elif GEN_GEN == 9
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c

index e3d4b5e067c80522fe35b4bf249b939cdfb0b688..d43f0ed2adeb179a8edaf496a5e274447c4966af 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void *location,
     }
  }
  
-static struct brw_address
+UNUSED static struct brw_address
  rw_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
@@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset)
     };
  }
  
+static struct brw_address
+rw_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
+   };
+}
+
+static struct brw_address
+ro_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_32BIT,
+   };
+}
+
  UNUSED static struct brw_address
  ggtt_bo(struct brw_bo *bo, uint32_t offset)
  {
@@ -317,7 +337,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
     struct GENX(VERTEX_BUFFER_STATE) buf_state = {
        .VertexBufferIndex = buffer_nr,
        .BufferPitch = stride,
-      .BufferStartingAddress = ro_bo(bo, start_offset),
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .BufferStartingAddress = ro_32_bo(bo, start_offset),
  #if GEN_GEN >= 8
        .BufferSize = end_offset - start_offset,
  #endif
@@ -858,7 +886,15 @@ genX(emit_index_buffer)(struct brw_context *brw)
        ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
  #endif
        ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
-      ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two index buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict index buffers to the low 32 bits of
+       * the address space.
+       */
+      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
  #if GEN_GEN >= 8
        ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
        ib.BufferSize = brw->ib.size;
@@ -1895,7 +1931,7 @@ genX(upload_wm)(struct brw_context *brw)
  #endif
  
        if (wm_prog_data->base.total_scratch) {
-         wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
           wm.PerThreadScratchSpace =
              ffs(stage_state->per_thread_scratch) - 11;
        }
@@ -2014,6 +2050,14 @@ static const struct brw_tracked_state genX(wm_state) = {
  
  /* ---------------------------------------------------------------------- */
  
+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken.  If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
  #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
     pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
     pkt.SamplerCount       =                                               \
@@ -2023,7 +2067,7 @@ static const struct brw_tracked_state genX(wm_state) = {
     pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                            \
     if (stage_prog_data->total_scratch) {                                  \
-      pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
+      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
        pkt.PerThreadScratchSpace =                                         \
           ffs(stage_state->per_thread_scratch) - 11;                       \
     }                                                                      \
@@ -3894,8 +3938,8 @@ genX(upload_ps)(struct brw_context *brw)
  
        if (prog_data->base.total_scratch) {
           ps.ScratchSpaceBasePointer =
-            rw_bo(stage_state->scratch_bo,
-                  ffs(stage_state->per_thread_scratch) - 11);
+            rw_32_bo(stage_state->scratch_bo,
+                     ffs(stage_state->per_thread_scratch) - 11);
        }
     }
  }
@@ -4216,7 +4260,7 @@ genX(upload_cs_state)(struct brw_context *brw)
               */
              per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
           }
-         vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
           vfe.PerThreadScratchSpace = per_thread_scratch_value;
        }
  
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c

index c16f7c4d74be72919e868d5e5fe862034730c776..aabe21b7c8ed077ef83e3e32d78aebed9d2203a3 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -1095,6 +1095,21 @@ emit_reloc(struct intel_batchbuffer *batch,
     unsigned int index = add_exec_bo(batch, target);
     struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
  
+   if (reloc_flags & RELOC_32BIT) {
+      /* Restrict this buffer to the low 32 bits of the address space.
+       *
+       * Altering the validation list flags restricts it for this batch,
+       * but we also alter the BO's kflags to restrict it permanently
+       * (until the BO is destroyed and put back in the cache).  Buffers
+       * may stay bound across batches, and we want keep it constrained.
+       */
+      target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+      entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+      /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
+      reloc_flags &= ~RELOC_32BIT;
+   }
+
     if (reloc_flags)
        entry->flags |= reloc_flags & batch->valid_reloc_flags;
  
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h

index a9a34600ad1256f397faf265ff2e70331e8042f2..7be5b10f3ab05b09c6e26391fa5bc8be2fb956ba 100644 (file)
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -53,6 +53,8 @@ bool brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo);
  
  #define RELOC_WRITE EXEC_OBJECT_WRITE
  #define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
+/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
+#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
  uint64_t brw_batch_reloc(struct intel_batchbuffer *batch,
                           uint32_t batch_offset,
                           struct brw_bo *target,
author	Kenneth Graunke <kenneth@whitecape.org>
	Mon, 12 Feb 2018 15:18:29 +0000 (07:18 -0800)
committer	Kenneth Graunke <kenneth@whitecape.org>
	Thu, 1 Mar 2018 23:46:11 +0000 (15:46 -0800)
src/mesa/drivers/dri/i965/brw_bufmgr.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_misc_state.c		patch \| blob \| history
src/mesa/drivers/dri/i965/brw_wm_surface_state.c		patch \| blob \| history
src/mesa/drivers/dri/i965/genX_blorp_exec.c		patch \| blob \| history
src/mesa/drivers/dri/i965/genX_state_upload.c		patch \| blob \| history
src/mesa/drivers/dri/i965/intel_batchbuffer.c		patch \| blob \| history
src/mesa/drivers/dri/i965/intel_batchbuffer.h		patch \| blob \| history