i965: Allow 48-bit addressing on Gen8+.

[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c

index 498c3979ad5f2ea8e30fc9510b0adc3da58c010e..d43f0ed2adeb179a8edaf496a5e274447c4966af 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -101,7 +101,7 @@ __gen_combine_address(struct brw_context *brw, void *location,
     }
  }
  
-static struct brw_address
+UNUSED static struct brw_address
  rw_bo(struct brw_bo *bo, uint32_t offset)
  {
     return (struct brw_address) {
@@ -120,6 +120,26 @@ ro_bo(struct brw_bo *bo, uint32_t offset)
     };
  }
  
+static struct brw_address
+rw_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_WRITE | RELOC_32BIT,
+   };
+}
+
+static struct brw_address
+ro_32_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .reloc_flags = RELOC_32BIT,
+   };
+}
+
  UNUSED static struct brw_address
  ggtt_bo(struct brw_bo *bo, uint32_t offset)
  {
@@ -317,7 +337,15 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
     struct GENX(VERTEX_BUFFER_STATE) buf_state = {
        .VertexBufferIndex = buffer_nr,
        .BufferPitch = stride,
-      .BufferStartingAddress = ro_bo(bo, start_offset),
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two vertex buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict vertex buffers to the low 32 bits of
+       * the address space.
+       */
+      .BufferStartingAddress = ro_32_bo(bo, start_offset),
  #if GEN_GEN >= 8
        .BufferSize = end_offset - start_offset,
  #endif
@@ -334,7 +362,9 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
  #endif
  #endif
  
-#if GEN_GEN == 10
+#if GEN_GEN == 11
+      .VertexBufferMOCS = ICL_MOCS_WB,
+#elif GEN_GEN == 10
        .VertexBufferMOCS = CNL_MOCS_WB,
  #elif GEN_GEN == 9
        .VertexBufferMOCS = SKL_MOCS_WB,
@@ -856,7 +886,15 @@ genX(emit_index_buffer)(struct brw_context *brw)
        ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
  #endif
        ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
-      ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
+
+      /* The VF cache designers apparently cut corners, and made the cache
+       * only consider the bottom 32 bits of memory addresses.  If you happen
+       * to have two index buffers which get placed exactly 4 GiB apart and
+       * use them in back-to-back draw calls, you can get collisions.  To work
+       * around this problem, we restrict index buffers to the low 32 bits of
+       * the address space.
+       */
+      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
  #if GEN_GEN >= 8
        ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
        ib.BufferSize = brw->ib.size;
@@ -1893,7 +1931,7 @@ genX(upload_wm)(struct brw_context *brw)
  #endif
  
        if (wm_prog_data->base.total_scratch) {
-         wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
           wm.PerThreadScratchSpace =
              ffs(stage_state->per_thread_scratch) - 11;
        }
@@ -2012,6 +2050,14 @@ static const struct brw_tracked_state genX(wm_state) = {
  
  /* ---------------------------------------------------------------------- */
  
+/* We restrict scratch buffers to the bottom 32 bits of the address space
+ * by using rw_32_bo().
+ *
+ * General State Base Address is a bit broken.  If the address + size as
+ * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
+ * all accesses to the buffer as being out of bounds and returns zero.
+ */
+
  #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
     pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
     pkt.SamplerCount       =                                               \
@@ -2021,7 +2067,7 @@ static const struct brw_tracked_state genX(wm_state) = {
     pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                            \
     if (stage_prog_data->total_scratch) {                                  \
-      pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
+      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
        pkt.PerThreadScratchSpace =                                         \
           ffs(stage_state->per_thread_scratch) - 11;                       \
     }                                                                      \
@@ -2048,6 +2094,8 @@ genX(upload_vs_state)(struct brw_context *brw)
  
     assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
            vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
+   assert(GEN_GEN < 11 ||
+          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
  
  #if GEN_GEN == 6
     /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
@@ -2465,24 +2513,28 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
  #elif GEN_GEN >= 8
        /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
         * The hardware will take the intersection of the drawing rectangle,
-       * scissor rectangle, and the viewport extents. We don't need to be
-       * smart, and can therefore just program the viewport extents.
+       * scissor rectangle, and the viewport extents.  However, emitting
+       * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
+       * pipeline stall so we're better off just being a little more clever
+       * with our viewport so we can emit it once at context creation time.
         */
+      const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
+      const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
        const float viewport_Xmax =
-         ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
+         MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
        const float viewport_Ymax =
-         ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
+         MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
  
        if (render_to_fbo) {
-         sfv.XMinViewPort = ctx->ViewportArray[i].X;
+         sfv.XMinViewPort = viewport_Xmin;
           sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = ctx->ViewportArray[i].Y;
+         sfv.YMinViewPort = viewport_Ymin;
           sfv.YMaxViewPort = viewport_Ymax - 1;
        } else {
-         sfv.XMinViewPort = ctx->ViewportArray[i].X;
+         sfv.XMinViewPort = viewport_Xmin;
           sfv.XMaxViewPort = viewport_Xmax - 1;
           sfv.YMinViewPort = fb_height - viewport_Ymax;
-         sfv.YMaxViewPort = fb_height - ctx->ViewportArray[i].Y - 1;
+         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
        }
  #endif
  
@@ -3456,10 +3508,8 @@ genX(upload_sbe)(struct brw_context *brw)
  
  #if GEN_GEN >= 9
        /* prepare the active component dwords */
-      const int num_inputs = urb_entry_read_length * 2;
-      for (int input_index = 0; input_index < num_inputs; input_index++) {
-         sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
-      }
+      for (int i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
  #endif
     }
  
@@ -3815,11 +3865,12 @@ genX(upload_ps)(struct brw_context *brw)
        ps.SampleMask = genX(determine_sample_mask(brw));
  #endif
  
-      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
-       * it implicitly scales for different GT levels (which have some # of
-       * PSDs).
+      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
+       * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
+       * k, it implies 2(k+1) threads. It implicitly scales for different GT
+       * levels (which have some # of PSDs).
         *
-       * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
+       * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
         */
  #if GEN_GEN >= 9
        ps.MaximumNumberofThreadsPerPSD = 64 - 1;
@@ -3887,8 +3938,8 @@ genX(upload_ps)(struct brw_context *brw)
  
        if (prog_data->base.total_scratch) {
           ps.ScratchSpaceBasePointer =
-            rw_bo(stage_state->scratch_bo,
-                  ffs(stage_state->per_thread_scratch) - 11);
+            rw_32_bo(stage_state->scratch_bo,
+                     ffs(stage_state->per_thread_scratch) - 11);
        }
     }
  }
@@ -3964,6 +4015,9 @@ genX(upload_ds_state)(struct brw_context *brw)
     if (!tes_prog_data) {
        brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
     } else {
+      assert(GEN_GEN < 11 ||
+             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
+
        brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
           INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
  
@@ -4206,7 +4260,7 @@ genX(upload_cs_state)(struct brw_context *brw)
               */
              per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
           }
-         vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
+         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
           vfe.PerThreadScratchSpace = per_thread_scratch_value;
        }
  
@@ -4220,8 +4274,10 @@ genX(upload_cs_state)(struct brw_context *brw)
        const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
        vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
        vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
+#if GEN_GEN < 11
        vfe.ResetGatewayTimer =
           Resettingrelativetimerandlatchingtheglobaltimestamp;
+#endif
  #if GEN_GEN < 9
        vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
  #endif