i965: Fix cross-primitive scratch corruption when changing the per-thread allocation.
authorFrancisco Jerez <currojerez@riseup.net>
Fri, 10 Jun 2016 23:41:59 +0000 (16:41 -0700)
committerFrancisco Jerez <currojerez@riseup.net>
Mon, 13 Jun 2016 22:55:58 +0000 (15:55 -0700)
I haven't found any mention of this in the hardware docs, but
experimentally what seems to be going on is that when the per-thread
scratch slot size is changed between two pipelined draw calls, shader
invocations using the old and new scratch size setting may end up
being executed in parallel, causing their scratch offset calculations
to be based in a different partitioning of the scratch space, which
can cause their thread-local scratch space to overlap leading to
cross-thread scratch corruption.

I've been experimenting with alternative workarounds, like emitting a
PIPE_CONTROL with DC flush and CS stall between draw (or dispatch
compute) calls using different per-thread scratch allocation settings,
or avoiding reuse of the scratch BO if the per-thread scratch
allocation doesn't exactly match the original.  Both seem to be as
effective as this workaround, but they have potential performance
implications, while this should be basically for free.

Fixes over 40 failures in our CI system with spilling forced on
(including CTS, dEQP and Piglit failures) on a number of different
platforms from Gen4 to Gen9.  The 'glsl-max-varyings' piglit test
seems to be able to reproduce this bug consistently in the vertex
shader on at least Gen4, Gen8 and Gen9 with spilling forced on.

Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
17 files changed:
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_vs_state.c
src/mesa/drivers/dri/i965/brw_wm_state.c
src/mesa/drivers/dri/i965/gen6_gs_state.c
src/mesa/drivers/dri/i965/gen6_vs_state.c
src/mesa/drivers/dri/i965/gen6_wm_state.c
src/mesa/drivers/dri/i965/gen7_cs_state.c
src/mesa/drivers/dri/i965/gen7_ds_state.c
src/mesa/drivers/dri/i965/gen7_gs_state.c
src/mesa/drivers/dri/i965/gen7_hs_state.c
src/mesa/drivers/dri/i965/gen7_vs_state.c
src/mesa/drivers/dri/i965/gen7_wm_state.c
src/mesa/drivers/dri/i965/gen8_ds_state.c
src/mesa/drivers/dri/i965/gen8_gs_state.c
src/mesa/drivers/dri/i965/gen8_hs_state.c
src/mesa/drivers/dri/i965/gen8_ps_state.c
src/mesa/drivers/dri/i965/gen8_vs_state.c

index 9618b4a81827f5303b649550fffa302de96ec1af..6e84506e1841d718c885cf0a193dbf0caaa66ae3 100644 (file)
@@ -674,6 +674,19 @@ struct brw_stage_state
    /**
     * Optional scratch buffer used to store spilled register values and
     * variably-indexed GRF arrays.
+    *
+    * The contents of this buffer are short-lived so the same memory can be
+    * re-used at will for multiple shader programs (executed by the same fixed
+    * function).  However reusing a scratch BO for which shader invocations
+    * are still in flight with a per-thread scratch slot size other than the
+    * original can cause threads with different scratch slot size and FFTID
+    * (which may be executed in parallel depending on the shader stage and
+    * hardware generation) to map to an overlapping region of the scratch
+    * space, which can potentially lead to mutual scratch space corruption.
+    * For that reason if you borrow this scratch buffer you should only be
+    * using the slot size given by the \c per_thread_scratch member below,
+    * unless you're taking additional measures to synchronize thread execution
+    * across slot size changes.
     */
    drm_intel_bo *scratch_bo;
 
index c728f09cac7a8a23cb27d39c7dbc7c0ff4fe75d8..331949aa1f131660fd1cb08a46d055c378b33c00 100644 (file)
@@ -83,7 +83,7 @@ brw_upload_vs_unit(struct brw_context *brw)
       vs->thread2.scratch_space_base_pointer =
         stage_state->scratch_bo->offset64 >> 10; /* reloc */
       vs->thread2.per_thread_scratch_space =
-        ffs(brw->vs.prog_data->base.base.total_scratch) - 11;
+        ffs(stage_state->per_thread_scratch) - 11;
    } else {
       vs->thread2.scratch_space_base_pointer = 0;
       vs->thread2.per_thread_scratch_space = 0;
index bf1bdc9948f67282c52f29b6e502e0912d7386c2..dda4f23e5541b4ef0cc007325ffc09cd0a2d387f 100644 (file)
@@ -133,7 +133,7 @@ brw_upload_wm_unit(struct brw_context *brw)
       wm->thread2.scratch_space_base_pointer =
         brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
       wm->thread2.per_thread_scratch_space =
-        ffs(prog_data->base.total_scratch) - 11;
+        ffs(brw->wm.base.per_thread_scratch) - 11;
    } else {
       wm->thread2.scratch_space_base_pointer = 0;
       wm->thread2.per_thread_scratch_space = 0;
index 4e4b9463464875c0b8fa36e565f96bf392226438..da7322ebe9e6da162e7ea240728cac3083cb11e9 100644 (file)
@@ -140,7 +140,7 @@ upload_gs_state(struct brw_context *brw)
       if (prog_data->base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0); /* no scratch space */
       }
index 3ae00ec29c0954b7fd905aa66c29cbe1bed01077..0ad74c4fd3801ac8285b9cdd6d4ee23a1df6dcac 100644 (file)
@@ -126,7 +126,7 @@ upload_vs_state(struct brw_context *brw)
    if (brw->vs.prog_data->base.base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-               ffs(brw->vs.prog_data->base.base.total_scratch) - 11);
+               ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
index 3e872af4894584b143d87a45c5d995ff39bf4958..34aa1218b9674ca303253a34a4291cee2a69e306 100644 (file)
@@ -220,7 +220,7 @@ gen6_upload_wm_state(struct brw_context *brw,
    if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
                 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-               ffs(prog_data->base.total_scratch) - 11);
+               ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
index ff308e6f79064c231aad89dfe5e6696e1e0d3847..5427fa5af195c16405dcad3d23480e46d03984f8 100644 (file)
@@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw)
           */
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else if (brw->is_haswell) {
          /* Haswell's Per Thread Scratch Space is in the range [0, 10]
           * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->total_scratch) - 12);
+                   ffs(stage_state->per_thread_scratch) - 12);
       } else {
          /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
           * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
           */
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   prog_data->total_scratch / 1024 - 1);
+                   stage_state->per_thread_scratch / 1024 - 1);
       }
    } else {
       OUT_BATCH(0);
index 2fe0d88f11913114ff94ed60fe7cdbfa1ea88a3b..3e1a03b230d34b5f803dbabda705b254e433ed51 100644 (file)
@@ -82,7 +82,7 @@ gen7_upload_ds_state(struct brw_context *brw)
       if (prog_data->total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
       }
index 6b126fe8fdb3990a8218cf26b0dee2a5de18d27c..7d733933e2b0035f80e6743718e148214404b393 100644 (file)
@@ -64,7 +64,7 @@ upload_gs_state(struct brw_context *brw)
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
       }
index 4f948dc22e462affd68ff01660470262ccefdde5..297d61b36f08eb16770c4c19af75580497b36a49 100644 (file)
@@ -83,7 +83,7 @@ gen7_upload_hs_state(struct brw_context *brw)
       if (prog_data->base.total_scratch) {
          OUT_RELOC(stage_state->scratch_bo,
                    I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                   ffs(prog_data->base.total_scratch) - 11);
+                   ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
       }
index bb3179df2f79b7f52c52a3d2d182a787232ed3bc..2715b37118adc1ec50430c6671b7b93724cebe8f 100644 (file)
@@ -56,7 +56,7 @@ upload_vs_state(struct brw_context *brw)
    if (prog_data->base.total_scratch) {
       OUT_RELOC(stage_state->scratch_bo,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-               ffs(prog_data->base.total_scratch) - 11);
+               ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
index 8d4d4fc60693b1386e34788185fae023162048ee..8243905a8df2e3c11edc0e46fd2cfdf0007130c2 100644 (file)
@@ -235,7 +235,7 @@ gen7_upload_ps_state(struct brw_context *brw,
    if (prog_data->base.total_scratch) {
       OUT_RELOC(brw->wm.base.scratch_bo,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-               ffs(prog_data->base.total_scratch) - 11);
+               ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
    }
index 0219d072c726298f6467128c0da63370a38c32e5..6f01abb76fe265682eaee017024d49c447906dd9 100644 (file)
@@ -52,7 +52,7 @@ gen8_upload_ds_state(struct brw_context *brw)
       if (prog_data->total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
index 2741330106d385e82540fc8f1fe28a23720f81f6..eb0c3dff9cb29a43d6b11b4542a38c29ea929d38 100644 (file)
@@ -57,7 +57,7 @@ gen8_upload_gs_state(struct brw_context *brw)
       if (brw->gs.prog_data->base.base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
index 4f8eba6cd53c911a7ea97a07a5e635015c80f1eb..e759a648af32eceda093c710d084e5e0ba6d3d3c 100644 (file)
@@ -52,7 +52,7 @@ gen8_upload_hs_state(struct brw_context *brw)
       if (prog_data->base.total_scratch) {
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                     ffs(prog_data->base.total_scratch) - 11);
+                     ffs(stage_state->per_thread_scratch) - 11);
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
index 51a3121c72388104e41ebc8d8c10a0a4a43f0f38..0e7d258e1aa19f6fee396e2f95c512c429ebcf4b 100644 (file)
@@ -255,7 +255,7 @@ gen8_upload_ps_state(struct brw_context *brw,
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(prog_data->base.total_scratch) - 11);
+                  ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
       OUT_BATCH(0);
index d4a345583d68e91cc372469d915053290bd678eb..b7682b553b49bc8b0e6c9c63248e97e5cb12e593 100644 (file)
@@ -58,7 +58,7 @@ upload_vs_state(struct brw_context *brw)
    if (prog_data->base.total_scratch) {
       OUT_RELOC64(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  ffs(prog_data->base.total_scratch) - 11);
+                  ffs(stage_state->per_thread_scratch) - 11);
    } else {
       OUT_BATCH(0);
       OUT_BATCH(0);