From: Francisco Jerez Date: Fri, 10 Jun 2016 23:41:59 +0000 (-0700) Subject: i965: Fix cross-primitive scratch corruption when changing the per-thread allocation. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a84b5d43e2e54dbebe3600111f4f35c29411f831;p=mesa.git i965: Fix cross-primitive scratch corruption when changing the per-thread allocation. I haven't found any mention of this in the hardware docs, but experimentally what seems to be going on is that when the per-thread scratch slot size is changed between two pipelined draw calls, shader invocations using the old and new scratch size setting may end up being executed in parallel, causing their scratch offset calculations to be based in a different partitioning of the scratch space, which can cause their thread-local scratch space to overlap leading to cross-thread scratch corruption. I've been experimenting with alternative workarounds, like emitting a PIPE_CONTROL with DC flush and CS stall between draw (or dispatch compute) calls using different per-thread scratch allocation settings, or avoiding reuse of the scratch BO if the per-thread scratch allocation doesn't exactly match the original. Both seem to be as effective as this workaround, but they have potential performance implications, while this should be basically for free. Fixes over 40 failures in our CI system with spilling forced on (including CTS, dEQP and Piglit failures) on a number of different platforms from Gen4 to Gen9. The 'glsl-max-varyings' piglit test seems to be able to reproduce this bug consistently in the vertex shader on at least Gen4, Gen8 and Gen9 with spilling forced on. Cc: Reviewed-by: Kenneth Graunke --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 9618b4a8182..6e84506e184 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -674,6 +674,19 @@ struct brw_stage_state /** * Optional scratch buffer used to store spilled register values and * variably-indexed GRF arrays. + * + * The contents of this buffer are short-lived so the same memory can be + * re-used at will for multiple shader programs (executed by the same fixed + * function). However reusing a scratch BO for which shader invocations + * are still in flight with a per-thread scratch slot size other than the + * original can cause threads with different scratch slot size and FFTID + * (which may be executed in parallel depending on the shader stage and + * hardware generation) to map to an overlapping region of the scratch + * space, which can potentially lead to mutual scratch space corruption. + * For that reason if you borrow this scratch buffer you should only be + * using the slot size given by the \c per_thread_scratch member below, + * unless you're taking additional measures to synchronize thread execution + * across slot size changes. */ drm_intel_bo *scratch_bo; diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c index c728f09cac7..331949aa1f1 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_state.c @@ -83,7 +83,7 @@ brw_upload_vs_unit(struct brw_context *brw) vs->thread2.scratch_space_base_pointer = stage_state->scratch_bo->offset64 >> 10; /* reloc */ vs->thread2.per_thread_scratch_space = - ffs(brw->vs.prog_data->base.base.total_scratch) - 11; + ffs(stage_state->per_thread_scratch) - 11; } else { vs->thread2.scratch_space_base_pointer = 0; vs->thread2.per_thread_scratch_space = 0; diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index bf1bdc9948f..dda4f23e554 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -133,7 +133,7 @@ brw_upload_wm_unit(struct brw_context *brw) wm->thread2.scratch_space_base_pointer = brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */ wm->thread2.per_thread_scratch_space = - ffs(prog_data->base.total_scratch) - 11; + ffs(brw->wm.base.per_thread_scratch) - 11; } else { wm->thread2.scratch_space_base_pointer = 0; wm->thread2.per_thread_scratch_space = 0; diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c index 4e4b9463464..da7322ebe9e 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c @@ -140,7 +140,7 @@ upload_gs_state(struct brw_context *brw) if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); /* no scratch space */ } diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 3ae00ec29c0..0ad74c4fd38 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -126,7 +126,7 @@ upload_vs_state(struct brw_context *brw) if (brw->vs.prog_data->base.base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(brw->vs.prog_data->base.base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 3e872af4894..34aa1218b96 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -220,7 +220,7 @@ gen6_upload_wm_state(struct brw_context *brw, if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index ff308e6f790..5427fa5af19 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw) */ OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else if (brw->is_haswell) { /* Haswell's Per Thread Scratch Space is in the range [0, 10] * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. */ OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->total_scratch) - 12); + ffs(stage_state->per_thread_scratch) - 12); } else { /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. */ OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - prog_data->total_scratch / 1024 - 1); + stage_state->per_thread_scratch / 1024 - 1); } } else { OUT_BATCH(0); diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c index 2fe0d88f119..3e1a03b230d 100644 --- a/src/mesa/drivers/dri/i965/gen7_ds_state.c +++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c @@ -82,7 +82,7 @@ gen7_upload_ds_state(struct brw_context *brw) if (prog_data->total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c index 6b126fe8fdb..7d733933e2b 100644 --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c @@ -64,7 +64,7 @@ upload_gs_state(struct brw_context *brw) if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(brw->gs.prog_data->base.base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c index 4f948dc22e4..297d61b36f0 100644 --- a/src/mesa/drivers/dri/i965/gen7_hs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c @@ -83,7 +83,7 @@ gen7_upload_hs_state(struct brw_context *brw) if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c index bb3179df2f7..2715b37118a 100644 --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c @@ -56,7 +56,7 @@ upload_vs_state(struct brw_context *brw) if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 8d4d4fc6069..8243905a8df 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -235,7 +235,7 @@ gen7_upload_ps_state(struct brw_context *brw, if (prog_data->base.total_scratch) { OUT_RELOC(brw->wm.base.scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c index 0219d072c72..6f01abb76fe 100644 --- a/src/mesa/drivers/dri/i965/gen8_ds_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c @@ -52,7 +52,7 @@ gen8_upload_ds_state(struct brw_context *brw) if (prog_data->total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c index 2741330106d..eb0c3dff9cb 100644 --- a/src/mesa/drivers/dri/i965/gen8_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c @@ -57,7 +57,7 @@ gen8_upload_gs_state(struct brw_context *brw) if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(brw->gs.prog_data->base.base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c index 4f8eba6cd53..e759a648af3 100644 --- a/src/mesa/drivers/dri/i965/gen8_hs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c @@ -52,7 +52,7 @@ gen8_upload_hs_state(struct brw_context *brw) if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 51a3121c723..0e7d258e1aa 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -255,7 +255,7 @@ gen8_upload_ps_state(struct brw_context *brw, if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c index d4a345583d6..b7682b553b4 100644 --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c @@ -58,7 +58,7 @@ upload_vs_state(struct brw_context *brw) if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, - ffs(prog_data->base.total_scratch) - 11); + ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0);