From: Kenneth Graunke Date: Mon, 9 Apr 2018 22:39:56 +0000 (-0700) Subject: i965: Emit VF cache invalidates for 48-bit addressing bugs with softpin. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=92f01fc5f914fd500497d0c3aed75f3ac8dc054d;p=mesa.git i965: Emit VF cache invalidates for 48-bit addressing bugs with softpin. We'd like to start using soft-pin to assign BO addresses up front, and never move them again. Our previous plan for dealing with 48-bit VF cache bugs was to relocate vertex buffers to the low 4GB, so we'd never have addresses that alias in the low 32 bits. But that requires moving buffers dynamically. This patch tracks the last seen BO address for each vertex/index buffer, and emits a VF cache invalidate if the high bits change. (Ideally, we won't hit this case very often.) This should work for the soft-pin case, but unfortunately won't work in the relocation case, as we don't actually know the addresses. So, we have to use both methods. v2: Mention that the cache uses a tuple more explicitly (suggested by Scott). Mention "single batch" too (suggested by Chris). Reviewed-by: Scott D Phillips --- diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 0844400bc53..773f104824d 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -966,6 +966,9 @@ struct brw_context * These bitfields indicate which workarounds are needed. */ uint8_t attrib_wa_flags[VERT_ATTRIB_MAX]; + + /* High bits of the last seen vertex buffer address (for workarounds). */ + uint16_t last_bo_high_bits[33]; } vb; struct { @@ -986,6 +989,9 @@ struct brw_context * referencing the same index buffer. */ unsigned int start_vertex_offset; + + /* High bits of the last seen index buffer address (for workarounds). */ + uint16_t last_bo_high_bits; } ib; /* Active vertex program: diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index 6178bfa3f88..4f44b9965e6 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -480,6 +480,65 @@ upload_format_size(uint32_t upload_format) } } +static UNUSED uint16_t +pinned_bo_high_bits(struct brw_bo *bo) +{ + return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0; +} + +/* The VF cache designers apparently cut corners, and made the cache key's + * tuple only consider the bottom 32 bits + * of the address. If you happen to have two vertex buffers which get placed + * exactly 4 GiB apart and use them in back-to-back draw calls, you can get + * collisions. (These collisions can happen within a single batch.) + * + * In the soft-pin world, we'd like to assign addresses up front, and never + * move buffers. So, we need to do a VF cache invalidate if the buffer for + * a particular VB slot has different [48:32] address bits than the last one. + * + * In the relocation world, we have no idea what the addresses will be, so + * we can't apply this workaround. Instead, we tell the kernel to move it + * to the low 4GB regardless. + */ +static void +vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw) +{ +#if GEN_GEN >= 8 + bool need_invalidate = true; + unsigned i; + + for (i = 0; i < brw->vb.nr_buffers; i++) { + uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo); + + if (high_bits != brw->vb.last_bo_high_bits[i]) { + need_invalidate = true; + brw->vb.last_bo_high_bits[i] = high_bits; + } + } + + /* Don't bother with draw parameter buffers - those are generated by + * the driver so we can select a consistent memory zone. + */ + + if (need_invalidate) { + brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE); + } +#endif +} + +static void +vf_invalidate_for_ib_48bit_transition(struct brw_context *brw) +{ +#if GEN_GEN >= 8 + uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo); + + if (high_bits != brw->ib.last_bo_high_bits) { + brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE); + brw->ib.last_bo_high_bits = high_bits; + } +#endif +} + static void genX(emit_vertices)(struct brw_context *brw) { @@ -594,6 +653,8 @@ genX(emit_vertices)(struct brw_context *brw) const unsigned nr_buffers = brw->vb.nr_buffers + uses_draw_params + uses_derived_draw_params; + vf_invalidate_for_vb_48bit_transitions(brw); + if (nr_buffers) { assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17)); @@ -886,6 +947,8 @@ genX(emit_index_buffer)(struct brw_context *brw) if (index_buffer == NULL) return; + vf_invalidate_for_ib_48bit_transition(brw); + brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) { #if GEN_GEN < 8 && !GEN_IS_HASWELL ib.CutIndexEnable = brw->prim_restart.enable_cut_index;