From ae45a5a28d8c8a48e7353e37da2ce28a6f2bdef4 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 3 Dec 2014 14:26:48 -0800 Subject: [PATCH] i965: Compute VS attribute WA bits earlier and check if they changed. BRW_NEW_VERTICES is flagged every time we draw a primitive. Having the brw_vs_prog atom depend on BRW_NEW_VERTICES meant that we had to compute the VS program key and do a program cache lookup for every single primitive. This is painfully expensive. The workaround bit computation is almost entirely based on the vertex attribute arrays (brw->vb.inputs[i]), which are set by brw_merge_inputs. The only thing it uses the VS program for is to see which VS inputs are actually read. brw_merge_inputs() happens once per primitive, and can safely look at the currently bound vertex program, as it doesn't change in the middle of a draw. This patch moves the workaround bit computation to brw_merge_inputs(), right after assigning brw->vb.inputs[i], and stores the previous WA bit values in the context. If they've actually changed from the last draw (which is uncommon), we signal that we need a new vertex program, causing brw_vs_prog to compute a new key. Improves performance in Gl32Batch7 by 13.6123% +/- 0.739652% (n=166) on Haswell GT3e. I'm told Baytrail shows similar gains. v2: Introduce a new BRW_NEW_VS_ATTRIB_WORKAROUNDS dirty bit, rather than reusing BRW_NEW_VERTEX_PROGRAM (suggested by Chris Forbes). This prevents unnecessary re-emission of surface/sampler related atoms (and an SOL atom on Sandybridge). Signed-off-by: Kenneth Graunke Reviewed-by: Chris Forbes --- src/mesa/drivers/dri/i965/brw_context.h | 10 +++++ src/mesa/drivers/dri/i965/brw_draw.c | 42 ++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_state_upload.c | 1 + src/mesa/drivers/dri/i965/brw_vs.c | 41 ++----------------- 4 files changed, 57 insertions(+), 37 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 4bb3b17ff97..ce67e292b37 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -201,6 +201,7 @@ enum brw_state_id { BRW_STATE_SF_VP, BRW_STATE_CLIP_VP, BRW_STATE_SAMPLER_STATE_TABLE, + BRW_STATE_VS_ATTRIB_WORKAROUNDS, BRW_NUM_STATE_BITS }; @@ -279,6 +280,7 @@ enum brw_state_id { #define BRW_NEW_SF_VP (1ull << BRW_STATE_SF_VP) #define BRW_NEW_CLIP_VP (1ull << BRW_STATE_CLIP_VP) #define BRW_NEW_SAMPLER_STATE_TABLE (1ull << BRW_STATE_SAMPLER_STATE_TABLE) +#define BRW_NEW_VS_ATTRIB_WORKAROUNDS (1ull << BRW_STATE_VS_ATTRIB_WORKAROUNDS) struct brw_state_flags { /** State update flags signalled by mesa internals */ @@ -1133,6 +1135,14 @@ struct brw_context * the same VB packed over and over again. */ unsigned int start_vertex_bias; + + /** + * Certain vertex attribute formats aren't natively handled by the + * hardware and require special VS code to fix up their values. + * + * These bitfields indicate which workarounds are needed. + */ + uint8_t attrib_wa_flags[VERT_ATTRIB_MAX]; } vb; struct { diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c index 4c2802ac66f..c581cc0f5c8 100644 --- a/src/mesa/drivers/dri/i965/brw_draw.c +++ b/src/mesa/drivers/dri/i965/brw_draw.c @@ -46,6 +46,7 @@ #include "brw_defines.h" #include "brw_context.h" #include "brw_state.h" +#include "brw_vs.h" #include "intel_batchbuffer.h" #include "intel_buffers.h" @@ -281,6 +282,7 @@ static void brw_emit_prim(struct brw_context *brw, static void brw_merge_inputs( struct brw_context *brw, const struct gl_client_array *arrays[]) { + const struct gl_context *ctx = &brw->ctx; GLuint i; for (i = 0; i < brw->vb.nr_buffers; i++) { @@ -293,6 +295,46 @@ static void brw_merge_inputs( struct brw_context *brw, brw->vb.inputs[i].buffer = -1; brw->vb.inputs[i].glarray = arrays[i]; } + + if (brw->gen < 8 && !brw->is_haswell) { + struct gl_program *vp = &ctx->VertexProgram._Current->Base; + /* Prior to Haswell, the hardware can't natively support GL_FIXED or + * 2_10_10_10_REV vertex formats. Set appropriate workaround flags. + */ + for (i = 0; i < VERT_ATTRIB_MAX; i++) { + if (!(vp->InputsRead & BITFIELD64_BIT(i))) + continue; + + uint8_t wa_flags = 0; + + switch (brw->vb.inputs[i].glarray->Type) { + + case GL_FIXED: + wa_flags = brw->vb.inputs[i].glarray->Size; + break; + + case GL_INT_2_10_10_10_REV: + wa_flags |= BRW_ATTRIB_WA_SIGN; + /* fallthough */ + + case GL_UNSIGNED_INT_2_10_10_10_REV: + if (brw->vb.inputs[i].glarray->Format == GL_BGRA) + wa_flags |= BRW_ATTRIB_WA_BGRA; + + if (brw->vb.inputs[i].glarray->Normalized) + wa_flags |= BRW_ATTRIB_WA_NORMALIZE; + else if (!brw->vb.inputs[i].glarray->Integer) + wa_flags |= BRW_ATTRIB_WA_SCALE; + + break; + } + + if (brw->vb.attrib_wa_flags[i] != wa_flags) { + brw->vb.attrib_wa_flags[i] = wa_flags; + brw->state.dirty.brw |= BRW_NEW_VS_ATTRIB_WORKAROUNDS; + } + } + } } /** diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index b3f34647772..a579781a851 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -525,6 +525,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_SF_VP), DEFINE_BIT(BRW_NEW_CLIP_VP), DEFINE_BIT(BRW_NEW_SAMPLER_STATE_TABLE), + DEFINE_BIT(BRW_NEW_VS_ATTRIB_WORKAROUNDS), {0, 0, 0} }; diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 970d86cc698..cc2af35ab50 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -453,42 +453,9 @@ static void brw_upload_vs_prog(struct brw_context *brw) brw_populate_sampler_prog_key_data(ctx, prog, brw->vs.base.sampler_count, &key.base.tex); - /* BRW_NEW_VERTICES */ - if (brw->gen < 8 && !brw->is_haswell) { - /* Prior to Haswell, the hardware can't natively support GL_FIXED or - * 2_10_10_10_REV vertex formats. Set appropriate workaround flags. - */ - for (i = 0; i < VERT_ATTRIB_MAX; i++) { - if (!(vp->program.Base.InputsRead & BITFIELD64_BIT(i))) - continue; - - uint8_t wa_flags = 0; - - switch (brw->vb.inputs[i].glarray->Type) { - - case GL_FIXED: - wa_flags = brw->vb.inputs[i].glarray->Size; - break; - - case GL_INT_2_10_10_10_REV: - wa_flags |= BRW_ATTRIB_WA_SIGN; - /* fallthough */ - - case GL_UNSIGNED_INT_2_10_10_10_REV: - if (brw->vb.inputs[i].glarray->Format == GL_BGRA) - wa_flags |= BRW_ATTRIB_WA_BGRA; - - if (brw->vb.inputs[i].glarray->Normalized) - wa_flags |= BRW_ATTRIB_WA_NORMALIZE; - else if (!brw->vb.inputs[i].glarray->Integer) - wa_flags |= BRW_ATTRIB_WA_SCALE; - - break; - } - - key.gl_attrib_wa_flags[i] = wa_flags; - } - } + /* BRW_NEW_VS_ATTRIB_WORKAROUNDS */ + memcpy(key.gl_attrib_wa_flags, brw->vb.attrib_wa_flags, + sizeof(brw->vb.attrib_wa_flags)); if (!brw_search_cache(&brw->cache, BRW_CACHE_VS_PROG, &key, sizeof(key), @@ -526,7 +493,7 @@ const struct brw_tracked_state brw_vs_prog = { _NEW_TEXTURE | _NEW_TRANSFORM, .brw = BRW_NEW_VERTEX_PROGRAM | - BRW_NEW_VERTICES, + BRW_NEW_VS_ATTRIB_WORKAROUNDS, }, .emit = brw_upload_vs_prog }; -- 2.30.2