i965: Compute VS attribute WA bits earlier and check if they changed.
authorKenneth Graunke <kenneth@whitecape.org>
Wed, 3 Dec 2014 22:26:48 +0000 (14:26 -0800)
committerKenneth Graunke <kenneth@whitecape.org>
Fri, 5 Dec 2014 01:50:52 +0000 (17:50 -0800)
BRW_NEW_VERTICES is flagged every time we draw a primitive.  Having
the brw_vs_prog atom depend on BRW_NEW_VERTICES meant that we had to
compute the VS program key and do a program cache lookup for every
single primitive.  This is painfully expensive.

The workaround bit computation is almost entirely based on the vertex
attribute arrays (brw->vb.inputs[i]), which are set by brw_merge_inputs.
The only thing it uses the VS program for is to see which VS inputs are
actually read.  brw_merge_inputs() happens once per primitive, and can
safely look at the currently bound vertex program, as it doesn't change
in the middle of a draw.

This patch moves the workaround bit computation to brw_merge_inputs(),
right after assigning brw->vb.inputs[i], and stores the previous WA bit
values in the context.  If they've actually changed from the last draw
(which is uncommon), we signal that we need a new vertex program,
causing brw_vs_prog to compute a new key.

Improves performance in Gl32Batch7 by 13.6123% +/- 0.739652% (n=166)
on Haswell GT3e.  I'm told Baytrail shows similar gains.

v2: Introduce a new BRW_NEW_VS_ATTRIB_WORKAROUNDS dirty bit, rather
    than reusing BRW_NEW_VERTEX_PROGRAM (suggested by Chris Forbes).
    This prevents unnecessary re-emission of surface/sampler related
    atoms (and an SOL atom on Sandybridge).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_draw.c
src/mesa/drivers/dri/i965/brw_state_upload.c
src/mesa/drivers/dri/i965/brw_vs.c

index 4bb3b17ff975436e3c8bbf0070cae054d11cdc8c..ce67e292b37d83a3f779d7382f998174beaa128c 100644 (file)
@@ -201,6 +201,7 @@ enum brw_state_id {
    BRW_STATE_SF_VP,
    BRW_STATE_CLIP_VP,
    BRW_STATE_SAMPLER_STATE_TABLE,
+   BRW_STATE_VS_ATTRIB_WORKAROUNDS,
    BRW_NUM_STATE_BITS
 };
 
@@ -279,6 +280,7 @@ enum brw_state_id {
 #define BRW_NEW_SF_VP                   (1ull << BRW_STATE_SF_VP)
 #define BRW_NEW_CLIP_VP                 (1ull << BRW_STATE_CLIP_VP)
 #define BRW_NEW_SAMPLER_STATE_TABLE     (1ull << BRW_STATE_SAMPLER_STATE_TABLE)
+#define BRW_NEW_VS_ATTRIB_WORKAROUNDS   (1ull << BRW_STATE_VS_ATTRIB_WORKAROUNDS)
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -1133,6 +1135,14 @@ struct brw_context
        * the same VB packed over and over again.
        */
       unsigned int start_vertex_bias;
+
+      /**
+       * Certain vertex attribute formats aren't natively handled by the
+       * hardware and require special VS code to fix up their values.
+       *
+       * These bitfields indicate which workarounds are needed.
+       */
+      uint8_t attrib_wa_flags[VERT_ATTRIB_MAX];
    } vb;
 
    struct {
index 4c2802ac66f0da9cc8fa186aefc93a32bcbd4b82..c581cc0f5c8299a0dca17d12592d36cb15e8030b 100644 (file)
@@ -46,6 +46,7 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
+#include "brw_vs.h"
 
 #include "intel_batchbuffer.h"
 #include "intel_buffers.h"
@@ -281,6 +282,7 @@ static void brw_emit_prim(struct brw_context *brw,
 static void brw_merge_inputs( struct brw_context *brw,
                       const struct gl_client_array *arrays[])
 {
+   const struct gl_context *ctx = &brw->ctx;
    GLuint i;
 
    for (i = 0; i < brw->vb.nr_buffers; i++) {
@@ -293,6 +295,46 @@ static void brw_merge_inputs( struct brw_context *brw,
       brw->vb.inputs[i].buffer = -1;
       brw->vb.inputs[i].glarray = arrays[i];
    }
+
+   if (brw->gen < 8 && !brw->is_haswell) {
+      struct gl_program *vp = &ctx->VertexProgram._Current->Base;
+      /* Prior to Haswell, the hardware can't natively support GL_FIXED or
+       * 2_10_10_10_REV vertex formats.  Set appropriate workaround flags.
+       */
+      for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+         if (!(vp->InputsRead & BITFIELD64_BIT(i)))
+            continue;
+
+         uint8_t wa_flags = 0;
+
+         switch (brw->vb.inputs[i].glarray->Type) {
+
+         case GL_FIXED:
+            wa_flags = brw->vb.inputs[i].glarray->Size;
+            break;
+
+         case GL_INT_2_10_10_10_REV:
+            wa_flags |= BRW_ATTRIB_WA_SIGN;
+            /* fallthough */
+
+         case GL_UNSIGNED_INT_2_10_10_10_REV:
+            if (brw->vb.inputs[i].glarray->Format == GL_BGRA)
+               wa_flags |= BRW_ATTRIB_WA_BGRA;
+
+            if (brw->vb.inputs[i].glarray->Normalized)
+               wa_flags |= BRW_ATTRIB_WA_NORMALIZE;
+            else if (!brw->vb.inputs[i].glarray->Integer)
+               wa_flags |= BRW_ATTRIB_WA_SCALE;
+
+            break;
+         }
+
+         if (brw->vb.attrib_wa_flags[i] != wa_flags) {
+            brw->vb.attrib_wa_flags[i] = wa_flags;
+            brw->state.dirty.brw |= BRW_NEW_VS_ATTRIB_WORKAROUNDS;
+         }
+      }
+   }
 }
 
 /**
index b3f34647772f8da9fa38b547f348605004c9fbf0..a579781a8517ab862a12beecf8fefbf48b796092 100644 (file)
@@ -525,6 +525,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_SF_VP),
    DEFINE_BIT(BRW_NEW_CLIP_VP),
    DEFINE_BIT(BRW_NEW_SAMPLER_STATE_TABLE),
+   DEFINE_BIT(BRW_NEW_VS_ATTRIB_WORKAROUNDS),
    {0, 0, 0}
 };
 
index 970d86cc69860cffe7eb30176e17eb04538f515c..cc2af35ab503679bfc80203b00117fbd15fd2789 100644 (file)
@@ -453,42 +453,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    brw_populate_sampler_prog_key_data(ctx, prog, brw->vs.base.sampler_count,
                                       &key.base.tex);
 
-   /* BRW_NEW_VERTICES */
-   if (brw->gen < 8 && !brw->is_haswell) {
-      /* Prior to Haswell, the hardware can't natively support GL_FIXED or
-       * 2_10_10_10_REV vertex formats.  Set appropriate workaround flags.
-       */
-      for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-         if (!(vp->program.Base.InputsRead & BITFIELD64_BIT(i)))
-            continue;
-
-         uint8_t wa_flags = 0;
-
-         switch (brw->vb.inputs[i].glarray->Type) {
-
-         case GL_FIXED:
-            wa_flags = brw->vb.inputs[i].glarray->Size;
-            break;
-
-         case GL_INT_2_10_10_10_REV:
-            wa_flags |= BRW_ATTRIB_WA_SIGN;
-            /* fallthough */
-
-         case GL_UNSIGNED_INT_2_10_10_10_REV:
-            if (brw->vb.inputs[i].glarray->Format == GL_BGRA)
-               wa_flags |= BRW_ATTRIB_WA_BGRA;
-
-            if (brw->vb.inputs[i].glarray->Normalized)
-               wa_flags |= BRW_ATTRIB_WA_NORMALIZE;
-            else if (!brw->vb.inputs[i].glarray->Integer)
-               wa_flags |= BRW_ATTRIB_WA_SCALE;
-
-            break;
-         }
-
-         key.gl_attrib_wa_flags[i] = wa_flags;
-      }
-   }
+   /* BRW_NEW_VS_ATTRIB_WORKAROUNDS */
+   memcpy(key.gl_attrib_wa_flags, brw->vb.attrib_wa_flags,
+          sizeof(brw->vb.attrib_wa_flags));
 
    if (!brw_search_cache(&brw->cache, BRW_CACHE_VS_PROG,
                         &key, sizeof(key),
@@ -526,7 +493,7 @@ const struct brw_tracked_state brw_vs_prog = {
                _NEW_TEXTURE |
                _NEW_TRANSFORM,
       .brw   = BRW_NEW_VERTEX_PROGRAM |
-               BRW_NEW_VERTICES,
+               BRW_NEW_VS_ATTRIB_WORKAROUNDS,
    },
    .emit = brw_upload_vs_prog
 };