i965: Upload as many VS constants as possible through the push constants.
authorEric Anholt <eric@anholt.net>
Mon, 18 Jan 2010 23:12:40 +0000 (15:12 -0800)
committerEric Anholt <eric@anholt.net>
Tue, 19 Jan 2010 19:31:23 +0000 (11:31 -0800)
The pull constants require sending out to an overworked shared unit
and waiting for a response, while push constants are nicely loaded in
for us at thread dispatch time.  By putting things we access in every
VS invocation there, ETQW performance improved by 2.5% +/- 1.6% (n=6).

src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_curbe.c
src/mesa/drivers/dri/i965/brw_vs.c
src/mesa/drivers/dri/i965/brw_vs.h
src/mesa/drivers/dri/i965/brw_vs_emit.c

index 21d82977b7c810572af1787ee1dbf900a32e1624..cbf022ca0c76e11682d1cde7e4c40223d78d377f 100644 (file)
@@ -582,6 +582,7 @@ struct brw_context
 
    struct {
       struct brw_vs_prog_data *prog_data;
+      int8_t *constant_map; /* variable array following prog_data */
 
       dri_bo *prog_bo;
       dri_bo *state_bo;
index 190310afbb034f1e3002f44dffa392900785a8c3..22e3e732f40601099fff8dd27d1ab5d23a8fbdff 100644 (file)
@@ -256,13 +256,24 @@ static void prepare_constant_buffer(struct brw_context *brw)
        */
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
-      /* XXX just use a memcpy here */
-      for (i = 0; i < nr; i++) {
-         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
-        buf[offset + i * 4 + 0] = value[0];
-        buf[offset + i * 4 + 1] = value[1];
-        buf[offset + i * 4 + 2] = value[2];
-        buf[offset + i * 4 + 3] = value[3];
+      if (vp->use_const_buffer) {
+        /* Load the subset of push constants that will get used when
+         * we also have a pull constant buffer.
+         */
+        for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+           if (brw->vs.constant_map[i] != -1) {
+              assert(brw->vs.constant_map[i] <= nr);
+              memcpy(buf + offset + brw->vs.constant_map[i] * 4,
+                     vp->program.Base.Parameters->ParameterValues[i],
+                     4 * sizeof(float));
+           }
+        }
+      } else {
+        for (i = 0; i < nr; i++) {
+           memcpy(buf + offset + i * 4,
+                  vp->program.Base.Parameters->ParameterValues[i],
+                  4 * sizeof(float));
+        }
       }
    }
 
index a66927235e57cd89003ab04b312b8a8426fbe24b..44b085e214b8654e0b37c2a24c259afa0508c0ca 100644 (file)
@@ -35,6 +35,7 @@
 #include "brw_util.h"
 #include "brw_state.h"
 #include "shader/prog_print.h"
+#include "shader/prog_parameter.h"
 
 
 
@@ -42,9 +43,11 @@ static void do_vs_prog( struct brw_context *brw,
                        struct brw_vertex_program *vp,
                        struct brw_vs_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    GLuint program_size;
    const GLuint *program;
    struct brw_vs_compile c;
+   int aux_size;
 
    memset(&c, 0, sizeof(c));
    memcpy(&c.key, key, sizeof(*key));
@@ -73,13 +76,26 @@ static void do_vs_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
+   /* We upload from &c.prog_data including the constant_map assuming
+    * they're packed together.  It would be nice to have a
+    * compile-time assert macro here.
+    */
+   assert(c.constant_map == (int8_t *)&c.prog_data +
+         sizeof(c.prog_data));
+   assert(ctx->Const.VertexProgram.MaxNativeParameters ==
+         ARRAY_SIZE(c.constant_map));
+
+   aux_size = sizeof(c.prog_data);
+   if (c.vp->use_const_buffer)
+      aux_size += c.vp->program.Base.Parameters->NumParameters;
+
    dri_bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
                                                   &c.key, sizeof(c.key),
                                                   NULL, 0,
                                                   program, program_size,
                                                   &c.prog_data,
-                                                  sizeof(c.prog_data),
+                                                  aux_size,
                                                   &brw->vs.prog_data);
 }
 
@@ -110,6 +126,8 @@ static void brw_upload_vs_prog(struct brw_context *brw)
                                      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
+   brw->vs.constant_map = ((int8_t *)brw->vs.prog_data +
+                          sizeof(*brw->vs.prog_data));
 }
 
 
index 249d48bc1806229c0193ec7536727e43932b9b6b..95e0501b1ebb3262bfb12936401aa56040b4b0cc 100644 (file)
@@ -51,6 +51,7 @@ struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
+   int8_t constant_map[1024];
 
    struct brw_vertex_program *vp;
 
index 7a252dde6c00576d514809c424ba22196c47bcaf..52cc04fee8747f9a21ef46b3e4afe00f8461d5d6 100644 (file)
@@ -104,9 +104,47 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* Vertex program parameters from curbe:
     */
    if (c->vp->use_const_buffer) {
-      /* get constants from a real constant buffer */
-      c->prog_data.curb_read_length = 0;
-      c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+      int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
+      int constant = 0;
+
+      /* We've got more constants than we can load with the push
+       * mechanism.  This is often correlated with reladdr loads where
+       * we should probably be using a pull mechanism anyway to avoid
+       * excessive reading.  However, the pull mechanism is slow in
+       * general.  So, we try to allocate as many non-reladdr-loaded
+       * constants through the push buffer as we can before giving up.
+       */
+      memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
+      for (i = 0;
+          i < c->vp->program.Base.NumInstructions && constant < max_constant;
+          i++) {
+        struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
+        int arg;
+
+        for (arg = 0; arg < 3 && constant < max_constant; arg++) {
+           if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
+                inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
+                inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
+                inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
+                inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
+               inst->SrcReg[arg].RelAddr)
+              continue;
+
+           if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
+              c->constant_map[inst->SrcReg[arg].Index] = constant++;
+           }
+        }
+      }
+
+      for (i = 0; i < constant; i++) {
+         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
+                                                             (i%2) * 4),
+                                                0, 4, 1);
+      }
+      reg += (constant + 1) / 2;
+      c->prog_data.curb_read_length = reg - 1;
+      /* XXX 0 causes a bug elsewhere... */
+      c->prog_data.nr_params = MAX2(constant * 4, 4);
    }
    else {
       /* use a section of the GRF for constants */
@@ -955,7 +993,10 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_ENV_PARAM:
    case PROGRAM_LOCAL_PARAM:
       if (c->vp->use_const_buffer) {
-        if (relAddr)
+        if (!relAddr && c->constant_map[index] != -1) {
+           assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
+           return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
+        } else if (relAddr)
            return get_reladdr_constant(c, inst, argIndex);
         else
            return get_constant(c, inst, argIndex);