freedreno/a3xx: use INDIRECT state load for shaders
authorRob Clark <robclark@freedesktop.org>
Fri, 6 Sep 2013 22:21:25 +0000 (18:21 -0400)
committerRob Clark <robclark@freedesktop.org>
Sat, 14 Sep 2013 17:31:58 +0000 (13:31 -0400)
With a debug option to force DIRECT (mainly to make it easier for
capturing cmdstream dumps).  Using INDIRECT for large shaders at least
makes a noticable reduction in CPU load, which helps for CPU limited
games.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/a3xx/fd3_program.c
src/gallium/drivers/freedreno/freedreno_screen.c
src/gallium/drivers/freedreno/freedreno_util.h

index d84bbe9c36fad299bb42adf45f806a88d7463ba2..b0eec6e66d3820dbef1573c48ff0be80349d13a9 100644 (file)
@@ -186,7 +186,8 @@ emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so)
 {
        struct ir3_shader_info *si = &so->info;
        enum adreno_state_block sb;
-       uint32_t i, *bin;
+       enum adreno_state_src src;
+       uint32_t i, sz, *bin;
 
        if (so->type == SHADER_VERTEX) {
                sb = SB_VERT_SHADER;
@@ -194,17 +195,31 @@ emit_shader(struct fd_ringbuffer *ring, struct fd3_shader_stateobj *so)
                sb = SB_FRAG_SHADER;
        }
 
-       // XXX use SS_INDIRECT
-       bin = fd_bo_map(so->bo);
-       OUT_PKT3(ring, CP_LOAD_STATE, 2 + si->sizedwords);
+       if (fd_mesa_debug & FD_DBG_DIRECT) {
+               sz = si->sizedwords;
+               src = SS_DIRECT;
+               bin = fd_bo_map(so->bo);
+       } else {
+               sz = 0;
+               src = SS_INDIRECT;
+               bin = NULL;
+       }
+
+       OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz);
        OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) |
-                       CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
+                       CP_LOAD_STATE_0_STATE_SRC(src) |
                        CP_LOAD_STATE_0_STATE_BLOCK(sb) |
                        CP_LOAD_STATE_0_NUM_UNIT(so->instrlen));
-       OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) |
-                       CP_LOAD_STATE_1_EXT_SRC_ADDR(0));
-       for (i = 0; i < si->sizedwords; i++)
+       if (bin) {
+               OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
+                               CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER));
+       } else {
+               OUT_RELOC(ring, so->bo, 0,
+                               CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0);
+       }
+       for (i = 0; i < sz; i++) {
                OUT_RING(ring, bin[i]);
+       }
 }
 
 void
@@ -223,6 +238,10 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 
        OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6);
        OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+                       /* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
+                        * flush some caches? I think we only need to set those
+                        * bits if we have updated const or shader..
+                        */
                        A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
                        A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
        OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
index 7412e3dca96b41c3e084abcc9aece423773dadf9..eada1af9892483037f6ff66f33670ca633732f1b 100644 (file)
@@ -61,6 +61,7 @@ static const struct debug_named_value debug_options[] = {
                {"dclear",    FD_DBG_DCLEAR, "Mark all state dirty after clear"},
                {"dgmem",     FD_DBG_DGMEM,  "Mark all state dirty after GMEM tile pass"},
                {"dscis",     FD_DBG_DSCIS,  "Disable scissor optimization"},
+               {"direct",    FD_DBG_DIRECT, "Force inline (SS_DIRECT) state loads"},
                DEBUG_NAMED_VALUE_END
 };
 
index f8672339cfff87ef5f00545d9546f1b391c0bc20..4c7c78b955d0551c57df12988c7876cda5a3c5e9 100644 (file)
@@ -57,6 +57,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_DCLEAR   0x04
 #define FD_DBG_DGMEM    0x08
 #define FD_DBG_DSCIS    0x10
+#define FD_DBG_DIRECT   0x20
 extern int fd_mesa_debug;
 
 #define DBG(fmt, ...) \