freedreno/a4xx: use smaller threadsize for more registers
authorRob Clark <robclark@freedesktop.org>
Mon, 18 Jan 2016 20:30:53 +0000 (15:30 -0500)
committerRob Clark <robclark@freedesktop.org>
Mon, 18 Jan 2016 21:58:25 +0000 (16:58 -0500)
Once we go past half of the "GPR" register file, it seems like we need
to run frag shader with smaller threadsize.  (The vertex shader already
runs at TWO_QUADS, which is the minimum.)

Signed-off-by: Rob Clark <robclark@freedesktop.org>
src/gallium/drivers/freedreno/a4xx/fd4_program.c

index 32b8fce161364540944cafdf6d91c9220e611fc1..74716fb733f1b59cdc08f9fe029039c7a51343fc 100644 (file)
@@ -217,6 +217,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
        struct stage s[MAX_STAGES];
        uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
        uint32_t face_regid, coord_regid, zwcoord_regid;
+       enum a3xx_threadsize fssz;
        int constmode;
        int i, j, k;
 
@@ -224,6 +225,8 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
        setup_stages(emit, s);
 
+       fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
+
        /* blob seems to always use constmode currently: */
        constmode = 1;
 
@@ -258,7 +261,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
        OUT_RING(ring, 0x00000003);
 
        OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
-       OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+       OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
                        A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
                        A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
                        /* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
@@ -385,7 +388,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
                        A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
                        A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
                        A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
-                       A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+                       A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
                        A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
                        COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
        OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |