freedreno/a5xx: SSBO support
authorRob Clark <robdclark@gmail.com>
Mon, 17 Apr 2017 15:25:29 +0000 (11:25 -0400)
committerRob Clark <robdclark@gmail.com>
Thu, 4 May 2017 17:48:06 +0000 (13:48 -0400)
To simplify things for now, since all the gfx shader stages share a
single SSBO state block, only advertise SSBO support for fragment shader
(and compute when we have that).  We could possibly use a fixed-
partitioning of the SSBO index space to support SSBOs on other stages
without having to resort to shader variants.

Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/a5xx/fd5_emit.c
src/gallium/drivers/freedreno/a5xx/fd5_program.c
src/gallium/drivers/freedreno/freedreno_screen.c

index 5b25257167b0f2fea67a670aa3a4cce854950f3a..a51401b4cfecb5b9d0c6e32f14909f1c9b86458e 100644 (file)
@@ -345,6 +345,72 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
        return needs_border;
 }
 
+static void
+emit_ssbos(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               enum a4xx_state_block sb, struct fd_shaderbuf_stateobj *so)
+{
+       unsigned count = util_last_bit(so->enabled_mask);
+
+       if (count == 0)
+               return;
+
+       OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (4 * count));
+       OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+                       CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+                       CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+                       CP_LOAD_STATE4_0_NUM_UNIT(count));
+       OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(0) |
+                       CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+       OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
+       for (unsigned i = 0; i < count; i++) {
+               struct pipe_shader_buffer *buf = &so->sb[i];
+               if (buf->buffer) {
+                       struct fd_resource *rsc = fd_resource(buf->buffer);
+                       OUT_RELOCW(ring, rsc->bo, 0, 0, 0);
+               } else {
+                       OUT_RING(ring, 0x00000000);
+                       OUT_RING(ring, 0x00000000);
+               }
+               OUT_RING(ring, 0x00000000);
+               OUT_RING(ring, 0x00000000);
+       }
+
+       OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * count));
+       OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+                       CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+                       CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+                       CP_LOAD_STATE4_0_NUM_UNIT(count));
+       OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(1) |
+                       CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+       OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
+       for (unsigned i = 0; i < count; i++) {
+               struct pipe_shader_buffer *buf = &so->sb[i];
+
+               // TODO maybe offset encoded somewhere here??
+               OUT_RING(ring, (buf->buffer_size << 16));
+               OUT_RING(ring, 0x00000000);
+       }
+
+       OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * count));
+       OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+                       CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+                       CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
+                       CP_LOAD_STATE4_0_NUM_UNIT(count));
+       OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(2) |
+                       CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+       OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
+       for (unsigned i = 0; i < count; i++) {
+               struct pipe_shader_buffer *buf = &so->sb[i];
+               if (buf->buffer) {
+                       struct fd_resource *rsc = fd_resource(buf->buffer);
+                       OUT_RELOCW(ring, rsc->bo, 0, 0, 0);
+               } else {
+                       OUT_RING(ring, 0x00000000);
+                       OUT_RING(ring, 0x00000000);
+               }
+       }
+}
+
 void
 fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit)
 {
@@ -663,6 +729,9 @@ fd5_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
        if (needs_border)
                emit_border_color(ctx, ring);
+
+       if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_SSBO)
+               emit_ssbos(ctx, ring, SB4_SSBO, &ctx->shaderbuf[PIPE_SHADER_FRAGMENT]);
 }
 
 /* emit setup at begin of new cmdstream buffer (don't rely on previous
index 232b3fb877574d2771a464c8b7e465964f68fe46..54b5d8063c695a716d9ed2fd758ecf37139b1798 100644 (file)
@@ -389,11 +389,16 @@ fd5_program_emit(struct fd_ringbuffer *ring, struct fd5_emit *emit)
        OUT_RING(ring, 0x00000000);
 
        OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CNTL, 5);
-       OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen));
-       OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen));
-       OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen));
-       OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen));
-       OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen));
+       OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen) |
+                       COND(s[VS].v && s[VS].v->has_ssbo, A5XX_HLSQ_VS_CNTL_SSBO_ENABLE));
+       OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen) |
+                       COND(s[FS].v && s[FS].v->has_ssbo, A5XX_HLSQ_FS_CNTL_SSBO_ENABLE));
+       OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen) |
+                       COND(s[HS].v && s[HS].v->has_ssbo, A5XX_HLSQ_HS_CNTL_SSBO_ENABLE));
+       OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen) |
+                       COND(s[DS].v && s[DS].v->has_ssbo, A5XX_HLSQ_DS_CNTL_SSBO_ENABLE));
+       OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen) |
+                       COND(s[GS].v && s[GS].v->has_ssbo, A5XX_HLSQ_GS_CNTL_SSBO_ENABLE));
 
        OUT_PKT4(ring, REG_A5XX_SP_VS_CONFIG, 5);
        OUT_RING(ring, A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) |
index 93b434b0bac564f58fe9caef0ee0bdebd4f1d6d8..15293b1b3be3cf2f22abaa1451edc3849e4c7632 100644 (file)
@@ -250,6 +250,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                        return 120;
                return is_ir3(screen) ? 140 : 120;
 
+       case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+               if (is_a5xx(screen))
+                       return 4;
+               return 0;
+
        /* Unsupported features. */
        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
@@ -282,7 +287,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-       case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
        case PIPE_CAP_INVALIDATE_BUFFER:
        case PIPE_CAP_GENERATE_MIPMAP:
        case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
@@ -439,7 +443,7 @@ fd_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 
 static int
 fd_screen_get_shader_param(struct pipe_screen *pscreen,
-                                                  enum pipe_shader_type shader,
+               enum pipe_shader_type shader,
                enum pipe_shader_cap param)
 {
        struct fd_screen *screen = fd_screen(pscreen);
@@ -518,6 +522,35 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen,
        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
                return 32;
        case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+               if (is_a5xx(screen)) {
+                       /* a5xx (and a4xx for that matter) has one state-block
+                        * for compute-shader SSBO's and another that is shared
+                        * by VS/HS/DS/GS/FS..  so to simplify things for now
+                        * just advertise SSBOs for FS and CS.  We could possibly
+                        * do what blob does, and partition the space for
+                        * VS/HS/DS/GS/FS.  The blob advertises:
+                        *
+                        *   GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS: 4
+                        *   GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS: 4
+                        *   GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS: 4
+                        *   GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS: 4
+                        *   GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS: 4
+                        *   GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS: 24
+                        *   GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS: 24
+                        *
+                        * I think that way we could avoid having to patch shaders
+                        * for actual SSBO indexes by using a static partitioning.
+                        */
+                       switch(shader)
+                       {
+                       case PIPE_SHADER_FRAGMENT:
+                       case PIPE_SHADER_COMPUTE:
+                               return 24;
+                       default:
+                               return 0;
+                       }
+               }
+               return 0;
        case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
        case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
        case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: