freedreno/a6xx: move const emit to state group
authorRob Clark <robdclark@gmail.com>
Sun, 7 Oct 2018 17:59:27 +0000 (13:59 -0400)
committerRob Clark <robdclark@gmail.com>
Wed, 17 Oct 2018 16:44:48 +0000 (12:44 -0400)
Eventually we want to move nearly everything, but no other state depends
on const state, so this is the easiest one to move first.

For webgl aquarium, this reduces GPU load by about 10%, since for each
fish it does a uniform upload plus draw.. fish frequently are visible in
only a single tile, so this skips the uniform uploads for other tiles.

The additional step of avoiding WFI's when using CP_SET_DRAW_STATE seems
to be work an additional 10% gain for aquarium.

Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/a6xx/fd6_emit.c
src/gallium/drivers/freedreno/a6xx/fd6_emit.h
src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
src/gallium/drivers/freedreno/ir3/ir3_shader.c

index fc4a53f865186da9c72d89481b6b977e75a332de..93f6a267fa98595e2c463e5fb7446532a802b008 100644 (file)
@@ -359,7 +359,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
        if (tex->num_samplers > 0) {
                struct fd_ringbuffer *state =
-                       fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4);
+                       fd_ringbuffer_new_flags(ctx->pipe, tex->num_samplers * 4 * 4,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
                for (unsigned i = 0; i < tex->num_samplers; i++) {
                        static const struct fd6_sampler_stateobj dummy_sampler = {};
                        const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -389,7 +390,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
        if (tex->num_textures > 0) {
                struct fd_ringbuffer *state =
-                       fd_ringbuffer_new_object(ctx->pipe, tex->num_textures * 16);
+                       fd_ringbuffer_new_flags(ctx->pipe, tex->num_textures * 16 * 4,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
                for (unsigned i = 0; i < tex->num_textures; i++) {
                        static const struct fd6_pipe_sampler_view dummy_view = {};
                        const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
@@ -791,9 +793,29 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
        }
 
-       ir3_emit_vs_consts(vp, ring, ctx, emit->info);
-       if (!emit->key.binning_pass)
-               ir3_emit_fs_consts(fp, ring, ctx);
+#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST | \
+                                        FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
+
+       if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
+               struct fd_ringbuffer *vsconstobj =
+                       fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+               ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
+               fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
+               fd_ringbuffer_del(vsconstobj);
+       }
+
+       if ((ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) &&
+                       !emit->key.binning_pass) {
+               struct fd_ringbuffer *fsconstobj =
+                       fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+               ir3_emit_fs_consts(fp, fsconstobj, ctx);
+               fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x7);
+               fd_ringbuffer_del(fsconstobj);
+       }
 
        struct pipe_stream_output_info *info = &vp->shader->stream_output;
        if (info->num_outputs) {
index a2117a1b2442aca97c0d30d0e159c8d1b98a9327..4e27597a70b91ef9466396f693e8d0d4bd680230 100644 (file)
@@ -43,7 +43,8 @@ struct fd_ringbuffer;
  * need to be emit'd.
  */
 enum fd6_state_id {
-       FD6_GROUP_CONST,
+       FD6_GROUP_VS_CONST,
+       FD6_GROUP_FS_CONST,
 };
 
 struct fd6_state_group {
@@ -116,7 +117,7 @@ fd6_emit_add_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj,
        if (fd_ringbuffer_size(stateobj) == 0)
                return;
        struct fd6_state_group *g = &emit->groups[emit->num_groups++];
-       g->stateobj = stateobj;
+       g->stateobj = fd_ringbuffer_ref(stateobj);
        g->group_id = group_id;
        g->enable_mask = enable_mask;
 }
index 0c96250f974abd973893d14d749481fe8beba766..11673992959e56d4c34d349b3b89b0eedeb29be3 100644 (file)
@@ -751,6 +751,13 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
                OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
        }
 
+       OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
+       OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
+                       CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
+                       CP_SET_DRAW_STATE__0_GROUP_ID(0));
+       OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
+       OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
+
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);
 
index 5532a7f346720f076ce02835ca728dd389a4ea44..ee063f84d73088fd7649db4dc1f051aa1f3267f3 100644 (file)
@@ -552,6 +552,18 @@ ir3_shader_outputs(const struct ir3_shader *so)
 
 #include "freedreno_resource.h"
 
+static inline void
+ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring)
+{
+       /* when we emit const state via ring (IB2) we need a WFI, but when
+        * it is emit'd via stateobj, we don't
+        */
+       if (ring->flags & FD_RINGBUFFER_OBJECT)
+               return;
+
+       fd_wfi(batch, ring);
+}
+
 static void
 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
                struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
@@ -579,7 +591,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
                size = MIN2(size, 4 * max_const);
 
                if (size > 0) {
-                       fd_wfi(ctx->batch, ring);
+                       ring_wfi(ctx->batch, ring);
                        ctx->emit_const(ring, v->type, 0,
                                        cb->buffer_offset, size,
                                        cb->user_buffer, cb->buffer);
@@ -611,7 +623,7 @@ emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        }
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
        }
 }
@@ -631,7 +643,7 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        sizes[off] = sb->sb[index].buffer_size;
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const(ring, v->type, offset * 4,
                        0, ARRAY_SIZE(sizes), sizes, NULL);
        }
@@ -673,7 +685,7 @@ emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        }
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const(ring, v->type, offset * 4,
                        0, ARRAY_SIZE(dims), dims, NULL);
        }
@@ -696,7 +708,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
        size *= 4;
 
        if (size > 0) {
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const(ring, v->type, base,
                        0, size, v->immediates[0].val, NULL);
        }
@@ -729,7 +741,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        }
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
        }
 }
@@ -787,6 +799,19 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 {
        enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];
 
+       /* When we use CP_SET_DRAW_STATE objects to emit constant state,
+        * if we emit any of it we need to emit all.  This is because
+        * we are using the same state-group-id each time for uniform
+        * state, and if previous update is never evaluated (due to no
+        * visible primitives in the current tile) then the new stateobj
+        * completely replaces the old one.
+        *
+        * Possibly if we split up different parts of the const state to
+        * different state-objects we could avoid this.
+        */
+       if (dirty && (ring->flags & FD_RINGBUFFER_OBJECT))
+               dirty = ~0;
+
        if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
                struct fd_constbuf_stateobj *constbuf;
                bool shader_dirty;
@@ -846,7 +871,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
                                vertex_params_size = ARRAY_SIZE(vertex_params);
                        }
 
-                       fd_wfi(ctx->batch, ring);
+                       ring_wfi(ctx->batch, ring);
 
                        bool needs_vtxid_base =
                                ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
@@ -918,7 +943,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
        /* emit compute-shader driver-params: */
        uint32_t offset = v->constbase.driver_param;
        if (v->constlen > offset) {
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
 
                if (info->indirect) {
                        struct pipe_resource *indirect = NULL;