From: Rob Clark <robdclark@gmail.com>
Date: Sun, 7 Oct 2018 17:59:27 +0000 (-0400)
Subject: freedreno/a6xx: move const emit to state group
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=abcdf5627a29b7f1856b86bce4ff9bd0029a3099;p=mesa.git

freedreno/a6xx: move const emit to state group

Eventually we want to move nearly everything, but no other state depends
on const state, so this is the easiest one to move first.

For webgl aquarium, this reduces GPU load by about 10%, since for each
fish it does a uniform upload plus draw.. fish frequently are visible in
only a single tile, so this skips the uniform uploads for other tiles.

The additional step of avoiding WFI's when using CP_SET_DRAW_STATE seems
to be work an additional 10% gain for aquarium.

Signed-off-by: Rob Clark <robdclark@gmail.com>
---

diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
index fc4a53f8651..93f6a267fa9 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
@@ -359,7 +359,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (tex->num_samplers > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4);
+			fd_ringbuffer_new_flags(ctx->pipe, tex->num_samplers * 4 * 4,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
 		for (unsigned i = 0; i < tex->num_samplers; i++) {
 			static const struct fd6_sampler_stateobj dummy_sampler = {};
 			const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -389,7 +390,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (tex->num_textures > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_object(ctx->pipe, tex->num_textures * 16);
+			fd_ringbuffer_new_flags(ctx->pipe, tex->num_textures * 16 * 4,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
 		for (unsigned i = 0; i < tex->num_textures; i++) {
 			static const struct fd6_pipe_sampler_view dummy_view = {};
 			const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
@@ -791,9 +793,29 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
 	}
 
-	ir3_emit_vs_consts(vp, ring, ctx, emit->info);
-	if (!emit->key.binning_pass)
-		ir3_emit_fs_consts(fp, ring, ctx);
+#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST | \
+					 FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
+
+	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
+		struct fd_ringbuffer *vsconstobj =
+			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+		ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
+		fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
+		fd_ringbuffer_del(vsconstobj);
+	}
+
+	if ((ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) &&
+			!emit->key.binning_pass) {
+		struct fd_ringbuffer *fsconstobj =
+			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+		ir3_emit_fs_consts(fp, fsconstobj, ctx);
+		fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x7);
+		fd_ringbuffer_del(fsconstobj);
+	}
 
 	struct pipe_stream_output_info *info = &vp->shader->stream_output;
 	if (info->num_outputs) {
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
index a2117a1b244..4e27597a70b 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
@@ -43,7 +43,8 @@ struct fd_ringbuffer;
  * need to be emit'd.
  */
 enum fd6_state_id {
-	FD6_GROUP_CONST,
+	FD6_GROUP_VS_CONST,
+	FD6_GROUP_FS_CONST,
 };
 
 struct fd6_state_group {
@@ -116,7 +117,7 @@ fd6_emit_add_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj,
 	if (fd_ringbuffer_size(stateobj) == 0)
 		return;
 	struct fd6_state_group *g = &emit->groups[emit->num_groups++];
-	g->stateobj = stateobj;
+	g->stateobj = fd_ringbuffer_ref(stateobj);
 	g->group_id = group_id;
 	g->enable_mask = enable_mask;
 }
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
index 0c96250f974..11673992959 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
@@ -751,6 +751,13 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
 		OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
 	}
 
+	OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
+	OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
+			CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
+			CP_SET_DRAW_STATE__0_GROUP_ID(0));
+	OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
+	OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
+
 	OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x0);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 5532a7f3467..ee063f84d73 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -552,6 +552,18 @@ ir3_shader_outputs(const struct ir3_shader *so)
 
 #include "freedreno_resource.h"
 
+static inline void
+ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring)
+{
+	/* when we emit const state via ring (IB2) we need a WFI, but when
+	 * it is emit'd via stateobj, we don't
+	 */
+	if (ring->flags & FD_RINGBUFFER_OBJECT)
+		return;
+
+	fd_wfi(batch, ring);
+}
+
 static void
 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
@@ -579,7 +591,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		size = MIN2(size, 4 * max_const);
 
 		if (size > 0) {
-			fd_wfi(ctx->batch, ring);
+			ring_wfi(ctx->batch, ring);
 			ctx->emit_const(ring, v->type, 0,
 					cb->buffer_offset, size,
 					cb->user_buffer, cb->buffer);
@@ -611,7 +623,7 @@ emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
 	}
 }
@@ -631,7 +643,7 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			sizes[off] = sb->sb[index].buffer_size;
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, offset * 4,
 			0, ARRAY_SIZE(sizes), sizes, NULL);
 	}
@@ -673,7 +685,7 @@ emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, offset * 4,
 			0, ARRAY_SIZE(dims), dims, NULL);
 	}
@@ -696,7 +708,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
 	size *= 4;
 
 	if (size > 0) {
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const(ring, v->type, base,
 			0, size, v->immediates[0].val, NULL);
 	}
@@ -729,7 +741,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 			}
 		}
 
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 		ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
 	}
 }
@@ -787,6 +799,19 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 {
 	enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];
 
+	/* When we use CP_SET_DRAW_STATE objects to emit constant state,
+	 * if we emit any of it we need to emit all.  This is because
+	 * we are using the same state-group-id each time for uniform
+	 * state, and if previous update is never evaluated (due to no
+	 * visible primitives in the current tile) then the new stateobj
+	 * completely replaces the old one.
+	 *
+	 * Possibly if we split up different parts of the const state to
+	 * different state-objects we could avoid this.
+	 */
+	if (dirty && (ring->flags & FD_RINGBUFFER_OBJECT))
+		dirty = ~0;
+
 	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
 		struct fd_constbuf_stateobj *constbuf;
 		bool shader_dirty;
@@ -846,7 +871,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 				vertex_params_size = ARRAY_SIZE(vertex_params);
 			}
 
-			fd_wfi(ctx->batch, ring);
+			ring_wfi(ctx->batch, ring);
 
 			bool needs_vtxid_base =
 				ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
@@ -918,7 +943,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 	/* emit compute-shader driver-params: */
 	uint32_t offset = v->constbase.driver_param;
 	if (v->constlen > offset) {
-		fd_wfi(ctx->batch, ring);
+		ring_wfi(ctx->batch, ring);
 
 		if (info->indirect) {
 			struct pipe_resource *indirect = NULL;