From 893425a607a63a83e8a4c13fd963367c8d174678 Mon Sep 17 00:00:00 2001
From: "Kristian H. Kristensen" <hoegsberg@chromium.org>
Date: Tue, 26 Mar 2019 10:31:54 -0700
Subject: [PATCH] freedreno/ir3: Push UBOs to constant file

We have a rather big constant file and it seems that the best way to
use it is to upload all UBOs and lower UBO access the load_uniform.

Signed-off-by: Kristian H. Kristensen <hoegsberg@chromium.org>
Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/freedreno/ir3/ir3_context.c               |   2 +-
 .../ir3/ir3_nir_analyze_ubo_ranges.c          | 111 ++++++++++++++++--
 src/freedreno/ir3/ir3_shader.h                |  17 +++
 src/gallium/drivers/freedreno/a6xx/fd6_emit.c |  15 ++-
 .../drivers/freedreno/ir3/ir3_gallium.c       |  16 +++
 5 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c
index 7c35b9ba65f..d6267165ec7 100644
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@@ -124,7 +124,7 @@ ir3_context_init(struct ir3_compiler *compiler,
 	 * Immediates go last mostly because they are inserted in the CP pass
 	 * after the nir -> ir3 frontend.
 	 */
-	unsigned constoff = align(ctx->s->num_uniforms, 4);
+	unsigned constoff = align(ctx->so->shader->ubo_state.size / 16, 4);
 	unsigned ptrsz = ir3_pointer_size(ctx);
 
 	memset(&so->constbase, ~0, sizeof(so->constbase));
diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
index 35b921990a2..aaa2a8684a2 100644
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@@ -27,9 +27,38 @@
 #include "util/u_dynarray.h"
 #include "mesa/main/macros.h"
 
-struct ir3_ubo_analysis_state {
-	unsigned lower_count;
-};
+static inline struct ir3_ubo_range
+get_ubo_load_range(nir_intrinsic_instr *instr)
+{
+	struct ir3_ubo_range r;
+
+	const int bytes = nir_intrinsic_dest_components(instr) *
+		(nir_dest_bit_size(instr->dest) / 8);
+
+	r.start = ROUND_DOWN_TO(nir_src_as_uint(instr->src[1]), 16 * 4);
+	r.end = ALIGN(r.start + bytes, 16 * 4);
+
+	return r;
+}
+
+static void
+gather_ubo_ranges(nir_intrinsic_instr *instr,
+				  struct ir3_ubo_analysis_state *state)
+{
+	if (!nir_src_is_const(instr->src[0]))
+		return;
+
+	if (!nir_src_is_const(instr->src[1]))
+		return;
+
+	const struct ir3_ubo_range r = get_ubo_load_range(instr);
+	const uint32_t block = nir_src_as_uint(instr->src[0]);
+
+	if (r.start < state->range[block].start)
+		state->range[block].start = r.start;
+	if (state->range[block].end < r.end)
+		state->range[block].end = r.end;
+}
 
 static void
 lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
@@ -43,15 +72,37 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
 		return;
 
 	const uint32_t block = nir_src_as_uint(instr->src[0]);
-	if (block > 0)
-		return;
+
+	if (block > 0) {
+		/* We don't lower dynamic array indexing either, but we definitely should.
+		 * We don't have a good way of determining the range of the dynamic
+		 * access, so for now just fall back to pulling.
+		 */
+		if (!nir_src_is_const(instr->src[1]))
+			return;
+
+		/* After gathering the UBO access ranges, we limit the total
+		 * upload. Reject if we're now outside the range.
+		 */
+		const struct ir3_ubo_range r = get_ubo_load_range(instr);
+		if (!(state->range[block].start <= r.start &&
+			  r.end <= state->range[block].end))
+			return;
+	}
 
 	b->cursor = nir_before_instr(&instr->instr);
 
 	nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
-	nir_ssa_def *uniform_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
-	if (uniform_offset == NULL)
-		uniform_offset = nir_ushr(b, ubo_offset, nir_imm_int(b, 2));
+	nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
+	if (new_offset)
+		ubo_offset = new_offset;
+	else
+		ubo_offset = nir_ushr(b, ubo_offset, nir_imm_int(b, 2));
+
+	const int range_offset =
+		(state->range[block].offset - state->range[block].start) / 4;
+	nir_ssa_def *uniform_offset =
+		nir_iadd(b, ubo_offset, nir_imm_int(b, range_offset));
 
 	nir_intrinsic_instr *uniform =
 		nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
@@ -72,7 +123,45 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
 bool
 ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
 {
-	struct ir3_ubo_analysis_state state = { 0 };
+	struct ir3_ubo_analysis_state *state = &shader->ubo_state;
+
+	memset(state, 0, sizeof(*state));
+	state->range[0].end = nir->num_uniforms * 16;
+
+	nir_foreach_function(function, nir) {
+		if (function->impl) {
+			nir_foreach_block(block, function->impl) {
+				nir_foreach_instr(instr, block) {
+					if (instr->type == nir_instr_type_intrinsic &&
+						nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo)
+						gather_ubo_ranges(nir_instr_as_intrinsic(instr), state);
+				}
+			}
+		}
+	}
+
+	/* For now, everything we upload is accessed statically and thus will be
+	 * used by the shader. Once we can upload dynamically indexed data, we may
+	 * upload sparsely accessed arrays, at which point we probably want to
+	 * give priority to smaller UBOs, on the assumption that big UBOs will be
+	 * accessed dynamically.  Alternatively, we can track statically and
+	 * dynamically accessed ranges separately and upload static rangtes
+	 * first.
+	 */
+	const uint32_t max_upload = 16 * 1024;
+	uint32_t offset = 0;
+	for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) {
+		uint32_t range_size = state->range[i].end - state->range[i].start;
+
+		debug_assert(offset <= max_upload);
+		state->range[i].offset = offset;
+		if (offset + range_size > max_upload) {
+			range_size = max_upload - offset;
+			state->range[i].end = state->range[i].start + range_size;
+		}
+		offset += range_size;
+	}
+	state->size = offset;
 
 	nir_foreach_function(function, nir) {
 		if (function->impl) {
@@ -82,7 +171,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
 				nir_foreach_instr_safe(instr, block) {
 					if (instr->type == nir_instr_type_intrinsic &&
 						nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo)
-						lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), &builder, &state);
+						lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), &builder, state);
 				}
 			}
 
@@ -91,5 +180,5 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader)
 		}
 	}
 
-	return state.lower_count > 0;
+	return state->lower_count > 0;
 }
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 647651c03b0..58d14197879 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -67,6 +67,8 @@ enum ir3_driver_param {
 #define IR3_MAX_SHADER_IMAGES    32
 #define IR3_MAX_SO_BUFFERS        4
 #define IR3_MAX_SO_OUTPUTS       64
+#define IR3_MAX_CONSTANT_BUFFERS 32
+
 
 /**
  * For consts needed to pass internal values to shader which may or may not
@@ -474,6 +476,19 @@ struct ir3_shader_variant {
 	struct ir3_shader *shader;
 };
 
+struct ir3_ubo_range {
+	uint32_t offset; /* start offset of this block in const register file */
+	uint32_t start, end; /* range of block that's actually used */
+};
+
+struct ir3_ubo_analysis_state
+{
+	struct ir3_ubo_range range[IR3_MAX_CONSTANT_BUFFERS];
+	uint32_t size;
+	uint32_t lower_count;
+};
+
+
 struct ir3_shader {
 	gl_shader_stage type;
 
@@ -486,6 +501,8 @@ struct ir3_shader {
 
 	struct ir3_compiler *compiler;
 
+	struct ir3_ubo_analysis_state ubo_state;
+
 	struct nir_shader *nir;
 	struct ir3_stream_output_info stream_output;
 
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
index b48a0d68fde..75c8c91d897 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
@@ -72,11 +72,10 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
 		uint32_t regid, uint32_t offset, uint32_t sizedwords,
 		const uint32_t *dwords, struct pipe_resource *prsc)
 {
-	uint32_t i, sz;
+	uint32_t i, sz, align_sz;
 	enum a6xx_state_src src;
 
 	debug_assert((regid % 4) == 0);
-	debug_assert((sizedwords % 4) == 0);
 
 	if (prsc) {
 		sz = 0;
@@ -86,12 +85,14 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
 		src = SS6_DIRECT;
 	}
 
-	OUT_PKT7(ring, shader_t_to_opcode(type), 3 + sz);
+	align_sz = align(sz, 4);
+
+	OUT_PKT7(ring, shader_t_to_opcode(type), 3 + align_sz);
 	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
 			CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
 			CP_LOAD_STATE6_0_STATE_SRC(src) |
 			CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) |
-			CP_LOAD_STATE6_0_NUM_UNIT(sizedwords/4));
+			CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));
 	if (prsc) {
 		struct fd_bo *bo = fd_resource(prsc)->bo;
 		OUT_RELOC(ring, bo, offset, 0, 0);
@@ -100,9 +101,15 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
 		OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 		dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
 	}
+
 	for (i = 0; i < sz; i++) {
 		OUT_RING(ring, dwords[i]);
 	}
+
+	/* Zero-pad to multiple of 4 dwords */
+	for (i = sz; i < align_sz; i++) {
+		OUT_RING(ring, 0);
+	}
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 4481c544217..2d9516ade5c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -254,6 +254,22 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 					cb->user_buffer, cb->buffer);
 		}
 	}
+
+	struct ir3_ubo_analysis_state *state;
+	state = &v->shader->ubo_state;
+
+	for (uint32_t i = 1; i < ARRAY_SIZE(state->range); i++) {
+		struct pipe_constant_buffer *cb = &constbuf->cb[i];
+
+		if (state->range[i].start < state->range[i].end &&
+			constbuf->enabled_mask & (1 << i)) {
+
+			ctx->emit_const(ring, v->type, state->range[i].offset / 4,
+							cb->buffer_offset + state->range[i].start,
+							(state->range[i].end - state->range[i].start) / 4,
+							cb->user_buffer, cb->buffer);
+		}
+	}
 }
 
 static void
-- 
2.30.2