From 9be624b3ef32ae6311010cf05531e12051b647dc Mon Sep 17 00:00:00 2001
From: Vadim Girlin <vadimgirlin@gmail.com>
Date: Tue, 2 Apr 2013 19:33:40 +0400
Subject: [PATCH] r600g: don't reserve more stack space than required v5

Reduced stack size allows to run more threads in some cases,
improving performance for the shaders that use stack (that is, for the
shaders with control flow instructions). E.g. with unigine-based apps.

v4: implement exact computation taking into account wavefront size
v5: add cases for RV620, RS880

Signed-off-by: Vadim Girlin <vadimgirlin@gmail.com>
---
 src/gallium/drivers/r600/r600_asm.c    |  43 +++++++-
 src/gallium/drivers/r600/r600_asm.h    |  24 +++--
 src/gallium/drivers/r600/r600_shader.c | 131 ++++++++++++++++---------
 3 files changed, 142 insertions(+), 56 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 65c705d0aa8..c88b48dc96b 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -87,6 +87,40 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void)
 	return tex;
 }
 
+static unsigned stack_entry_size(enum radeon_family chip) {
+	/* Wavefront size:
+	 *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
+	 *       Aruba/Sumo/Sumo2/redwood/juniper
+	 *   32: R630/R730/R710/Palm/Cedar
+	 *   16: R610/Rs780
+	 *
+	 * Stack row size:
+	 * 	Wavefront Size                        16  32  48  64
+	 * 	Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
+	 * 	Columns per Row (R9xx+)                8   4   4   4 */
+
+	switch (chip) {
+	/* FIXME: are some chips missing here? */
+	/* wavefront size 16 */
+	case CHIP_RV610:
+	case CHIP_RS780:
+	case CHIP_RV620:
+	case CHIP_RS880:
+	/* wavefront size 32 */
+	case CHIP_RV630:
+	case CHIP_RV635:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_PALM:
+	case CHIP_CEDAR:
+		return 8;
+
+	/* wavefront size 64 */
+	default:
+		return 4;
+	}
+}
+
 void r600_bytecode_init(struct r600_bytecode *bc,
 			enum chip_class chip_class,
 			enum radeon_family family,
@@ -104,6 +138,7 @@ void r600_bytecode_init(struct r600_bytecode *bc,
 	LIST_INITHEAD(&bc->cf);
 	bc->chip_class = chip_class;
 	bc->msaa_texture_mode = msaa_texture_mode;
+	bc->stack.entry_size = stack_entry_size(family);
 }
 
 int r600_bytecode_add_cf(struct r600_bytecode *bc)
@@ -1522,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 	unsigned addr;
 	int i, r;
 
-	if (bc->callstack[0].max > 0)
-		bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
+	bc->nstack = bc->stack.max_entries;
+
 	if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
 		bc->nstack = 1;
 	}
@@ -1824,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
 		chip = '6';
 		break;
 	}
-	fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n",
-	        bc->ndw, bc->ngpr);
+	fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
+	        bc->ndw, bc->ngpr, bc->nstack);
 	fprintf(stderr, "shader %d -- %c\n", index++, chip);
 
 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index c1aa3bae4e3..c052ceabfc7 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -173,16 +173,25 @@ struct r600_cf_stack_entry {
 };
 
 #define SQ_MAX_CALL_DEPTH 0x00000020
-struct r600_cf_callstack {
-	unsigned			fc_sp_before_entry;
-	int				sub_desc_index;
-	int				current;
-	int				max;
-};
 
 #define AR_HANDLE_NORMAL 0
 #define AR_HANDLE_RV6XX 1 /* except RV670 */
 
+struct r600_stack_info {
+	/* current level of non-WQM PUSH operations
+	 * (PUSH, PUSH_ELSE, ALU_PUSH_BEFORE) */
+	int push;
+	/* current level of WQM PUSH operations
+	 * (PUSH, PUSH_ELSE, PUSH_WQM) */
+	int push_wqm;
+	/* current loop level */
+	int loop;
+
+	/* required depth */
+	int max_entries;
+	/* subentries per entry */
+	int entry_size;
+};
 
 struct r600_bytecode {
 	enum chip_class			chip_class;
@@ -199,8 +208,7 @@ struct r600_bytecode {
 	uint32_t			*bytecode;
 	uint32_t			fc_sp;
 	struct r600_cf_stack_entry	fc_stack[32];
-	unsigned			call_sp;
-	struct r600_cf_callstack	callstack[SQ_MAX_CALL_DEPTH];
+	struct r600_stack_info		stack;
 	unsigned	ar_loaded;
 	unsigned	ar_reg;
 	unsigned	ar_chan;
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index e74ed54443d..82885d1370e 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -245,7 +245,7 @@ struct r600_shader_tgsi_instruction {
 
 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
-static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
 static int tgsi_else(struct r600_shader_ctx *ctx);
 static int tgsi_endif(struct r600_shader_ctx *ctx);
@@ -419,7 +419,7 @@ static void llvm_if(struct r600_shader_ctx *ctx)
 {
 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
 	fc_pushlevel(ctx, FC_IF);
-	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+	callstack_push(ctx, FC_PUSH_VPM);
 }
 
 static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx)
@@ -5551,63 +5551,107 @@ static int pops(struct r600_shader_ctx *ctx, int pops)
 	return 0;
 }
 
-static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
+static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
+                                              unsigned reason)
+{
+	struct r600_stack_info *stack = &ctx->bc->stack;
+	unsigned elements, entries;
+
+	unsigned entry_size = stack->entry_size;
+
+	elements = (stack->loop + stack->push_wqm ) * entry_size;
+	elements += stack->push;
+
+	switch (ctx->bc->chip_class) {
+	case R600:
+	case R700:
+		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
+		 * the stack must be reserved to hold the current active/continue
+		 * masks */
+		if (reason == FC_PUSH_VPM) {
+			elements += 2;
+		}
+		break;
+
+	case CAYMAN:
+		/* r9xx: any stack operation on empty stack consumes 2 additional
+		 * elements */
+		elements += 2;
+
+		/* fallthrough */
+		/* FIXME: do the two elements added above cover the cases for the
+		 * r8xx+ below? */
+
+	case EVERGREEN:
+		/* r8xx+: 2 extra elements are not always required, but one extra
+		 * element must be added for each of the following cases:
+		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
+		 *    stack usage.
+		 *    (Currently we don't use ALU_ELSE_AFTER.)
+		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
+		 *    PUSH instruction executed.
+		 *
+		 *    NOTE: it seems we also need to reserve additional element in some
+		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
+		 *    then STACK_SIZE should be 2 instead of 1 */
+		if (reason == FC_PUSH_VPM) {
+			elements += 1;
+		}
+		break;
+
+	default:
+		assert(0);
+		break;
+	}
+
+	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
+	 * for all chips, so we use 4 in the final formula, not the real entry_size
+	 * for the chip */
+	entry_size = 4;
+
+	entries = (elements + (entry_size - 1)) / entry_size;
+
+	if (entries > stack->max_entries)
+		stack->max_entries = entries;
+}
+
+static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
 {
 	switch(reason) {
 	case FC_PUSH_VPM:
-		ctx->bc->callstack[ctx->bc->call_sp].current--;
+		--ctx->bc->stack.push;
+		assert(ctx->bc->stack.push >= 0);
 		break;
 	case FC_PUSH_WQM:
+		--ctx->bc->stack.push_wqm;
+		assert(ctx->bc->stack.push_wqm >= 0);
+		break;
 	case FC_LOOP:
-		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
+		--ctx->bc->stack.loop;
+		assert(ctx->bc->stack.loop >= 0);
 		break;
-	case FC_REP:
-		/* TOODO : for 16 vp asic should -= 2; */
-		ctx->bc->callstack[ctx->bc->call_sp].current --;
+	default:
+		assert(0);
 		break;
 	}
 }
 
-static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
+static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
 {
-	if (check_max_only) {
-		int diff;
-		switch (reason) {
-		case FC_PUSH_VPM:
-			diff = 1;
-			break;
-		case FC_PUSH_WQM:
-			diff = 4;
-			break;
-		default:
-			assert(0);
-			diff = 0;
-		}
-		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
-		    ctx->bc->callstack[ctx->bc->call_sp].max) {
-			ctx->bc->callstack[ctx->bc->call_sp].max =
-				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
-		}
-		return;
-	}
 	switch (reason) {
 	case FC_PUSH_VPM:
-		ctx->bc->callstack[ctx->bc->call_sp].current++;
+		++ctx->bc->stack.push;
 		break;
 	case FC_PUSH_WQM:
+		++ctx->bc->stack.push_wqm;
 	case FC_LOOP:
-		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
-		break;
-	case FC_REP:
-		ctx->bc->callstack[ctx->bc->call_sp].current++;
+		++ctx->bc->stack.loop;
 		break;
+	default:
+		assert(0);
 	}
 
-	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
-	    ctx->bc->callstack[ctx->bc->call_sp].max) {
-		ctx->bc->callstack[ctx->bc->call_sp].max =
-			ctx->bc->callstack[ctx->bc->call_sp].current;
-	}
+	callstack_update_max_depth(ctx, reason);
 }
 
 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
@@ -5694,7 +5738,7 @@ static int tgsi_if(struct r600_shader_ctx *ctx)
 
 	fc_pushlevel(ctx, FC_IF);
 
-	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
+	callstack_push(ctx, FC_PUSH_VPM);
 	return 0;
 }
 
@@ -5724,7 +5768,7 @@ static int tgsi_endif(struct r600_shader_ctx *ctx)
 	}
 	fc_poplevel(ctx);
 
-	callstack_decrease_current(ctx, FC_PUSH_VPM);
+	callstack_pop(ctx, FC_PUSH_VPM);
 	return 0;
 }
 
@@ -5737,7 +5781,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
 	fc_pushlevel(ctx, FC_LOOP);
 
 	/* check stack depth */
-	callstack_check_depth(ctx, FC_LOOP, 0);
+	callstack_push(ctx, FC_LOOP);
 	return 0;
 }
 
@@ -5766,7 +5810,7 @@ static int tgsi_endloop(struct r600_shader_ctx *ctx)
 	}
 	/* XXX add LOOPRET support */
 	fc_poplevel(ctx);
-	callstack_decrease_current(ctx, FC_LOOP);
+	callstack_pop(ctx, FC_LOOP);
 	return 0;
 }
 
@@ -5789,7 +5833,6 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
 
 	fc_set_mid(ctx, fscp);
 
-	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
 	return 0;
 }
 
-- 
2.30.2