From af4a6eba55ad81e343db788a97f3eaa3cf184369 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Fri, 20 Aug 2010 21:16:49 +0200
Subject: [PATCH] nv40: add fragment program control flow

---
 src/gallium/drivers/nvfx/nvfx_fragprog.c | 247 ++++++++++++++++++++++-
 src/gallium/drivers/nvfx/nvfx_shader.h   |   5 +
 2 files changed, 247 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index f8c4eae3873..91776371e46 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -15,6 +15,7 @@
 
 #define MAX_CONSTS 128
 #define MAX_IMM 32
+
 struct nvfx_fpc {
 	struct nvfx_fragment_program *fp;
 
@@ -38,6 +39,10 @@ struct nvfx_fpc {
 	unsigned nr_imm;
 
 	unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */
+
+	struct util_dynarray if_stack;
+	//struct util_dynarray loop_stack;
+	struct util_dynarray label_relocs;
 };
 
 static INLINE struct nvfx_reg
@@ -233,6 +238,134 @@ nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
 	nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
                    (d), (m), (s0), none, none)
 
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
+{
+	const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+	struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
+	insn.cc_update = 1;
+	nvfx_fp_emit(fpc, insn);
+
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+		NV40_FP_OP_OUT_NONE |
+		(NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+			(0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
+			(NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
+	hw[3] = 0; /* | endif_offset */
+	util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
+}
+
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_label_relocation reloc;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+        hw[3] = 0;
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+}
+
+static void
+nv40_fp_ret(struct nvfx_fpc *fpc)
+{
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+			(NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+	hw[3] = 0;
+}
+
+static void
+nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
+{
+        struct nvfx_label_relocation reloc;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
+                        NV40_FP_OP_OUT_NONE |
+                        (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
+                        (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT3_SHIFT);
+        hw[3] = 0; /* | end_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+        //util_dynarray_append(&fpc->loop_stack, unsigned, target);
+}
+
+/* warning: this only works forward, and probably only if not inside any IF */
+static void
+nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_label_relocation reloc;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+                NV40_FP_OP_OUT_NONE |
+                (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+                        (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
+        hw[3] = 0; /* | endif_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+}
+
+static void
+nv40_fp_brk(struct nvfx_fpc *fpc)
+{
+	fpc->inst_offset = fpc->fp->insn_len;
+	grow_insns(fpc, 4);
+	uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+	/* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+	hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
+		NV40_FP_OP_OUT_NONE;
+	/* Use .xxxx swizzle so that we check only src[0].x*/
+	hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+			(NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+	hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
+	hw[3] = 0;
+}
+
 static INLINE struct nvfx_src
 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
@@ -516,9 +649,6 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 	case TGSI_OPCODE_RCP:
 		nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
 		break;
-	case TGSI_OPCODE_RET:
-		assert(0);
-		break;
 	case TGSI_OPCODE_RFL:
 		if(!nvfx->is_nv4x)
 			nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none));
@@ -604,13 +734,106 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
 		nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
 		nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
 		break;
-	default:
+
+	case TGSI_OPCODE_IF:
+		// MOVRC0 R31 (TR0.xyzw), R<src>:
+		// IF (NE.xxxx) ELSE <else> END <end>
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_if(fpc, src[0]);
+		break;
+
+	case TGSI_OPCODE_ELSE:
+	{
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+		uint32_t *hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
+		hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+		break;
+	}
+
+	case TGSI_OPCODE_ENDIF:
+	{
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+		uint32_t *hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
+		if(!hw[2])
+			hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+		hw[3] = fpc->fp->insn_len;
+		break;
+	}
+
+	case TGSI_OPCODE_BRA:
+		/* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */
+		/* no state tracker uses this, so don't implement this for now */
+		assert(0);
+		nv40_fp_bra(fpc, finst->Label.Label);
+		break;
+
+	case TGSI_OPCODE_BGNSUB:
+	case TGSI_OPCODE_ENDSUB:
+		/* nothing to do here */
+		break;
+
+	case TGSI_OPCODE_CAL:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_cal(fpc, finst->Label.Label);
+		break;
+
+	case TGSI_OPCODE_RET:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_ret(fpc);
+		break;
+
+	case TGSI_OPCODE_BGNLOOP:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		/* TODO: we should support using two nested REPs to allow a > 255 iteration count */
+		nv40_fp_rep(fpc, 255, finst->Label.Label);
+		break;
+
+	case TGSI_OPCODE_ENDLOOP:
+		break;
+
+	case TGSI_OPCODE_BRK:
+		if(!nvfx->is_nv4x)
+			goto nv3x_cflow;
+		nv40_fp_brk(fpc);
+		break;
+
+	case TGSI_OPCODE_CONT:
+	{
+		static int warned = 0;
+		if(!warned) {
+			NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
+			warned = 1;
+		}
+		break;
+	}
+
+        default:
 		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
 		return FALSE;
 	}
 
+out:
 	release_temps(fpc);
 	return TRUE;
+nv3x_cflow:
+	{
+		static int warned = 0;
+		if(!warned) {
+			NOUVEAU_ERR(
+					"Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
+					"If rendering is incorrect, try to disable GLSL support in the application.\n");
+			warned = 1;
+		}
+	}
+	goto out;
 }
 
 static boolean
@@ -734,6 +957,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 {
 	struct tgsi_parse_context parse;
 	struct nvfx_fpc *fpc = NULL;
+	struct util_dynarray insns;
 
 	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
 	if (!fpc)
@@ -748,6 +972,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 
 	tgsi_parse_init(&parse, fp->pipe.tokens);
 
+	util_dynarray_init(&insns);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		tgsi_parse_token(&parse);
 
@@ -756,6 +981,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 		{
 			const struct tgsi_full_instruction *finst;
 
+			util_dynarray_append(&insns, unsigned, fp->insn_len);
 			finst = &parse.FullToken.FullInstruction;
 			if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
 				goto out_err;
@@ -765,6 +991,14 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 			break;
 		}
 	}
+	util_dynarray_append(&insns, unsigned, fp->insn_len);
+
+	for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_label_relocation))
+	{
+		struct nvfx_label_relocation* label_reloc = (struct nvfx_label_relocation*)((char*)fpc->label_relocs.data + i);
+		fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
+	}
+	util_dynarray_fini(&insns);
 
 	if(!nvfx->is_nv4x)
 		fp->fp_control |= (fpc->num_regs-1)/2;
@@ -775,7 +1009,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
 	if(fp->insn)
 		fp->insn[fpc->inst_offset] |= 0x00000001;
 
-	/* Append NOP + END instruction, may or may not be necessary. */
+	/* Append NOP + END instruction for branches to the end of the program */
 	fpc->inst_offset = fp->insn_len;
 	grow_insns(fpc, 4);
 	fp->insn[fpc->inst_offset + 0] = 0x00000001;
@@ -799,6 +1033,9 @@ out_err:
 	tgsi_parse_free(&parse);
 	if (fpc->r_temp)
 		FREE(fpc->r_temp);
+	util_dynarray_fini(&fpc->if_stack);
+	util_dynarray_fini(&fpc->label_relocs);
+	//util_dynarray_fini(&fpc->loop_stack);
 	FREE(fpc);
 }
 
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
index fcf577ca74c..52e684aec3b 100644
--- a/src/gallium/drivers/nvfx/nvfx_shader.h
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -509,4 +509,9 @@ nvfx_src_abs(struct nvfx_src src)
 	return src;
 }
 
+struct nvfx_label_relocation {
+        unsigned location;
+        unsigned target;
+};
+
 #endif
-- 
2.30.2