From af4a6eba55ad81e343db788a97f3eaa3cf184369 Mon Sep 17 00:00:00 2001 From: Luca Barbieri Date: Fri, 20 Aug 2010 21:16:49 +0200 Subject: [PATCH] nv40: add fragment program control flow --- src/gallium/drivers/nvfx/nvfx_fragprog.c | 247 ++++++++++++++++++++++- src/gallium/drivers/nvfx/nvfx_shader.h | 5 + 2 files changed, 247 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c index f8c4eae3873..91776371e46 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragprog.c +++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c @@ -15,6 +15,7 @@ #define MAX_CONSTS 128 #define MAX_IMM 32 + struct nvfx_fpc { struct nvfx_fragment_program *fp; @@ -38,6 +39,10 @@ struct nvfx_fpc { unsigned nr_imm; unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */ + + struct util_dynarray if_stack; + //struct util_dynarray loop_stack; + struct util_dynarray label_relocs; }; static INLINE struct nvfx_reg @@ -233,6 +238,134 @@ nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn) nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \ (d), (m), (s0), none, none) +/* IF src.x != 0, as TGSI specifies */ +static void +nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src) +{ + const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0)); + struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none); + insn.cc_update = 1; + nvfx_fp_emit(fpc, insn); + + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + uint32_t *hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE | + (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) | + (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) | + (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) | + (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT); + hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */ + hw[3] = 0; /* | endif_offset */ + util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset); +} + +/* IF src.x != 0, as TGSI specifies */ +static void +nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target) +{ + struct nvfx_label_relocation reloc; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + uint32_t *hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */ + hw[3] = 0; + reloc.target = target; + reloc.location = fpc->inst_offset + 2; + util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc); +} + +static void +nv40_fp_ret(struct nvfx_fpc *fpc) +{ + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + uint32_t *hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */ + hw[3] = 0; +} + +static void +nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target) +{ + struct nvfx_label_relocation reloc; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + uint32_t *hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE | + (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | + (count << NV40_FP_OP_REP_COUNT1_SHIFT) | + (count << NV40_FP_OP_REP_COUNT2_SHIFT) | + (count << NV40_FP_OP_REP_COUNT3_SHIFT); + hw[3] = 0; /* | end_offset */ + reloc.target = target; + reloc.location = fpc->inst_offset + 3; + util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc); + //util_dynarray_append(&fpc->loop_stack, unsigned, target); +} + +/* warning: this only works forward, and probably only if not inside any IF */ +static void +nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target) +{ + struct nvfx_label_relocation reloc; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + uint32_t *hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE | + (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */ + hw[3] = 0; /* | endif_offset */ + reloc.target = target; + reloc.location = fpc->inst_offset + 2; + util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc); + reloc.target = target; + reloc.location = fpc->inst_offset + 3; + util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc); +} + +static void +nv40_fp_brk(struct nvfx_fpc *fpc) +{ + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + uint32_t *hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE; + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; + hw[3] = 0; +} + static INLINE struct nvfx_src tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) { @@ -516,9 +649,6 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, case TGSI_OPCODE_RCP: nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none)); break; - case TGSI_OPCODE_RET: - assert(0); - break; case TGSI_OPCODE_RFL: if(!nvfx->is_nv4x) nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none)); @@ -604,13 +734,106 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none)); nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp))); break; - default: + + case TGSI_OPCODE_IF: + // MOVRC0 R31 (TR0.xyzw), R: + // IF (NE.xxxx) ELSE END + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_if(fpc, src[0]); + break; + + case TGSI_OPCODE_ELSE: + { + if(!nvfx->is_nv4x) + goto nv3x_cflow; + assert(util_dynarray_contains(&fpc->if_stack, unsigned)); + uint32_t *hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)]; + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len; + break; + } + + case TGSI_OPCODE_ENDIF: + { + if(!nvfx->is_nv4x) + goto nv3x_cflow; + assert(util_dynarray_contains(&fpc->if_stack, unsigned)); + uint32_t *hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)]; + if(!hw[2]) + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len; + hw[3] = fpc->fp->insn_len; + break; + } + + case TGSI_OPCODE_BRA: + /* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */ + /* no state tracker uses this, so don't implement this for now */ + assert(0); + nv40_fp_bra(fpc, finst->Label.Label); + break; + + case TGSI_OPCODE_BGNSUB: + case TGSI_OPCODE_ENDSUB: + /* nothing to do here */ + break; + + case TGSI_OPCODE_CAL: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_cal(fpc, finst->Label.Label); + break; + + case TGSI_OPCODE_RET: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_ret(fpc); + break; + + case TGSI_OPCODE_BGNLOOP: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + /* TODO: we should support using two nested REPs to allow a > 255 iteration count */ + nv40_fp_rep(fpc, 255, finst->Label.Label); + break; + + case TGSI_OPCODE_ENDLOOP: + break; + + case TGSI_OPCODE_BRK: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_brk(fpc); + break; + + case TGSI_OPCODE_CONT: + { + static int warned = 0; + if(!warned) { + NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n"); + warned = 1; + } + break; + } + + default: NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode); return FALSE; } +out: release_temps(fpc); return TRUE; +nv3x_cflow: + { + static int warned = 0; + if(!warned) { + NOUVEAU_ERR( + "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n" + "If rendering is incorrect, try to disable GLSL support in the application.\n"); + warned = 1; + } + } + goto out; } static boolean @@ -734,6 +957,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, { struct tgsi_parse_context parse; struct nvfx_fpc *fpc = NULL; + struct util_dynarray insns; fpc = CALLOC(1, sizeof(struct nvfx_fpc)); if (!fpc) @@ -748,6 +972,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, tgsi_parse_init(&parse, fp->pipe.tokens); + util_dynarray_init(&insns); while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); @@ -756,6 +981,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, { const struct tgsi_full_instruction *finst; + util_dynarray_append(&insns, unsigned, fp->insn_len); finst = &parse.FullToken.FullInstruction; if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst)) goto out_err; @@ -765,6 +991,14 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, break; } } + util_dynarray_append(&insns, unsigned, fp->insn_len); + + for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_label_relocation)) + { + struct nvfx_label_relocation* label_reloc = (struct nvfx_label_relocation*)((char*)fpc->label_relocs.data + i); + fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target]; + } + util_dynarray_fini(&insns); if(!nvfx->is_nv4x) fp->fp_control |= (fpc->num_regs-1)/2; @@ -775,7 +1009,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, if(fp->insn) fp->insn[fpc->inst_offset] |= 0x00000001; - /* Append NOP + END instruction, may or may not be necessary. */ + /* Append NOP + END instruction for branches to the end of the program */ fpc->inst_offset = fp->insn_len; grow_insns(fpc, 4); fp->insn[fpc->inst_offset + 0] = 0x00000001; @@ -799,6 +1033,9 @@ out_err: tgsi_parse_free(&parse); if (fpc->r_temp) FREE(fpc->r_temp); + util_dynarray_fini(&fpc->if_stack); + util_dynarray_fini(&fpc->label_relocs); + //util_dynarray_fini(&fpc->loop_stack); FREE(fpc); } diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h index fcf577ca74c..52e684aec3b 100644 --- a/src/gallium/drivers/nvfx/nvfx_shader.h +++ b/src/gallium/drivers/nvfx/nvfx_shader.h @@ -509,4 +509,9 @@ nvfx_src_abs(struct nvfx_src src) return src; } +struct nvfx_label_relocation { + unsigned location; + unsigned target; +}; + #endif -- 2.30.2