From 6f9978050eb8648888a728fc09b99e279c2b7b15 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 31 Aug 2010 19:17:46 +0200 Subject: [PATCH] nv50: re-add proper TEXBIAS sequence --- src/gallium/drivers/nv50/nv50_pc.c | 29 +++ src/gallium/drivers/nv50/nv50_pc.h | 9 +- src/gallium/drivers/nv50/nv50_pc_emit.c | 28 ++- src/gallium/drivers/nv50/nv50_pc_optimize.c | 34 +--- src/gallium/drivers/nv50/nv50_pc_print.c | 5 +- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 8 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 212 +++++++++++++++++--- 7 files changed, 258 insertions(+), 67 deletions(-) diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index b03f5b27f60..28e32eadb7d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -204,6 +204,35 @@ nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, return n; } +struct nv_value * +nvcg_find_constant(struct nv_ref *ref) +{ + struct nv_value *src; + + if (!ref) + return NULL; + + src = ref->value; + while (src->insn && src->insn->opcode == NV_OP_MOV) { + assert(!src->insn->src[0]->mod); + src = src->insn->src[0]->value; + } + if ((src->reg.file == NV_FILE_IMM) || + (src->insn && src->insn->opcode == NV_OP_LDA && + src->insn->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + src->insn->src[0]->value->reg.file <= NV_FILE_MEM_C(15))) + return src; + return NULL; +} + +struct nv_value * +nvcg_find_immediate(struct nv_ref *ref) +{ + struct nv_value *src = nvcg_find_constant(ref); + + return (src && src->reg.file == NV_FILE_IMM) ? src : NULL; +} + static void nv_pc_free_refs(struct nv_pc *pc) { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 2bb3ea4374c..adc46dec8db 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -126,6 +126,7 @@ #define NV_TYPE_ISINT(t) ((t) <= 5) #define NV_TYPE_ISFLT(t) ((t) & 0x08) +/* $cX registers contain 4 bits: OCSZ (Z is bit 0) */ #define NV_CC_FL 0x0 #define NV_CC_LT 0x1 #define NV_CC_EQ 0x2 @@ -135,6 +136,10 @@ #define NV_CC_GE 0x6 #define NV_CC_U 0x8 #define NV_CC_TR 0xf +#define NV_CC_O 0x10 +#define NV_CC_C 0x11 +#define NV_CC_A 0x12 +#define NV_CC_S 0x13 #define NV_PC_MAX_INSTRUCTIONS 2048 #define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) @@ -241,7 +246,7 @@ struct nv_instruction { ubyte saturate : 1; ubyte centroid : 1; ubyte flat : 1; - ubyte padding : 4; + ubyte lanes : 4; ubyte tex_live : 1; /* */ ubyte tex_t; /* TIC binding */ @@ -459,6 +464,8 @@ boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *, struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *); int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, struct nv_value *new_val); +struct nv_value *nvcg_find_immediate(struct nv_ref *); +struct nv_value *nvcg_find_constant(struct nv_ref *); typedef void (*nv_pc_pass_func)(void *priv, struct nv_basic_block *b); diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index e1d7bc64593..bb0a6f32d1e 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -540,8 +540,9 @@ emit_mov(struct nv_pc *pc, struct nv_instruction *i) set_a16_bits(pc, SREG(i->src[0])->id); } else if (DFILE(i, 0) == NV_FILE_FLAGS) { - pc->emit[0] = 0x000001fd; - pc->emit[1] = 0xa0000788 | (1 << 6); + pc->emit[0] = 0x00000001; + pc->emit[1] = 0xa0000000 | (1 << 6); + set_pred(pc, i); pc->emit[0] |= SREG(i->src[0])->id << 9; pc->emit[1] |= DREG(i->def[0])->id << 4; } else @@ -984,7 +985,7 @@ emit_tex(struct nv_pc *pc, struct nv_instruction *i) pc->emit[0] |= i->tex_t << 9; pc->emit[0] |= i->tex_s << 17; - pc->emit[0] |= i->tex_argc << 22; + pc->emit[0] |= (i->tex_argc - 1) << 22; pc->emit[0] |= (i->tex_mask & 0x3) << 25; pc->emit[1] |= (i->tex_mask & 0xc) << 12; @@ -1000,8 +1001,6 @@ emit_tex(struct nv_pc *pc, struct nv_instruction *i) else if (i->opcode == NV_OP_TXL) pc->emit[1] |= 0x40000000; - else - pc->emit[0] -= 1 << 22; } static void @@ -1053,6 +1052,20 @@ emit_ddy(struct nv_pc *pc, struct nv_instruction *i) set_pred_wr(pc, i); } +static void +emit_quadop(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xc0000000; + pc->emit[1] = 0x80000000; + + emit_form_ADD(pc, i); + + pc->emit[0] |= i->lanes << 16; + + pc->emit[0] |= (i->quadop & 0x03) << 20; + pc->emit[1] |= (i->quadop & 0xfc) << 20; +} + void nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) { @@ -1132,6 +1145,9 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) case NV_OP_TXL: emit_tex(pc, i); break; + case NV_OP_QUADOP: + emit_quadop(pc, i); + break; case NV_OP_KIL: emit_flow(pc, i, 0x0); break; @@ -1162,7 +1178,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) case NV_OP_UNDEF: case NV_OP_SUB: NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", - nv_opcode_name(i->opcode)); + nv_opcode_name(i->opcode)); break; default: NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4a3a51512e2..fb95da30f23 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -94,14 +94,17 @@ nvi_isnop(struct nv_instruction *nvi) if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF) return TRUE; - if (nvi->fixed || - nvi->is_terminator || - nvi->flags_src || + /* NOTE: 'fixed' now only means that it shouldn't be optimized away, + * but we can still remove it if it is a no-op move. + */ + if (/* nvi->fixed || */ + /* nvi->flags_src || */ /* cond. MOV to same register is still NOP */ nvi->flags_def || + nvi->is_terminator || nvi->is_join) return FALSE; - if (nvi->def[0]->join->reg.id < 0) + if (nvi->def[0] && nvi->def[0]->join->reg.id < 0) return TRUE; if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) @@ -436,22 +439,6 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) #define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) -static struct nv_value * -find_immediate(struct nv_ref *ref) -{ - struct nv_value *src; - - if (!ref) - return NULL; - - src = ref->value; - while (src->insn && src->insn->opcode == NV_OP_MOV) { - assert(!src->insn->src[0]->mod); - src = src->insn->src[0]->value; - } - return (src->reg.file == NV_FILE_IMM) ? src : NULL; -} - static void modifiers_apply(uint32_t *val, ubyte type, ubyte mod) { @@ -663,8 +650,8 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) next = nvi->next; - src0 = find_immediate(nvi->src[0]); - src1 = find_immediate(nvi->src[1]); + src0 = nvcg_find_immediate(nvi->src[0]); + src1 = nvcg_find_immediate(nvi->src[1]); if (src0 && src1) constant_expression(ctx->pc, nvi, src0, src1); @@ -778,6 +765,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) if (ld->def[0]->reg.id >= 0) it->value = ld->def[0]; else + if (!ld->fixed) nvcg_replace_value(ctx->pc, ld->def[0], it->value); } else { if (ctx->alloc == LOAD_RECORD_POOL_SIZE) @@ -979,7 +967,7 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) for (ir = entry; ir; ir = next) { next = ir->next; for (ik = entry; ik != ir; ik = ik->next) { - if (ir->opcode != ik->opcode) + if (ir->opcode != ik->opcode || ir->fixed) continue; if (!ir->def[0] || !ik->def[0] || diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 7bdeb1c78d4..01a6f009979 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -102,7 +102,8 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = { static const char *nv_cond_names[] = { "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "", - "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "" + "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "", + "o", "c", "a", "s" }; static const char *nv_modifier_strings[] = @@ -144,7 +145,7 @@ nv_type_name(ubyte type) static INLINE const char * nv_cond_name(ubyte cc) { - return nv_cond_names[MIN2(cc, 15)]; + return nv_cond_names[MIN2(cc, 19)]; } static INLINE const char * diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 81decf8d4a5..e689d349f1d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -493,10 +493,10 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) case NV_OP_SELECT: if (!iter) break; - assert(join_allowed(ctx, i->def[0], i->src[0]->value)); - assert(join_allowed(ctx, i->def[0], i->src[1]->value)); - do_join_values(ctx, i->def[0], i->src[0]->value); - do_join_values(ctx, i->def[0], i->src[1]->value); + for (c = 0; c < 4 && i->src[c]; ++c) { + assert(join_allowed(ctx, i->def[0], i->src[c]->value)); + do_join_values(ctx, i->def[0], i->src[c]->value); + } break; case NV_OP_TEX: case NV_OP_TXB: diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index c98d5e126ad..27d851e9fdb 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1156,8 +1156,8 @@ get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) static void load_proj_tex_coords(struct bld_context *bld, - struct nv_value *t[4], int dim, - const struct tgsi_full_instruction *insn) + struct nv_value *t[4], int dim, + const struct tgsi_full_instruction *insn) { int c, mask = 0; @@ -1188,59 +1188,209 @@ load_proj_tex_coords(struct bld_context *bld, } } +/* For a quad of threads / top left, top right, bottom left, bottom right + * pixels, do a different operation, and take src0 from a specific thread. + */ +#define QOP_ADD 0 +#define QOP_SUBR 1 +#define QOP_SUB 2 +#define QOP_MOV1 3 + +#define QOP(a, b, c, d) \ + ((QOP_##a << 0) | (QOP_##b << 2) | (QOP_##c << 4) | (QOP_##d << 6)) + +static INLINE struct nv_value * +bld_quadop(struct bld_context *bld, ubyte qop, struct nv_value *src0, int lane, + struct nv_value *src1, boolean wp) +{ + struct nv_value *val = bld_insn_2(bld, NV_OP_QUADOP, src0, src1); + val->insn->lanes = lane; + val->insn->quadop = qop; + if (wp) { + val->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + val->insn->flags_def->insn = val->insn; + } + return val; +} + +static INLINE struct nv_value * +bld_cmov(struct bld_context *bld, + struct nv_value *src, ubyte cc, struct nv_value *cr) +{ + src = bld_insn_1(bld, NV_OP_MOV, src); + + src->insn->cc = cc; + src->insn->flags_src = new_ref(bld->pc, cr); + + return src; +} + +static struct nv_instruction * +emit_tex(struct bld_context *bld, uint opcode, + struct nv_value *dst[4], struct nv_value *t_in[4], + int argc, int tic, int tsc, int cube) +{ + struct nv_value *t[4]; + struct nv_instruction *nvi; + int c; + + /* the inputs to a tex instruction must be separate values */ + for (c = 0; c < argc; ++c) { + t[c] = bld_insn_1(bld, NV_OP_MOV, t_in[c]); + t[c]->reg.type = NV_TYPE_F32; + t[c]->insn->fixed = 1; + } + + nvi = new_instruction(bld->pc, opcode); + + for (c = 0; c < 4; ++c) + dst[c] = bld_def(nvi, c, new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32)); + + for (c = 0; c < argc; ++c) + nvi->src[c] = new_ref(bld->pc, t[c]); + + nvi->tex_t = tic; + nvi->tex_s = tsc; + nvi->tex_mask = 0xf; + nvi->tex_cube = cube; + nvi->tex_live = 0; + nvi->tex_argc = argc; + + return nvi; +} + +static void +bld_texlod_sequence(struct bld_context *bld, + struct nv_value *dst[4], struct nv_value *t[4], int arg, + int tic, int tsc, int cube) +{ + emit_tex(bld, NV_OP_TXL, dst, t, arg, tic, tsc, cube); /* TODO */ +} + + +/* The lanes of a quad are grouped by the bit in the condition register + * they have set, which is selected by differing bias values. + * Move the input values for TEX into a new register set for each group + * and execute TEX only for a specific group. + * We always need to use 4 new registers for the inputs/outputs because + * the implicitly calculated derivatives must be correct. + */ +static void +bld_texbias_sequence(struct bld_context *bld, + struct nv_value *dst[4], struct nv_value *t[4], int arg, + int tic, int tsc, int cube) +{ + struct nv_instruction *sel, *tex; + struct nv_value *bit[4], *cr[4], *res[4][4], *val; + int l, c; + + const ubyte cc[4] = { NV_CC_EQ, NV_CC_S, NV_CC_C, NV_CC_O }; + + for (l = 0; l < 4; ++l) { + bit[l] = bld_load_imm_u32(bld, 1 << l); + + val = bld_quadop(bld, QOP(SUBR, SUBR, SUBR, SUBR), + t[arg - 1], l, t[arg - 1], TRUE); + + cr[l] = bld_cmov(bld, bit[l], NV_CC_EQ, val->insn->flags_def); + + cr[l]->reg.file = NV_FILE_FLAGS; + cr[l]->reg.type = NV_TYPE_U16; + } + + sel = new_instruction(bld->pc, NV_OP_SELECT); + + for (l = 0; l < 4; ++l) + sel->src[l] = new_ref(bld->pc, cr[l]); + + bld_def(sel, 0, new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16)); + + for (l = 0; l < 4; ++l) { + tex = emit_tex(bld, NV_OP_TXB, dst, t, arg, tic, tsc, cube); + + tex->cc = cc[l]; + tex->flags_src = new_ref(bld->pc, sel->def[0]); + + for (c = 0; c < 4; ++c) + res[l][c] = tex->def[c]; + } + + for (l = 0; l < 4; ++l) + for (c = 0; c < 4; ++c) + res[l][c] = bld_cmov(bld, res[l][c], cc[l], sel->def[0]); + + for (c = 0; c < 4; ++c) { + sel = new_instruction(bld->pc, NV_OP_SELECT); + + for (l = 0; l < 4; ++l) + sel->src[l] = new_ref(bld->pc, res[l][c]); + + bld_def(sel, 0, (dst[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32))); + } +} + +static boolean +bld_is_constant(struct nv_value *val) +{ + if (val->reg.file == NV_FILE_IMM) + return TRUE; + return val->insn && nvcg_find_constant(val->insn->src[0]); +} + static void bld_tex(struct bld_context *bld, struct nv_value *dst0[4], const struct tgsi_full_instruction *insn) { - struct nv_value *t[4]; - struct nv_instruction *nvi; + struct nv_value *t[4], *s[3]; uint opcode = translate_opcode(insn->Instruction.Opcode); int arg, dim, c; + const int tic = insn->Src[1].Register.Index; + const int tsc = 0; + const int cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; get_tex_dim(insn, &dim, &arg); - if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) { - } - // else - if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) { + if (!cube && insn->Instruction.Opcode == TGSI_OPCODE_TXP) load_proj_tex_coords(bld, t, dim, insn); - } else + else for (c = 0; c < dim; ++c) t[c] = emit_fetch(bld, insn, 0, c); - if (arg != dim) - t[dim] = emit_fetch(bld, insn, 0, 2); + if (cube) { + assert(dim >= 3); + for (c = 0; c < 3; ++c) + s[c] = bld_insn_1(bld, NV_OP_ABS, t[c]); - if (insn->Instruction.Opcode == TGSI_OPCODE_TXB || - insn->Instruction.Opcode == TGSI_OPCODE_TXL) { - t[arg++] = emit_fetch(bld, insn, 0, 3); - } + s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[1]); + s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[2]); + s[0] = bld_insn_1(bld, NV_OP_RCP, s[0]); - for (c = 0; c < arg; ++c) { - t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]); - t[c]->reg.type = NV_TYPE_F32; + for (c = 0; c < 3; ++c) + t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], s[0]); } - nvi = new_instruction(bld->pc, opcode); + if (arg != dim) + t[dim] = emit_fetch(bld, insn, 0, 2); - for (c = 0; c < 4; ++c) { - nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32); - nvi->def[c]->insn = nvi; + if (opcode == NV_OP_TXB || opcode == NV_OP_TXL) { + t[arg++] = emit_fetch(bld, insn, 0, 3); + + if ((bld->ti->p->type == PIPE_SHADER_FRAGMENT) && + !bld_is_constant(t[arg - 1])) { + if (opcode == NV_OP_TXB) + bld_texbias_sequence(bld, dst0, t, arg, tic, tsc, cube); + else + bld_texlod_sequence(bld, dst0, t, arg, tic, tsc, cube); + return; + } } - for (c = 0; c < arg; ++c) - nvi->src[c] = new_ref(bld->pc, t[c]); - nvi->tex_t = insn->Src[1].Register.Index; - nvi->tex_s = 0; - nvi->tex_mask = 0xf; - nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; - nvi->tex_live = 0; - nvi->tex_argc = arg; + emit_tex(bld, opcode, dst0, t, arg, tic, tsc, cube); } static INLINE struct nv_value * bld_dot(struct bld_context *bld, const struct tgsi_full_instruction *insn, - int n) + int n) { struct nv_value *dotp, *src0, *src1; int c; -- 2.30.2