From 3c4c6d1f80ba4881155eceba18c634099383346c Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Sun, 5 Jun 2005 08:25:54 +0000 Subject: [PATCH] Fix xyz/w interaction (needs a cleanup still..) Use SRC0A instead of WZY/XXX combination for W in XYZ positions. Remove dodgy hack from POW opcode, now works correctly without it --- src/mesa/drivers/dri/r300/r300_context.c | 4 +- src/mesa/drivers/dri/r300/r300_fragprog.c | 241 +++++++++++----------- 2 files changed, 128 insertions(+), 117 deletions(-) diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c index f4ed7159dd2..f949e92081e 100644 --- a/src/mesa/drivers/dri/r300/r300_context.c +++ b/src/mesa/drivers/dri/r300/r300_context.c @@ -74,9 +74,10 @@ static const char *const card_extensions[] = { "GL_ARB_texture_border_clamp", "GL_ARB_texture_compression", /* disable until we support it, fixes a few things in ut2004 */ -// "GL_ARB_texture_cube_map", +/* "GL_ARB_texture_cube_map", */ "GL_ARB_texture_env_add", "GL_ARB_texture_env_combine", + "GL_ARB_texture_env_crossbar", "GL_ARB_texture_env_dot3", "GL_ARB_texture_mirrored_repeat", "GL_ARB_vertex_buffer_object", @@ -104,7 +105,6 @@ static const char *const card_extensions[] = { "GL_NV_blend_square", "GL_NV_vertex_program", "GL_SGIS_generate_mipmap", - "GL_ARB_texture_env_crossbar", NULL }; diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c index de6a36f50bc..e6a68ab89b1 100644 --- a/src/mesa/drivers/dri/r300/r300_fragprog.c +++ b/src/mesa/drivers/dri/r300/r300_fragprog.c @@ -40,8 +40,6 @@ * fglrx does (see r300_reg.h). * - Verify results of opcodes for accuracy, I've only checked them * in specific cases. - * - Learn more about interaction between xyz/w units.. A few bugs are - * caused by something I'm missing.. * - and more... */ @@ -112,11 +110,13 @@ static const struct r300_pfv_swizzle { { "xxx", MAKE_SWZ3(X, X, X), GL_TRUE, R300_FPI0_ARGC_SRC0C_XXX, 4, GL_FALSE }, { "yyy", MAKE_SWZ3(Y, Y, Y), GL_TRUE, R300_FPI0_ARGC_SRC0C_YYY, 4, GL_FALSE }, { "zzz", MAKE_SWZ3(Z, Z, Z), GL_TRUE, R300_FPI0_ARGC_SRC0C_ZZZ, 4, GL_FALSE }, + { "www", MAKE_SWZ3(W, W, W), GL_TRUE, R300_FPI0_ARGC_SRC0A, 1, GL_TRUE }, { "yzx", MAKE_SWZ3(Y, Z, X), GL_TRUE, R300_FPI0_ARGC_SRC0C_YZX, 1, GL_FALSE }, { "zxy", MAKE_SWZ3(Z, X, Y), GL_TRUE, R300_FPI0_ARGC_SRC0C_ZXY, 1, GL_FALSE }, - { "wzy", MAKE_SWZ3(W, Z, Y), GL_TRUE, R300_FPI0_ARGC_SRC0CA_WZY, 1, GL_TRUE }, +/* disable this for now, until I find a clean way of making sure xyz/w streams + * have a source in the same register slot.. */ +// { "wzy", MAKE_SWZ3(W, Z, Y), GL_TRUE, R300_FPI0_ARGC_SRC0CA_WZY, 1, GL_TRUE }, /* special cases */ - { NULL, MAKE_SWZ3(W, W, W), GL_FALSE, 0, 0, GL_FALSE}, { NULL, MAKE_SWZ3(ONE, ONE, ONE), GL_FALSE, R300_FPI0_ARGC_ONE, 0, GL_FALSE}, { NULL, MAKE_SWZ3(ZERO, ZERO, ZERO), GL_FALSE, R300_FPI0_ARGC_ZERO, 0, GL_FALSE}, { NULL, PFS_INVAL, GL_FALSE, R300_FPI0_ARGC_HALF, 0, GL_FALSE}, @@ -124,10 +124,10 @@ static const struct r300_pfv_swizzle { }; #define SWIZZLE_XYZ 0 #define SWIZZLE_XXX 1 -#define SWIZZLE_WZY 6 -#define SWIZZLE_111 8 -#define SWIZZLE_000 9 -#define SWIZZLE_HHH 10 +#define SWIZZLE_WWW 4 +#define SWIZZLE_111 7 +#define SWIZZLE_000 8 +#define SWIZZLE_HHH 9 #define SWZ_X_MASK (7 << 0) #define SWZ_Y_MASK (7 << 3) @@ -320,30 +320,6 @@ static int swz_special_case(struct r300_fragment_program *rp, pfs_reg_t ssrc = pfs_default_reg; switch(GET_SWZ(v_swiz[src.v_swz].hash, 0)) { - case SWIZZLE_W: - ssrc = get_temp_reg(rp); - src.v_swz = SWIZZLE_WZY; - if (s_mask[mask].count == 3) { - emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_XW, src, pfs_one, pfs_zero, 0); - *r = ssrc; - r->v_swz = SWIZZLE_XXX; - r->s_swz = SWIZZLE_W; - } else if (mc + s_mask[mask].count == 3) { - if (!r->valid) - *r = get_temp_reg(rp); - emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_XW, src, pfs_one, pfs_zero, 0); - ssrc.v_swz = SWIZZLE_XXX; - emit_arith(rp, PFS_OP_MAD, *r, s_mask[mask].mask|WRITEMASK_W, ssrc, pfs_one, pfs_zero, 0); - free_temp(rp, ssrc); - } else { - if (!r->valid) - *r = get_temp_reg(rp); - emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_X, src, pfs_one, pfs_zero, 0); - ssrc.v_swz = SWIZZLE_XXX; - emit_arith(rp, PFS_OP_MAD, *r, s_mask[mask].mask, ssrc, pfs_one, pfs_zero, 0); - free_temp(rp, ssrc); - } - break; case SWIZZLE_ONE: case SWIZZLE_ZERO: if (!r->valid) @@ -472,16 +448,16 @@ static void sync_streams(struct r300_fragment_program *rp) { /* Bring vector/scalar streams into sync, inserting nops into * whatever stream is lagging behind * - * I'm using "MAD t0, t0, 1.0, 0.0" as a NOP + * Using NOP == MAD out.none, 0, 0, 0 */ while (rp->v_pos != rp->s_pos) { if (rp->s_pos > rp->v_pos) { - rp->alu.inst[rp->v_pos].inst0 = 0x00050A80; - rp->alu.inst[rp->v_pos].inst1 = 0x03820800; + rp->alu.inst[rp->v_pos].inst0 = 0x00050A14; + rp->alu.inst[rp->v_pos].inst1 = 0x00020820; rp->v_pos++; } else { - rp->alu.inst[rp->s_pos].inst2 = 0x00040889; - rp->alu.inst[rp->s_pos].inst3 = 0x00820800; + rp->alu.inst[rp->s_pos].inst2 = 0x00040810; + rp->alu.inst[rp->s_pos].inst3 = 0x00020820; rp->s_pos++; } } @@ -550,25 +526,68 @@ static void emit_tex(struct r300_fragment_program *rp, rp->node[rp->cur_node].tex_end++; } +#define ARG_NEG (1<<5) +#define ARG_ABS (1<<6) +#define SRC_CONST (1<<5) +#define SRC_STRIDE 6 + +static int t_hw_src(struct r300_fragment_program *rp, pfs_reg_t src) +{ + int idx; + + switch (src.type) { + case REG_TYPE_TEMP: + idx = rp->temps[src.index]; + break; + case REG_TYPE_INPUT: + idx = rp->inputs[src.index]; + break; + case REG_TYPE_CONST: + return (src.index | SRC_CONST); + default: + ERROR("Invalid type for source reg\n"); + return (0 | SRC_CONST); + } + + rp->used_in_node |= (1 << idx); + return idx; +} + +/* Add sources to FPI1/FPI3 lists. If source is already on list, + * reuse the index instead of wasting a source. + */ +static inline int add_src(int src[3], int *cnt, int reg) { + int i; + + for (i=0;i<*cnt;i++) + if (src[i] == reg) return i; + + if (*cnt == 3) assert(0); /* I don't *think* this can happen */ + + src[*cnt] = reg; + return (*cnt)++; +} + static void emit_arith(struct r300_fragment_program *rp, int op, pfs_reg_t dest, int mask, pfs_reg_t src0, pfs_reg_t src1, pfs_reg_t src2, int flags) { pfs_reg_t src[3] = { src0, src1, src2 }; + /* XYZ/W emit control */ + int v_idx = rp->v_pos, s_idx = rp->s_pos; + GLboolean emit_v = GL_FALSE, emit_s = GL_FALSE; + /* INST1/INST3 sources */ + int vsrc[3], ssrc[3]; + int nvs = 0, nss = 0; + /* INST0/INST2 sources */ + int vswz[3], sswz[3]; + /* temp stuff */ int hwdest, hwsrc; int argc; - int v_idx = rp->v_pos, s_idx = rp->s_pos; - GLuint inst[4] = { 0, 0, 0, 0 }; int vop, sop; int i; - -#define ARG_NEG (1<<5) -#define ARG_ABS (1<<6) -#define ARG_STRIDE 7 -#define SRC_CONST (1<<5) -#define SRC_STRIDE 6 - + if (!dest.valid || !src0.valid || !src1.valid || !src2.valid) { ERROR("invalid register. dest/src0/src1/src2 valid = %d/%d/%d/%d\n", dest.valid, src0.valid, src1.valid, src2.valid); @@ -598,96 +617,91 @@ static void emit_arith(struct r300_fragment_program *rp, int op, ERROR("invalid dest reg type %d\n", dest.type); return; } - - /* grab hwregs of sources */ + + int str; for (i=0;i<3;i++) { if (iinputs[src[i].index]; - rp->used_in_node |= (1 << hwsrc); - - inst[1] |= hwsrc << (i * SRC_STRIDE); - inst[3] |= hwsrc << (i * SRC_STRIDE); - break; - case REG_TYPE_TEMP: - /* make sure insn ordering is right... */ - if ((v_swiz[src[i].v_swz].dep_sca && v_idx < s_idx) || - (s_swiz[src[i].s_swz].dep_vec && s_idx < v_idx)) { + hwsrc = t_hw_src(rp, src[i]); + if (mask & WRITEMASK_XYZ && vop != R300_FPI0_OUTC_REPL_ALPHA) { + if (v_swiz[src[i].v_swz].dep_sca) { sync_streams(rp); v_idx = s_idx = rp->v_pos; - } + emit_s = GL_TRUE; + str = add_src(ssrc, &nss, hwsrc); + } else + str = add_src(vsrc, &nvs, hwsrc); + vswz[i] = v_swiz[src[i].v_swz].base + (str * v_swiz[src[i].v_swz].stride); + } else + vswz[i] = R300_FPI0_ARGC_ZERO; + + if (mask & WRITEMASK_W || vop == R300_FPI0_OUTC_REPL_ALPHA) { + if (s_swiz[src[i].s_swz].dep_vec) { + sync_streams(rp); + v_idx = s_idx = rp->v_pos; + emit_v = GL_TRUE; + str = add_src(vsrc, &nvs, hwsrc); + } else + str = add_src(ssrc, &nss, hwsrc); + sswz[i] = s_swiz[src[i].s_swz].base + (str * s_swiz[src[i].s_swz].stride); + } else + sswz[i] = R300_FPI2_ARGA_ZERO; - hwsrc = rp->temps[src[i].index]; - rp->used_in_node |= (1 << hwsrc); - - inst[1] |= hwsrc << (i * SRC_STRIDE); - inst[3] |= hwsrc << (i * SRC_STRIDE); - break; - case REG_TYPE_CONST: - hwsrc = src[i].index; - - inst[1] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE)); - inst[3] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE)); - break; - default: - ERROR("invalid source reg\n"); - return; - } - - /* Swizzling/Negation */ - if (vop == R300_FPI0_OUTC_REPL_ALPHA) - inst[0] |= R300_FPI0_ARGC_ZERO << (i * ARG_STRIDE); - else - inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i*ARG_STRIDE); - inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i*ARG_STRIDE); - if (src[i].negate) { - inst[0] |= ARG_NEG << (i * ARG_STRIDE); - inst[2] |= ARG_NEG << (i * ARG_STRIDE); + vswz[i] |= ARG_NEG; + sswz[i] |= ARG_NEG; } - + if (flags & PFS_FLAG_ABS) { - inst[0] |= ARG_ABS << (i * ARG_STRIDE); - inst[2] |= ARG_ABS << (i * ARG_STRIDE); + vswz[i] |= ARG_ABS; + sswz[i] |= ARG_ABS; } } else { - /* read constant 0, use zero swizzle aswell */ - inst[0] |= R300_FPI0_ARGC_ZERO << (i*ARG_STRIDE); - inst[1] |= SRC_CONST << (i*SRC_STRIDE); - inst[2] |= R300_FPI2_ARGA_ZERO << (i*ARG_STRIDE); - inst[3] |= SRC_CONST << (i*SRC_STRIDE); + vswz[i] = R300_FPI0_ARGC_ZERO; + sswz[i] = R300_FPI2_ARGA_ZERO; } } + /* Unused sources, read constant reg 0 */ + for (i=nvs;i<3;i++) + vsrc[i] = 0 | SRC_CONST; + for (i=nss;i<3;i++) + ssrc[i] = 0 | SRC_CONST; if (flags & PFS_FLAG_SAT) { vop |= R300_FPI0_OUTC_SAT; sop |= R300_FPI2_OUTA_SAT; } - - if (mask & WRITEMASK_XYZ) { + + if (mask & WRITEMASK_XYZ || emit_v) { if (r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) { sync_streams(rp); s_idx = v_idx = rp->v_pos; } - rp->alu.inst[v_idx].inst0 = inst[0] | vop; - rp->alu.inst[v_idx].inst1 = inst[1] | - (hwdest << R300_FPI1_DSTC_SHIFT) | + rp->alu.inst[v_idx].inst0 = vop | + vswz[0] << R300_FPI0_ARG0C_SHIFT | + vswz[1] << R300_FPI0_ARG1C_SHIFT | + vswz[2] << R300_FPI0_ARG2C_SHIFT; + rp->alu.inst[v_idx].inst1 = hwdest << R300_FPI1_DSTC_SHIFT | + vsrc[0] << R300_FPI1_SRC0C_SHIFT | + vsrc[1] << R300_FPI1_SRC1C_SHIFT | + vsrc[2] << R300_FPI1_SRC2C_SHIFT | ((mask & WRITEMASK_XYZ) << (dest.type == REG_TYPE_OUTPUT ? 26 : 23)); rp->v_pos = v_idx + 1; } - - if ((mask & WRITEMASK_W) || r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) { - rp->alu.inst[s_idx].inst2 = inst[2] | sop; - rp->alu.inst[s_idx].inst3 = inst[3] | - (hwdest << R300_FPI3_DSTA_SHIFT) | + + if (mask & WRITEMASK_W || emit_s || vop == R300_FPI0_OUTC_REPL_ALPHA) { + rp->alu.inst[s_idx].inst2 = sop | + sswz[0] << R300_FPI2_ARG0A_SHIFT | + sswz[1] << R300_FPI2_ARG1A_SHIFT | + sswz[2] << R300_FPI2_ARG2A_SHIFT; + rp->alu.inst[s_idx].inst3 = hwdest << R300_FPI3_DSTA_SHIFT | + ssrc[0] << R300_FPI3_SRC0A_SHIFT | + ssrc[1] << R300_FPI3_SRC1A_SHIFT | + ssrc[2] << R300_FPI3_SRC2A_SHIFT | (((mask & WRITEMASK_W)?1:0) << (dest.type == REG_TYPE_OUTPUT ? 24 : 23)); rp->s_pos = s_idx + 1; } -/* Force this for now */ - sync_streams(rp); +/* sync_streams(rp); */ return; }; @@ -791,17 +805,14 @@ static GLboolean parse_program(struct r300_fragment_program *rp) flags); break; case FP_OPCODE_POW: - /* I don't like this, and it's probably wrong in some - * circumstances... Needs checking */ src0 = t_src(rp, fpi->SrcReg[0]); src1 = t_src(rp, fpi->SrcReg[1]); dest = t_dst(rp, fpi->DstReg); temp = get_temp_reg(rp); - temp.s_swz = SWIZZLE_X; /* cheat, bypass swizzle code */ - emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_X, + emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W, src0, pfs_zero, pfs_zero, 0); - emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X, + emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W, temp, src1, pfs_zero, 0); emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask, temp, pfs_zero, pfs_zero, 0); @@ -969,12 +980,12 @@ void translate_fragment_shader(struct r300_fragment_program *rp) if (!rp->translated) { init_program(rp); - + if (parse_program(rp) == GL_FALSE) { dump_program(rp); return; } - + /* Finish off */ sync_streams(rp); rp->node[rp->cur_node].alu_end = rp->v_pos - 1; -- 2.30.2