* fglrx does (see r300_reg.h).
* - Verify results of opcodes for accuracy, I've only checked them
* in specific cases.
- * - Learn more about interaction between xyz/w units.. A few bugs are
- * caused by something I'm missing..
* - and more...
*/
{ "xxx", MAKE_SWZ3(X, X, X), GL_TRUE, R300_FPI0_ARGC_SRC0C_XXX, 4, GL_FALSE },
{ "yyy", MAKE_SWZ3(Y, Y, Y), GL_TRUE, R300_FPI0_ARGC_SRC0C_YYY, 4, GL_FALSE },
{ "zzz", MAKE_SWZ3(Z, Z, Z), GL_TRUE, R300_FPI0_ARGC_SRC0C_ZZZ, 4, GL_FALSE },
+ { "www", MAKE_SWZ3(W, W, W), GL_TRUE, R300_FPI0_ARGC_SRC0A, 1, GL_TRUE },
{ "yzx", MAKE_SWZ3(Y, Z, X), GL_TRUE, R300_FPI0_ARGC_SRC0C_YZX, 1, GL_FALSE },
{ "zxy", MAKE_SWZ3(Z, X, Y), GL_TRUE, R300_FPI0_ARGC_SRC0C_ZXY, 1, GL_FALSE },
- { "wzy", MAKE_SWZ3(W, Z, Y), GL_TRUE, R300_FPI0_ARGC_SRC0CA_WZY, 1, GL_TRUE },
+/* disable this for now, until I find a clean way of making sure xyz/w streams
+ * have a source in the same register slot.. */
+// { "wzy", MAKE_SWZ3(W, Z, Y), GL_TRUE, R300_FPI0_ARGC_SRC0CA_WZY, 1, GL_TRUE },
/* special cases */
- { NULL, MAKE_SWZ3(W, W, W), GL_FALSE, 0, 0, GL_FALSE},
{ NULL, MAKE_SWZ3(ONE, ONE, ONE), GL_FALSE, R300_FPI0_ARGC_ONE, 0, GL_FALSE},
{ NULL, MAKE_SWZ3(ZERO, ZERO, ZERO), GL_FALSE, R300_FPI0_ARGC_ZERO, 0, GL_FALSE},
{ NULL, PFS_INVAL, GL_FALSE, R300_FPI0_ARGC_HALF, 0, GL_FALSE},
};
#define SWIZZLE_XYZ 0
#define SWIZZLE_XXX 1
-#define SWIZZLE_WZY 6
-#define SWIZZLE_111 8
-#define SWIZZLE_000 9
-#define SWIZZLE_HHH 10
+#define SWIZZLE_WWW 4
+#define SWIZZLE_111 7
+#define SWIZZLE_000 8
+#define SWIZZLE_HHH 9
#define SWZ_X_MASK (7 << 0)
#define SWZ_Y_MASK (7 << 3)
pfs_reg_t ssrc = pfs_default_reg;
switch(GET_SWZ(v_swiz[src.v_swz].hash, 0)) {
- case SWIZZLE_W:
- ssrc = get_temp_reg(rp);
- src.v_swz = SWIZZLE_WZY;
- if (s_mask[mask].count == 3) {
- emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_XW, src, pfs_one, pfs_zero, 0);
- *r = ssrc;
- r->v_swz = SWIZZLE_XXX;
- r->s_swz = SWIZZLE_W;
- } else if (mc + s_mask[mask].count == 3) {
- if (!r->valid)
- *r = get_temp_reg(rp);
- emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_XW, src, pfs_one, pfs_zero, 0);
- ssrc.v_swz = SWIZZLE_XXX;
- emit_arith(rp, PFS_OP_MAD, *r, s_mask[mask].mask|WRITEMASK_W, ssrc, pfs_one, pfs_zero, 0);
- free_temp(rp, ssrc);
- } else {
- if (!r->valid)
- *r = get_temp_reg(rp);
- emit_arith(rp, PFS_OP_MAD, ssrc, WRITEMASK_X, src, pfs_one, pfs_zero, 0);
- ssrc.v_swz = SWIZZLE_XXX;
- emit_arith(rp, PFS_OP_MAD, *r, s_mask[mask].mask, ssrc, pfs_one, pfs_zero, 0);
- free_temp(rp, ssrc);
- }
- break;
case SWIZZLE_ONE:
case SWIZZLE_ZERO:
if (!r->valid)
/* Bring vector/scalar streams into sync, inserting nops into
* whatever stream is lagging behind
*
- * I'm using "MAD t0, t0, 1.0, 0.0" as a NOP
+ * Using NOP == MAD out.none, 0, 0, 0
*/
while (rp->v_pos != rp->s_pos) {
if (rp->s_pos > rp->v_pos) {
- rp->alu.inst[rp->v_pos].inst0 = 0x00050A80;
- rp->alu.inst[rp->v_pos].inst1 = 0x03820800;
+ rp->alu.inst[rp->v_pos].inst0 = 0x00050A14;
+ rp->alu.inst[rp->v_pos].inst1 = 0x00020820;
rp->v_pos++;
} else {
- rp->alu.inst[rp->s_pos].inst2 = 0x00040889;
- rp->alu.inst[rp->s_pos].inst3 = 0x00820800;
+ rp->alu.inst[rp->s_pos].inst2 = 0x00040810;
+ rp->alu.inst[rp->s_pos].inst3 = 0x00020820;
rp->s_pos++;
}
}
rp->node[rp->cur_node].tex_end++;
}
+#define ARG_NEG (1<<5)
+#define ARG_ABS (1<<6)
+#define SRC_CONST (1<<5)
+#define SRC_STRIDE 6
+
+static int t_hw_src(struct r300_fragment_program *rp, pfs_reg_t src)
+{
+ int idx;
+
+ switch (src.type) {
+ case REG_TYPE_TEMP:
+ idx = rp->temps[src.index];
+ break;
+ case REG_TYPE_INPUT:
+ idx = rp->inputs[src.index];
+ break;
+ case REG_TYPE_CONST:
+ return (src.index | SRC_CONST);
+ default:
+ ERROR("Invalid type for source reg\n");
+ return (0 | SRC_CONST);
+ }
+
+ rp->used_in_node |= (1 << idx);
+ return idx;
+}
+
+/* Add sources to FPI1/FPI3 lists. If source is already on list,
+ * reuse the index instead of wasting a source.
+ */
+static inline int add_src(int src[3], int *cnt, int reg) {
+ int i;
+
+ for (i=0;i<*cnt;i++)
+ if (src[i] == reg) return i;
+
+ if (*cnt == 3) assert(0); /* I don't *think* this can happen */
+
+ src[*cnt] = reg;
+ return (*cnt)++;
+}
+
static void emit_arith(struct r300_fragment_program *rp, int op,
pfs_reg_t dest, int mask,
pfs_reg_t src0, pfs_reg_t src1, pfs_reg_t src2,
int flags)
{
pfs_reg_t src[3] = { src0, src1, src2 };
+ /* XYZ/W emit control */
+ int v_idx = rp->v_pos, s_idx = rp->s_pos;
+ GLboolean emit_v = GL_FALSE, emit_s = GL_FALSE;
+ /* INST1/INST3 sources */
+ int vsrc[3], ssrc[3];
+ int nvs = 0, nss = 0;
+ /* INST0/INST2 sources */
+ int vswz[3], sswz[3];
+ /* temp stuff */
int hwdest, hwsrc;
int argc;
- int v_idx = rp->v_pos, s_idx = rp->s_pos;
- GLuint inst[4] = { 0, 0, 0, 0 };
int vop, sop;
int i;
-
-#define ARG_NEG (1<<5)
-#define ARG_ABS (1<<6)
-#define ARG_STRIDE 7
-#define SRC_CONST (1<<5)
-#define SRC_STRIDE 6
-
+
if (!dest.valid || !src0.valid || !src1.valid || !src2.valid) {
ERROR("invalid register. dest/src0/src1/src2 valid = %d/%d/%d/%d\n",
dest.valid, src0.valid, src1.valid, src2.valid);
ERROR("invalid dest reg type %d\n", dest.type);
return;
}
-
- /* grab hwregs of sources */
+
+ int str;
for (i=0;i<3;i++) {
if (i<argc) {
- /* Decide on hardware source index */
- switch (src[i].type) {
- case REG_TYPE_INPUT:
- hwsrc = rp->inputs[src[i].index];
- rp->used_in_node |= (1 << hwsrc);
-
- inst[1] |= hwsrc << (i * SRC_STRIDE);
- inst[3] |= hwsrc << (i * SRC_STRIDE);
- break;
- case REG_TYPE_TEMP:
- /* make sure insn ordering is right... */
- if ((v_swiz[src[i].v_swz].dep_sca && v_idx < s_idx) ||
- (s_swiz[src[i].s_swz].dep_vec && s_idx < v_idx)) {
+ hwsrc = t_hw_src(rp, src[i]);
+ if (mask & WRITEMASK_XYZ && vop != R300_FPI0_OUTC_REPL_ALPHA) {
+ if (v_swiz[src[i].v_swz].dep_sca) {
sync_streams(rp);
v_idx = s_idx = rp->v_pos;
- }
+ emit_s = GL_TRUE;
+ str = add_src(ssrc, &nss, hwsrc);
+ } else
+ str = add_src(vsrc, &nvs, hwsrc);
+ vswz[i] = v_swiz[src[i].v_swz].base + (str * v_swiz[src[i].v_swz].stride);
+ } else
+ vswz[i] = R300_FPI0_ARGC_ZERO;
+
+ if (mask & WRITEMASK_W || vop == R300_FPI0_OUTC_REPL_ALPHA) {
+ if (s_swiz[src[i].s_swz].dep_vec) {
+ sync_streams(rp);
+ v_idx = s_idx = rp->v_pos;
+ emit_v = GL_TRUE;
+ str = add_src(vsrc, &nvs, hwsrc);
+ } else
+ str = add_src(ssrc, &nss, hwsrc);
+ sswz[i] = s_swiz[src[i].s_swz].base + (str * s_swiz[src[i].s_swz].stride);
+ } else
+ sswz[i] = R300_FPI2_ARGA_ZERO;
- hwsrc = rp->temps[src[i].index];
- rp->used_in_node |= (1 << hwsrc);
-
- inst[1] |= hwsrc << (i * SRC_STRIDE);
- inst[3] |= hwsrc << (i * SRC_STRIDE);
- break;
- case REG_TYPE_CONST:
- hwsrc = src[i].index;
-
- inst[1] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE));
- inst[3] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE));
- break;
- default:
- ERROR("invalid source reg\n");
- return;
- }
-
- /* Swizzling/Negation */
- if (vop == R300_FPI0_OUTC_REPL_ALPHA)
- inst[0] |= R300_FPI0_ARGC_ZERO << (i * ARG_STRIDE);
- else
- inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i*ARG_STRIDE);
- inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i*ARG_STRIDE);
-
if (src[i].negate) {
- inst[0] |= ARG_NEG << (i * ARG_STRIDE);
- inst[2] |= ARG_NEG << (i * ARG_STRIDE);
+ vswz[i] |= ARG_NEG;
+ sswz[i] |= ARG_NEG;
}
-
+
if (flags & PFS_FLAG_ABS) {
- inst[0] |= ARG_ABS << (i * ARG_STRIDE);
- inst[2] |= ARG_ABS << (i * ARG_STRIDE);
+ vswz[i] |= ARG_ABS;
+ sswz[i] |= ARG_ABS;
}
} else {
- /* read constant 0, use zero swizzle aswell */
- inst[0] |= R300_FPI0_ARGC_ZERO << (i*ARG_STRIDE);
- inst[1] |= SRC_CONST << (i*SRC_STRIDE);
- inst[2] |= R300_FPI2_ARGA_ZERO << (i*ARG_STRIDE);
- inst[3] |= SRC_CONST << (i*SRC_STRIDE);
+ vswz[i] = R300_FPI0_ARGC_ZERO;
+ sswz[i] = R300_FPI2_ARGA_ZERO;
}
}
+ /* Unused sources, read constant reg 0 */
+ for (i=nvs;i<3;i++)
+ vsrc[i] = 0 | SRC_CONST;
+ for (i=nss;i<3;i++)
+ ssrc[i] = 0 | SRC_CONST;
if (flags & PFS_FLAG_SAT) {
vop |= R300_FPI0_OUTC_SAT;
sop |= R300_FPI2_OUTA_SAT;
}
-
- if (mask & WRITEMASK_XYZ) {
+
+ if (mask & WRITEMASK_XYZ || emit_v) {
if (r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) {
sync_streams(rp);
s_idx = v_idx = rp->v_pos;
}
- rp->alu.inst[v_idx].inst0 = inst[0] | vop;
- rp->alu.inst[v_idx].inst1 = inst[1] |
- (hwdest << R300_FPI1_DSTC_SHIFT) |
+ rp->alu.inst[v_idx].inst0 = vop |
+ vswz[0] << R300_FPI0_ARG0C_SHIFT |
+ vswz[1] << R300_FPI0_ARG1C_SHIFT |
+ vswz[2] << R300_FPI0_ARG2C_SHIFT;
+ rp->alu.inst[v_idx].inst1 = hwdest << R300_FPI1_DSTC_SHIFT |
+ vsrc[0] << R300_FPI1_SRC0C_SHIFT |
+ vsrc[1] << R300_FPI1_SRC1C_SHIFT |
+ vsrc[2] << R300_FPI1_SRC2C_SHIFT |
((mask & WRITEMASK_XYZ) << (dest.type == REG_TYPE_OUTPUT ? 26 : 23));
rp->v_pos = v_idx + 1;
}
-
- if ((mask & WRITEMASK_W) || r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) {
- rp->alu.inst[s_idx].inst2 = inst[2] | sop;
- rp->alu.inst[s_idx].inst3 = inst[3] |
- (hwdest << R300_FPI3_DSTA_SHIFT) |
+
+ if (mask & WRITEMASK_W || emit_s || vop == R300_FPI0_OUTC_REPL_ALPHA) {
+ rp->alu.inst[s_idx].inst2 = sop |
+ sswz[0] << R300_FPI2_ARG0A_SHIFT |
+ sswz[1] << R300_FPI2_ARG1A_SHIFT |
+ sswz[2] << R300_FPI2_ARG2A_SHIFT;
+ rp->alu.inst[s_idx].inst3 = hwdest << R300_FPI3_DSTA_SHIFT |
+ ssrc[0] << R300_FPI3_SRC0A_SHIFT |
+ ssrc[1] << R300_FPI3_SRC1A_SHIFT |
+ ssrc[2] << R300_FPI3_SRC2A_SHIFT |
(((mask & WRITEMASK_W)?1:0) << (dest.type == REG_TYPE_OUTPUT ? 24 : 23));
rp->s_pos = s_idx + 1;
}
-/* Force this for now */
- sync_streams(rp);
+/* sync_streams(rp); */
return;
};
flags);
break;
case FP_OPCODE_POW:
- /* I don't like this, and it's probably wrong in some
- * circumstances... Needs checking */
src0 = t_src(rp, fpi->SrcReg[0]);
src1 = t_src(rp, fpi->SrcReg[1]);
dest = t_dst(rp, fpi->DstReg);
temp = get_temp_reg(rp);
- temp.s_swz = SWIZZLE_X; /* cheat, bypass swizzle code */
- emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_X,
+ emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
src0, pfs_zero, pfs_zero, 0);
- emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+ emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
temp, src1, pfs_zero, 0);
emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
temp, pfs_zero, pfs_zero, 0);
if (!rp->translated) {
init_program(rp);
-
+
if (parse_program(rp) == GL_FALSE) {
dump_program(rp);
return;
}
-
+
/* Finish off */
sync_streams(rp);
rp->node[rp->cur_node].alu_end = rp->v_pos - 1;