- Remove one of the loops in emit_arith
authorBen Skeggs <darktama@iinet.net.au>
Wed, 25 May 2005 06:46:10 +0000 (06:46 +0000)
committerBen Skeggs <darktama@iinet.net.au>
Wed, 25 May 2005 06:46:10 +0000 (06:46 +0000)
- Handle REPL_ALPHA in emit_arith (possibly incorrect for some things)
- Start on getting demos/arbfplight.c to look right.  Won't be animated yet,
  need to re-work const emit so we can update consts without re-translating
  the entire program.

Assertion in r300_state.c::setup_rs_unit needs to be disabled for it to
work.

src/mesa/drivers/dri/r300/r300_fragprog.c
src/mesa/drivers/dri/r300/r300_fragprog.h
src/mesa/drivers/dri/r300/r300_reg.h

index dbc650d437c2d2786515443a18be3ccfbed7b565..fc10e368d682e8e178d25e507f183b537a26c615 100644 (file)
@@ -38,6 +38,8 @@
  * - Reuse input/temp regs, if they're no longer needed.
  * - Find out whether there's any benifit in ordering registers the way
  *   fglrx does (see r300_reg.h).
+ * - Verify results of opcodes for accuracy, I've only checked them
+ *   in specific cases.
  * - and more...
  */
 
@@ -78,11 +80,11 @@ const struct {
        { "MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX },
        { "CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP },
        { "FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC },
-/* should the vector insns below be REPL_ALPHA? */
-       { "EX2", 1, PFS_INVAL, R300_FPI2_OUTA_EX2 },
-       { "LG2", 1, PFS_INVAL, R300_FPI2_OUTA_LG2 },
-       { "RCP", 1, PFS_INVAL, R300_FPI2_OUTA_RCP },
-       { "RSQ", 1, PFS_INVAL, R300_FPI2_OUTA_RSQ },
+       { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2 },
+       { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2 },
+       { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP },
+       { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ },
+       { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL }
 };
 
 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
@@ -545,12 +547,19 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
                                int flags)
 {
        pfs_reg_t src[3] = { src0, src1, src2 };
-       int hwdest, hwsrc[3];
+       int hwdest, hwsrc;
        int argc;
        int v_idx = rp->v_pos, s_idx = rp->s_pos;
        GLuint inst[4] = { 0, 0, 0, 0 }; 
+       int vop, sop;
        int i;
 
+#define ARG_NEG        (1<<5)
+#define ARG_ABS (1<<6)
+#define ARG_STRIDE 7
+#define SRC_CONST (1<<5)
+#define SRC_STRIDE 6
+
        if (!dest.valid || !src0.valid || !src1.valid || !src2.valid) {
                ERROR("invalid register.  dest/src0/src1/src2 valid = %d/%d/%d/%d\n",
                                                dest.valid, src0.valid, src1.valid, src2.valid);
@@ -563,34 +572,9 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
                return;
        }
        argc = r300_fpop[op].argc;
+       vop = r300_fpop[op].v_op;
+       sop = r300_fpop[op].s_op;
 
-       /* grab hwregs of sources */
-       for (i=0;i<argc;i++) {
-               switch (src[i].type) {
-               case REG_TYPE_INPUT:
-                       hwsrc[i] = rp->inputs[src[i].index];
-                       rp->used_in_node |= (1 << hwsrc[i]);
-                       break;
-               case REG_TYPE_TEMP:
-                       /* make sure insn ordering is right... */
-                       if ((src[i].vcross && v_idx < s_idx) ||
-                               (src[i].scross && s_idx < v_idx)) {
-                               sync_streams(rp);
-                               v_idx = s_idx = rp->v_pos;
-                       }
-                       
-                       hwsrc[i] = rp->temps[src[i].index];
-                       rp->used_in_node |= (1 << hwsrc[i]);
-                       break;
-               case REG_TYPE_CONST:
-                       hwsrc[i] = src[i].index;
-                       break;
-               default:
-                       ERROR("invalid source reg\n");
-                       return;
-               }
-       }
-       
        /* grab hwregs of dest */
        switch (dest.type) {
        case REG_TYPE_TEMP:
@@ -606,42 +590,90 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
                return;
        }
 
+       /* grab hwregs of sources */
        for (i=0;i<3;i++) {
-               if (i < argc) {
-                       inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i * 7);
-                       inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i * 7);
+               if (i<argc) {
+                       /* Decide on hardware source index */
+                       switch (src[i].type) {
+                       case REG_TYPE_INPUT:
+                               hwsrc = rp->inputs[src[i].index];
+                               rp->used_in_node |= (1 << hwsrc);
+
+                               inst[1] |= hwsrc << (i * SRC_STRIDE);
+                               inst[3] |= hwsrc << (i * SRC_STRIDE);
+                               break;
+                       case REG_TYPE_TEMP:
+                               /* make sure insn ordering is right... */
+                               if ((src[i].vcross && v_idx < s_idx) ||
+                                       (src[i].scross && s_idx < v_idx)) {
+                                       sync_streams(rp);
+                                       v_idx = s_idx = rp->v_pos;
+                               }
+               
+                               hwsrc = rp->temps[src[i].index];
+                               rp->used_in_node |= (1 << hwsrc);
+
+                               inst[1] |= hwsrc << (i * SRC_STRIDE);
+                               inst[3] |= hwsrc << (i * SRC_STRIDE);
+                               break;
+                       case REG_TYPE_CONST:
+                               hwsrc = src[i].index;
+
+                               inst[1] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE));
+                               inst[3] |= ((hwsrc | SRC_CONST) << (i * SRC_STRIDE));
+                               break;
+                       default:
+                               ERROR("invalid source reg\n");
+                               return;
+                       }
+
+                       /* Swizzling/Negation */
+                       if (vop == R300_FPI0_OUTC_REPL_ALPHA)
+                               inst[0] |= R300_FPI0_ARGC_ZERO << (i * ARG_STRIDE);
+                       else
+                               inst[0] |= (v_swiz[src[i].v_swz].base + (i * v_swiz[src[i].v_swz].stride)) << (i*ARG_STRIDE);
+                       inst[2] |= (s_swiz[src[i].s_swz].base + (i * s_swiz[src[i].s_swz].stride)) << (i*ARG_STRIDE);
+
                        if (src[i].negate) {
-                               inst[0] |= (1<<5) << (i*7);
-                               inst[2] |= (1<<5) << (i*7);
+                               inst[0] |= ARG_NEG << (i * ARG_STRIDE);
+                               inst[2] |= ARG_NEG << (i * ARG_STRIDE);
                        }
-                       inst[1] |= hwsrc[i] << (i*6);
-                       inst[3] |= hwsrc[i] << (i*6);
-                       if (src[i].type == REG_TYPE_CONST) {
-                               inst[1] |= (1<<5) << (i*6);
-                               inst[3] |= (1<<5) << (i*6);
+                       
+                       if (flags & PFS_FLAG_ABS) {
+                               inst[0] |= ARG_ABS << (i * ARG_STRIDE);
+                               inst[2] |= ARG_ABS << (i * ARG_STRIDE); 
                        }
                } else {
-                       /* read constant zero, may aswell use a ZERO swizzle aswell.. */
-                       inst[0] |= R300_FPI0_ARGC_ZERO << (i*7);
-                       inst[2] |= R300_FPI2_ARGA_ZERO << (i*7);
-                       inst[1] |= (1<<5) << (i*6);
-                       inst[3] |= (1<<5) << (i*6);
+                       /* read constant 0, use zero swizzle aswell */
+                       inst[0] |= R300_FPI0_ARGC_ZERO << (i*ARG_STRIDE);
+                       inst[1] |= SRC_CONST << (i*SRC_STRIDE);
+                       inst[2] |= R300_FPI2_ARGA_ZERO << (i*ARG_STRIDE);
+                       inst[3] |= SRC_CONST << (i*SRC_STRIDE);
                }
        }
 
+       if (flags & PFS_FLAG_SAT) {
+               vop |= R300_FPI0_OUTC_SAT;
+               sop |= R300_FPI2_OUTA_SAT;
+       }
+               
        if (mask & WRITEMASK_XYZ) {
-               rp->alu.inst[v_idx].inst0 = inst[0] | r300_fpop[op].v_op | flags;
+               if (r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) {
+                       sync_streams(rp);
+                       s_idx = v_idx = rp->v_pos;
+               }
+               rp->alu.inst[v_idx].inst0 = inst[0] | vop;
                rp->alu.inst[v_idx].inst1 = inst[1] |
                                (hwdest << R300_FPI1_DSTC_SHIFT) |
                                ((mask & WRITEMASK_XYZ) << (dest.type == REG_TYPE_OUTPUT ? 26 : 23));
                rp->v_pos = v_idx + 1;
        }
        
-       if (mask & WRITEMASK_W) {
-               rp->alu.inst[s_idx].inst2 = inst[2] | r300_fpop[op].s_op | flags;
+       if ((mask & WRITEMASK_W) || r300_fpop[op].v_op == R300_FPI0_OUTC_REPL_ALPHA) {
+               rp->alu.inst[s_idx].inst2 = inst[2] | sop;
                rp->alu.inst[s_idx].inst3 = inst[3] |
                                (hwdest << R300_FPI3_DSTA_SHIFT) |
-                               (1 << (dest.type == REG_TYPE_OUTPUT ? 24 : 23));
+                               (((mask & WRITEMASK_W)?1:0) << (dest.type == REG_TYPE_OUTPUT ? 24 : 23));
                rp->s_pos = s_idx + 1;
        }
 
@@ -663,7 +695,9 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
        }
 
        for (fpi=mp->Instructions; fpi->Opcode != FP_OPCODE_END; fpi++) {
-               if (inst->Saturate) flags = R300_FPI0_OUTC_SAT; /* same for OUTA */
+               if (fpi->Saturate) {
+                       flags = PFS_FLAG_SAT;
+               }
                
                switch (fpi->Opcode) {
                case FP_OPCODE_ABS:
@@ -681,6 +715,20 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
                        ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
                        break;
                case FP_OPCODE_DP3:
+                       dest = t_dst(rp, fpi->DstReg);
+                       if (fpi->DstReg.WriteMask & WRITEMASK_W) {
+                               /* I assume these need to share the same alu slot */
+                               sync_streams(rp);
+                               emit_arith(rp, PFS_OP_DP4, dest, WRITEMASK_W, 
+                                                               pfs_zero, pfs_zero, pfs_zero,
+                                                               flags);
+                       }
+                       emit_arith(rp, PFS_OP_DP3, t_dst(rp, fpi->DstReg),
+                                                       fpi->DstReg.WriteMask & WRITEMASK_XYZ,
+                                                       t_src(rp, fpi->SrcReg[0]),
+                                                       t_src(rp, fpi->SrcReg[1]),
+                                                       pfs_zero, flags);
+                       break;
                case FP_OPCODE_DP4:
                case FP_OPCODE_DPH:
                case FP_OPCODE_DST:
@@ -732,8 +780,31 @@ static GLboolean parse_program(struct r300_fragment_program *rp)
                                                        flags);
                        break;
                case FP_OPCODE_POW:
+                       /* I don't like this, and it's probably wrong in some
+                        * circumstances... Needs checking */
+                       src0 = t_src(rp, fpi->SrcReg[0]);
+                       src1 = t_src(rp, fpi->SrcReg[1]);
+                       dest = t_dst(rp, fpi->DstReg);
+                       temp = get_temp_reg(rp);
+                       temp.s_swz = SWIZZLE_X; /* cheat, bypass swizzle code */
+
+                       emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_X,
+                                                       src0, pfs_zero, pfs_zero, 0);
+                       emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
+                                                       temp, src1, pfs_zero, 0);
+                       emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
+                                                       temp, pfs_zero, pfs_zero, 0);
+                       free_temp(rp, temp);
+                       break;
                case FP_OPCODE_RCP:
+                       ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
+                       break;
                case FP_OPCODE_RSQ:
+                       emit_arith(rp, PFS_OP_RSQ, t_dst(rp, fpi->DstReg),
+                                                       fpi->DstReg.WriteMask,
+                                                       t_src(rp, fpi->SrcReg[0]), pfs_zero, pfs_zero,
+                                                       flags | PFS_FLAG_ABS);
+                       break;
                case FP_OPCODE_SCS:
                case FP_OPCODE_SGE:
                case FP_OPCODE_SIN:
@@ -873,7 +944,7 @@ void init_program(struct r300_fragment_program *rp)
 void translate_fragment_shader(struct r300_fragment_program *rp)
 {
        int i;
-       
+
        init_program(rp);
        
        if (parse_program(rp) == GL_FALSE) {
index b98c6c03a772881006cd8c398aba0442aa2bd992..26e4ae56a95dc69a73e082fff92d605f211882c0 100644 (file)
@@ -39,8 +39,12 @@ typedef struct _pfs_reg_t {
 #define PFS_OP_LG2 8
 #define PFS_OP_RCP 9
 #define PFS_OP_RSQ 10
-#define MAX_PFS_OP 10
+#define PFS_OP_REPL_ALPHA 11
+#define MAX_PFS_OP 11
 #define OP(n) PFS_OP_##n
 
+#define PFS_FLAG_SAT   (1 << 0)
+#define PFS_FLAG_ABS   (1 << 1)
+
 #endif
 
index 3d090c3710843cd527b4d9d2ac54acf13bd32f42..0beef346793d1627f42ca97d7b988b67e66350d7 100644 (file)
@@ -1000,12 +1000,15 @@ I am fairly certain that they are correct unless stated otherwise in comments.
 #       define R300_FPI2_ARG0A_SHIFT             0
 #       define R300_FPI2_ARG0A_MASK              (31 << 0)
 #       define R300_FPI2_ARG0A_NEG               (1 << 5)
+#              define R300_FPI2_ARG0A_ABS                               (1 << 6) /* GUESS */
 #       define R300_FPI2_ARG1A_SHIFT             7
 #       define R300_FPI2_ARG1A_MASK              (31 << 7)
 #       define R300_FPI2_ARG1A_NEG               (1 << 12)
+#              define R300_FPI2_ARG1A_ABS                               (1 << 13) /* GUESS */
 #       define R300_FPI2_ARG2A_SHIFT             14
 #       define R300_FPI2_ARG2A_MASK              (31 << 14)
 #       define R300_FPI2_ARG2A_NEG               (1 << 19)
+#              define R300_FPI2_ARG2A_ABS                               (1 << 20) /* GUESS */
 #       define R300_FPI2_SPECIAL_LRP             (1 << 21)
 #       define R300_FPI2_OUTA_MAD                (0 << 23)
 #       define R300_FPI2_OUTA_DP4                (1 << 23)