r300: Fix fragment program instruction pairing and register allocation
authorNicolai Haehnle <prefect@upb.de>
Sun, 18 Mar 2007 01:15:56 +0000 (02:15 +0100)
committerNicolai Haehnle <nhaehnle@gmail.com>
Mon, 19 Mar 2007 17:38:07 +0000 (18:38 +0100)
There were a number of bugs related to the pairing of vector and scalar
operations where swizzles ended up using the wrong source register,
or an instruction was moved forward and ended up overwriting an aliased
register.

The new algorithm for register allocation is quite conservative and may
run out of registers before necessary. On the plus side, It Just Works.

Pairing is done whenever possible, and in more cases than before, so
in practice this change should be a net win.

src/mesa/drivers/dri/r300/r300_context.h
src/mesa/drivers/dri/r300/r300_fragprog.c
src/mesa/drivers/dri/r300/r300_reg.h

index bd9ed6f170e6d3ce60402f56e997431183bd949f..bc43953ff38099800251cdd173b39b4749767933 100644 (file)
@@ -647,38 +647,84 @@ struct r300_vertex_program_cont {
 #define PFS_NUM_TEMP_REGS      32
 #define PFS_NUM_CONST_REGS     16
 
-/* Tracking data for Mesa registers */
+/* Mapping Mesa registers to R300 temporaries */
 struct reg_acc {
        int reg;        /* Assigned hw temp */
        unsigned int refcount; /* Number of uses by mesa program */
 };
 
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+       /* Index of the first slot where this register is free in the sense
+          that it can be used as a new destination register.
+          This is -1 if the register has been assigned to a Mesa register
+          and the last access to the register has not yet been emitted */
+       int free;
+       
+       /* Index of the first slot where this register is currently reserved.
+          This is used to stop e.g. a scalar operation from being moved
+          before the allocation time of a register that was first allocated
+          for a vector operation. */
+       int reserved;
+       
+       /* Index of the first slot in which the register can be used as a
+          source without losing the value that is written by the last
+          emitted instruction that writes to the register */
+       int vector_valid;
+       int scalar_valid;
+};
+
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR  (1<<0)
+#define SLOT_SRC_SCALAR  (1<<3)
+#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR   (1<<16)
+#define SLOT_OP_SCALAR   (1<<17)
+#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r300_pfs_compile_slot {
+       /* Bitmask indicating which parts of the slot are used, using SLOT_ constants 
+          defined above */
+       unsigned int used;
+
+       /* Selected sources */
+       int vsrc[3];
+       int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
 struct r300_pfs_compile_state {
-       int v_pos, s_pos;       /* highest ALU slots used */
-
-       /* Track some information gathered during opcode
-        * construction.
-        * 
-        * NOTE: Data is only set by the code, and isn't used yet.
-        */
-       struct {
-               int vsrc[3];
-               int ssrc[3];
-               int umask;
-       } slot[PFS_MAX_ALU_INST];
-
-       /* Used to map Mesa's inputs/temps onto hardware temps */
-       int temp_in_use;
-       struct reg_acc temps[PFS_NUM_TEMP_REGS];
-       struct reg_acc inputs[32]; /* don't actually need 32... */
-
-       /* Track usage of hardware temps, for register allocation,
-        * indirection detection, etc. */
-       int hwreg_in_use;
-       GLuint used_in_node;
-       GLuint dest_in_node;
+       int nrslots;       /* number of ALU slots used so far */
+       
+       /* Track which (parts of) slots are already filled with instructions */
+       struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+       
+       /* Track the validity of R300 temporaries */
+       struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+       
+       /* Used to map Mesa's inputs/temps onto hardware temps */
+       int temp_in_use;
+       struct reg_acc temps[PFS_NUM_TEMP_REGS];
+       struct reg_acc inputs[32]; /* don't actually need 32... */
+       
+       /* Track usage of hardware temps, for register allocation,
+        * indirection detection, etc. */
+       GLuint used_in_node;
+       GLuint dest_in_node;
 };
 
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
 struct r300_fragment_program {
        struct gl_fragment_program mesa_program;
 
index 251fd2608213161191ef6292c13219dd73c5c95f..b2c89ccb364f1990dbd7cf7fc3691434de5468e2 100644 (file)
@@ -94,8 +94,9 @@
 #define REG_NEGV_SHIFT         18
 #define REG_NEGS_SHIFT         19
 #define REG_ABS_SHIFT          20
-#define REG_NO_USE_SHIFT       21
-#define REG_VALID_SHIFT                22
+#define REG_NO_USE_SHIFT       21 // Hack for refcounting
+#define REG_VALID_SHIFT                22 // Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT   23 // Is it a builtin (like all zero/all one)?
 
 #define REG_TYPE_MASK          (0x03 << REG_TYPE_SHIFT)
 #define REG_INDEX_MASK         (0x3F << REG_INDEX_SHIFT)
 #define REG_ABS_MASK           (0x01 << REG_ABS_SHIFT)
 #define REG_NO_USE_MASK                (0x01 << REG_NO_USE_SHIFT)
 #define REG_VALID_MASK         (0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK       (0x01 << REG_BUILTIN_SHIFT)
 
-#define REG(type, index, vswz, sswz, nouse, valid)                     \
+#define REG(type, index, vswz, sswz, nouse, valid, builtin)    \
        (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
         ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
         ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
         ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
+        ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
         ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
         ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 #define REG_GET_TYPE(reg)                                              \
        ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 #define REG_GET_VALID(reg)                                             \
        ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg)                                           \
+       ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 #define REG_SET_TYPE(reg, type)                                                \
        reg = ((reg & ~REG_TYPE_MASK) |                                 \
               ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 #define REG_SET_VALID(reg, valid)                                      \
        reg = ((reg & ~REG_VALID_MASK) |                                \
               ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin)                                  \
+       reg = ((reg & ~REG_BUILTIN_MASK) |                              \
+              ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 #define REG_ABS(reg)                                                   \
        reg = (reg | REG_ABS_MASK)
 #define REG_NEGV(reg)                                                  \
@@ -184,9 +192,6 @@ static const struct {
  *
  * REG_VSWZ/REG_SSWZ is an index into this table
  */
-#define SLOT_VECTOR    (1<<0)
-#define SLOT_SCALAR    (1<<3)
-#define SLOT_BOTH      (SLOT_VECTOR | SLOT_SCALAR)
 
 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 #define SWIZZLE_HALF 6
@@ -202,14 +207,14 @@ static const struct r300_pfs_swizzle {
        GLuint flags;
 } v_swiz[] = {
 /* native swizzles */
-       { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_VECTOR },
-       { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_VECTOR },
-       { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_VECTOR },
-       { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_VECTOR },
-       { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SCALAR },
-       { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_VECTOR },
-       { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_VECTOR },
-       { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_BOTH },
+       { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR },
+       { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR },
+       { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR },
+       { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR },
+       { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SRC_SCALAR },
+       { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR },
+       { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR },
+       { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH },
        { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
        { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
        { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
@@ -241,10 +246,10 @@ static const struct {
        int stride;     /* difference between SRC0/1/2 */
        GLuint flags;
 } s_swiz[] = {
-       { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_VECTOR },
-       { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_VECTOR },
-       { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_VECTOR },
-       { R300_FPI2_ARGA_SRC0A  , 1, SLOT_SCALAR },
+       { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR },
+       { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR },
+       { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR },
+       { R300_FPI2_ARGA_SRC0A  , 1, SLOT_SRC_SCALAR },
        { R300_FPI2_ARGA_ZERO   , 0, 0 },
        { R300_FPI2_ARGA_ONE    , 0, 0 },
        { R300_FPI2_ARGA_HALF   , 0, 0 }
@@ -256,6 +261,7 @@ static const GLuint undef = REG(REG_TYPE_TEMP,
                                SWIZZLE_XYZ,
                                SWIZZLE_W,
                                GL_FALSE,
+                               GL_FALSE,
                                GL_FALSE);
 
 /* constant one source */
@@ -264,6 +270,7 @@ static const GLuint pfs_one = REG(REG_TYPE_CONST,
                                  SWIZZLE_111,
                                  SWIZZLE_ONE,
                                  GL_FALSE,
+                                 GL_TRUE,
                                  GL_TRUE);
 
 /* constant half source */
@@ -272,6 +279,7 @@ static const GLuint pfs_half = REG(REG_TYPE_CONST,
                                   SWIZZLE_HHH,
                                   SWIZZLE_HALF,
                                   GL_FALSE,
+                                  GL_TRUE,
                                   GL_TRUE);
 
 /* constant zero source */
@@ -280,6 +288,7 @@ static const GLuint pfs_zero = REG(REG_TYPE_CONST,
                                   SWIZZLE_000,
                                   SWIZZLE_ZERO,
                                   GL_FALSE,
+                                  GL_TRUE,
                                   GL_TRUE);
 
 /*
@@ -291,47 +300,105 @@ static void emit_arith(struct r300_fragment_program *rp, int op,
                                GLuint src0, GLuint src1, GLuint src2,
                                int flags);
 
-/*
- * Helper functions prototypes
+/**
+ * Get an R300 temporary that can be written to in the given slot.
  */
-static int get_hw_temp(struct r300_fragment_program *rp)
+static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 {
        COMPILE_STATE;
-       int r = ffs(~cs->hwreg_in_use);
-       if (!r) {
+       int r;
+       
+       for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+               if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+                       break;
+       }
+       
+       if (r >= PFS_NUM_TEMP_REGS) {
                ERROR("Out of hardware temps\n");
                return 0;
        }
-
-       cs->hwreg_in_use |= (1 << --r);
+       
+       // Reserved is used to avoid the following scenario:
+       //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
+       //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+       //  Then scalar ops on Mesa temporary Z are emitted and move back in time
+       //  to overwrite the value of temporary Y.
+       // End scenario.
+       cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+       cs->hwtemps[r].free = -1;
+       
+       // Reset to some value that won't mess things up when the user
+       // tries to read from a temporary that hasn't been assigned a value yet.
+       // In the normal case, vector_valid and scalar_valid should be set to
+       // a sane value by the first emit that writes to this temporary.
+       cs->hwtemps[r].vector_valid = 0;
+       cs->hwtemps[r].scalar_valid = 0;
+       
        if (r > rp->max_temp_idx)
                rp->max_temp_idx = r;
-
+       
        return r;
 }
 
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
 static int get_hw_temp_tex(struct r300_fragment_program *rp)
 {
        COMPILE_STATE;
        int r;
 
-       r = ffs(~(cs->hwreg_in_use | cs->used_in_node));
-       if (!r)
-               return get_hw_temp(rp); /* Will cause an indirection */
+       for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+               if (cs->used_in_node & (1 << r))
+                       continue;
+               
+               // Note: Be very careful here
+               if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+                       break;
+       }
+       
+       if (r >= PFS_NUM_TEMP_REGS)
+               return get_hw_temp(rp, 0); /* Will cause an indirection */
 
-       cs->hwreg_in_use |= (1 << --r);
+       cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+       cs->hwtemps[r].free = -1;
+       
+       // Reset to some value that won't mess things up when the user
+       // tries to read from a temporary that hasn't been assigned a value yet.
+       // In the normal case, vector_valid and scalar_valid should be set to
+       // a sane value by the first emit that writes to this temporary.
+       cs->hwtemps[r].vector_valid = cs->nrslots;
+       cs->hwtemps[r].scalar_valid = cs->nrslots;
+       
        if (r > rp->max_temp_idx)
                rp->max_temp_idx = r;
 
        return r;
 }
 
+/**
+ * Mark the given hardware register as free.
+ */
 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 {
        COMPILE_STATE;
-       cs->hwreg_in_use &= ~(1<<idx);
+       
+       // Be very careful here. Consider sequences like
+       //  MAD r0, r1,r2,r3
+       //  TEX r4, ...
+       // The TEX instruction may be moved in front of the MAD instruction
+       // due to the way nodes work. We don't want to alias r1 and r4 in
+       // this case.
+       // I'm certain the register allocation could be further sanitized,
+       // but it's tricky because of stuff that can happen inside emit_tex
+       // and emit_arith.
+       cs->hwtemps[idx].free = cs->nrslots+1;
 }
 
+
+/**
+ * Create a new Mesa temporary register.
+ */
 static GLuint get_temp_reg(struct r300_fragment_program *rp)
 {
        COMPILE_STATE;
@@ -354,6 +421,10 @@ static GLuint get_temp_reg(struct r300_fragment_program *rp)
        return r;
 }
 
+/**
+ * Create a new Mesa temporary register that will act as the destination
+ * register for a texture read.
+ */
 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 {
        COMPILE_STATE;
@@ -376,6 +447,9 @@ static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
        return r;
 }
 
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
 static void free_temp(struct r300_fragment_program *rp, GLuint r)
 {
        COMPILE_STATE;
@@ -762,10 +836,10 @@ static int t_hw_src(struct r300_fragment_program *rp,
        switch(REG_GET_TYPE(src)) {
        case REG_TYPE_TEMP:
                /* NOTE: if reg==-1 here, a source is being read that
-                *       hasn't been written to. Undefined results
+                *       hasn't been written to. Undefined results.
                 */
                if (cs->temps[index].reg == -1)
-                       cs->temps[index].reg = get_hw_temp(rp);
+                       cs->temps[index].reg = get_hw_temp(rp, cs->nrslots);
 
                idx = cs->temps[index].reg;
 
@@ -795,7 +869,8 @@ static int t_hw_src(struct r300_fragment_program *rp,
 
 static int t_hw_dst(struct r300_fragment_program *rp,
                    GLuint dest,
-                   GLboolean tex)
+                   GLboolean tex,
+                   int slot)
 {
        COMPILE_STATE;
        int idx;
@@ -806,7 +881,7 @@ static int t_hw_dst(struct r300_fragment_program *rp,
        case REG_TYPE_TEMP:
                if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
                        if (!tex) {
-                               cs->temps[index].reg = get_hw_temp(rp);
+                               cs->temps[index].reg = get_hw_temp(rp, slot);
                        } else {
                                cs->temps[index].reg = get_hw_temp_tex(rp);
                        }
@@ -839,26 +914,20 @@ static int t_hw_dst(struct r300_fragment_program *rp,
        return idx;
 }
 
-static void emit_nop(struct r300_fragment_program *rp,
-                    GLuint mask,
-                    GLboolean sync)
+static void emit_nop(struct r300_fragment_program *rp)
 {
        COMPILE_STATE;
        
-       if (sync)
-               cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-
-       if (mask & WRITEMASK_XYZ) {
-               rp->alu.inst[cs->v_pos].inst0 = NOP_INST0;
-               rp->alu.inst[cs->v_pos].inst1 = NOP_INST1;
-               cs->v_pos++;
-       }
-
-       if (mask & WRITEMASK_W) {
-               rp->alu.inst[cs->s_pos].inst2 = NOP_INST2;
-               rp->alu.inst[cs->s_pos].inst3 = NOP_INST3;
-               cs->s_pos++;
+       if (cs->nrslots >= PFS_MAX_ALU_INST) {
+               ERROR("Out of ALU instruction slots\n");
+               return;
        }
+       
+       rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+       rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+       rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+       rp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+       cs->nrslots++;
 }
 
 static void emit_tex(struct r300_fragment_program *rp,
@@ -882,7 +951,7 @@ static void emit_tex(struct r300_fragment_program *rp,
                        rdest = dest;
                        dest = get_temp_reg_tex(rp);
                }
-               hwdest = t_hw_dst(rp, dest, GL_TRUE);
+               hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset);
                
                /* Use a temp that hasn't been used in this node, rather
                 * than causing an indirection
@@ -904,15 +973,11 @@ static void emit_tex(struct r300_fragment_program *rp,
             (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
                        
                /* Finish off current node */
-               cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-               if (rp->node[rp->cur_node].alu_offset == cs->v_pos) {
-                       /* No alu instructions in the node? Emit a NOP. */
-                       emit_nop(rp, WRITEMASK_XYZW, GL_TRUE);
-                       cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
-               }
+               if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
+                       emit_nop(rp);
                                
                rp->node[rp->cur_node].alu_end =
-                               cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
+                               cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
                assert(rp->node[rp->cur_node].alu_end >= 0);
 
                if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
@@ -922,7 +987,7 @@ static void emit_tex(struct r300_fragment_program *rp,
 
                /* Start new node */
                rp->node[rp->cur_node].tex_offset = rp->tex.length;
-               rp->node[rp->cur_node].alu_offset = cs->v_pos;
+               rp->node[rp->cur_node].alu_offset = cs->nrslots;
                rp->node[rp->cur_node].tex_end = -1;
                rp->node[rp->cur_node].alu_end = -1;    
                rp->node[rp->cur_node].flags = 0;
@@ -954,84 +1019,243 @@ static void emit_tex(struct r300_fragment_program *rp,
        }
 }
 
-/* Add sources to FPI1/FPI3 lists.  If source is already on list,
- * reuse the index instead of wasting a source.
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
  */
-static int add_src(struct r300_fragment_program *rp,
-                  int reg,
-                  int pos,
-                  int srcmask)
+static int get_earliest_allowed_write(
+               struct r300_fragment_program* rp,
+               GLuint dest)
 {
        COMPILE_STATE;
-       int csm, i;
-       
-       /* Look for matches */
-       for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {    
-               /* If sources have been allocated in this position(s)... */
-               if ((cs->slot[pos].umask & csm) == csm) {
-                       /* ... and the register number(s) match, re-use the
-                          source */
-                       if (srcmask == SLOT_VECTOR &&
-                           cs->slot[pos].vsrc[i] == reg)
-                               return i;
-                       if (srcmask == SLOT_SCALAR &&
-                           cs->slot[pos].ssrc[i] == reg)
-                               return i;
-                       if (srcmask == SLOT_BOTH &&
-                           cs->slot[pos].vsrc[i] == reg &&
-                           cs->slot[pos].ssrc[i] == reg)
-                               return i;
-               }
-       }
+       int idx;
+       GLuint index = REG_GET_INDEX(dest);
+       assert(REG_GET_VALID(dest));
 
-       /* Look for free spaces */
-       for (i=0,csm=srcmask; i<3; i++,csm=csm<<1) {
-               /* If the position(s) haven't been allocated */
-               if ((cs->slot[pos].umask & csm) == 0) {
-                       cs->slot[pos].umask |= csm;
-
-                       if (srcmask & SLOT_VECTOR)
-                               cs->slot[pos].vsrc[i] = reg;
-                       if (srcmask & SLOT_SCALAR)
-                               cs->slot[pos].ssrc[i] = reg;
-                       return i;
-               }       
+       switch(REG_GET_TYPE(dest)) {
+               case REG_TYPE_TEMP:
+                       if (cs->temps[index].reg == -1)
+                               return 0;
+                       
+                       idx = cs->temps[index].reg;
+                       break;
+               case REG_TYPE_OUTPUT:
+                       return 0;
+               default:
+                       ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+                       return 0;
        }
        
-       //ERROR("Failed to allocate sources in FPI1/FPI3!\n");
-       return 0;
+       return cs->hwtemps[idx].reserved;
 }
 
-/* Determine whether or not to position opcode in the same ALU slot for both
- * vector and scalar portions of an instruction.
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
  *
- * It's not necessary to force the first case, but it makes disassembled
- * shaders easier to read.
+ * @return the index of the slot
  */
-static GLboolean force_same_slot(int vop,
-                                int sop,
-                                GLboolean emit_vop,
-                                GLboolean emit_sop,
-                                int argc,
-                                GLuint *src)
+static int find_and_prepare_slot(struct r300_fragment_program* rp,
+               GLboolean emit_vop,
+               GLboolean emit_sop,
+               int argc,
+               GLuint* src,
+               GLuint dest)
 {
-       int i;
-
-       if (emit_vop && emit_sop)
-               return GL_TRUE;
+       COMPILE_STATE;
+       int hwsrc[3];
+       int srcpos[3];
+       unsigned int used;
+       int tempused;
+       int tempvsrc[3];
+       int tempssrc[3];
+       int pos;
+       int regnr;
+       int i,j;
+       
+       // Determine instruction slots, whether sources are required on
+       // vector or scalar side, and the smallest slot number where
+       // all source registers are available
+       used = 0;
+       if (emit_vop)
+               used |= SLOT_OP_VECTOR;
+       if (emit_sop)
+               used |= SLOT_OP_SCALAR;
+       
+       pos = get_earliest_allowed_write(rp, dest);
+       
+       if (rp->node[rp->cur_node].alu_offset > pos)
+               pos = rp->node[rp->cur_node].alu_offset;
+       for(i = 0; i < argc; ++i) {
+               if (!REG_GET_BUILTIN(src[i])) {
+                       if (emit_vop)
+                               used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+                       if (emit_sop)
+                               used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+               }
+               
+               hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
+               regnr = hwsrc[i] & 31;
+               
+               if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+                       if (used & (SLOT_SRC_VECTOR << i)) {
+                               if (cs->hwtemps[regnr].vector_valid > pos)
+                                       pos = cs->hwtemps[regnr].vector_valid;
+                       }
+                       if (used & (SLOT_SRC_SCALAR << i)) {
+                               if (cs->hwtemps[regnr].scalar_valid > pos)
+                                       pos = cs->hwtemps[regnr].scalar_valid;
+                       }
+               }
+       }
+       
+       // Find a slot that fits
+       for(; ; ++pos) {
+               if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+                       continue;
+               
+               if (pos >= cs->nrslots) {
+                       if (cs->nrslots >= PFS_MAX_ALU_INST) {
+                               ERROR("Out of ALU instruction slots\n");
+                               return -1;
+                       }
 
-       if (emit_vop && vop == R300_FPI0_OUTC_REPL_ALPHA)
-               return GL_TRUE;
+                       rp->alu.inst[pos].inst0 = NOP_INST0;
+                       rp->alu.inst[pos].inst2 = NOP_INST2;
 
+                       cs->nrslots++;
+               }
+               
+               // Note: When we need both parts (vector and scalar) of a source,
+               // we always try to put them into the same position. This makes the
+               // code easier to read, and it is optimal (i.e. one doesn't gain
+               // anything by splitting the parts).
+               // It also avoids headaches with swizzles that access both parts (i.e WXY)
+               tempused = cs->slot[pos].used;
+               for(i = 0; i < 3; ++i) {
+                       tempvsrc[i] = cs->slot[pos].vsrc[i];
+                       tempssrc[i] = cs->slot[pos].ssrc[i];
+               }
+               
+               for(i = 0; i < argc; ++i) {
+                       int flags = (used >> i) & SLOT_SRC_BOTH;
+                       
+                       if (!flags) {
+                               srcpos[i] = 0;
+                               continue;
+                       }
+                       
+                       for(j = 0; j < 3; ++j) {
+                               if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+                                       if (tempvsrc[j] != hwsrc[i])
+                                               continue;
+                               }
+                       
+                               if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+                                       if (tempssrc[j] != hwsrc[i])
+                                               continue;
+                               }
+                               
+                               break;
+                       }
+                       
+                       if (j == 3)
+                               break;
+                       
+                       srcpos[i] = j;
+                       tempused |= flags << j;
+                       if (flags & SLOT_SRC_VECTOR)
+                               tempvsrc[j] = hwsrc[i];
+                       if (flags & SLOT_SRC_SCALAR)
+                               tempssrc[j] = hwsrc[i];
+               }
+               
+               if (i == argc)
+                       break;
+       }
+       
+       // Found a slot, reserve it
+       cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+       for(i = 0; i < 3; ++i) {
+               cs->slot[pos].vsrc[i] = tempvsrc[i];
+               cs->slot[pos].ssrc[i] = tempssrc[i];
+       }
+       
+       // Emit the source fetch code
+       rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
+       rp->alu.inst[pos].inst1 |=
+                       ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
+                        (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
+                        (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
+       
+       rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
+       rp->alu.inst[pos].inst3 |=
+                       ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
+                        (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
+                        (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
+       
+       // Emit the argument selection code
        if (emit_vop) {
-               for (i=0;i<argc;i++)
-                       if (REG_GET_VSWZ(src[i]) == SWIZZLE_WZY)
-                               return GL_TRUE;
+               int swz[3];
+               
+               for(i = 0; i < 3; ++i) {
+                       if (i < argc) {
+                               swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+                                           (srcpos[i] * v_swiz[REG_GET_VSWZ(src[i])].stride)) |
+                                       ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
+                                       ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
+                       } else {
+                               swz[i] = R300_FPI0_ARGC_ZERO;
+                       }
+               }
+               
+               rp->alu.inst[pos].inst0 &=
+                               ~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK);
+               rp->alu.inst[pos].inst0 |=
+                               (swz[0] << R300_FPI0_ARG0C_SHIFT) |
+                               (swz[1] << R300_FPI0_ARG1C_SHIFT) |
+                               (swz[2] << R300_FPI0_ARG2C_SHIFT);
+       }
+       
+       if (emit_sop) {
+               int swz[3];
+               
+               for(i = 0; i < 3; ++i) {
+                       if (i < argc) {
+                               swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+                                               (srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) |
+                                               ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
+                                               ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
+                       } else {
+                               swz[i] = R300_FPI2_ARGA_ZERO;
+                       }
+               }
+               
+               rp->alu.inst[pos].inst2 &=
+                               ~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK);
+               rp->alu.inst[pos].inst2 |=
+                               (swz[0] << R300_FPI2_ARG0A_SHIFT) |
+                               (swz[1] << R300_FPI2_ARG1A_SHIFT) |
+                               (swz[2] << R300_FPI2_ARG2A_SHIFT);
        }
 
-       return GL_FALSE;
+       return pos;
 }
 
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
 static void emit_arith(struct r300_fragment_program *rp,
                       int op,
                       GLuint dest,
@@ -1043,87 +1267,31 @@ static void emit_arith(struct r300_fragment_program *rp,
 {
        COMPILE_STATE;
        GLuint src[3] = { src0, src1, src2 };
-       int hwsrc[3], sswz[3], vswz[3];
        int hwdest;
-       GLboolean emit_vop = GL_FALSE, emit_sop = GL_FALSE;
+       GLboolean emit_vop, emit_sop;
        int vop, sop, argc;
-       int vpos, spos;
-       int i;
+       int pos;
 
        vop = r300_fpop[op].v_op;
        sop = r300_fpop[op].s_op;
        argc = r300_fpop[op].argc;
 
+       if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+           REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
+               mask &= ~WRITEMASK_XYZ;
+       
+       emit_vop = GL_FALSE;
+       emit_sop = GL_FALSE;
        if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
                emit_vop = GL_TRUE;
        if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
                emit_sop = GL_TRUE;
 
-       if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
-           REG_GET_INDEX(dest) == FRAG_RESULT_DEPR)
-               emit_vop = GL_FALSE;
-                                       
-       if (force_same_slot(vop, sop, emit_vop, emit_sop, argc, src)) {
-               vpos = spos = MAX2(cs->v_pos, cs->s_pos);
-       } else {
-               vpos = cs->v_pos;
-               spos = cs->s_pos;
-               /* Here is where we'd decide on where a safe place is to
-                * combine this instruction with a previous one.
-                *
-                * This is extremely simple for now.. if a source depends
-                * on the opposite stream, force the same instruction.
-                */
-               for (i=0;i<3;i++) {
-                       if (emit_vop &&
-                           (v_swiz[REG_GET_VSWZ(src[i])].flags & SLOT_SCALAR)) {
-                               vpos = spos = MAX2(vpos, spos);
-                               break;
-                       }
-                       if (emit_sop &&
-                           (s_swiz[REG_GET_SSWZ(src[i])].flags & SLOT_VECTOR)) {
-                               vpos = spos = MAX2(vpos, spos);
-                               break;
-                       }
-               }
-       }
+       pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest);
+       if (pos < 0)
+               return;
        
-       /* - Convert src->hwsrc, record for FPI1/FPI3
-        * - Determine ARG parts of FPI0/FPI2, unused args are filled
-        *   with ARG_ZERO.
-        */     
-       for (i=0;i<3;i++) {
-               int srcpos;
-               
-               if (i >= argc) {
-                       vswz[i] = R300_FPI0_ARGC_ZERO;
-                       sswz[i] = R300_FPI2_ARGA_ZERO;
-                       continue;
-               }
-               
-               hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE);      
-
-               if (emit_vop && vop != R300_FPI0_OUTC_REPL_ALPHA) {
-                       srcpos = add_src(rp, hwsrc[i], vpos,
-                                        v_swiz[REG_GET_VSWZ(src[i])].flags);
-                       vswz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
-                                  (srcpos *
-                                   v_swiz[REG_GET_VSWZ(src[i])].stride)) |
-                               ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
-                               ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
-               } else vswz[i] = R300_FPI0_ARGC_ZERO;
-               
-               if (emit_sop) {
-                       srcpos = add_src(rp, hwsrc[i], spos,
-                                        s_swiz[REG_GET_SSWZ(src[i])].flags);
-                       sswz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
-                                  (srcpos *
-                                   s_swiz[REG_GET_SSWZ(src[i])].stride)) |
-                               ((src[i] & REG_NEGS_MASK) ? ARG_NEG : 0) |
-                               ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
-               } else sswz[i] = R300_FPI2_ARGA_ZERO;
-       }
-       hwdest = t_hw_dst(rp, dest, GL_FALSE);
+       hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
        
        if (flags & PFS_FLAG_SAT) {
                vop |= R300_FPI0_OUTC_SAT;
@@ -1131,58 +1299,45 @@ static void emit_arith(struct r300_fragment_program *rp,
        }
 
        /* Throw the pieces together and get FPI0/1 */
-       rp->alu.inst[vpos].inst1 =
-                       ((cs->slot[vpos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
-                        (cs->slot[vpos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
-                        (cs->slot[vpos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
        if (emit_vop) {
-               rp->alu.inst[vpos].inst0 = vop |
-                               (vswz[0] << R300_FPI0_ARG0C_SHIFT) |
-                               (vswz[1] << R300_FPI0_ARG1C_SHIFT) |
-                               (vswz[2] << R300_FPI0_ARG2C_SHIFT);
+               rp->alu.inst[pos].inst0 |= vop;
 
-               rp->alu.inst[vpos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+               rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+               
                if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
                        if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-                               rp->alu.inst[vpos].inst1 |=
+                               rp->alu.inst[pos].inst1 |=
                                        (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
                        } else assert(0);
                } else {
-                       rp->alu.inst[vpos].inst1 |=
+                       rp->alu.inst[pos].inst1 |=
                                        (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
+                       
+                       cs->hwtemps[hwdest].vector_valid = pos+1;
                }
-               cs->v_pos = vpos+1;
-       } else if (spos >= vpos)
-               rp->alu.inst[spos].inst0 = NOP_INST0;
+       }
 
        /* And now FPI2/3 */
-       rp->alu.inst[spos].inst3 =
-                       ((cs->slot[spos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
-                        (cs->slot[spos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
-                        (cs->slot[spos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
        if (emit_sop) {
-               rp->alu.inst[spos].inst2 = sop |
-                               sswz[0] << R300_FPI2_ARG0A_SHIFT |
-                               sswz[1] << R300_FPI2_ARG1A_SHIFT |
-                               sswz[2] << R300_FPI2_ARG2A_SHIFT;
+               rp->alu.inst[pos].inst2 |= sop;
 
                if (mask & WRITEMASK_W) {
                        if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
                                if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
-                                       rp->alu.inst[spos].inst3 |= 
+                                       rp->alu.inst[pos].inst3 |= 
                                                        (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
                                } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
-                                       rp->alu.inst[spos].inst3 |= R300_FPI3_DSTA_DEPTH;
+                                       rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH;
                                } else assert(0);
                        } else {
-                               rp->alu.inst[spos].inst3 |=
+                               rp->alu.inst[pos].inst3 |=
                                                (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
+                       
+                               cs->hwtemps[hwdest].scalar_valid = pos+1;
                        }
                }
-               cs->s_pos = spos+1;
-       } else if (vpos >= spos)
-               rp->alu.inst[vpos].inst2 = NOP_INST2;
-
+       }
+       
        return;
 }
 
@@ -1922,7 +2077,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
        for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
                if (InputsRead & (FRAG_BIT_TEX0 << i)) {
                        cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
-                       cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp);
+                       cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0);
                }
        }
        InputsRead &= ~FRAG_BITS_TEX_ANY;
@@ -1930,7 +2085,7 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
        /* fragment position treated as a texcoord */
        if (InputsRead & FRAG_BIT_WPOS) {
                cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
-               cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp);
+               cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0);
                insert_wpos(&mp->Base);
        }
        InputsRead &= ~FRAG_BIT_WPOS;
@@ -1938,14 +2093,14 @@ static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
        /* Then primary colour */
        if (InputsRead & FRAG_BIT_COL0) {
                cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
-               cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp);
+               cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
        }
        InputsRead &= ~FRAG_BIT_COL0;
        
        /* Secondary color */
        if (InputsRead & FRAG_BIT_COL1) {
                cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
-               cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp);
+               cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0);
        }
        InputsRead &= ~FRAG_BIT_COL1;
 
@@ -2030,13 +2185,12 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
                }
                
                /* Finish off */
-               cs->v_pos = cs->s_pos = MAX2(cs->v_pos, cs->s_pos);
                rp->node[rp->cur_node].alu_end =
-                               cs->v_pos - rp->node[rp->cur_node].alu_offset - 1;
+                               cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
                if (rp->node[rp->cur_node].tex_end < 0)
                        rp->node[rp->cur_node].tex_end = 0;
                rp->alu_offset = 0;
-               rp->alu_end    = cs->v_pos - 1;
+               rp->alu_end    = cs->nrslots - 1;
                rp->tex_offset = 0;
                rp->tex_end    = rp->tex.length ? rp->tex.length - 1 : 0;
                assert(rp->node[rp->cur_node].alu_end >= 0);
@@ -2053,7 +2207,7 @@ void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_pr
 /* just some random things... */
 static void dump_program(struct r300_fragment_program *rp)
 {
-       int i;
+       int n, i, j;
        static int pc = 0;
 
        fprintf(stderr, "pc=%d*************************************\n", pc++);
@@ -2066,46 +2220,136 @@ static void dump_program(struct r300_fragment_program *rp)
        fprintf(stderr, "Hardware program\n");
        fprintf(stderr, "----------------\n");
        
-       fprintf(stderr, "tex:\n");
-       
-       for(i=0;i<rp->tex.length;i++) {
-               fprintf(stderr, "%08x\n", rp->tex.inst[i]);
-       }
-       
-       for (i=0;i<(rp->cur_node+1);i++) {
+       for (n = 0; n < (rp->cur_node+1); n++) {
                fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
-                       "alu_end: %d, tex_end: %d\n", i,
-                       rp->node[i].alu_offset,
-                       rp->node[i].tex_offset,
-                       rp->node[i].alu_end,
-                       rp->node[i].tex_end);
+                       "alu_end: %d, tex_end: %d\n", n,
+                       rp->node[n].alu_offset,
+                       rp->node[n].tex_offset,
+                       rp->node[n].alu_end,
+                       rp->node[n].tex_end);
+               
+               if (rp->tex.length) {
+                       fprintf(stderr, "  TEX:\n");
+                       for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_end; ++i)
+                               fprintf(stderr, "    %08x\n", rp->tex.inst[i]);
+               }
+               
+               for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_end; ++i) {
+                       char srcc[3][10], dstc[20];
+                       char srca[3][10], dsta[20];
+                       char argc[3][20];
+                       char arga[3][20];
+                       
+                       for(j = 0; j < 3; ++j) {
+                               int regc = rp->alu.inst[i].inst1 >> (j*6);
+                               int rega = rp->alu.inst[i].inst3 >> (j*6);
+                               
+                               sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31);
+                               sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
+                       }
+                       
+                       sprintf(dstc, "t%i.%c%c%c o%i.%c%c%c",
+                                       (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+                                       (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? 'x' : ' ',
+                                       (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? 'y' : ' ',
+                                       (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? 'z' : ' ',
+                                       (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+                                       (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? 'x' : ' ',
+                                       (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? 'y' : ' ',
+                                       (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? 'z' : ' ');
+                       
+                       sprintf(dsta, "t%i.%c o%i.%c %c",
+                                       (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
+                                       (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) ? 'w' : ' ',
+                                       (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31,
+                                       (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) ? 'w' : ' ',
+                                       (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) ? 'Z' : ' ');
+                       
+                       fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+                                       "       w: %3s %3s %3s -> %-20s (%08x)\n",
+                                       i,
+                                       srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1,
+                                       srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3);
+                       
+                       for(j = 0; j < 3; ++j) {
+                               int regc = rp->alu.inst[i].inst0 >> (j*7);
+                               int rega = rp->alu.inst[i].inst2 >> (j*7);
+                               int d;
+                               char buf[20];
+
+                               d = regc & 31;
+                               if (d < 12) {
+                                       switch(d % 4) {
+                                               case R300_FPI0_ARGC_SRC0C_XYZ:
+                                                       sprintf(buf, "%s.xyz", srcc[d / 4]);
+                                                       break;
+                                               case R300_FPI0_ARGC_SRC0C_XXX:
+                                                       sprintf(buf, "%s.xxx", srcc[d / 4]);
+                                                       break;
+                                               case R300_FPI0_ARGC_SRC0C_YYY:
+                                                       sprintf(buf, "%s.yyy", srcc[d / 4]);
+                                                       break;
+                                               case R300_FPI0_ARGC_SRC0C_ZZZ:
+                                                       sprintf(buf, "%s.zzz", srcc[d / 4]);
+                                                       break;
+                                       }
+                               } else if (d < 15) {
+                                       sprintf(buf, "%s.www", srca[d-12]);
+                               } else if (d == 20) {
+                                       sprintf(buf, "0.0");
+                               } else if (d == 21) {
+                                       sprintf(buf, "1.0");
+                               } else if (d == 22) {
+                                       sprintf(buf, "0.5");
+                               } else if (d >= 23 && d < 32) {
+                                       d -= 23;
+                                       switch(d/3) {
+                                               case 0:
+                                                       sprintf(buf, "%s.yzx", srcc[d % 3]);
+                                                       break;
+                                               case 1:
+                                                       sprintf(buf, "%s.zxy", srcc[d % 3]);
+                                                       break;
+                                               case 2:
+                                                       sprintf(buf, "%s.Wzy", srcc[d % 3]);
+                                                       break;
+                                       }
+                               } else {
+                                       sprintf(buf, "%i", d);
+                               }
+                               
+                               sprintf(argc[j], "%s%s%s%s",
+                                               (regc & 32) ? "-" : "",
+                                               (regc & 64) ? "|" : "",
+                                               buf,
+                                               (regc & 64) ? "|" : "");
+                       
+                               d = rega & 31;
+                               if (d < 9) {
+                                       sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3));
+                               } else if (d < 12) {
+                                       sprintf(buf, "%s.w", srca[d-9]);
+                               } else if (d == 16) {
+                                       sprintf(buf, "0.0");
+                               } else if (d == 17) {
+                                       sprintf(buf, "1.0");
+                               } else if (d == 18) {
+                                       sprintf(buf, "0.5");
+                               } else {
+                                       sprintf(buf, "%i", d);
+                               }
+                               
+                               sprintf(arga[j], "%s%s%s%s",
+                                               (rega & 32) ? "-" : "",
+                                               (rega & 64) ? "|" : "",
+                                               buf,
+                                               (rega & 64) ? "|" : "");
+                       }
+                       
+                       fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+                                       "       w: %8s %8s %8s    op: %08x\n",
+                                       argc[0], argc[1], argc[2], rp->alu.inst[i].inst0,
+                                       arga[0], arga[1], arga[2], rp->alu.inst[i].inst2);
+               }
        }
-       
-       fprintf(stderr, "%08x\n",
-               ((rp->tex_end << 16) | (R300_PFS_TEXI_0 >> 2)));
-       for (i=0;i<=rp->tex_end;i++)
-               fprintf(stderr, "%08x\n", rp->tex.inst[i]);
-
-       /* dump program in pretty_print_command_stream.tcl-readable format */
-       fprintf(stderr, "%08x\n",
-               ((rp->alu_end << 16) | (R300_PFS_INSTR0_0 >> 2)));
-       for (i=0;i<=rp->alu_end;i++)
-               fprintf(stderr, "%08x\n", rp->alu.inst[i].inst0);
-
-       fprintf(stderr, "%08x\n",
-               ((rp->alu_end << 16) | (R300_PFS_INSTR1_0 >> 2)));
-       for (i=0;i<=rp->alu_end;i++)
-               fprintf(stderr, "%08x\n", rp->alu.inst[i].inst1);
-
-       fprintf(stderr, "%08x\n",
-               ((rp->alu_end << 16) | (R300_PFS_INSTR2_0 >> 2)));
-       for (i=0;i<=rp->alu_end;i++)
-               fprintf(stderr, "%08x\n", rp->alu.inst[i].inst2);
-
-       fprintf(stderr, "%08x\n",
-               ((rp->alu_end << 16) | (R300_PFS_INSTR3_0 >> 2)));
-       for (i=0;i<=rp->alu_end;i++)
-               fprintf(stderr, "%08x\n", rp->alu.inst[i].inst3);
-
-       fprintf(stderr, "00000000\n");
 }
index 3de15752b164aa5addffbd831f786b0fa508b656..1f4a2d2e64722a6d4a24ef9caea9fd19ed126bee 100644 (file)
@@ -1047,7 +1047,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  * WRT swizzling. If, for example, you want to load an R component into an
  * Alpha operand, this R component is taken from a *color* source, not from
  * an alpha source. The corresponding register doesn't even have to appear in
- * the alpha sources list. (I hope this alll makes sense to you)
+ * the alpha sources list. (I hope this all makes sense to you)
  *
  * Destination selection
  * The destination register index is in FPI1 (color) and FPI3 (alpha)
@@ -1074,6 +1074,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_FPI1_SRC2C_SHIFT             12
 #       define R300_FPI1_SRC2C_MASK              (31 << 12)
 #       define R300_FPI1_SRC2C_CONST             (1 << 17)
+#       define R300_FPI1_SRC_MASK                0x0003ffff
 #       define R300_FPI1_DSTC_SHIFT              18
 #       define R300_FPI1_DSTC_MASK               (31 << 18)
 #              define R300_FPI1_DSTC_REG_MASK_SHIFT     23
@@ -1095,6 +1096,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_FPI3_SRC2A_SHIFT             12
 #       define R300_FPI3_SRC2A_MASK              (31 << 12)
 #       define R300_FPI3_SRC2A_CONST             (1 << 17)
+#       define R300_FPI3_SRC_MASK                0x0003ffff
 #       define R300_FPI3_DSTA_SHIFT              18
 #       define R300_FPI3_DSTA_MASK               (31 << 18)
 #       define R300_FPI3_DSTA_REG                (1 << 23)