r300/vertprog: Refactor wpos rewrite using rc_program
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
index ff6575b303c4b39f24c482d771a6ebb149c3bf10..c05b488645b7520091fa8167a2196a9b14877ed2 100644 (file)
 #include "radeon_program.h"
 #include "radeon_program_alu.h"
 
-#include "shader/prog_optimize.h"
 #include "shader/prog_print.h"
 
 
-/* TODO: Get rid of t_src_class call */
-#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
-                      ((t_src_class(a.File) == PVS_SRC_REG_CONSTANT && \
-                        t_src_class(b.File) == PVS_SRC_REG_CONSTANT) || \
-                       (t_src_class(a.File) == PVS_SRC_REG_INPUT && \
-                        t_src_class(b.File) == PVS_SRC_REG_INPUT)))) \
-
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
  * obtain a constant ZERO or ONE source.
  */
 #define __CONST(x, y)  \
-       (PVS_SRC_OPERAND(t_src_index(vp, &src[x]),      \
+       (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),      \
                           t_swizzle(y),        \
                           t_swizzle(y),        \
                           t_swizzle(y),        \
                           t_swizzle(y),        \
-                          t_src_class(src[x].File), \
-                          NEGATE_NONE) | (src[x].RelAddr << 4))
-
-
+                          t_src_class(vpi->SrcReg[x].File), \
+                          NEGATE_NONE) | (vpi->SrcReg[x].RelAddr << 4))
 
 
 static unsigned long t_dst_mask(GLuint mask)
@@ -121,6 +111,24 @@ static unsigned long t_src_class(gl_register_file file)
        }
 }
 
+static GLboolean t_src_conflict(struct prog_src_register a, struct prog_src_register b)
+{
+       unsigned long aclass = t_src_class(a.File);
+       unsigned long bclass = t_src_class(b.File);
+
+       if (aclass != bclass)
+               return GL_FALSE;
+       if (aclass == PVS_SRC_REG_TEMPORARY)
+               return GL_FALSE;
+
+       if (a.RelAddr || b.RelAddr)
+               return GL_TRUE;
+       if (a.Index != b.Index)
+               return GL_TRUE;
+
+       return GL_FALSE;
+}
+
 static INLINE unsigned long t_swizzle(GLubyte swizzle)
 {
        /* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
@@ -188,11 +196,10 @@ static GLboolean valid_dst(struct r300_vertex_program_code *vp,
        return GL_TRUE;
 }
 
-static GLuint * ei_vector1(struct r300_vertex_program_code *vp,
+static void ei_vector1(struct r300_vertex_program_code *vp,
                                GLuint hw_opcode,
                                struct prog_instruction *vpi,
-                               GLuint * inst,
-                               struct prog_src_register src[3])
+                               GLuint * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
                                     GL_FALSE,
@@ -200,18 +207,15 @@ static GLuint * ei_vector1(struct r300_vertex_program_code *vp,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
-       inst[1] = t_src(vp, &src[0]);
+       inst[1] = t_src(vp, &vpi->SrcReg[0]);
        inst[2] = __CONST(0, SWIZZLE_ZERO);
        inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-       return inst;
 }
 
-static GLuint * ei_vector2(struct r300_vertex_program_code *vp,
+static void ei_vector2(struct r300_vertex_program_code *vp,
                                GLuint hw_opcode,
                                struct prog_instruction *vpi,
-                               GLuint * inst,
-                               struct prog_src_register src[3])
+                               GLuint * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
                                     GL_FALSE,
@@ -219,18 +223,15 @@ static GLuint * ei_vector2(struct r300_vertex_program_code *vp,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
-       inst[1] = t_src(vp, &src[0]);
-       inst[2] = t_src(vp, &src[1]);
+       inst[1] = t_src(vp, &vpi->SrcReg[0]);
+       inst[2] = t_src(vp, &vpi->SrcReg[1]);
        inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-       return inst;
 }
 
-static GLuint *ei_math1(struct r300_vertex_program_code *vp,
+static void ei_math1(struct r300_vertex_program_code *vp,
                                GLuint hw_opcode,
                                struct prog_instruction *vpi,
-                               GLuint * inst,
-                               struct prog_src_register src[3])
+                               GLuint * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
                                     GL_TRUE,
@@ -238,17 +239,14 @@ static GLuint *ei_math1(struct r300_vertex_program_code *vp,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
-       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
        inst[2] = __CONST(0, SWIZZLE_ZERO);
        inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-       return inst;
 }
 
-static GLuint *ei_lit(struct r300_vertex_program_code *vp,
+static void ei_lit(struct r300_vertex_program_code *vp,
                                      struct prog_instruction *vpi,
-                                     GLuint * inst,
-                                     struct prog_src_register src[3])
+                                     GLuint * inst)
 {
        //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
 
@@ -259,35 +257,32 @@ static GLuint *ei_lit(struct r300_vertex_program_code *vp,
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
        /* NOTE: Users swizzling might not work. */
-       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 0)),      // X
-                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),      // X
+                                 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
                                  PVS_SRC_SELECT_FORCE_0,       // Z
-                                 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),        // Y
-                                 t_src_class(src[0].File),
-                                 src[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
-           (src[0].RelAddr << 4);
-       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),      // Y
-                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+                                 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),        // Y
+                                 t_src_class(vpi->SrcReg[0].File),
+                                 vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+           (vpi->SrcReg[0].RelAddr << 4);
+       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
+                                 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
                                  PVS_SRC_SELECT_FORCE_0,       // Z
-                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),        // X
-                                 t_src_class(src[0].File),
-                                 src[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
-           (src[0].RelAddr << 4);
-       inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),      // Y
-                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),        // X
+                                 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
+                                 t_src_class(vpi->SrcReg[0].File),
+                                 vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+           (vpi->SrcReg[0].RelAddr << 4);
+       inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
+                                 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
                                  PVS_SRC_SELECT_FORCE_0,       // Z
-                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
-                                 t_src_class(src[0].File),
-                                 src[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
-           (src[0].RelAddr << 4);
-
-       return inst;
+                                 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
+                                 t_src_class(vpi->SrcReg[0].File),
+                                 vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+           (vpi->SrcReg[0].RelAddr << 4);
 }
 
-static GLuint *ei_mad(struct r300_vertex_program_code *vp,
+static void ei_mad(struct r300_vertex_program_code *vp,
                                      struct prog_instruction *vpi,
-                                     GLuint * inst,
-                                     struct prog_src_register src[3])
+                                     GLuint * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
                                     GL_FALSE,
@@ -295,17 +290,14 @@ static GLuint *ei_mad(struct r300_vertex_program_code *vp,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
-       inst[1] = t_src(vp, &src[0]);
-       inst[2] = t_src(vp, &src[1]);
-       inst[3] = t_src(vp, &src[2]);
-
-       return inst;
+       inst[1] = t_src(vp, &vpi->SrcReg[0]);
+       inst[2] = t_src(vp, &vpi->SrcReg[1]);
+       inst[3] = t_src(vp, &vpi->SrcReg[2]);
 }
 
-static GLuint *ei_pow(struct r300_vertex_program_code *vp,
+static void ei_pow(struct r300_vertex_program_code *vp,
                                      struct prog_instruction *vpi,
-                                     GLuint * inst,
-                                     struct prog_src_register src[3])
+                                     GLuint * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
                                     GL_TRUE,
@@ -313,42 +305,40 @@ static GLuint *ei_pow(struct r300_vertex_program_code *vp,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
-       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
        inst[2] = __CONST(0, SWIZZLE_ZERO);
-       inst[3] = t_src_scalar(vp, &src[1]);
-
-       return inst;
+       inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
-static void t_inputs_outputs(struct r300_vertex_program_code *vp, struct gl_program * glvp)
+static void t_inputs_outputs(struct r300_vertex_program_compiler * c)
 {
        int i;
        int cur_reg;
        GLuint OutputsWritten, InputsRead;
 
-       OutputsWritten = glvp->OutputsWritten;
-       InputsRead = glvp->InputsRead;
+       OutputsWritten = c->Base.Program.OutputsWritten;
+       InputsRead = c->Base.Program.InputsRead;
 
        cur_reg = -1;
        for (i = 0; i < VERT_ATTRIB_MAX; i++) {
                if (InputsRead & (1 << i))
-                       vp->inputs[i] = ++cur_reg;
+                       c->code->inputs[i] = ++cur_reg;
                else
-                       vp->inputs[i] = -1;
+                       c->code->inputs[i] = -1;
        }
 
        cur_reg = 0;
        for (i = 0; i < VERT_RESULT_MAX; i++)
-               vp->outputs[i] = -1;
+               c->code->outputs[i] = -1;
 
        assert(OutputsWritten & (1 << VERT_RESULT_HPOS));
 
        if (OutputsWritten & (1 << VERT_RESULT_HPOS)) {
-               vp->outputs[VERT_RESULT_HPOS] = cur_reg++;
+               c->code->outputs[VERT_RESULT_HPOS] = cur_reg++;
        }
 
        if (OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
-               vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
+               c->code->outputs[VERT_RESULT_PSIZ] = cur_reg++;
        }
 
        /* If we're writing back facing colors we need to send
@@ -358,355 +348,272 @@ static void t_inputs_outputs(struct r300_vertex_program_code *vp, struct gl_prog
         * get written into appropriate output vectors.
         */
        if (OutputsWritten & (1 << VERT_RESULT_COL0)) {
-               vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+               c->code->outputs[VERT_RESULT_COL0] = cur_reg++;
        } else if (OutputsWritten & (1 << VERT_RESULT_BFC0) ||
                OutputsWritten & (1 << VERT_RESULT_BFC1)) {
                cur_reg++;
        }
 
        if (OutputsWritten & (1 << VERT_RESULT_COL1)) {
-               vp->outputs[VERT_RESULT_COL1] = cur_reg++;
+               c->code->outputs[VERT_RESULT_COL1] = cur_reg++;
        } else if (OutputsWritten & (1 << VERT_RESULT_BFC0) ||
                OutputsWritten & (1 << VERT_RESULT_BFC1)) {
                cur_reg++;
        }
 
        if (OutputsWritten & (1 << VERT_RESULT_BFC0)) {
-               vp->outputs[VERT_RESULT_BFC0] = cur_reg++;
+               c->code->outputs[VERT_RESULT_BFC0] = cur_reg++;
        } else if (OutputsWritten & (1 << VERT_RESULT_BFC1)) {
                cur_reg++;
        }
 
        if (OutputsWritten & (1 << VERT_RESULT_BFC1)) {
-               vp->outputs[VERT_RESULT_BFC1] = cur_reg++;
+               c->code->outputs[VERT_RESULT_BFC1] = cur_reg++;
        } else if (OutputsWritten & (1 << VERT_RESULT_BFC0)) {
                cur_reg++;
        }
 
        for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) {
                if (OutputsWritten & (1 << i)) {
-                       vp->outputs[i] = cur_reg++;
+                       c->code->outputs[i] = cur_reg++;
                }
        }
 
        if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
-               vp->outputs[VERT_RESULT_FOGC] = cur_reg++;
+               c->code->outputs[VERT_RESULT_FOGC] = cur_reg++;
        }
 }
 
-static GLboolean translate_vertex_program(struct r300_vertex_program_compiler * compiler)
+static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
 {
-       struct prog_instruction *vpi = compiler->program->Instructions;
-       int i;
-       GLuint *inst;
-       unsigned long num_operands;
-       /* Initial value should be last tmp reg that hw supports.
-          Strangely enough r300 doesnt mind even though these would be out of range.
-          Smart enough to realize that it doesnt need it? */
-       int u_temp_i = VSF_MAX_FRAGMENT_TEMPS - 1;
-       struct prog_src_register src[3];
-       struct r300_vertex_program_code * vp = compiler->code;
+       struct rc_instruction *rci;
 
        compiler->code->pos_end = 0;    /* Not supported yet */
        compiler->code->length = 0;
 
-       t_inputs_outputs(compiler->code, compiler->program);
-
-       for (inst = compiler->code->body.d; vpi->Opcode != OPCODE_END;
-            vpi++, inst += 4) {
-
-               {
-                       int u_temp_used = (VSF_MAX_FRAGMENT_TEMPS - 1) - u_temp_i;
-                       if((compiler->code->num_temporaries + u_temp_used) > VSF_MAX_FRAGMENT_TEMPS) {
-                               fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", compiler->code->num_temporaries, u_temp_used);
-                               return GL_FALSE;
-                       }
-                       u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1;
-               }
-
-               if (!valid_dst(compiler->code, &vpi->DstReg)) {
-                       /* redirect result to unused temp */
-                       vpi->DstReg.File = PROGRAM_TEMPORARY;
-                       vpi->DstReg.Index = u_temp_i;
-               }
+       t_inputs_outputs(compiler);
 
-               num_operands = _mesa_num_inst_src_regs(vpi->Opcode);
+       for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
+               struct prog_instruction *vpi = &rci->I;
+               GLuint *inst = compiler->code->body.d + compiler->code->length;
 
-               /* copy the sources (src) from mesa into a local variable... is this needed? */
-               for (i = 0; i < num_operands; i++) {
-                       src[i] = vpi->SrcReg[i];
-               }
+               /* Skip instructions writing to non-existing destination */
+               if (!valid_dst(compiler->code, &vpi->DstReg))
+                       continue;
 
-               if (num_operands == 3) {        /* TODO: scalars */
-                       if (CMP_SRCS(src[1], src[2])
-                           || CMP_SRCS(src[0], src[2])) {
-                               inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-                                                            GL_FALSE,
-                                                            GL_FALSE,
-                                                            u_temp_i,
-                                                            WRITEMASK_XYZW,
-                                                            PVS_DST_REG_TEMPORARY);
-                               inst[1] =
-                                   PVS_SRC_OPERAND(t_src_index(compiler->code, &src[2]),
-                                                   SWIZZLE_X,
-                                                   SWIZZLE_Y,
-                                                   SWIZZLE_Z,
-                                                   SWIZZLE_W,
-                                                   t_src_class(src[2].File),
-                                                   NEGATE_NONE) | (src[2].
-                                                                     RelAddr <<
-                                                                     4);
-                               inst[2] = __CONST(2, SWIZZLE_ZERO);
-                               inst[3] = __CONST(2, SWIZZLE_ZERO);
-                               inst += 4;
-
-                               src[2].File = PROGRAM_TEMPORARY;
-                               src[2].Index = u_temp_i;
-                               src[2].RelAddr = 0;
-                               u_temp_i--;
-                       }
-               }
-
-               if (num_operands >= 2) {
-                       if (CMP_SRCS(src[1], src[0])) {
-                               inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-                                                            GL_FALSE,
-                                                            GL_FALSE,
-                                                            u_temp_i,
-                                                            WRITEMASK_XYZW,
-                                                            PVS_DST_REG_TEMPORARY);
-                               inst[1] =
-                                   PVS_SRC_OPERAND(t_src_index(compiler->code, &src[0]),
-                                                   SWIZZLE_X,
-                                                   SWIZZLE_Y,
-                                                   SWIZZLE_Z,
-                                                   SWIZZLE_W,
-                                                   t_src_class(src[0].File),
-                                                   NEGATE_NONE) | (src[0].
-                                                                     RelAddr <<
-                                                                     4);
-                               inst[2] = __CONST(0, SWIZZLE_ZERO);
-                               inst[3] = __CONST(0, SWIZZLE_ZERO);
-                               inst += 4;
-
-                               src[0].File = PROGRAM_TEMPORARY;
-                               src[0].Index = u_temp_i;
-                               src[0].RelAddr = 0;
-                               u_temp_i--;
-                       }
+               if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+                       rc_error(&compiler->Base, "Vertex program has too many instructions\n");
+                       return;
                }
 
                switch (vpi->Opcode) {
-               case OPCODE_ADD: inst = ei_vector2(compiler->code, VE_ADD, vpi, inst, src); break;
-               case OPCODE_ARL: inst = ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst, src); break;
-               case OPCODE_DP4: inst = ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst, src); break;
-               case OPCODE_DST: inst = ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst, src); break;
-               case OPCODE_EX2: inst = ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst, src); break;
-               case OPCODE_EXP: inst = ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst, src); break;
-               case OPCODE_FRC: inst = ei_vector1(compiler->code, VE_FRACTION, vpi, inst, src); break;
-               case OPCODE_LG2: inst = ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst, src); break;
-               case OPCODE_LIT: inst = ei_lit(compiler->code, vpi, inst, src); break;
-               case OPCODE_LOG: inst = ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst, src); break;
-               case OPCODE_MAD: inst = ei_mad(compiler->code, vpi, inst, src); break;
-               case OPCODE_MAX: inst = ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst, src); break;
-               case OPCODE_MIN: inst = ei_vector2(compiler->code, VE_MINIMUM, vpi, inst, src); break;
-               case OPCODE_MOV: inst = ei_vector1(compiler->code, VE_ADD, vpi, inst, src); break;
-               case OPCODE_MUL: inst = ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst, src); break;
-               case OPCODE_POW: inst = ei_pow(compiler->code, vpi, inst, src); break;
-               case OPCODE_RCP: inst = ei_math1(compiler->code, ME_RECIP_DX, vpi, inst, src); break;
-               case OPCODE_RSQ: inst = ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst, src); break;
-               case OPCODE_SGE: inst = ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst, src); break;
-               case OPCODE_SLT: inst = ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst, src); break;
+               case OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
+               case OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+               case OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
+               case OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+               case OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
+               case OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
+               case OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+               case OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
+               case OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
+               case OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
+               case OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
+               case OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
+               case OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
+               case OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
+               case OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
+               case OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
+               case OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
+               case OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
+               case OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
+               case OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
                default:
-                       fprintf(stderr, "Unknown opcode %i\n", vpi->Opcode);
-                       return GL_FALSE;
+                       rc_error(&compiler->Base, "Unknown opcode %i\n", vpi->Opcode);
+                       return;
                }
-       }
 
-       compiler->code->length = (inst - compiler->code->body.d);
-       if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
-               return GL_FALSE;
-       }
+               compiler->code->length += 4;
 
-       return GL_TRUE;
+               if (compiler->Base.Error)
+                       return;
+       }
 }
 
-static void insert_wpos(struct gl_program *prog, GLuint temp_index, int tex_id)
-{
-       struct prog_instruction *vpi;
-
-       _mesa_insert_instructions(prog, prog->NumInstructions - 1, 2);
-
-       vpi = &prog->Instructions[prog->NumInstructions - 3];
+struct temporary_allocation {
+       GLuint Allocated:1;
+       GLuint HwTemp:15;
+       struct rc_instruction * LastRead;
+};
 
-       vpi->Opcode = OPCODE_MOV;
-
-       vpi->DstReg.File = PROGRAM_OUTPUT;
-       vpi->DstReg.Index = VERT_RESULT_HPOS;
-       vpi->DstReg.WriteMask = WRITEMASK_XYZW;
-       vpi->DstReg.CondMask = COND_TR;
+static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
+{
+       struct rc_instruction *inst;
+       GLuint num_orig_temps = 0;
+       GLboolean hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+       struct temporary_allocation * ta;
+       GLuint i, j;
+
+       compiler->code->num_temporaries = 0;
+       memset(hwtemps, 0, sizeof(hwtemps));
+
+       /* Pass 1: Count original temporaries and allocate structures */
+       for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+               GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+               GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+               for (i = 0; i < numsrcs; ++i) {
+                       if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+                               if (inst->I.SrcReg[i].Index >= num_orig_temps)
+                                       num_orig_temps = inst->I.SrcReg[i].Index + 1;
+                       }
+               }
 
-       vpi->SrcReg[0].File = PROGRAM_TEMPORARY;
-       vpi->SrcReg[0].Index = temp_index;
-       vpi->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+               if (numdsts) {
+                       if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+                               if (inst->I.DstReg.Index >= num_orig_temps)
+                                       num_orig_temps = inst->I.DstReg.Index + 1;
+                       }
+               }
+       }
 
-       ++vpi;
+       ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
+                       sizeof(struct temporary_allocation) * num_orig_temps);
+       memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
 
-       vpi->Opcode = OPCODE_MOV;
+       /* Pass 2: Determine original temporary lifetimes */
+       for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+               GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
 
-       vpi->DstReg.File = PROGRAM_OUTPUT;
-       vpi->DstReg.Index = VERT_RESULT_TEX0 + tex_id;
-       vpi->DstReg.WriteMask = WRITEMASK_XYZW;
-       vpi->DstReg.CondMask = COND_TR;
+               for (i = 0; i < numsrcs; ++i) {
+                       if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY)
+                               ta[inst->I.SrcReg[i].Index].LastRead = inst;
+               }
+       }
 
-       vpi->SrcReg[0].File = PROGRAM_TEMPORARY;
-       vpi->SrcReg[0].Index = temp_index;
-       vpi->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+       /* Pass 3: Register allocation */
+       for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+               GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+               GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
 
-       ++vpi;
+               for (i = 0; i < numsrcs; ++i) {
+                       if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+                               GLuint orig = inst->I.SrcReg[i].Index;
+                               inst->I.SrcReg[i].Index = ta[orig].HwTemp;
 
-       vpi->Opcode = OPCODE_END;
-}
+                               if (ta[orig].Allocated && inst == ta[orig].LastRead)
+                                       hwtemps[ta[orig].HwTemp] = GL_FALSE;
+                       }
+               }
 
-static void pos_as_texcoord(struct gl_program *prog, int tex_id)
-{
-       struct prog_instruction *vpi;
-       GLuint tempregi = prog->NumTemporaries;
+               if (numdsts) {
+                       if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+                               GLuint orig = inst->I.DstReg.Index;
 
-       prog->NumTemporaries++;
+                               if (!ta[orig].Allocated) {
+                                       for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+                                               if (!hwtemps[j])
+                                                       break;
+                                       }
+                                       if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+                                               fprintf(stderr, "Out of hw temporaries\n");
+                                       } else {
+                                               ta[orig].Allocated = GL_TRUE;
+                                               ta[orig].HwTemp = j;
+                                               hwtemps[j] = GL_TRUE;
+
+                                               if (j >= compiler->code->num_temporaries)
+                                                       compiler->code->num_temporaries = j + 1;
+                                       }
+                               }
 
-       for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
-               if (vpi->DstReg.File == PROGRAM_OUTPUT && vpi->DstReg.Index == VERT_RESULT_HPOS) {
-                       vpi->DstReg.File = PROGRAM_TEMPORARY;
-                       vpi->DstReg.Index = tempregi;
+                               inst->I.DstReg.Index = ta[orig].HwTemp;
+                       }
                }
        }
-
-       insert_wpos(prog, tempregi, tex_id);
-
-       prog->OutputsWritten |= 1 << (VERT_RESULT_TEX0 + tex_id);
 }
 
+
 /**
- * The fogcoord attribute is special in that only the first component
- * is relevant, and the remaining components are always fixed (when read
- * from by the fragment program) to yield an X001 pattern.
- *
- * We need to enforce this either in the vertex program or in the fragment
- * program, and this code chooses not to enforce it in the vertex program.
- * This is slightly cheaper, as long as the fragment program does not use
- * weird swizzles.
- *
- * And it seems that usually, weird swizzles are not used, so...
- *
- * See also the counterpart rewriting for fragment programs.
+ * Vertex engine cannot read two inputs or two constants at the same time.
+ * Introduce intermediate MOVs to temporary registers to account for this.
  */
-static void fog_as_texcoord(struct gl_program *prog, int tex_id)
+static GLboolean transform_source_conflicts(
+       struct radeon_compiler *c,
+       struct rc_instruction* inst,
+       void* unused)
 {
-       struct prog_instruction *vpi;
-
-       vpi = prog->Instructions;
-       while (vpi->Opcode != OPCODE_END) {
-               if (vpi->DstReg.File == PROGRAM_OUTPUT && vpi->DstReg.Index == VERT_RESULT_FOGC) {
-                       vpi->DstReg.Index = VERT_RESULT_TEX0 + tex_id;
-                       vpi->DstReg.WriteMask = WRITEMASK_X;
+       GLuint num_operands = _mesa_num_inst_src_regs(inst->I.Opcode);
+
+       if (num_operands == 3) {
+               if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[2])
+                   || t_src_conflict(inst->I.SrcReg[0], inst->I.SrcReg[2])) {
+                       int tmpreg = rc_find_free_temporary(c);
+                       struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+                       inst_mov->I.Opcode = OPCODE_MOV;
+                       inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+                       inst_mov->I.DstReg.Index = tmpreg;
+                       inst_mov->I.SrcReg[0] = inst->I.SrcReg[2];
+
+                       reset_srcreg(&inst->I.SrcReg[2]);
+                       inst->I.SrcReg[2].File = PROGRAM_TEMPORARY;
+                       inst->I.SrcReg[2].Index = tmpreg;
                }
+       }
 
-               ++vpi;
+       if (num_operands >= 2) {
+               if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[0])) {
+                       int tmpreg = rc_find_free_temporary(c);
+                       struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+                       inst_mov->I.Opcode = OPCODE_MOV;
+                       inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+                       inst_mov->I.DstReg.Index = tmpreg;
+                       inst_mov->I.SrcReg[0] = inst->I.SrcReg[1];
+
+                       reset_srcreg(&inst->I.SrcReg[1]);
+                       inst->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+                       inst->I.SrcReg[1].Index = tmpreg;
+               }
        }
 
-       prog->OutputsWritten &= ~(1 << VERT_RESULT_FOGC);
-       prog->OutputsWritten |= 1 << (VERT_RESULT_TEX0 + tex_id);
+       return GL_TRUE;
 }
 
-
-#define ADD_OUTPUT(fp_attr, vp_result) \
-       do { \
-               if ((FpReads & (1 << (fp_attr))) && !(compiler->program->OutputsWritten & (1 << (vp_result)))) { \
-                       OutputsAdded |= 1 << (vp_result); \
-                       count++; \
-               } \
-       } while (0)
-
 static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
 {
-       GLuint OutputsAdded, FpReads;
-       int i, count;
-
-       OutputsAdded = 0;
-       count = 0;
-       FpReads = compiler->state.FpReads;
-
-       ADD_OUTPUT(FRAG_ATTRIB_COL0, VERT_RESULT_COL0);
-       ADD_OUTPUT(FRAG_ATTRIB_COL1, VERT_RESULT_COL1);
-
-       for (i = 0; i < 7; ++i) {
-               ADD_OUTPUT(FRAG_ATTRIB_TEX0 + i, VERT_RESULT_TEX0 + i);
-       }
-
-       /* Some outputs may be artificially added, to match the inputs of the fragment program.
-        * Issue 16 of vertex program spec says that all vertex attributes that are unwritten by
-        * vertex program are undefined, so just use MOV [vertex_result], CONST[0]
-        */
-       if (count > 0) {
-               struct prog_instruction *inst;
-
-               _mesa_insert_instructions(compiler->program, compiler->program->NumInstructions - 1, count);
-               inst = &compiler->program->Instructions[compiler->program->NumInstructions - 1 - count];
+       int i;
 
-               for (i = 0; i < VERT_RESULT_MAX; ++i) {
-                       if (OutputsAdded & (1 << i)) {
-                               inst->Opcode = OPCODE_MOV;
+       for(i = 0; i < 32; ++i) {
+               if ((compiler->RequiredOutputs & (1 << i)) &&
+                   !(compiler->Base.Program.OutputsWritten & (1 << i))) {
+                       struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
+                       inst->I.Opcode = OPCODE_MOV;
 
-                               inst->DstReg.File = PROGRAM_OUTPUT;
-                               inst->DstReg.Index = i;
-                               inst->DstReg.WriteMask = WRITEMASK_XYZW;
-                               inst->DstReg.CondMask = COND_TR;
+                       inst->I.DstReg.File = PROGRAM_OUTPUT;
+                       inst->I.DstReg.Index = i;
+                       inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
 
-                               inst->SrcReg[0].File = PROGRAM_CONSTANT;
-                               inst->SrcReg[0].Index = 0;
-                               inst->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+                       inst->I.SrcReg[0].File = PROGRAM_CONSTANT;
+                       inst->I.SrcReg[0].Index = 0;
+                       inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
 
-                               ++inst;
-                       }
+                       compiler->Base.Program.OutputsWritten |= 1 << i;
                }
-
-               compiler->program->OutputsWritten |= OutputsAdded;
        }
 }
 
-#undef ADD_OUTPUT
-
 static void nqssadceInit(struct nqssadce_state* s)
 {
        struct r300_vertex_program_compiler * compiler = s->UserData;
-       GLuint fp_reads;
-
-       fp_reads = compiler->state.FpReads;
-       {
-               if (fp_reads & FRAG_BIT_COL0) {
-                               s->Outputs[VERT_RESULT_COL0].Sourced = WRITEMASK_XYZW;
-                               s->Outputs[VERT_RESULT_BFC0].Sourced = WRITEMASK_XYZW;
-               }
-
-               if (fp_reads & FRAG_BIT_COL1) {
-                               s->Outputs[VERT_RESULT_COL1].Sourced = WRITEMASK_XYZW;
-                               s->Outputs[VERT_RESULT_BFC1].Sourced = WRITEMASK_XYZW;
-               }
-       }
+       int i;
 
-       {
-               int i;
-               for (i = 0; i < 8; ++i) {
-                       if (fp_reads & FRAG_BIT_TEX(i)) {
-                               s->Outputs[VERT_RESULT_TEX0 + i].Sourced = WRITEMASK_XYZW;
-                       }
+       for(i = 0; i < VERT_RESULT_MAX; ++i) {
+               if (compiler->RequiredOutputs & (1 << i)) {
+                       if (i != VERT_RESULT_PSIZ)
+                               s->Outputs[i].Sourced = WRITEMASK_XYZW;
+                       else
+                               s->Outputs[i].Sourced = WRITEMASK_X; /* ugly hack! */
                }
        }
-
-       s->Outputs[VERT_RESULT_HPOS].Sourced = WRITEMASK_XYZW;
-       if (s->Program->OutputsWritten & (1 << VERT_RESULT_PSIZ))
-               s->Outputs[VERT_RESULT_PSIZ].Sourced = WRITEMASK_X;
 }
 
 static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
@@ -719,16 +626,21 @@ static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
 
 
 
-GLboolean r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler, GLcontext * ctx)
+void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
-       GLboolean success;
+       rc_mesa_to_rc_program(&compiler->Base, compiler->program);
+       compiler->program = 0;
 
        if (compiler->state.WPosAttr != FRAG_ATTRIB_MAX) {
-               pos_as_texcoord(compiler->program, compiler->state.WPosAttr - FRAG_ATTRIB_TEX0);
+               rc_copy_output(&compiler->Base,
+                       VERT_RESULT_HPOS,
+                       compiler->state.WPosAttr - FRAG_ATTRIB_TEX0 + VERT_RESULT_TEX0);
        }
 
        if (compiler->state.FogAttr != FRAG_ATTRIB_MAX) {
-               fog_as_texcoord(compiler->program, compiler->state.FogAttr - FRAG_ATTRIB_TEX0);
+               rc_move_output(&compiler->Base,
+                       VERT_RESULT_FOGC,
+                       compiler->state.FogAttr - FRAG_ATTRIB_TEX0 + VERT_RESULT_TEX0, WRITEMASK_X);
        }
 
        addArtificialOutputs(compiler);
@@ -737,12 +649,29 @@ GLboolean r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compi
                struct radeon_program_transformation transformations[] = {
                        { &r300_transform_vertex_alu, 0 },
                };
-               radeonLocalTransform(compiler->program, 1, transformations);
+               radeonLocalTransform(&compiler->Base, 1, transformations);
        }
 
        if (compiler->Base.Debug) {
                fprintf(stderr, "Vertex program after native rewrite:\n");
-               _mesa_print_program(compiler->program);
+               rc_print_program(&compiler->Base.Program);
+               fflush(stdout);
+       }
+
+       {
+               /* Note: This pass has to be done seperately from ALU rewrite,
+                * otherwise non-native ALU instructions with source conflits
+                * will not be treated properly.
+                */
+               struct radeon_program_transformation transformations[] = {
+                       { &transform_source_conflicts, 0 },
+               };
+               radeonLocalTransform(&compiler->Base, 1, transformations);
+       }
+
+       if (compiler->Base.Debug) {
+               fprintf(stderr, "Vertex program after source conflict resolve:\n");
+               rc_print_program(&compiler->Base.Program);
                fflush(stdout);
        }
 
@@ -752,56 +681,22 @@ GLboolean r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compi
                        .IsNativeSwizzle = &swizzleIsNative,
                        .BuildSwizzle = NULL
                };
-               radeonNqssaDce(compiler->program, &nqssadce, compiler);
+               radeonNqssaDce(&compiler->Base, &nqssadce, compiler);
 
                /* We need this step for reusing temporary registers */
-               _mesa_optimize_program(ctx, compiler->program);
+               allocate_temporary_registers(compiler);
 
                if (compiler->Base.Debug) {
                        fprintf(stderr, "Vertex program after NQSSADCE:\n");
-                       _mesa_print_program(compiler->program);
+                       rc_print_program(&compiler->Base.Program);
                        fflush(stdout);
                }
        }
 
-       assert(compiler->program->NumInstructions);
-       {
-               struct prog_instruction *inst;
-               int max, i, tmp;
-
-               inst = compiler->program->Instructions;
-               max = -1;
-               while (inst->Opcode != OPCODE_END) {
-                       tmp = _mesa_num_inst_src_regs(inst->Opcode);
-                       for (i = 0; i < tmp; ++i) {
-                               if (inst->SrcReg[i].File == PROGRAM_TEMPORARY) {
-                                       if ((int) inst->SrcReg[i].Index > max) {
-                                               max = inst->SrcReg[i].Index;
-                                       }
-                               }
-                       }
-
-                       if (_mesa_num_inst_dst_regs(inst->Opcode)) {
-                               if (inst->DstReg.File == PROGRAM_TEMPORARY) {
-                                       if ((int) inst->DstReg.Index > max) {
-                                               max = inst->DstReg.Index;
-                                       }
-                               }
-                       }
-                       ++inst;
-               }
-
-               /* We actually want highest index of used temporary register,
-                * not the number of temporaries used.
-                * These values aren't always the same.
-                */
-               compiler->code->num_temporaries = max + 1;
-       }
-
-       success = translate_vertex_program(compiler);
+       translate_vertex_program(compiler);
 
-       compiler->code->InputsRead = compiler->program->InputsRead;
-       compiler->code->OutputsWritten = compiler->program->OutputsWritten;
+       rc_constants_copy(&compiler->code->constants, &compiler->Base.Program.Constants);
 
-       return success;
+       compiler->code->InputsRead = compiler->Base.Program.InputsRead;
+       compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten;
 }