Merge branch 'master' into r300-compiler
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
index fc9c8f805ae0091751065bd4cc52d2356e385d10..ac72f8cbb615658e7ccb046c632368bd02c9617a 100644 (file)
 
 #include "radeon_compiler.h"
 
+#include <stdio.h>
+
 #include "../r300_reg.h"
 
-#include "radeon_nqssadce.h"
-#include "radeon_program.h"
+#include "radeon_dataflow.h"
 #include "radeon_program_alu.h"
-
-#include "shader/prog_print.h"
+#include "radeon_swizzle.h"
 
 
 /*
                           t_swizzle(y),        \
                           t_swizzle(y),        \
                           t_src_class(vpi->SrcReg[x].File), \
-                          NEGATE_NONE) | (vpi->SrcReg[x].RelAddr << 4))
+                          RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
 
 
-static unsigned long t_dst_mask(GLuint mask)
+static unsigned long t_dst_mask(unsigned int mask)
 {
-       /* WRITEMASK_* is equivalent to VSF_FLAG_* */
-       return mask & WRITEMASK_XYZW;
+       /* RC_MASK_* is equivalent to VSF_FLAG_* */
+       return mask & RC_MASK_XYZW;
 }
 
-static unsigned long t_dst_class(gl_register_file file)
+static unsigned long t_dst_class(rc_register_file file)
 {
-
        switch (file) {
-       case PROGRAM_TEMPORARY:
+       default:
+               fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
+               /* fall-through */
+       case RC_FILE_TEMPORARY:
                return PVS_DST_REG_TEMPORARY;
-       case PROGRAM_OUTPUT:
+       case RC_FILE_OUTPUT:
                return PVS_DST_REG_OUT;
-       case PROGRAM_ADDRESS:
+       case RC_FILE_ADDRESS:
                return PVS_DST_REG_A0;
-               /*
-                  case PROGRAM_INPUT:
-                  case PROGRAM_LOCAL_PARAM:
-                  case PROGRAM_ENV_PARAM:
-                  case PROGRAM_NAMED_PARAM:
-                  case PROGRAM_STATE_VAR:
-                  case PROGRAM_WRITE_ONLY:
-                  case PROGRAM_ADDRESS:
-                */
-       default:
-               fprintf(stderr, "problem in %s", __FUNCTION__);
-               _mesa_exit(-1);
-               return -1;
        }
 }
 
 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
-                                struct prog_dst_register *dst)
+                                struct rc_dst_register *dst)
 {
-       if (dst->File == PROGRAM_OUTPUT)
+       if (dst->File == RC_FILE_OUTPUT)
                return vp->outputs[dst->Index];
 
        return dst->Index;
 }
 
-static unsigned long t_src_class(gl_register_file file)
+static unsigned long t_src_class(rc_register_file file)
 {
        switch (file) {
-       case PROGRAM_TEMPORARY:
+       default:
+               fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
+               /* fall-through */
+       case RC_FILE_NONE:
+       case RC_FILE_TEMPORARY:
                return PVS_SRC_REG_TEMPORARY;
-       case PROGRAM_INPUT:
+       case RC_FILE_INPUT:
                return PVS_SRC_REG_INPUT;
-       case PROGRAM_LOCAL_PARAM:
-       case PROGRAM_ENV_PARAM:
-       case PROGRAM_NAMED_PARAM:
-       case PROGRAM_CONSTANT:
-       case PROGRAM_STATE_VAR:
+       case RC_FILE_CONSTANT:
                return PVS_SRC_REG_CONSTANT;
-               /*
-                  case PROGRAM_OUTPUT:
-                  case PROGRAM_WRITE_ONLY:
-                  case PROGRAM_ADDRESS:
-                */
-       default:
-               fprintf(stderr, "problem in %s", __FUNCTION__);
-               _mesa_exit(-1);
-               return -1;
        }
 }
 
-static GLboolean t_src_conflict(struct prog_src_register a, struct prog_src_register b)
+static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
 {
        unsigned long aclass = t_src_class(a.File);
        unsigned long bclass = t_src_class(b.File);
 
        if (aclass != bclass)
-               return GL_FALSE;
+               return 0;
        if (aclass == PVS_SRC_REG_TEMPORARY)
-               return GL_FALSE;
+               return 0;
 
        if (a.RelAddr || b.RelAddr)
-               return GL_TRUE;
+               return 1;
        if (a.Index != b.Index)
-               return GL_TRUE;
+               return 1;
 
-       return GL_FALSE;
+       return 0;
 }
 
-static INLINE unsigned long t_swizzle(GLubyte swizzle)
+static inline unsigned long t_swizzle(unsigned int swizzle)
 {
-       /* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+       /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
        return swizzle;
 }
 
 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
-                                struct prog_src_register *src)
+                                struct rc_src_register *src)
 {
-       if (src->File == PROGRAM_INPUT) {
+       if (src->File == RC_FILE_INPUT) {
                assert(vp->inputs[src->Index] != -1);
                return vp->inputs[src->Index];
        } else {
@@ -154,9 +134,9 @@ static unsigned long t_src_index(struct r300_vertex_program_code *vp,
 /* these two functions should probably be merged... */
 
 static unsigned long t_src(struct r300_vertex_program_code *vp,
-                          struct prog_src_register *src)
+                          struct rc_src_register *src)
 {
-       /* src->Negate uses the NEGATE_ flags from program_instruction.h,
+       /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
         * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
         */
        return PVS_SRC_OPERAND(t_src_index(vp, src),
@@ -169,9 +149,9 @@ static unsigned long t_src(struct r300_vertex_program_code *vp,
 }
 
 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
-                                 struct prog_src_register *src)
+                                 struct rc_src_register *src)
 {
-       /* src->Negate uses the NEGATE_ flags from program_instruction.h,
+       /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
         * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
         */
        return PVS_SRC_OPERAND(t_src_index(vp, src),
@@ -180,79 +160,79 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
                               t_swizzle(GET_SWZ(src->Swizzle, 0)),
                               t_swizzle(GET_SWZ(src->Swizzle, 0)),
                               t_src_class(src->File),
-                              src->Negate ? NEGATE_XYZW : NEGATE_NONE) |
+                              src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
            (src->RelAddr << 4);
 }
 
-static GLboolean valid_dst(struct r300_vertex_program_code *vp,
-                          struct prog_dst_register *dst)
+static int valid_dst(struct r300_vertex_program_code *vp,
+                          struct rc_dst_register *dst)
 {
-       if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
-               return GL_FALSE;
-       } else if (dst->File == PROGRAM_ADDRESS) {
+       if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
+               return 0;
+       } else if (dst->File == RC_FILE_ADDRESS) {
                assert(dst->Index == 0);
        }
 
-       return GL_TRUE;
+       return 1;
 }
 
 static void ei_vector1(struct r300_vertex_program_code *vp,
-                               GLuint hw_opcode,
-                               struct prog_instruction *vpi,
-                               GLuint * inst)
+                               unsigned int hw_opcode,
+                               struct rc_sub_instruction *vpi,
+                               unsigned int * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
-                                    GL_FALSE,
-                                    GL_FALSE,
+                                    0,
+                                    0,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
        inst[1] = t_src(vp, &vpi->SrcReg[0]);
-       inst[2] = __CONST(0, SWIZZLE_ZERO);
-       inst[3] = __CONST(0, SWIZZLE_ZERO);
+       inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
+       inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
 }
 
 static void ei_vector2(struct r300_vertex_program_code *vp,
-                               GLuint hw_opcode,
-                               struct prog_instruction *vpi,
-                               GLuint * inst)
+                               unsigned int hw_opcode,
+                               struct rc_sub_instruction *vpi,
+                               unsigned int * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
-                                    GL_FALSE,
-                                    GL_FALSE,
+                                    0,
+                                    0,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
        inst[1] = t_src(vp, &vpi->SrcReg[0]);
        inst[2] = t_src(vp, &vpi->SrcReg[1]);
-       inst[3] = __CONST(1, SWIZZLE_ZERO);
+       inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
 }
 
 static void ei_math1(struct r300_vertex_program_code *vp,
-                               GLuint hw_opcode,
-                               struct prog_instruction *vpi,
-                               GLuint * inst)
+                               unsigned int hw_opcode,
+                               struct rc_sub_instruction *vpi,
+                               unsigned int * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
-                                    GL_TRUE,
-                                    GL_FALSE,
+                                    1,
+                                    0,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
        inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
-       inst[2] = __CONST(0, SWIZZLE_ZERO);
-       inst[3] = __CONST(0, SWIZZLE_ZERO);
+       inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
+       inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
 }
 
 static void ei_lit(struct r300_vertex_program_code *vp,
-                                     struct prog_instruction *vpi,
-                                     GLuint * inst)
+                                     struct rc_sub_instruction *vpi,
+                                     unsigned int * inst)
 {
        //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
 
        inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
-                                    GL_TRUE,
-                                    GL_FALSE,
+                                    1,
+                                    0,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
@@ -262,51 +242,91 @@ static void ei_lit(struct r300_vertex_program_code *vp,
                                  PVS_SRC_SELECT_FORCE_0,       // Z
                                  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),        // Y
                                  t_src_class(vpi->SrcReg[0].File),
-                                 vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+                                 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
            (vpi->SrcReg[0].RelAddr << 4);
        inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
                                  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
                                  PVS_SRC_SELECT_FORCE_0,       // Z
                                  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
                                  t_src_class(vpi->SrcReg[0].File),
-                                 vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+                                 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
            (vpi->SrcReg[0].RelAddr << 4);
        inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
                                  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
                                  PVS_SRC_SELECT_FORCE_0,       // Z
                                  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
                                  t_src_class(vpi->SrcReg[0].File),
-                                 vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+                                 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
            (vpi->SrcReg[0].RelAddr << 4);
 }
 
 static void ei_mad(struct r300_vertex_program_code *vp,
-                                     struct prog_instruction *vpi,
-                                     GLuint * inst)
+                                     struct rc_sub_instruction *vpi,
+                                     unsigned int * inst)
 {
-       inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
-                                    GL_FALSE,
-                                    GL_TRUE,
-                                    t_dst_index(vp, &vpi->DstReg),
-                                    t_dst_mask(vpi->DstReg.WriteMask),
-                                    t_dst_class(vpi->DstReg.File));
+       /* Remarks about hardware limitations of MAD
+        * (please preserve this comment, as this information is _NOT_
+        * in the documentation provided by AMD).
+        *
+        * As described in the documentation, MAD with three unique temporary
+        * source registers requires the use of the macro version.
+        *
+        * However (and this is not mentioned in the documentation), apparently
+        * the macro version is _NOT_ a full superset of the normal version.
+        * In particular, the macro version does not always work when relative
+        * addressing is used in the source operands.
+        *
+        * This limitation caused incorrect rendering in Sauerbraten's OpenGL
+        * assembly shader path when using medium quality animations
+        * (i.e. animations with matrix blending instead of quaternion blending).
+        *
+        * Unfortunately, I (nha) have been unable to extract a Piglit regression
+        * test for this issue - for some reason, it is possible to have vertex
+        * programs whose prefix is *exactly* the same as the prefix of the
+        * offending program in Sauerbraten up to the offending instruction
+        * without causing any trouble.
+        *
+        * Bottom line: Only use the macro version only when really necessary;
+        * according to AMD docs, this should improve performance by one clock
+        * as a nice side bonus.
+        */
+       if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
+           vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
+           vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
+           vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
+           vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
+           vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
+               inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
+                               0,
+                               1,
+                               t_dst_index(vp, &vpi->DstReg),
+                               t_dst_mask(vpi->DstReg.WriteMask),
+                               t_dst_class(vpi->DstReg.File));
+       } else {
+               inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
+                               0,
+                               0,
+                               t_dst_index(vp, &vpi->DstReg),
+                               t_dst_mask(vpi->DstReg.WriteMask),
+                               t_dst_class(vpi->DstReg.File));
+       }
        inst[1] = t_src(vp, &vpi->SrcReg[0]);
        inst[2] = t_src(vp, &vpi->SrcReg[1]);
        inst[3] = t_src(vp, &vpi->SrcReg[2]);
 }
 
 static void ei_pow(struct r300_vertex_program_code *vp,
-                                     struct prog_instruction *vpi,
-                                     GLuint * inst)
+                                     struct rc_sub_instruction *vpi,
+                                     unsigned int * inst)
 {
        inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
-                                    GL_TRUE,
-                                    GL_FALSE,
+                                    1,
+                                    0,
                                     t_dst_index(vp, &vpi->DstReg),
                                     t_dst_mask(vpi->DstReg.WriteMask),
                                     t_dst_class(vpi->DstReg.File));
        inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
-       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
        inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
@@ -321,8 +341,8 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
        compiler->SetHwInputOutput(compiler);
 
        for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
-               struct prog_instruction *vpi = &rci->I;
-               GLuint *inst = compiler->code->body.d + compiler->code->length;
+               struct rc_sub_instruction *vpi = &rci->I;
+               unsigned int *inst = compiler->code->body.d + compiler->code->length;
 
                /* Skip instructions writing to non-existing destination */
                if (!valid_dst(compiler->code, &vpi->DstReg))
@@ -334,26 +354,26 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
                }
 
                switch (vpi->Opcode) {
-               case OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
-               case OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
-               case OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
-               case OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
-               case OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
-               case OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
-               case OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
-               case OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
-               case OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
-               case OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
-               case OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
-               case OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
-               case OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
-               case OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
-               case OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
-               case OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
-               case OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
-               case OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
-               case OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
-               case OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
+               case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
+               case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+               case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
+               case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+               case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
+               case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
+               case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+               case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
+               case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
+               case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
+               case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
+               case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
+               case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
+               case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
+               case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
+               case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
+               case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
+               case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
+               case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
+               case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
                default:
                        rc_error(&compiler->Base, "Unknown opcode %i\n", vpi->Opcode);
                        return;
@@ -367,36 +387,35 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 }
 
 struct temporary_allocation {
-       GLuint Allocated:1;
-       GLuint HwTemp:15;
+       unsigned int Allocated:1;
+       unsigned int HwTemp:15;
        struct rc_instruction * LastRead;
 };
 
 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
 {
        struct rc_instruction *inst;
-       GLuint num_orig_temps = 0;
-       GLboolean hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+       unsigned int num_orig_temps = 0;
+       char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
        struct temporary_allocation * ta;
-       GLuint i, j;
+       unsigned int i, j;
 
        compiler->code->num_temporaries = 0;
        memset(hwtemps, 0, sizeof(hwtemps));
 
        /* Pass 1: Count original temporaries and allocate structures */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
-               GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
-               GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+               const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->I.Opcode);
 
-               for (i = 0; i < numsrcs; ++i) {
-                       if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+               for (i = 0; i < opcode->NumSrcRegs; ++i) {
+                       if (inst->I.SrcReg[i].File == RC_FILE_TEMPORARY) {
                                if (inst->I.SrcReg[i].Index >= num_orig_temps)
                                        num_orig_temps = inst->I.SrcReg[i].Index + 1;
                        }
                }
 
-               if (numdsts) {
-                       if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+               if (opcode->HasDstReg) {
+                       if (inst->I.DstReg.File == RC_FILE_TEMPORARY) {
                                if (inst->I.DstReg.Index >= num_orig_temps)
                                        num_orig_temps = inst->I.DstReg.Index + 1;
                        }
@@ -409,32 +428,31 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 
        /* Pass 2: Determine original temporary lifetimes */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
-               GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+               const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->I.Opcode);
 
-               for (i = 0; i < numsrcs; ++i) {
-                       if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY)
+               for (i = 0; i < opcode->NumSrcRegs; ++i) {
+                       if (inst->I.SrcReg[i].File == RC_FILE_TEMPORARY)
                                ta[inst->I.SrcReg[i].Index].LastRead = inst;
                }
        }
 
        /* Pass 3: Register allocation */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
-               GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
-               GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+               const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->I.Opcode);
 
-               for (i = 0; i < numsrcs; ++i) {
-                       if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
-                               GLuint orig = inst->I.SrcReg[i].Index;
+               for (i = 0; i < opcode->NumSrcRegs; ++i) {
+                       if (inst->I.SrcReg[i].File == RC_FILE_TEMPORARY) {
+                               unsigned int orig = inst->I.SrcReg[i].Index;
                                inst->I.SrcReg[i].Index = ta[orig].HwTemp;
 
                                if (ta[orig].Allocated && inst == ta[orig].LastRead)
-                                       hwtemps[ta[orig].HwTemp] = GL_FALSE;
+                                       hwtemps[ta[orig].HwTemp] = 0;
                        }
                }
 
-               if (numdsts) {
-                       if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
-                               GLuint orig = inst->I.DstReg.Index;
+               if (opcode->HasDstReg) {
+                       if (inst->I.DstReg.File == RC_FILE_TEMPORARY) {
+                               unsigned int orig = inst->I.DstReg.Index;
 
                                if (!ta[orig].Allocated) {
                                        for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
@@ -444,9 +462,9 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
                                        if (j >= VSF_MAX_FRAGMENT_TEMPS) {
                                                fprintf(stderr, "Out of hw temporaries\n");
                                        } else {
-                                               ta[orig].Allocated = GL_TRUE;
+                                               ta[orig].Allocated = 1;
                                                ta[orig].HwTemp = j;
-                                               hwtemps[j] = GL_TRUE;
+                                               hwtemps[j] = 1;
 
                                                if (j >= compiler->code->num_temporaries)
                                                        compiler->code->num_temporaries = j + 1;
@@ -464,45 +482,45 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
  * Vertex engine cannot read two inputs or two constants at the same time.
  * Introduce intermediate MOVs to temporary registers to account for this.
  */
-static GLboolean transform_source_conflicts(
+static int transform_source_conflicts(
        struct radeon_compiler *c,
        struct rc_instruction* inst,
        void* unused)
 {
-       GLuint num_operands = _mesa_num_inst_src_regs(inst->I.Opcode);
+       const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->I.Opcode);
 
-       if (num_operands == 3) {
+       if (opcode->NumSrcRegs == 3) {
                if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[2])
                    || t_src_conflict(inst->I.SrcReg[0], inst->I.SrcReg[2])) {
                        int tmpreg = rc_find_free_temporary(c);
                        struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
-                       inst_mov->I.Opcode = OPCODE_MOV;
-                       inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+                       inst_mov->I.Opcode = RC_OPCODE_MOV;
+                       inst_mov->I.DstReg.File = RC_FILE_TEMPORARY;
                        inst_mov->I.DstReg.Index = tmpreg;
                        inst_mov->I.SrcReg[0] = inst->I.SrcReg[2];
 
                        reset_srcreg(&inst->I.SrcReg[2]);
-                       inst->I.SrcReg[2].File = PROGRAM_TEMPORARY;
+                       inst->I.SrcReg[2].File = RC_FILE_TEMPORARY;
                        inst->I.SrcReg[2].Index = tmpreg;
                }
        }
 
-       if (num_operands >= 2) {
+       if (opcode->NumSrcRegs >= 2) {
                if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[0])) {
                        int tmpreg = rc_find_free_temporary(c);
                        struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
-                       inst_mov->I.Opcode = OPCODE_MOV;
-                       inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+                       inst_mov->I.Opcode = RC_OPCODE_MOV;
+                       inst_mov->I.DstReg.File = RC_FILE_TEMPORARY;
                        inst_mov->I.DstReg.Index = tmpreg;
                        inst_mov->I.SrcReg[0] = inst->I.SrcReg[1];
 
                        reset_srcreg(&inst->I.SrcReg[1]);
-                       inst->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+                       inst->I.SrcReg[1].File = RC_FILE_TEMPORARY;
                        inst->I.SrcReg[1].Index = tmpreg;
                }
        }
 
-       return GL_TRUE;
+       return 1;
 }
 
 static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
@@ -513,44 +531,52 @@ static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
                if ((compiler->RequiredOutputs & (1 << i)) &&
                    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
                        struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
-                       inst->I.Opcode = OPCODE_MOV;
+                       inst->I.Opcode = RC_OPCODE_MOV;
 
-                       inst->I.DstReg.File = PROGRAM_OUTPUT;
+                       inst->I.DstReg.File = RC_FILE_OUTPUT;
                        inst->I.DstReg.Index = i;
-                       inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+                       inst->I.DstReg.WriteMask = RC_MASK_XYZW;
 
-                       inst->I.SrcReg[0].File = PROGRAM_CONSTANT;
+                       inst->I.SrcReg[0].File = RC_FILE_CONSTANT;
                        inst->I.SrcReg[0].Index = 0;
-                       inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+                       inst->I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
                        compiler->Base.Program.OutputsWritten |= 1 << i;
                }
        }
 }
 
-static void nqssadceInit(struct nqssadce_state* s)
+static void dataflow_outputs_mark_used(void * userdata, void * data,
+               void (*callback)(void *, unsigned int, unsigned int))
 {
-       struct r300_vertex_program_compiler * compiler = s->UserData;
+       struct r300_vertex_program_compiler * c = userdata;
        int i;
 
-       for(i = 0; i < VERT_RESULT_MAX; ++i) {
-               if (compiler->RequiredOutputs & (1 << i))
-                       s->Outputs[i].Sourced = WRITEMASK_XYZW;
+       for(i = 0; i < 32; ++i) {
+               if (c->RequiredOutputs & (1 << i))
+                       callback(data, i, RC_MASK_XYZW);
        }
 }
 
-static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
+static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
        (void) opcode;
        (void) reg;
 
-       return GL_TRUE;
+       return 1;
 }
 
 
+static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
+       .IsNative = &swizzle_is_native,
+       .Split = 0 /* should never be called */
+};
+
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
+       compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
+
        addArtificialOutputs(compiler);
 
        {
@@ -562,8 +588,8 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
        if (compiler->Base.Debug) {
                fprintf(stderr, "Vertex program after native rewrite:\n");
-               rc_print_program(&compiler->Base.Program);
-               fflush(stdout);
+               rc_print_program(&compiler->Base.Program, 0);
+               fflush(stderr);
        }
 
        {
@@ -579,26 +605,22 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
        if (compiler->Base.Debug) {
                fprintf(stderr, "Vertex program after source conflict resolve:\n");
-               rc_print_program(&compiler->Base.Program);
-               fflush(stdout);
+               rc_print_program(&compiler->Base.Program, 0);
+               fflush(stderr);
        }
 
-       {
-               struct radeon_nqssadce_descr nqssadce = {
-                       .Init = &nqssadceInit,
-                       .IsNativeSwizzle = &swizzleIsNative,
-                       .BuildSwizzle = NULL
-               };
-               radeonNqssaDce(&compiler->Base, &nqssadce, compiler);
+       rc_dataflow_annotate(&compiler->Base, &dataflow_outputs_mark_used, compiler);
+       rc_dataflow_dealias(&compiler->Base);
+       rc_dataflow_swizzles(&compiler->Base);
 
-               /* We need this step for reusing temporary registers */
-               allocate_temporary_registers(compiler);
+       /* This invalidates dataflow annotations and should be replaced
+        * by a future generic register allocation pass. */
+       allocate_temporary_registers(compiler);
 
-               if (compiler->Base.Debug) {
-                       fprintf(stderr, "Vertex program after NQSSADCE:\n");
-                       rc_print_program(&compiler->Base.Program);
-                       fflush(stdout);
-               }
+       if (compiler->Base.Debug) {
+               fprintf(stderr, "Vertex program after dataflow:\n");
+               rc_print_program(&compiler->Base.Program, 0);
+               fflush(stderr);
        }
 
        translate_vertex_program(compiler);
@@ -609,7 +631,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
        compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten;
 
        if (compiler->Base.Debug) {
-               printf("Final vertex program code:\n");
+               fprintf(stderr, "Final vertex program code:\n");
                r300_vertex_program_dump(compiler->code);
        }
 }