Merge branch 'llvm-cliptest-viewport'
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
index 02bef5603f38d9e91884e312cb3e4f5f4ecfdc9c..bf8341f0173662a826c003e7e8fbd4b8ed4e75be 100644 (file)
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
 #include "radeon_emulate_branches.h"
+#include "radeon_emulate_loops.h"
+#include "radeon_remove_constants.h"
+
+struct loop {
+       int BgnLoop;
+
+};
 
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
@@ -145,7 +152,8 @@ static unsigned long t_src(struct r300_vertex_program_code *vp,
                               t_swizzle(GET_SWZ(src->Swizzle, 2)),
                               t_swizzle(GET_SWZ(src->Swizzle, 3)),
                               t_src_class(src->File),
-                              src->Negate) | (src->RelAddr << 4);
+                              src->Negate) |
+              (src->RelAddr << 4) | (src->Abs << 3);
 }
 
 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
@@ -161,7 +169,7 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
                               t_swizzle(GET_SWZ(src->Swizzle, 0)),
                               t_src_class(src->File),
                               src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
-           (src->RelAddr << 4);
+              (src->RelAddr << 4) | (src->Abs << 3);
 }
 
 static int valid_dst(struct r300_vertex_program_code *vp,
@@ -330,37 +338,192 @@ static void ei_pow(struct r300_vertex_program_code *vp,
        inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
+static void mark_write(void * userdata,        struct rc_instruction * inst,
+               rc_register_file file,  unsigned int index, unsigned int mask)
+{
+       unsigned int * writemasks = userdata;
+
+       if (file != RC_FILE_TEMPORARY)
+               return;
+
+       if (index >= R300_VS_MAX_TEMPS)
+               return;
+
+       writemasks[index] |= mask;
+}
+
+static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
+{
+       return PVS_SRC_OPERAND(compiler->PredicateIndex,
+               t_swizzle(RC_SWIZZLE_ZERO),
+               t_swizzle(RC_SWIZZLE_ZERO),
+               t_swizzle(RC_SWIZZLE_ZERO),
+               t_swizzle(RC_SWIZZLE_W),
+               t_src_class(RC_FILE_TEMPORARY),
+               0);
+}
+
+static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
+                                       unsigned int hw_opcode, int is_math)
+{
+       return PVS_OP_DST_OPERAND(hw_opcode,
+            is_math,
+            0,
+            compiler->PredicateIndex,
+            RC_MASK_W,
+            t_dst_class(RC_FILE_TEMPORARY));
+
+}
+
+static void ei_if(struct r300_vertex_program_compiler * compiler,
+                                       struct rc_instruction *rci,
+                                       unsigned int * inst,
+                                       unsigned int branch_depth)
+{
+       unsigned int predicate_opcode;
+       int is_math = 0;
+
+       if (!compiler->Base.is_r500) {
+               rc_error(&compiler->Base,"Opcode IF not supported\n");
+               return;
+       }
+
+       /* Reserve a temporary to use as our predicate stack counter, if we
+        * don't already have one. */
+       if (!compiler->PredicateMask) {
+               unsigned int writemasks[RC_REGISTER_MAX_INDEX];
+               struct rc_instruction * inst;
+               unsigned int i;
+               memset(writemasks, 0, sizeof(writemasks));
+               for(inst = compiler->Base.Program.Instructions.Next;
+                               inst != &compiler->Base.Program.Instructions;
+                                                       inst = inst->Next) {
+                       rc_for_all_writes_mask(inst, mark_write, writemasks);
+               }
+               for(i = 0; i < compiler->Base.max_temp_regs; i++) {
+                       unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
+                       /* Only the W component can be used fo the predicate
+                        * stack counter. */
+                       if (mask & RC_MASK_W) {
+                               compiler->PredicateMask = RC_MASK_W;
+                               compiler->PredicateIndex = i;
+                               break;
+                       }
+               }
+               if (i == compiler->Base.max_temp_regs) {
+                       rc_error(&compiler->Base, "No free temporary to use for"
+                                       " predicate stack counter.\n");
+                       return;
+               }
+       }
+       predicate_opcode =
+                       branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
+
+       rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
+       if (branch_depth == 0) {
+               is_math = 1;
+               predicate_opcode = ME_PRED_SET_NEQ;
+               inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
+               inst[2] = 0;
+       } else {
+               predicate_opcode = VE_PRED_SET_NEQ_PUSH;
+               inst[1] = t_pred_src(compiler);
+               inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
+       }
+
+       inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
+       inst[3] = 0;
+
+}
+
+static void ei_else(struct r300_vertex_program_compiler * compiler,
+                                                       unsigned int * inst)
+{
+       if (!compiler->Base.is_r500) {
+               rc_error(&compiler->Base,"Opcode ELSE not supported\n");
+               return;
+       }
+       inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
+       inst[1] = t_pred_src(compiler);
+       inst[2] = 0;
+       inst[3] = 0;
+}
+
+static void ei_endif(struct r300_vertex_program_compiler *compiler,
+                                                       unsigned int * inst)
+{
+       if (!compiler->Base.is_r500) {
+               rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
+               return;
+       }
+       inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
+       inst[1] = t_pred_src(compiler);
+       inst[2] = 0;
+       inst[3] = 0;
+}
 
-static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
+static void translate_vertex_program(struct radeon_compiler *c, void *user)
 {
+       struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
        struct rc_instruction *rci;
 
+       struct loop * loops = NULL;
+       int current_loop_depth = 0;
+       int loops_reserved = 0;
+
+       unsigned int branch_depth = 0;
+
        compiler->code->pos_end = 0;    /* Not supported yet */
        compiler->code->length = 0;
+       compiler->code->num_temporaries = 0;
 
        compiler->SetHwInputOutput(compiler);
 
        for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
                struct rc_sub_instruction *vpi = &rci->U.I;
                unsigned int *inst = compiler->code->body.d + compiler->code->length;
+               const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
 
                /* Skip instructions writing to non-existing destination */
                if (!valid_dst(compiler->code, &vpi->DstReg))
                        continue;
 
-               if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+               if (info->HasDstReg) {
+                       /* Relative addressing of destination operands is not supported yet. */
+                       if (vpi->DstReg.RelAddr) {
+                               rc_error(&compiler->Base, "Vertex program does not support relative "
+                                        "addressing of destination operands (yet).\n");
+                               return;
+                       }
+
+                       /* Neither is Saturate. */
+                       if (vpi->SaturateMode != RC_SATURATE_NONE) {
+                               rc_error(&compiler->Base, "Vertex program does not support the Saturate "
+                                        "modifier (yet).\n");
+                       }
+               }
+
+               if (compiler->code->length >= c->max_alu_insts * 4) {
                        rc_error(&compiler->Base, "Vertex program has too many instructions\n");
                        return;
                }
 
+               assert(compiler->Base.is_r500 ||
+                      (vpi->Opcode != RC_OPCODE_SEQ &&
+                       vpi->Opcode != RC_OPCODE_SNE));
+
                switch (vpi->Opcode) {
                case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
                case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+               case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
                case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
                case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+               case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
+               case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
                case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
                case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
                case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+               case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
                case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
                case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
                case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
@@ -372,10 +535,113 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
                case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
                case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
                case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
+               case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
                case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
+               case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
                case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
+               case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
+               case RC_OPCODE_BGNLOOP:
+               {
+                       struct loop * l;
+
+                       if ((!compiler->Base.is_r500
+                               && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
+                               || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+                               rc_error(&compiler->Base,
+                                               "Loops are nested too deep.");
+                               return;
+                       }
+                       memory_pool_array_reserve(&compiler->Base.Pool,
+                                       struct loop, loops, current_loop_depth,
+                                       loops_reserved, 1);
+                       l = &loops[current_loop_depth++];
+                       memset(l , 0, sizeof(struct loop));
+                       l->BgnLoop = (compiler->code->length / 4);
+                       continue;
+               }
+               case RC_OPCODE_ENDLOOP:
+               {
+                       struct loop * l;
+                       unsigned int act_addr;
+                       unsigned int last_addr;
+                       unsigned int ret_addr;
+
+                       assert(loops);
+                       l = &loops[current_loop_depth - 1];
+                       act_addr = l->BgnLoop - 1;
+                       last_addr = (compiler->code->length / 4) - 1;
+                       ret_addr = l->BgnLoop;
+
+                       if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+                               rc_error(&compiler->Base,
+                                       "Too many flow control instructions.");
+                               return;
+                       }
+                       if (compiler->Base.is_r500) {
+                               compiler->code->fc_op_addrs.r500
+                                       [compiler->code->num_fc_ops].lw =
+                                       R500_PVS_FC_ACT_ADRS(act_addr)
+                                       | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+                                       ;
+                               compiler->code->fc_op_addrs.r500
+                                       [compiler->code->num_fc_ops].uw =
+                                       R500_PVS_FC_LAST_INST(last_addr)
+                                       | R500_PVS_FC_RTN_INST(ret_addr)
+                                       ;
+                       } else {
+                               compiler->code->fc_op_addrs.r300
+                                       [compiler->code->num_fc_ops] =
+                                       R300_PVS_FC_ACT_ADRS(act_addr)
+                                       | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
+                                       | R300_PVS_FC_LAST_INST(last_addr)
+                                       | R300_PVS_FC_RTN_INST(ret_addr)
+                                       ;
+                       }
+                       compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
+                               R300_PVS_FC_LOOP_INIT_VAL(0x0)
+                               | R300_PVS_FC_LOOP_STEP_VAL(0x1)
+                               ;
+                       compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
+                                               compiler->code->num_fc_ops);
+                       compiler->code->num_fc_ops++;
+                       current_loop_depth--;
+                       continue;
+               }
+
                default:
-                       rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
+                       rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
+                       return;
+               }
+
+               /* Non-flow control instructions that are inside an if statement
+                * need to pay attention to the predicate bit. */
+               if (branch_depth
+                       && vpi->Opcode != RC_OPCODE_IF
+                       && vpi->Opcode != RC_OPCODE_ELSE
+                       && vpi->Opcode != RC_OPCODE_ENDIF) {
+
+                       inst[0] |= (PVS_DST_PRED_ENABLE_MASK
+                                               << PVS_DST_PRED_ENABLE_SHIFT);
+                       inst[0] |= (PVS_DST_PRED_SENSE_MASK
+                                               << PVS_DST_PRED_SENSE_SHIFT);
+               }
+
+               /* Update the number of temporaries. */
+               if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
+                   vpi->DstReg.Index >= compiler->code->num_temporaries)
+                       compiler->code->num_temporaries = vpi->DstReg.Index + 1;
+
+               for (unsigned i = 0; i < info->NumSrcRegs; i++)
+                       if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
+                           vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
+                               compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
+
+               if (compiler->PredicateMask)
+                       if (compiler->PredicateIndex >= compiler->code->num_temporaries)
+                               compiler->code->num_temporaries = compiler->PredicateIndex + 1;
+
+               if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
+                       rc_error(&compiler->Base, "Too many temporaries.\n");
                        return;
                }
 
@@ -392,18 +658,22 @@ struct temporary_allocation {
        struct rc_instruction * LastRead;
 };
 
-static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
+static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
 {
+       struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
        struct rc_instruction *inst;
+       struct rc_instruction *end_loop = NULL;
        unsigned int num_orig_temps = 0;
-       char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+       char hwtemps[RC_REGISTER_MAX_INDEX];
        struct temporary_allocation * ta;
        unsigned int i, j;
+       struct rc_instruction *last_inst_src_reladdr = NULL;
 
-       compiler->code->num_temporaries = 0;
        memset(hwtemps, 0, sizeof(hwtemps));
 
-       /* Pass 1: Count original temporaries and allocate structures */
+       rc_recompute_ips(c);
+
+       /* Pass 1: Count original temporaries. */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
                const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
@@ -422,31 +692,92 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
                }
        }
 
+       /* Pass 2: If there is relative addressing of dst temporaries, we cannot change register indices. Give up.
+        * For src temporaries, save the last instruction which uses relative addressing. */
+       for (inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+               const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
+
+               if (opcode->HasDstReg)
+                       if (inst->U.I.DstReg.RelAddr)
+                               return;
+
+               for (i = 0; i < opcode->NumSrcRegs; ++i) {
+                       if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
+                           inst->U.I.SrcReg[i].RelAddr) {
+                               last_inst_src_reladdr = inst;
+                       }
+               }
+       }
+
        ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
                        sizeof(struct temporary_allocation) * num_orig_temps);
        memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
 
-       /* Pass 2: Determine original temporary lifetimes */
+       /* Pass 3: Determine original temporary lifetimes */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
                const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+               /* Instructions inside of loops need to use the ENDLOOP
+                * instruction as their LastRead. */
+               if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+                       int endloops = 1;
+                       struct rc_instruction * ptr;
+                       for(ptr = inst->Next;
+                               ptr != &compiler->Base.Program.Instructions;
+                                                       ptr = ptr->Next){
+                               if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+                                       endloops++;
+                               } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+                                       endloops--;
+                                       if (endloops <= 0) {
+                                               end_loop = ptr;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+
+               if (inst == end_loop) {
+                       end_loop = NULL;
+                       continue;
+               }
 
                for (i = 0; i < opcode->NumSrcRegs; ++i) {
-                       if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
-                               ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
+                       if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
+                               struct rc_instruction *last_read;
+
+                               /* From "last_inst_src_reladdr", "end_loop", and "inst",
+                                * select the instruction with the highest instruction index (IP).
+                                * Note that "end_loop", if available, has always a higher index than "inst". */
+                               if (last_inst_src_reladdr) {
+                                       if (end_loop) {
+                                               last_read = last_inst_src_reladdr->IP > end_loop->IP ?
+                                                           last_inst_src_reladdr : end_loop;
+                                       } else {
+                                               last_read = last_inst_src_reladdr->IP > inst->IP ?
+                                                           last_inst_src_reladdr : inst;
+                                       }
+                               } else {
+                                       last_read = end_loop ? end_loop : inst;
+                               }
+
+                               ta[inst->U.I.SrcReg[i].Index].LastRead = last_read;
+                       }
                }
        }
 
-       /* Pass 3: Register allocation */
+       /* Pass 4: Register allocation */
        for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
                const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-               for (i = 0; i < opcode->NumSrcRegs; ++i) {
-                       if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
-                               unsigned int orig = inst->U.I.SrcReg[i].Index;
-                               inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
+               if (!last_inst_src_reladdr || last_inst_src_reladdr->IP < inst->IP) {
+                       for (i = 0; i < opcode->NumSrcRegs; ++i) {
+                               if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
+                                       unsigned int orig = inst->U.I.SrcReg[i].Index;
+                                       inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
 
-                               if (ta[orig].Allocated && inst == ta[orig].LastRead)
-                                       hwtemps[ta[orig].HwTemp] = 0;
+                                       if (ta[orig].Allocated && inst == ta[orig].LastRead)
+                                               hwtemps[ta[orig].HwTemp] = 0;
+                               }
                        }
                }
 
@@ -455,19 +786,22 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
                                unsigned int orig = inst->U.I.DstReg.Index;
 
                                if (!ta[orig].Allocated) {
-                                       for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+                                       for(j = 0; j < c->max_temp_regs; ++j) {
                                                if (!hwtemps[j])
                                                        break;
                                        }
-                                       if (j >= VSF_MAX_FRAGMENT_TEMPS) {
-                                               fprintf(stderr, "Out of hw temporaries\n");
+                                       if (j >= c->max_temp_regs) {
+                                               rc_error(c, "Too many temporaries\n");
+                                               return;
                                        } else {
                                                ta[orig].Allocated = 1;
-                                               ta[orig].HwTemp = j;
-                                               hwtemps[j] = 1;
-
-                                               if (j >= compiler->code->num_temporaries)
-                                                       compiler->code->num_temporaries = j + 1;
+                                               if (last_inst_src_reladdr &&
+                                                   last_inst_src_reladdr->IP > inst->IP) {
+                                                       ta[orig].HwTemp = orig;
+                                               } else {
+                                                       ta[orig].HwTemp = j;
+                                               }
+                                               hwtemps[ta[orig].HwTemp] = 1;
                                        }
                                }
 
@@ -477,6 +811,44 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
        }
 }
 
+/**
+ * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
+ * and the Saturate opcode modifier. Only Absolute is currently transformed.
+ */
+static int transform_nonnative_modifiers(
+       struct radeon_compiler *c,
+       struct rc_instruction *inst,
+       void* unused)
+{
+       const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
+       unsigned i;
+
+       /* Transform ABS(a) to MAX(a, -a). */
+       for (i = 0; i < opcode->NumSrcRegs; i++) {
+               if (inst->U.I.SrcReg[i].Abs) {
+                       struct rc_instruction *new_inst;
+                       unsigned temp;
+
+                       inst->U.I.SrcReg[i].Abs = 0;
+
+                       temp = rc_find_free_temporary(c);
+
+                       new_inst = rc_insert_new_instruction(c, inst->Prev);
+                       new_inst->U.I.Opcode = RC_OPCODE_MAX;
+                       new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+                       new_inst->U.I.DstReg.Index = temp;
+                       new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
+                       new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
+                       new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
+
+                       memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
+                       inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
+                       inst->U.I.SrcReg[i].Index = temp;
+                       inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
+               }
+       }
+       return 1;
+}
 
 /**
  * Vertex engine cannot read two inputs or two constants at the same time.
@@ -523,8 +895,9 @@ static int transform_source_conflicts(
        return 1;
 }
 
-static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
+static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
 {
+       struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
        int i;
 
        for(i = 0; i < 32; ++i) {
@@ -566,77 +939,143 @@ static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
        return 1;
 }
 
-static void debug_program_log(struct r300_vertex_program_compiler* c, const char * where)
+static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
+                                         struct rc_instruction *arl,
+                                         struct rc_instruction *end,
+                                         int min_offset)
 {
-       if (c->Base.Debug) {
-               fprintf(stderr, "Vertex Program: %s\n", where);
-               rc_print_program(&c->Base.Program);
+       struct rc_instruction *inst, *add;
+       unsigned const_swizzle;
+
+       /* Transform ARL */
+       add = rc_insert_new_instruction(&c->Base, arl->Prev);
+       add->U.I.Opcode = RC_OPCODE_ADD;
+       add->U.I.DstReg.File = RC_FILE_TEMPORARY;
+       add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
+       add->U.I.DstReg.WriteMask = RC_MASK_X;
+       add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
+       add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
+       add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
+                                                                    min_offset, &const_swizzle);
+       add->U.I.SrcReg[1].Swizzle = const_swizzle;
+
+       arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+       arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
+       arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
+
+       /* Rewrite offsets up to and excluding inst. */
+       for (inst = arl->Next; inst != end; inst = inst->Next) {
+               const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+
+               for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
+                       if (inst->U.I.SrcReg[i].RelAddr)
+                               inst->U.I.SrcReg[i].Index -= min_offset;
        }
 }
 
-
-static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
-       .IsNative = &swizzle_is_native,
-       .Split = 0 /* should never be called */
-};
-
-
-void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
+static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
 {
-       compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
-
-       addArtificialOutputs(compiler);
+       struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
+       struct rc_instruction *inst, *lastARL = NULL;
+       int min_offset = 0;
 
-       debug_program_log(compiler, "before compilation");
+       for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
+               const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-       /* XXX Ideally this should be done only for r3xx, but since
-        * we don't have branching support for r5xx, we use the emulation
-        * on all chipsets. */
-       rc_emulate_branches(&compiler->Base);
+               if (inst->U.I.Opcode == RC_OPCODE_ARL) {
+                       if (lastARL != NULL && min_offset < 0)
+                               transform_negative_addressing(c, lastARL, inst, min_offset);
 
-       debug_program_log(compiler, "after emulate branches");
+                       lastARL = inst;
+                       min_offset = 0;
+                       continue;
+               }
 
-       {
-               struct radeon_program_transformation transformations[] = {
-                       { &r300_transform_vertex_alu, 0 },
-               };
-               radeonLocalTransform(&compiler->Base, 1, transformations);
-       }
+               for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
+                       if (inst->U.I.SrcReg[i].RelAddr &&
+                           inst->U.I.SrcReg[i].Index < 0) {
+                               /* ARL must precede any indirect addressing. */
+                               if (lastARL == NULL) {
+                                       rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
+                                       return;
+                               }
 
-       debug_program_log(compiler, "after native rewrite");
-
-       {
-               /* Note: This pass has to be done seperately from ALU rewrite,
-                * otherwise non-native ALU instructions with source conflits
-                * will not be treated properly.
-                */
-               struct radeon_program_transformation transformations[] = {
-                       { &transform_source_conflicts, 0 },
-               };
-               radeonLocalTransform(&compiler->Base, 1, transformations);
+                               if (inst->U.I.SrcReg[i].Index < min_offset)
+                                       min_offset = inst->U.I.SrcReg[i].Index;
+                       }
+               }
        }
 
-       debug_program_log(compiler, "after source conflict resolve");
-
-       rc_dataflow_deadcode(&compiler->Base, &dataflow_outputs_mark_used, compiler);
-
-       debug_program_log(compiler, "after deadcode");
-
-       rc_dataflow_swizzles(&compiler->Base);
-
-       allocate_temporary_registers(compiler);
-
-       debug_program_log(compiler, "after dataflow");
-
-       translate_vertex_program(compiler);
-
-       rc_constants_copy(&compiler->code->constants, &compiler->Base.Program.Constants);
+       if (lastARL != NULL && min_offset < 0)
+               transform_negative_addressing(c, lastARL, inst, min_offset);
+}
 
-       compiler->code->InputsRead = compiler->Base.Program.InputsRead;
-       compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten;
+static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
+       .IsNative = &swizzle_is_native,
+       .Split = 0 /* should never be called */
+};
 
-       if (compiler->Base.Debug) {
-               fprintf(stderr, "Final vertex program code:\n");
-               r300_vertex_program_dump(compiler->code);
-       }
+void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
+{
+       int is_r500 = c->Base.is_r500;
+       int kill_consts = c->Base.remove_unused_constants;
+       int opt = !c->Base.disable_optimizations;
+
+       /* Lists of instruction transformations. */
+       struct radeon_program_transformation alu_rewrite_r500[] = {
+               { &r300_transform_vertex_alu, 0 },
+               { &r300_transform_trig_scale_vertex, 0 },
+               { 0, 0 }
+       };
+
+       struct radeon_program_transformation alu_rewrite_r300[] = {
+               { &r300_transform_vertex_alu, 0 },
+               { &r300_transform_trig_simple, 0 },
+               { 0, 0 }
+       };
+
+       /* Note: These passes have to be done seperately from ALU rewrite,
+        * otherwise non-native ALU instructions with source conflits
+        * or non-native modifiers will not be treated properly.
+        */
+       struct radeon_program_transformation emulate_modifiers[] = {
+               { &transform_nonnative_modifiers, 0 },
+               { 0, 0 }
+       };
+
+       struct radeon_program_transformation resolve_src_conflicts[] = {
+               { &transform_source_conflicts, 0 },
+               { 0, 0 }
+       };
+
+       /* List of compiler passes. */
+       struct radeon_compiler_pass vs_list[] = {
+               /* NAME                         DUMP PREDICATE  FUNCTION                        PARAM */
+               {"add artificial outputs",      0, 1,           rc_vs_add_artificial_outputs,   NULL},
+               {"transform loops",             1, 1,           rc_transform_loops,             NULL},
+               {"emulate branches",            1, !is_r500,    rc_emulate_branches,            NULL},
+               {"emulate negative addressing", 1, 1,           rc_emulate_negative_addressing, NULL},
+               {"native rewrite",              1, is_r500,     rc_local_transform,             alu_rewrite_r500},
+               {"native rewrite",              1, !is_r500,    rc_local_transform,             alu_rewrite_r300},
+               {"emulate modifiers",           1, !is_r500,    rc_local_transform,             emulate_modifiers},
+               {"deadcode",                    1, opt,         rc_dataflow_deadcode,           dataflow_outputs_mark_used},
+               {"dataflow optimize",           1, opt,         rc_optimize,                    NULL},
+               /* This pass must be done after optimizations. */
+               {"source conflict resolve",     1, 1,           rc_local_transform,             resolve_src_conflicts},
+               {"dataflow swizzles",           1, 1,           rc_dataflow_swizzles,           NULL},
+               {"register allocation",         1, opt,         allocate_temporary_registers,   NULL},
+               {"dead constants",              1, kill_consts, rc_remove_unused_constants,     &c->code->constants_remap_table},
+               {"final code validation",       0, 1,           rc_validate_final_shader,       NULL},
+               {"machine code generation",     0, 1,           translate_vertex_program,       NULL},
+               {"dump machine code",           0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,        NULL},
+               {NULL, 0, 0, NULL, NULL}
+       };
+
+       c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
+
+       rc_run_compiler(&c->Base, vs_list, "Vertex Program");
+
+       c->code->InputsRead = c->Base.Program.InputsRead;
+       c->code->OutputsWritten = c->Base.Program.OutputsWritten;
+       rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
 }