r300/compiler: Rewrite register allocator
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_pair_regalloc.c
index 828d0c8e28e7f514aa5ef8d12890cf20652478c2..52c0216b64b0965e68eef47d408c1a95df65fd64 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009 Nicolai Haehnle.
+ * Copyright 2011 Tom Stellard <tstellar@gmail.com>
  *
  * All Rights Reserved.
  *
 
 #include <stdio.h>
 
+#include "main/glheader.h"
+#include "program/register_allocate.h"
+#include "ralloc.h"
+
 #include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
-
+#include "radeon_list.h"
+#include "radeon_variable.h"
 
 #define VERBOSE 0
 
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
 
-struct live_intervals {
-       int Start;
-       int End;
-       struct live_intervals * Next;
-};
 
 struct register_info {
-       struct live_intervals Live;
+       struct live_intervals Live[4];
 
        unsigned int Used:1;
        unsigned int Allocated:1;
-       rc_register_file File:3;
+       unsigned int File:3;
        unsigned int Index:RC_REGISTER_INDEX_BITS;
-};
-
-struct hardware_register {
-       struct live_intervals * Used;
+       unsigned int Writemask;
 };
 
 struct regalloc_state {
        struct radeon_compiler * C;
 
-       struct register_info Input[RC_REGISTER_MAX_INDEX];
-       struct register_info Temporary[RC_REGISTER_MAX_INDEX];
+       struct register_info * Input;
+       unsigned int NumInputs;
+
+       struct register_info * Temporary;
+       unsigned int NumTemporaries;
+
+       unsigned int Simple;
+       unsigned int HasLoop;
+};
+
+enum rc_reg_class {
+       RC_REG_CLASS_SINGLE,
+       RC_REG_CLASS_DOUBLE,
+       RC_REG_CLASS_TRIPLE,
+       RC_REG_CLASS_ALPHA,
+       RC_REG_CLASS_SINGLE_PLUS_ALPHA,
+       RC_REG_CLASS_DOUBLE_PLUS_ALPHA,
+       RC_REG_CLASS_TRIPLE_PLUS_ALPHA,
+       RC_REG_CLASS_X,
+       RC_REG_CLASS_Y,
+       RC_REG_CLASS_Z,
+       RC_REG_CLASS_XY,
+       RC_REG_CLASS_YZ,
+       RC_REG_CLASS_XZ,
+       RC_REG_CLASS_XW,
+       RC_REG_CLASS_YW,
+       RC_REG_CLASS_ZW,
+       RC_REG_CLASS_XYW,
+       RC_REG_CLASS_YZW,
+       RC_REG_CLASS_XZW,
+       RC_REG_CLASS_COUNT
+};
+
+struct rc_class {
+       enum rc_reg_class Class;
+
+       unsigned int WritemaskCount;
+
+       /** This is 1 if this class is being used by the register allocator
+        * and 0 otherwise */
+       unsigned int Used;
+
+       /** This is the ID number assigned to this class by ra. */
+       unsigned int Id;
+
+       /** List of writemasks that belong to this class */
+       unsigned int Writemasks[3];
+
 
-       struct hardware_register * HwTemporary;
-       unsigned int NumHwTemporaries;
 };
 
 static void print_live_intervals(struct live_intervals * src)
 {
-       if (!src) {
+       if (!src || !src->Used) {
                DBG("(null)");
                return;
        }
 
-       while(src) {
-               DBG("(%i,%i)", src->Start, src->End);
-               src = src->Next;
-       }
+       DBG("(%i,%i)", src->Start, src->End);
 }
 
-static void add_live_intervals(struct regalloc_state * s,
-               struct live_intervals ** dst, struct live_intervals * src)
+static int overlap_live_intervals(struct live_intervals * a, struct live_intervals * b)
 {
-       struct live_intervals ** dst_backup = dst;
-
        if (VERBOSE) {
-               DBG("add_live_intervals: ");
-               print_live_intervals(*dst);
+               DBG("overlap_live_intervals: ");
+               print_live_intervals(a);
                DBG(" to ");
-               print_live_intervals(src);
+               print_live_intervals(b);
                DBG("\n");
        }
 
-       while(src) {
-               if (*dst && (*dst)->End < src->Start) {
-                       dst = &(*dst)->Next;
-               } else if (!*dst || (*dst)->Start > src->End) {
-                       struct live_intervals * li = memory_pool_malloc(&s->C->Pool, sizeof(*li));
-                       li->Start = src->Start;
-                       li->End = src->End;
-                       li->Next = *dst;
-                       *dst = li;
-                       src = src->Next;
-               } else {
-                       if (src->End > (*dst)->End)
-                               (*dst)->End = src->End;
-                       if (src->Start < (*dst)->Start)
-                               (*dst)->Start = src->Start;
-                       src = src->Next;
-               }
-       }
-
-       if (VERBOSE) {
-               DBG("    result: ");
-               print_live_intervals(*dst_backup);
-               DBG("\n");
-       }
-}
-
-static int overlap_live_intervals(struct live_intervals * dst, struct live_intervals * src)
-{
-       if (VERBOSE) {
-               DBG("overlap_live_intervals: ");
-               print_live_intervals(dst);
-               DBG(" to ");
-               print_live_intervals(src);
-               DBG("\n");
+       if (!a->Used || !b->Used) {
+               DBG("    unused interval\n");
+               return 0;
        }
 
-       while(src && dst) {
-               if (dst->End <= src->Start) {
-                       dst = dst->Next;
-               } else if (dst->End <= src->End) {
+       if (a->Start > b->Start) {
+               if (a->Start < b->End) {
+                       DBG("    overlap\n");
+                       return 1;
+               }
+       } else if (b->Start > a->Start) {
+               if (b->Start < a->End) {
                        DBG("    overlap\n");
                        return 1;
-               } else if (dst->Start < src->End) {
+               }
+       } else { /* a->Start == b->Start */
+               if (a->Start != a->End && b->Start != b->End) {
                        DBG("    overlap\n");
                        return 1;
-               } else {
-                       src = src->Next;
                }
        }
 
@@ -148,60 +157,36 @@ static int overlap_live_intervals(struct live_intervals * dst, struct live_inter
        return 0;
 }
 
-static int try_add_live_intervals(struct regalloc_state * s,
-               struct live_intervals ** dst, struct live_intervals * src)
-{
-       if (overlap_live_intervals(*dst, src))
-               return 0;
-
-       add_live_intervals(s, dst, src);
-       return 1;
-}
-
-static void scan_callback(void * data, struct rc_instruction * inst,
-               rc_register_file file, unsigned int index, unsigned int chan)
+static void scan_read_callback(void * data, struct rc_instruction * inst,
+               rc_register_file file, unsigned int index, unsigned int mask)
 {
        struct regalloc_state * s = data;
        struct register_info * reg;
+       unsigned int i;
 
-       if (file == RC_FILE_TEMPORARY)
-               reg = &s->Temporary[index];
-       else if (file == RC_FILE_INPUT)
-               reg = &s->Input[index];
-       else
+       if (file != RC_FILE_INPUT)
                return;
 
-       if (!reg->Used) {
-               reg->Used = 1;
-               if (file == RC_FILE_INPUT)
-                       reg->Live.Start = -1;
-               else
-                       reg->Live.Start = inst->IP;
-               reg->Live.End = inst->IP;
-       } else {
-               if (inst->IP > reg->Live.End)
-                       reg->Live.End = inst->IP;
-       }
-}
-
-static void compute_live_intervals(struct regalloc_state * s)
-{
-       rc_recompute_ips(s->C);
+       s->Input[index].Used = 1;
+       reg = &s->Input[index];
 
-       for(struct rc_instruction * inst = s->C->Program.Instructions.Next;
-           inst != &s->C->Program.Instructions;
-           inst = inst->Next) {
-               rc_for_all_reads(inst, scan_callback, s);
-               rc_for_all_writes(inst, scan_callback, s);
+       for (i = 0; i < 4; i++) {
+               if (!((mask >> i) & 0x1)) {
+                       continue;
+               }
+               reg->Live[i].Used = 1;
+               reg->Live[i].Start = 0;
+               reg->Live[i].End = inst->IP;
        }
 }
 
-static void rewrite_register(struct regalloc_state * s,
+static void remap_register(void * data, struct rc_instruction * inst,
                rc_register_file * file, unsigned int * index)
 {
+       struct regalloc_state * s = data;
        const struct register_info * reg;
 
-       if (*file == RC_FILE_TEMPORARY)
+       if (*file == RC_FILE_TEMPORARY && s->Simple)
                reg = &s->Temporary[*index];
        else if (*file == RC_FILE_INPUT)
                reg = &s->Input[*index];
@@ -209,142 +194,473 @@ static void rewrite_register(struct regalloc_state * s,
                return;
 
        if (reg->Allocated) {
-               *file = reg->File;
                *index = reg->Index;
        }
 }
 
-static void rewrite_normal_instruction(struct regalloc_state * s, struct rc_sub_instruction * inst)
+static void alloc_input_simple(void * data, unsigned int input,
+                                                       unsigned int hwreg)
 {
-       const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+       struct regalloc_state * s = data;
 
-       if (opcode->HasDstReg) {
-               rc_register_file file = inst->DstReg.File;
-               unsigned int index = inst->DstReg.Index;
+       if (input >= s->NumInputs)
+               return;
 
-               rewrite_register(s, &file, &index);
+       s->Input[input].Allocated = 1;
+       s->Input[input].File = RC_FILE_TEMPORARY;
+       s->Input[input].Index = hwreg;
+}
 
-               inst->DstReg.File = file;
-               inst->DstReg.Index = index;
+/* This functions offsets the temporary register indices by the number
+ * of input registers, because input registers are actually temporaries and
+ * should not occupy the same space.
+ *
+ * This pass is supposed to be used to maintain correct allocation of inputs
+ * if the standard register allocation is disabled. */
+static void do_regalloc_inputs_only(struct regalloc_state * s)
+{
+       for (unsigned i = 0; i < s->NumTemporaries; i++) {
+               s->Temporary[i].Allocated = 1;
+               s->Temporary[i].File = RC_FILE_TEMPORARY;
+               s->Temporary[i].Index = i + s->NumInputs;
        }
+}
 
-       for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
-               rc_register_file file = inst->SrcReg[src].File;
-               unsigned int index = inst->SrcReg[src].Index;
+static unsigned int is_derivative(rc_opcode op)
+{
+       return (op == RC_OPCODE_DDX || op == RC_OPCODE_DDY);
+}
 
-               rewrite_register(s, &file, &index);
+static enum rc_reg_class variable_get_class(
+       struct rc_variable * variable,
+       struct rc_class * classes)
+{
+       unsigned int i;
+       unsigned int can_change_writemask= 1;
+       unsigned int writemask = rc_variable_writemask_sum(variable);
+       struct rc_list * readers = rc_variable_readers_union(variable);
+
+       if (!variable->C->is_r500) {
+               unsigned int mask_count = 0;
+               /* The assumption here is that if an instruction has type
+                * RC_INSTRUCTION_NORMAL then it is a TEX instruction.
+                * r300 and r400 can't swizzle the result of a TEX lookup. */
+               if (variable->Inst->Type == RC_INSTRUCTION_NORMAL) {
+                       writemask = RC_MASK_XYZW;
+               }
+               for (i = 0; i < 4; i++) {
+                       if (GET_BIT(writemask, i)) {
+                               mask_count++;
+                       }
+               }
+               /* XXX We should do swizzle packing for r300 and r400 here.
+                * We need to figure out how not to create non-native
+                * swizzles. */
+               if (mask_count > 1) {
+                       can_change_writemask = 0;
+               }
+       }
 
-               inst->SrcReg[src].File = file;
-               inst->SrcReg[src].Index = index;
+       if (variable->Inst->Type == RC_INSTRUCTION_PAIR) {
+               /* DDX/DDY seem to always fail when their writemasks are
+                * changed.*/
+               if (is_derivative(variable->Inst->U.P.RGB.Opcode)
+                   || is_derivative(variable->Inst->U.P.Alpha.Opcode)) {
+                       can_change_writemask = 0;
+               }
        }
+       for ( ; readers; readers = readers->Next) {
+               struct rc_reader * r = readers->Item;
+               if (r->Inst->Type == RC_INSTRUCTION_PAIR) {
+                       if (r->U.P.Arg->Source == RC_PAIR_PRESUB_SRC) {
+                               can_change_writemask = 0;
+                               break;
+                       }
+                       /* DDX/DDY also fail when their swizzles are changed. */
+                       if (is_derivative(r->Inst->U.P.RGB.Opcode)
+                           || is_derivative(r->Inst->U.P.Alpha.Opcode)) {
+                               can_change_writemask = 0;
+                               break;
+                       }
+               }
+       }
+       for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
+               unsigned int j;
+               if (!can_change_writemask && classes[i].WritemaskCount > 1) {
+                       continue;
+               }
+               for (j = 0; j < 3; j++) {
+                       if (classes[i].Writemasks[j] == writemask) {
+                               return classes[i].Class;
+                       }
+               }
+       }
+       rc_error(variable->C, "Could not find class for index=%u mask=%u\n",
+                               variable->Dst.Index, writemask);
+       return 0;
 }
 
-static void rewrite_pair_instruction(struct regalloc_state * s, struct rc_pair_instruction * inst)
+static unsigned int overlap_live_intervals_array(
+       struct live_intervals * a,
+       struct live_intervals * b)
 {
-       if (inst->RGB.WriteMask) {
-               rc_register_file file = RC_FILE_TEMPORARY;
-               unsigned int index = inst->RGB.DestIndex;
-
-               rewrite_register(s, &file, &index);
-
-               inst->RGB.DestIndex = index;
+       unsigned int a_chan, b_chan;
+       for (a_chan = 0; a_chan < 4; a_chan++) {
+               for (b_chan = 0; b_chan < 4; b_chan++) {
+                       if (overlap_live_intervals(&a[a_chan], &b[b_chan])) {
+                                       return 1;
+                       }
+               }
        }
+       return 0;
+}
 
-       if (inst->Alpha.WriteMask) {
-               rc_register_file file = RC_FILE_TEMPORARY;
-               unsigned int index = inst->Alpha.DestIndex;
+static unsigned int reg_get_index(int reg)
+{
+       return reg / RC_MASK_XYZW;
+}
 
-               rewrite_register(s, &file, &index);
+static unsigned int reg_get_writemask(int reg)
+{
+       return (reg % RC_MASK_XYZW) + 1;
+}
 
-               inst->Alpha.DestIndex = index;
+static int get_reg_id(unsigned int index, unsigned int writemask)
+{
+       assert(writemask);
+       if (writemask == 0) {
+               return 0;
        }
+       return (index * RC_MASK_XYZW) + (writemask - 1);
+}
 
-       for(unsigned int src = 0; src < 3; ++src) {
-               if (inst->RGB.Src[src].Used) {
-                       rc_register_file file = inst->RGB.Src[src].File;
-                       unsigned int index = inst->RGB.Src[src].Index;
+#if VERBOSE
+static void print_reg(int reg)
+{
+       unsigned int index = reg_get_index(reg);
+       unsigned int mask = reg_get_writemask(reg);
+       fprintf(stderr, "Temp[%u].%c%c%c%c", index,
+               mask & RC_MASK_X ? 'x' : '_',
+               mask & RC_MASK_Y ? 'y' : '_',
+               mask & RC_MASK_Z ? 'z' : '_',
+               mask & RC_MASK_W ? 'w' : '_');
+}
+#endif
 
-                       rewrite_register(s, &file, &index);
+static void add_register_conflicts(
+       struct ra_regs * regs,
+       unsigned int max_temp_regs)
+{
+       unsigned int index, a_mask, b_mask;
+       for (index = 0; index < max_temp_regs; index++) {
+               for(a_mask = 1; a_mask <= RC_MASK_XYZW; a_mask++) {
+                       for (b_mask = a_mask + 1; b_mask <= RC_MASK_XYZW;
+                                                               b_mask++) {
+                               if (a_mask & b_mask) {
+                                       ra_add_reg_conflict(regs,
+                                               get_reg_id(index, a_mask),
+                                               get_reg_id(index, b_mask));
+                               }
+                       }
+               }
+       }
+}
 
-                       inst->RGB.Src[src].File = file;
-                       inst->RGB.Src[src].Index = index;
+static void do_advanced_regalloc(struct regalloc_state * s)
+{
+       struct rc_class rc_class_list [] = {
+               {RC_REG_CLASS_SINGLE, 3, 0, 0,
+                       {RC_MASK_X,
+                        RC_MASK_Y,
+                        RC_MASK_Z}},
+               {RC_REG_CLASS_DOUBLE, 3, 0, 0,
+                       {RC_MASK_X | RC_MASK_Y,
+                        RC_MASK_X | RC_MASK_Z,
+                        RC_MASK_Y | RC_MASK_Z}},
+               {RC_REG_CLASS_TRIPLE, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_Y | RC_MASK_Z,
+                        RC_MASK_NONE,
+                        RC_MASK_NONE}},
+               {RC_REG_CLASS_ALPHA, 1, 0, 0,
+                       {RC_MASK_W,
+                        RC_MASK_NONE,
+                        RC_MASK_NONE}},
+               {RC_REG_CLASS_SINGLE_PLUS_ALPHA, 3, 0, 0,
+                       {RC_MASK_X | RC_MASK_W,
+                        RC_MASK_Y | RC_MASK_W,
+                        RC_MASK_Z | RC_MASK_W}},
+               {RC_REG_CLASS_DOUBLE_PLUS_ALPHA, 3, 0, 0,
+                       {RC_MASK_X | RC_MASK_Y | RC_MASK_W,
+                        RC_MASK_X | RC_MASK_Z | RC_MASK_W,
+                        RC_MASK_Y | RC_MASK_Z | RC_MASK_W}},
+               {RC_REG_CLASS_TRIPLE_PLUS_ALPHA, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_Y | RC_MASK_Z | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_X, 1, 0, 0,
+                       {RC_MASK_X,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_Y, 1, 0, 0,
+                       {RC_MASK_Y,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_Z, 1, 0, 0,
+                       {RC_MASK_Z,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_XY, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_Y,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_YZ, 1, 0, 0,
+                       {RC_MASK_Y | RC_MASK_Z,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_XZ, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_Z,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_XW, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_YW, 1, 0, 0,
+                       {RC_MASK_Y | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_ZW, 1, 0, 0,
+                       {RC_MASK_Z | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_XYW, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_Y | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_YZW, 1, 0, 0,
+                       {RC_MASK_Y | RC_MASK_Z | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}},
+               {RC_REG_CLASS_XZW, 1, 0, 0,
+                       {RC_MASK_X | RC_MASK_Z | RC_MASK_W,
+                       RC_MASK_NONE,
+                       RC_MASK_NONE}}
+       };
+
+       unsigned int i, j, index, input_node, node_count, node_index;
+       unsigned int * node_classes;
+       unsigned int * input_classes;
+       struct rc_instruction * inst;
+       struct rc_list * var_ptr;
+       struct rc_list * variables;
+       struct ra_regs * regs;
+       struct ra_graph * graph;
+
+       /* Allocate the main ra data structure */
+       regs = ra_alloc_reg_set(s->C->max_temp_regs * RC_MASK_XYZW);
+
+       /* Get list of program variables */
+       variables = rc_get_variables(s->C);
+       node_count = rc_list_count(variables);
+       node_classes = memory_pool_malloc(&s->C->Pool,
+                       node_count * sizeof(unsigned int));
+       input_classes = memory_pool_malloc(&s->C->Pool,
+                       s->NumInputs * sizeof(unsigned int));
+
+       for (var_ptr = variables, node_index = 0; var_ptr;
+                                       var_ptr = var_ptr->Next, node_index++) {
+               unsigned int class_index;
+               /* Compute the live intervals */
+               rc_variable_compute_live_intervals(var_ptr->Item);
+
+               class_index = variable_get_class(var_ptr->Item, rc_class_list);
+
+               /* If we haven't used this register class yet, mark it
+                * as used and allocate space for it. */
+               if (!rc_class_list[class_index].Used) {
+                       rc_class_list[class_index].Used = 1;
+                       rc_class_list[class_index].Id = ra_alloc_reg_class(regs);
                }
 
-               if (inst->Alpha.Src[src].Used) {
-                       rc_register_file file = inst->Alpha.Src[src].File;
-                       unsigned int index = inst->Alpha.Src[src].Index;
+               node_classes[node_index] = rc_class_list[class_index].Id;
+       }
 
-                       rewrite_register(s, &file, &index);
 
-                       inst->Alpha.Src[src].File = file;
-                       inst->Alpha.Src[src].Index = index;
+       /* Assign registers to the classes */
+       for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
+               struct rc_class class = rc_class_list[i];
+               if (!class.Used) {
+                       continue;
+               }
+
+               for (index = 0; index < s->C->max_temp_regs; index++) {
+                       for (j = 0; j < class.WritemaskCount; j++) {
+                               int reg_id = get_reg_id(index,
+                                                       class.Writemasks[j]);
+                               ra_class_add_reg(regs, class.Id, reg_id);
+                       }
                }
        }
-}
 
-static void do_regalloc(struct regalloc_state * s)
-{
-       /* Simple and stupid greedy register allocation */
-       for(unsigned int index = 0; index < RC_REGISTER_MAX_INDEX; ++index) {
-               struct register_info * reg = &s->Temporary[index];
+       /* Add register conflicts */
+       add_register_conflicts(regs, s->C->max_temp_regs);
 
-               if (!reg->Used)
-                       continue;
+       /* Calculate live intervals for input registers */
+       for (inst = s->C->Program.Instructions.Next;
+                                       inst != &s->C->Program.Instructions;
+                                       inst = inst->Next) {
+               rc_for_all_reads_mask(inst, scan_read_callback, s);
+       }
 
-               for(unsigned int hwreg = 0; hwreg < s->NumHwTemporaries; ++hwreg) {
-                       if (try_add_live_intervals(s, &s->HwTemporary[hwreg].Used, &reg->Live)) {
-                               reg->Allocated = 1;
-                               reg->File = RC_FILE_TEMPORARY;
-                               reg->Index = hwreg;
-                               goto success;
+       /* Create classes for input registers */
+       for (i = 0; i < s->NumInputs; i++) {
+               unsigned int chan, class_id, writemask = 0;
+               for (chan = 0; chan < 4; chan++) {
+                       if (s->Input[i].Live[chan].Used) {
+                               writemask |= (1 << chan);
                        }
                }
+               s->Input[i].Writemask = writemask;
+               if (!writemask) {
+                       continue;
+               }
 
-               rc_error(s->C, "Ran out of hardware temporaries\n");
-               return;
-
-       success:;
+               class_id = ra_alloc_reg_class(regs);
+               input_classes[i] = class_id;
+               ra_class_add_reg(regs, class_id,
+                               get_reg_id(s->Input[i].Index, writemask));
        }
 
-       /* Rewrite all instructions based on the translation table we built */
-       for(struct rc_instruction * inst = s->C->Program.Instructions.Next;
-           inst != &s->C->Program.Instructions;
-           inst = inst->Next) {
-               if (inst->Type == RC_INSTRUCTION_NORMAL)
-                       rewrite_normal_instruction(s, &inst->U.I);
-               else
-                       rewrite_pair_instruction(s, &inst->U.P);
+       ra_set_finalize(regs);
+
+       graph = ra_alloc_interference_graph(regs, node_count + s->NumInputs);
+
+       /* Build the interference graph */
+       for (var_ptr = variables, node_index = 0; var_ptr;
+                                       var_ptr = var_ptr->Next,node_index++) {
+               struct rc_list * a, * b;
+               unsigned int b_index;
+
+               ra_set_node_class(graph, node_index, node_classes[node_index]);
+
+               for (a = var_ptr, b = var_ptr->Next, b_index = node_index + 1;
+                                               b; b = b->Next, b_index++) {
+                       struct rc_variable * var_a = a->Item;
+                       while (var_a) {
+                               struct rc_variable * var_b = b->Item;
+                               while (var_b) {
+                                       if (overlap_live_intervals_array(var_a->Live, var_b->Live)) {
+                                               ra_add_node_interference(graph,
+                                                       node_index, b_index);
+                                       }
+                                       var_b = var_b->Friend;
+                               }
+                               var_a = var_a->Friend;
+                       }
+               }
        }
-}
 
-static void alloc_input(void * data, unsigned int input, unsigned int hwreg)
-{
-       struct regalloc_state * s = data;
+       /* Add input registers to the interference graph */
+       for (i = 0, input_node = 0; i< s->NumInputs; i++) {
+               if (!s->Input[i].Writemask) {
+                       continue;
+               }
+               ra_set_node_class(graph, node_count + input_node,
+                                                       input_classes[i]);
+               for (var_ptr = variables, node_index = 0;
+                               var_ptr; var_ptr = var_ptr->Next, node_index++) {
+                       struct rc_variable * var = var_ptr->Item;
+                       if (overlap_live_intervals_array(s->Input[i].Live,
+                                                               var->Live)) {
+                               ra_add_node_interference(graph, node_index,
+                                               node_count + input_node);
+                       }
+               }
+               /* Manually allocate a register for this input */
+               ra_set_node_reg(graph, node_count + input_node, get_reg_id(
+                               s->Input[i].Index, s->Input[i].Writemask));
+               input_node++;
+       }
 
-       if (!s->Input[input].Used)
+       if (!ra_allocate_no_spills(graph)) {
+               rc_error(s->C, "Ran out of hardware temporaries\n");
                return;
+       }
 
-       add_live_intervals(s, &s->HwTemporary[hwreg].Used, &s->Input[input].Live);
+       /* Rewrite the registers */
+       for (var_ptr = variables, node_index = 0; var_ptr;
+                               var_ptr = var_ptr->Next, node_index++) {
+               int reg = ra_get_node_reg(graph, node_index);
+               unsigned int writemask = reg_get_writemask(reg);
+               unsigned int index = reg_get_index(reg);
+               struct rc_variable * var = var_ptr->Item;
 
-       s->Input[input].Allocated = 1;
-       s->Input[input].File = RC_FILE_TEMPORARY;
-       s->Input[input].Index = hwreg;
+               if (!s->C->is_r500 && var->Inst->Type == RC_INSTRUCTION_NORMAL) {
+                       writemask = rc_variable_writemask_sum(var);
+               }
 
+               if (var->Dst.File == RC_FILE_INPUT) {
+                       continue;
+               }
+               rc_variable_change_dst(var, index, writemask);
+       }
+
+       ralloc_free(graph);
+       ralloc_free(regs);
 }
 
-void rc_pair_regalloc(struct r300_fragment_program_compiler *c, unsigned maxtemps)
+/**
+ * @param user This parameter should be a pointer to an integer value.  If this
+ * integer value is zero, then a simple register allocator will be used that
+ * only allocates space for input registers (\sa do_regalloc_inputs_only).  If
+ * user is non-zero, then the regular register allocator will be used
+ * (\sa do_regalloc).
+  */
+void rc_pair_regalloc(struct radeon_compiler *cc, void *user)
 {
+       struct r300_fragment_program_compiler *c =
+                               (struct r300_fragment_program_compiler*)cc;
        struct regalloc_state s;
+       int do_full_regalloc = (int)user;
+       struct rc_instruction * inst;
 
        memset(&s, 0, sizeof(s));
-       s.C = &c->Base;
-       s.NumHwTemporaries = maxtemps;
-       s.HwTemporary = memory_pool_malloc(&s.C->Pool, maxtemps*sizeof(struct hardware_register));
-       memset(s.HwTemporary, 0, maxtemps*sizeof(struct hardware_register));
+       s.C = cc;
+       s.NumInputs = rc_get_max_index(cc, RC_FILE_INPUT) + 1;
+       s.Input = memory_pool_malloc(&cc->Pool,
+                       s.NumInputs * sizeof(struct register_info));
+       memset(s.Input, 0, s.NumInputs * sizeof(struct register_info));
+
+       s.NumTemporaries = rc_get_max_index(cc, RC_FILE_TEMPORARY) + 1;
+       s.Temporary = memory_pool_malloc(&cc->Pool,
+                       s.NumTemporaries * sizeof(struct register_info));
+       memset(s.Temporary, 0, s.NumTemporaries * sizeof(struct register_info));
+
+       for(inst = cc->Program.Instructions.Next;
+           inst != &cc->Program.Instructions;
+           inst = inst->Next) {
+
+               if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+                       s.HasLoop = 1;
+                       break;
+               }
+       }
 
-       compute_live_intervals(&s);
+       rc_recompute_ips(s.C);
 
-       c->AllocateHwInputs(c, &alloc_input, &s);
+       c->AllocateHwInputs(c, &alloc_input_simple, &s);
+       if (!s.HasLoop && do_full_regalloc) {
+               do_advanced_regalloc(&s);
+       } else {
+               s.Simple = 1;
+               do_regalloc_inputs_only(&s);
+       }
 
-       do_regalloc(&s);
+       /* Rewrite inputs and if we are doing the simple allocation, rewrite
+        * temporaries too. */
+       for (struct rc_instruction *inst = s.C->Program.Instructions.Next;
+                                       inst != &s.C->Program.Instructions;
+                                       inst = inst->Next) {
+               rc_remap_registers(inst, &remap_register, &s);
+       }
 }