Merge branch 'llvm-cliptest-viewport'
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / r500_fragprog_emit.c
index d694725c9bb9df5ef728b7f7f1b6e9ea73684f49..6f101c68eb68630f872a2d2b681b9cf0283b8693 100644 (file)
  *
  * \author Corbin Simpson <MostAwesomeDude@gmail.com>
  *
- * \todo Depth write, WPOS/FOGC inputs
- *
- * \todo FogOption
- *
  */
 
 #include "r500_fragprog.h"
 
 #include "radeon_program_pair.h"
 
+#define MAX_BRANCH_DEPTH_FULL 32
+#define MAX_BRANCH_DEPTH_PARTIAL 4
 
 #define PROG_CODE \
-       struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
        struct r500_fragment_program_code *code = &c->code->code.r500
 
 #define error(fmt, args...) do {                       \
        } while(0)
 
 
-static GLuint translate_rgb_op(struct r300_fragment_program_compiler *c, GLuint opcode)
+struct branch_info {
+       int If;
+       int Else;
+       int Endif;
+};
+
+struct r500_loop_info {
+       int BgnLoop;
+
+       int BranchDepth;
+       int * Brks;
+       int BrkCount;
+       int BrkReserved;
+
+       int * Conts;
+       int ContCount;
+       int ContReserved;
+};
+
+struct emit_state {
+       struct radeon_compiler * C;
+       struct r500_fragment_program_code * Code;
+
+       struct branch_info * Branches;
+       unsigned int CurrentBranchDepth;
+       unsigned int BranchesReserved;
+
+       struct r500_loop_info * Loops;
+       unsigned int CurrentLoopDepth;
+       unsigned int LoopsReserved;
+
+       unsigned int MaxBranchDepth;
+
+};
+
+static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
 {
        switch(opcode) {
-       case OPCODE_CMP: return R500_ALU_RGBA_OP_CMP;
-       case OPCODE_DDX: return R500_ALU_RGBA_OP_MDH;
-       case OPCODE_DDY: return R500_ALU_RGBA_OP_MDV;
-       case OPCODE_DP3: return R500_ALU_RGBA_OP_DP3;
-       case OPCODE_DP4: return R500_ALU_RGBA_OP_DP4;
-       case OPCODE_FRC: return R500_ALU_RGBA_OP_FRC;
+       case RC_OPCODE_CMP: return R500_ALU_RGBA_OP_CMP;
+       case RC_OPCODE_DDX: return R500_ALU_RGBA_OP_MDH;
+       case RC_OPCODE_DDY: return R500_ALU_RGBA_OP_MDV;
+       case RC_OPCODE_DP3: return R500_ALU_RGBA_OP_DP3;
+       case RC_OPCODE_DP4: return R500_ALU_RGBA_OP_DP4;
+       case RC_OPCODE_FRC: return R500_ALU_RGBA_OP_FRC;
        default:
-               error("translate_rgb_op(%d): unknown opcode\n", opcode);
+               error("translate_rgb_op: unknown opcode %s\n", rc_get_opcode_info(opcode)->Name);
                /* fall through */
-       case OPCODE_NOP:
+       case RC_OPCODE_NOP:
                /* fall through */
-       case OPCODE_MAD: return R500_ALU_RGBA_OP_MAD;
-       case OPCODE_MAX: return R500_ALU_RGBA_OP_MAX;
-       case OPCODE_MIN: return R500_ALU_RGBA_OP_MIN;
-       case OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP;
+       case RC_OPCODE_MAD: return R500_ALU_RGBA_OP_MAD;
+       case RC_OPCODE_MAX: return R500_ALU_RGBA_OP_MAX;
+       case RC_OPCODE_MIN: return R500_ALU_RGBA_OP_MIN;
+       case RC_OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP;
        }
 }
 
-static GLuint translate_alpha_op(struct r300_fragment_program_compiler *c, GLuint opcode)
+static unsigned int translate_alpha_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
 {
        switch(opcode) {
-       case OPCODE_CMP: return R500_ALPHA_OP_CMP;
-       case OPCODE_COS: return R500_ALPHA_OP_COS;
-       case OPCODE_DDX: return R500_ALPHA_OP_MDH;
-       case OPCODE_DDY: return R500_ALPHA_OP_MDV;
-       case OPCODE_DP3: return R500_ALPHA_OP_DP;
-       case OPCODE_DP4: return R500_ALPHA_OP_DP;
-       case OPCODE_EX2: return R500_ALPHA_OP_EX2;
-       case OPCODE_FRC: return R500_ALPHA_OP_FRC;
-       case OPCODE_LG2: return R500_ALPHA_OP_LN2;
+       case RC_OPCODE_CMP: return R500_ALPHA_OP_CMP;
+       case RC_OPCODE_COS: return R500_ALPHA_OP_COS;
+       case RC_OPCODE_DDX: return R500_ALPHA_OP_MDH;
+       case RC_OPCODE_DDY: return R500_ALPHA_OP_MDV;
+       case RC_OPCODE_DP3: return R500_ALPHA_OP_DP;
+       case RC_OPCODE_DP4: return R500_ALPHA_OP_DP;
+       case RC_OPCODE_EX2: return R500_ALPHA_OP_EX2;
+       case RC_OPCODE_FRC: return R500_ALPHA_OP_FRC;
+       case RC_OPCODE_LG2: return R500_ALPHA_OP_LN2;
        default:
-               error("translate_alpha_op(%d): unknown opcode\n", opcode);
+               error("translate_alpha_op: unknown opcode %s\n", rc_get_opcode_info(opcode)->Name);
                /* fall through */
-       case OPCODE_NOP:
+       case RC_OPCODE_NOP:
                /* fall through */
-       case OPCODE_MAD: return R500_ALPHA_OP_MAD;
-       case OPCODE_MAX: return R500_ALPHA_OP_MAX;
-       case OPCODE_MIN: return R500_ALPHA_OP_MIN;
-       case OPCODE_RCP: return R500_ALPHA_OP_RCP;
-       case OPCODE_RSQ: return R500_ALPHA_OP_RSQ;
-       case OPCODE_SIN: return R500_ALPHA_OP_SIN;
+       case RC_OPCODE_MAD: return R500_ALPHA_OP_MAD;
+       case RC_OPCODE_MAX: return R500_ALPHA_OP_MAX;
+       case RC_OPCODE_MIN: return R500_ALPHA_OP_MIN;
+       case RC_OPCODE_RCP: return R500_ALPHA_OP_RCP;
+       case RC_OPCODE_RSQ: return R500_ALPHA_OP_RSQ;
+       case RC_OPCODE_SIN: return R500_ALPHA_OP_SIN;
        }
 }
 
-static GLuint fix_hw_swizzle(GLuint swz)
+static unsigned int fix_hw_swizzle(unsigned int swz)
 {
-       if (swz == 5) swz = 6;
-       if (swz == SWIZZLE_NIL) swz = 4;
+    switch (swz) {
+        case RC_SWIZZLE_ZERO:
+        case RC_SWIZZLE_UNUSED:
+            swz = 4;
+            break;
+        case RC_SWIZZLE_HALF:
+            swz = 5;
+            break;
+        case RC_SWIZZLE_ONE:
+            swz = 6;
+            break;
+    }
+
        return swz;
 }
 
-static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg)
+static unsigned int translate_arg_rgb(struct rc_pair_instruction *inst, int arg)
 {
-       GLuint t = inst->RGB.Arg[arg].Source;
+       unsigned int t = inst->RGB.Arg[arg].Source;
        int comp;
        t |= inst->RGB.Arg[arg].Negate << 11;
        t |= inst->RGB.Arg[arg].Abs << 12;
@@ -127,57 +170,103 @@ static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg)
        return t;
 }
 
-static GLuint translate_arg_alpha(struct radeon_pair_instruction *inst, int i)
+static unsigned int translate_arg_alpha(struct rc_pair_instruction *inst, int i)
 {
-       GLuint t = inst->Alpha.Arg[i].Source;
+       unsigned int t = inst->Alpha.Arg[i].Source;
        t |= fix_hw_swizzle(inst->Alpha.Arg[i].Swizzle) << 2;
        t |= inst->Alpha.Arg[i].Negate << 5;
        t |= inst->Alpha.Arg[i].Abs << 6;
        return t;
 }
 
-static void use_temporary(struct r500_fragment_program_code* code, GLuint index)
+static uint32_t translate_alu_result_op(struct r300_fragment_program_compiler * c, rc_compare_func func)
+{
+       switch(func) {
+       case RC_COMPARE_FUNC_EQUAL: return R500_INST_ALU_RESULT_OP_EQ;
+       case RC_COMPARE_FUNC_LESS: return R500_INST_ALU_RESULT_OP_LT;
+       case RC_COMPARE_FUNC_GEQUAL: return R500_INST_ALU_RESULT_OP_GE;
+       case RC_COMPARE_FUNC_NOTEQUAL: return R500_INST_ALU_RESULT_OP_NE;
+       default:
+               rc_error(&c->Base, "%s: unsupported compare func %i\n", __FUNCTION__, func);
+               return 0;
+       }
+}
+
+static void use_temporary(struct r500_fragment_program_code* code, unsigned int index)
 {
        if (index > code->max_temp_idx)
                code->max_temp_idx = index;
 }
 
-static GLuint use_source(struct r500_fragment_program_code* code, struct radeon_pair_instruction_source src)
+static unsigned int use_source(struct r500_fragment_program_code* code, struct rc_pair_instruction_source src)
 {
-       if (!src.Constant)
+       if (src.File == RC_FILE_CONSTANT) {
+               return src.Index | 0x100;
+       } else if (src.File == RC_FILE_TEMPORARY) {
                use_temporary(code, src.Index);
-       return src.Index | src.Constant << 8;
+               return src.Index;
+       }
+
+       return 0;
 }
 
+/**
+ * NOP the specified instruction if it is not a texture lookup.
+ */
+static void alu_nop(struct r300_fragment_program_compiler *c, int ip)
+{
+       PROG_CODE;
+
+       if ((code->inst[ip].inst0 & 0x3) != R500_INST_TYPE_TEX) {
+               code->inst[ip].inst0 |= R500_INST_NOP;
+       }
+}
 
 /**
  * Emit a paired ALU instruction.
  */
-static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
+static void emit_paired(struct r300_fragment_program_compiler *c, struct rc_pair_instruction *inst)
 {
+       int ip;
        PROG_CODE;
 
-       if (code->inst_end >= 511) {
+       if (code->inst_end >= c->Base.max_alu_insts-1) {
                error("emit_alu: Too many instructions");
-               return GL_FALSE;
+               return;
        }
 
-       int ip = ++code->inst_end;
+       ip = ++code->inst_end;
+
+       /* Quirk: MDH/MDV (DDX/DDY) need a NOP on previous non-TEX instructions. */
+       if (inst->RGB.Opcode == RC_OPCODE_DDX || inst->Alpha.Opcode == RC_OPCODE_DDX ||
+               inst->RGB.Opcode == RC_OPCODE_DDY || inst->Alpha.Opcode == RC_OPCODE_DDY) {
+               if (ip > 0) {
+                       alu_nop(c, ip - 1);
+               }
+       }
 
        code->inst[ip].inst5 = translate_rgb_op(c, inst->RGB.Opcode);
        code->inst[ip].inst4 = translate_alpha_op(c, inst->Alpha.Opcode);
 
-       if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask)
+       if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask) {
                code->inst[ip].inst0 = R500_INST_TYPE_OUT;
-       else
+               if (inst->WriteALUResult) {
+                       error("Cannot write output and ALU result at the same time");
+                       return;
+               }
+       } else {
                code->inst[ip].inst0 = R500_INST_TYPE_ALU;
+       }
        code->inst[ip].inst0 |= R500_INST_TEX_SEM_WAIT;
 
        code->inst[ip].inst0 |= (inst->RGB.WriteMask << 11) | (inst->Alpha.WriteMask << 14);
        code->inst[ip].inst0 |= (inst->RGB.OutputWriteMask << 15) | (inst->Alpha.OutputWriteMask << 18);
+       if (inst->Nop) {
+               code->inst[ip].inst0 |= R500_INST_NOP;
+       }
        if (inst->Alpha.DepthWriteMask) {
                code->inst[ip].inst4 |= R500_ALPHA_W_OMASK;
-               c->code->writes_depth = GL_TRUE;
+               c->code->writes_depth = 1;
        }
 
        code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex);
@@ -190,6 +279,40 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
        if (inst->Alpha.Saturate)
                code->inst[ip].inst0 |= R500_INST_ALPHA_CLAMP;
 
+       /* Set the presubtract operation. */
+       switch(inst->RGB.Src[RC_PAIR_PRESUB_SRC].Index) {
+               case RC_PRESUB_BIAS:
+                       code->inst[ip].inst1 |= R500_RGB_SRCP_OP_1_MINUS_2RGB0;
+                       break;
+               case RC_PRESUB_SUB:
+                       code->inst[ip].inst1 |= R500_RGB_SRCP_OP_RGB1_MINUS_RGB0;
+                       break;
+               case RC_PRESUB_ADD:
+                       code->inst[ip].inst1 |= R500_RGB_SRCP_OP_RGB1_PLUS_RGB0;
+                       break;
+               case RC_PRESUB_INV:
+                       code->inst[ip].inst1 |= R500_RGB_SRCP_OP_1_MINUS_RGB0;
+                       break;
+               default:
+                       break;
+       }
+       switch(inst->Alpha.Src[RC_PAIR_PRESUB_SRC].Index) {
+               case RC_PRESUB_BIAS:
+                       code->inst[ip].inst2 |= R500_ALPHA_SRCP_OP_1_MINUS_2A0;
+                       break;
+               case RC_PRESUB_SUB:
+                       code->inst[ip].inst2 |= R500_ALPHA_SRCP_OP_A1_MINUS_A0;
+                       break;
+               case RC_PRESUB_ADD:
+                       code->inst[ip].inst2 |= R500_ALPHA_SRCP_OP_A1_PLUS_A0;
+                       break;
+               case RC_PRESUB_INV:
+                       code->inst[ip].inst2 |= R500_ALPHA_SRCP_OP_1_MINUS_A0;
+                       break;
+               default:
+                       break;
+       }
+
        code->inst[ip].inst1 |= R500_RGB_ADDR0(use_source(code, inst->RGB.Src[0]));
        code->inst[ip].inst1 |= R500_RGB_ADDR1(use_source(code, inst->RGB.Src[1]));
        code->inst[ip].inst1 |= R500_RGB_ADDR2(use_source(code, inst->RGB.Src[2]));
@@ -206,12 +329,24 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
        code->inst[ip].inst4 |= translate_arg_alpha(inst, 1) << R500_ALPHA_SEL_B_SHIFT;
        code->inst[ip].inst5 |= translate_arg_alpha(inst, 2) << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT;
 
-       return GL_TRUE;
+       code->inst[ip].inst3 |= R500_ALU_RGB_TARGET(inst->RGB.Target);
+       code->inst[ip].inst4 |= R500_ALPHA_TARGET(inst->Alpha.Target);
+
+       if (inst->WriteALUResult) {
+               code->inst[ip].inst3 |= R500_ALU_RGB_WMASK;
+
+               if (inst->WriteALUResult == RC_ALURESULT_X)
+                       code->inst[ip].inst0 |= R500_INST_ALU_RESULT_SEL_RED;
+               else
+                       code->inst[ip].inst0 |= R500_INST_ALU_RESULT_SEL_ALPHA;
+
+               code->inst[ip].inst0 |= translate_alu_result_op(c, inst->ALUResultCompare);
+       }
 }
 
-static GLuint translate_strq_swizzle(GLuint swizzle)
+static unsigned int translate_strq_swizzle(unsigned int swizzle)
 {
-       GLuint swiz = 0;
+       unsigned int swiz = 0;
        int i;
        for (i = 0; i < 4; i++)
                swiz |= (GET_SWZ(swizzle, i) & 0x3) << i*2;
@@ -221,79 +356,296 @@ static GLuint translate_strq_swizzle(GLuint swizzle)
 /**
  * Emit a single TEX instruction
  */
-static GLboolean emit_tex(void *data, struct radeon_pair_texture_instruction *inst)
+static int emit_tex(struct r300_fragment_program_compiler *c, struct rc_sub_instruction *inst)
 {
+       int ip;
        PROG_CODE;
 
-       if (code->inst_end >= 511) {
+       if (code->inst_end >= c->Base.max_alu_insts-1) {
                error("emit_tex: Too many instructions");
-               return GL_FALSE;
+               return 0;
        }
 
-       int ip = ++code->inst_end;
+       ip = ++code->inst_end;
 
        code->inst[ip].inst0 = R500_INST_TYPE_TEX
-               | (inst->WriteMask << 11)
+               | (inst->DstReg.WriteMask << 11)
                | R500_INST_TEX_SEM_WAIT;
        code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit)
                | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
 
-       if (inst->TexSrcTarget == TEXTURE_RECT_INDEX)
-               code->inst[ip].inst1 |= R500_TEX_UNSCALED;
+       if (inst->TexSrcTarget == RC_TEXTURE_RECT)
+               code->inst[ip].inst1 |= R500_TEX_UNSCALED;
 
        switch (inst->Opcode) {
-       case RADEON_OPCODE_KIL:
+       case RC_OPCODE_KIL:
                code->inst[ip].inst1 |= R500_TEX_INST_TEXKILL;
                break;
-       case RADEON_OPCODE_TEX:
+       case RC_OPCODE_TEX:
                code->inst[ip].inst1 |= R500_TEX_INST_LD;
                break;
-       case RADEON_OPCODE_TXB:
+       case RC_OPCODE_TXB:
                code->inst[ip].inst1 |= R500_TEX_INST_LODBIAS;
                break;
-       case RADEON_OPCODE_TXP:
+       case RC_OPCODE_TXP:
                code->inst[ip].inst1 |= R500_TEX_INST_PROJ;
                break;
        default:
-               error("emit_tex can't handle opcode %x\n", inst->Opcode);
+               error("emit_tex can't handle opcode %s\n", rc_get_opcode_info(inst->Opcode)->Name);
        }
 
-       code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcIndex)
-               | (translate_strq_swizzle(inst->SrcSwizzle) << 8)
-               | R500_TEX_DST_ADDR(inst->DestIndex)
+       use_temporary(code, inst->SrcReg[0].Index);
+       if (inst->Opcode != RC_OPCODE_KIL)
+               use_temporary(code, inst->DstReg.Index);
+
+       code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index)
+               | (translate_strq_swizzle(inst->SrcReg[0].Swizzle) << 8)
+               | R500_TEX_DST_ADDR(inst->DstReg.Index)
                | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
                | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
 
-       return GL_TRUE;
+       return 1;
 }
 
-static const struct radeon_pair_handler pair_handler = {
-       .EmitPaired = emit_paired,
-       .EmitTex = emit_tex,
-       .MaxHwTemps = 128
-};
+static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst)
+{
+       unsigned int newip;
 
-void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
+       if (s->Code->inst_end >= s->C->max_alu_insts-1) {
+               rc_error(s->C, "emit_tex: Too many instructions");
+               return;
+       }
+
+       newip = ++s->Code->inst_end;
+
+       /* Currently all loops use the same integer constant to intialize
+        * the loop variables. */
+       if(!s->Code->int_constants[0]) {
+               s->Code->int_constants[0] = R500_FC_INT_CONST_KR(0xff);
+               s->Code->int_constant_count = 1;
+       }
+       s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
+
+       switch(inst->U.I.Opcode){
+       struct branch_info * branch;
+       struct r500_loop_info * loop;
+       case RC_OPCODE_BGNLOOP:
+               memory_pool_array_reserve(&s->C->Pool, struct r500_loop_info,
+                       s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1);
+
+               loop = &s->Loops[s->CurrentLoopDepth++];
+               memset(loop, 0, sizeof(struct r500_loop_info));
+               loop->BranchDepth = s->CurrentBranchDepth;
+               loop->BgnLoop = newip;
+
+               s->Code->inst[newip].inst2 = R500_FC_OP_LOOP
+                       | R500_FC_JUMP_FUNC(0x00)
+                       | R500_FC_IGNORE_UNCOVERED
+                       ;
+               break;
+       case RC_OPCODE_BRK:
+               loop = &s->Loops[s->CurrentLoopDepth - 1];
+               memory_pool_array_reserve(&s->C->Pool, int, loop->Brks,
+                                       loop->BrkCount, loop->BrkReserved, 1);
+
+               loop->Brks[loop->BrkCount++] = newip;
+               s->Code->inst[newip].inst2 = R500_FC_OP_BREAKLOOP
+                       | R500_FC_JUMP_FUNC(0xff)
+                       | R500_FC_B_OP1_DECR
+                       | R500_FC_B_POP_CNT(
+                               s->CurrentBranchDepth - loop->BranchDepth)
+                       | R500_FC_IGNORE_UNCOVERED
+                       ;
+               break;
+
+       case RC_OPCODE_CONT:
+               loop = &s->Loops[s->CurrentLoopDepth - 1];
+               memory_pool_array_reserve(&s->C->Pool, int, loop->Conts,
+                                       loop->ContCount, loop->ContReserved, 1);
+               loop->Conts[loop->ContCount++] = newip;
+               s->Code->inst[newip].inst2 = R500_FC_OP_CONTINUE
+                       | R500_FC_JUMP_FUNC(0xff)
+                       | R500_FC_B_OP1_DECR
+                       | R500_FC_B_POP_CNT(
+                               s->CurrentBranchDepth - loop->BranchDepth)
+                       | R500_FC_IGNORE_UNCOVERED
+                       ;
+               break;
+
+       case RC_OPCODE_ENDLOOP:
+       {
+               loop = &s->Loops[s->CurrentLoopDepth - 1];
+               /* Emit ENDLOOP */
+               s->Code->inst[newip].inst2 = R500_FC_OP_ENDLOOP
+                       | R500_FC_JUMP_FUNC(0xff)
+                       | R500_FC_JUMP_ANY
+                       | R500_FC_IGNORE_UNCOVERED
+                       ;
+               /* The constant integer at index 0 is used by all loops. */
+               s->Code->inst[newip].inst3 = R500_FC_INT_ADDR(0)
+                       | R500_FC_JUMP_ADDR(loop->BgnLoop + 1)
+                       ;
+
+               /* Set jump address and int constant for BGNLOOP */
+               s->Code->inst[loop->BgnLoop].inst3 = R500_FC_INT_ADDR(0)
+                       | R500_FC_JUMP_ADDR(newip)
+                       ;
+
+               /* Set jump address for the BRK instructions. */
+               while(loop->BrkCount--) {
+                       s->Code->inst[loop->Brks[loop->BrkCount]].inst3 =
+                                               R500_FC_JUMP_ADDR(newip + 1);
+               }
+
+               /* Set jump address for CONT instructions. */
+               while(loop->ContCount--) {
+                       s->Code->inst[loop->Conts[loop->ContCount]].inst3 =
+                                               R500_FC_JUMP_ADDR(newip);
+               }
+               s->CurrentLoopDepth--;
+               break;
+       }
+       case RC_OPCODE_IF:
+               if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
+                       rc_error(s->C, "Branch depth exceeds hardware limit");
+                       return;
+               }
+               memory_pool_array_reserve(&s->C->Pool, struct branch_info,
+                               s->Branches, s->CurrentBranchDepth, s->BranchesReserved, 1);
+
+               branch = &s->Branches[s->CurrentBranchDepth++];
+               branch->If = newip;
+               branch->Else = -1;
+               branch->Endif = -1;
+
+               if (s->CurrentBranchDepth > s->MaxBranchDepth)
+                       s->MaxBranchDepth = s->CurrentBranchDepth;
+
+               /* actual instruction is filled in at ENDIF time */
+               break;
+       
+       case RC_OPCODE_ELSE:
+               if (!s->CurrentBranchDepth) {
+                       rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
+                       return;
+               }
+
+               branch = &s->Branches[s->CurrentBranchDepth - 1];
+               branch->Else = newip;
+
+               /* actual instruction is filled in at ENDIF time */
+               break;
+
+       case RC_OPCODE_ENDIF:
+               if (!s->CurrentBranchDepth) {
+                       rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
+                       return;
+               }
+
+               branch = &s->Branches[s->CurrentBranchDepth - 1];
+               branch->Endif = newip;
+
+               s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+                       | R500_FC_A_OP_NONE /* no address stack */
+                       | R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+                       | R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+                       | R500_FC_B_OP1_NONE /* no branch counter if stay */
+                       | R500_FC_B_POP_CNT(1)
+                       ;
+               s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+               s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
+                       | R500_FC_A_OP_NONE /* no address stack */
+                       | R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
+                       | R500_FC_B_OP0_INCR /* increment branch counter if stay */
+                       | R500_FC_IGNORE_UNCOVERED
+               ;
+
+               if (branch->Else >= 0) {
+                       /* increment branch counter also if jump */
+                       s->Code->inst[branch->If].inst2 |= R500_FC_B_OP1_INCR;
+                       s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Else + 1);
+
+                       s->Code->inst[branch->Else].inst2 = R500_FC_OP_JUMP
+                               | R500_FC_A_OP_NONE /* no address stack */
+                               | R500_FC_B_ELSE /* all active pixels want to jump */
+                               | R500_FC_B_OP0_NONE /* no counter op if stay */
+                               | R500_FC_B_OP1_DECR /* decrement branch counter if jump */
+                               | R500_FC_B_POP_CNT(1)
+                       ;
+                       s->Code->inst[branch->Else].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+               } else {
+                       /* don't touch branch counter on jump */
+                       s->Code->inst[branch->If].inst2 |= R500_FC_B_OP1_NONE;
+                       s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+               }
+
+
+               s->CurrentBranchDepth--;
+               break;
+       default:
+               rc_error(s->C, "%s: unknown opcode %s\n", __FUNCTION__, rc_get_opcode_info(inst->U.I.Opcode)->Name);
+       }
+}
+
+void r500BuildFragmentProgramHwCode(struct radeon_compiler *c, void *user)
 {
+       struct r300_fragment_program_compiler *compiler = (struct r300_fragment_program_compiler*)c;
+       struct emit_state s;
        struct r500_fragment_program_code *code = &compiler->code->code.r500;
 
-       _mesa_bzero(code, sizeof(*code));
+       memset(&s, 0, sizeof(s));
+       s.C = &compiler->Base;
+       s.Code = code;
+
+       memset(code, 0, sizeof(*code));
        code->max_temp_idx = 1;
        code->inst_end = -1;
 
-       radeonPairProgram(compiler, &pair_handler, compiler);
+       for(struct rc_instruction * inst = compiler->Base.Program.Instructions.Next;
+           inst != &compiler->Base.Program.Instructions && !compiler->Base.Error;
+           inst = inst->Next) {
+               if (inst->Type == RC_INSTRUCTION_NORMAL) {
+                       const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+
+                       if (opcode->IsFlowControl) {
+                               emit_flowcontrol(&s, inst);
+                       } else if (inst->U.I.Opcode == RC_OPCODE_BEGIN_TEX) {
+                               continue;
+                       } else {
+                               emit_tex(compiler, &inst->U.I);
+                       }
+               } else {
+                       emit_paired(compiler, &inst->U.P);
+               }
+       }
+
+       if (code->max_temp_idx >= compiler->Base.max_temp_regs)
+               rc_error(&compiler->Base, "Too many hardware temporaries used");
+
        if (compiler->Base.Error)
                return;
 
-       if ((code->inst[code->inst_end].inst0 & R500_INST_TYPE_MASK) != R500_INST_TYPE_OUT) {
+       if (code->inst_end == -1 ||
+           (code->inst[code->inst_end].inst0 & R500_INST_TYPE_MASK) != R500_INST_TYPE_OUT) {
+               int ip;
+
                /* This may happen when dead-code elimination is disabled or
                 * when most of the fragment program logic is leading to a KIL */
-               if (code->inst_end >= 511) {
+               if (code->inst_end >= compiler->Base.max_alu_insts-1) {
                        rc_error(&compiler->Base, "Introducing fake OUT: Too many instructions");
                        return;
                }
 
-               int ip = ++code->inst_end;
+               ip = ++code->inst_end;
                code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
        }
+
+       /* Enable full flow control mode if we are using loops or have if
+        * statements nested at least four deep. */
+       if (s.MaxBranchDepth >= 4 || s.LoopsReserved > 0) {
+               if (code->max_temp_idx < 1)
+                       code->max_temp_idx = 1;
+
+               code->us_fc_ctrl |= R500_FC_FULL_FC_EN;
+       }
 }