r300g: implement TRUNC correctly
[mesa.git] / src / gallium / drivers / r300 / compiler / radeon_program_alu.c
index 9fc991166a3aa712bb1f063729c52613174ed37e..f4ee86de5d0424d74999d27eb358051f4cb35605 100644 (file)
 
 static struct rc_instruction *emit1(
        struct radeon_compiler * c, struct rc_instruction * after,
-       rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
-       struct rc_src_register SrcReg)
+       rc_opcode Opcode, struct rc_sub_instruction * base,
+       struct rc_dst_register DstReg, struct rc_src_register SrcReg)
 {
        struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
+       if (base) {
+               memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+       }
+
        fpi->U.I.Opcode = Opcode;
-       fpi->U.I.SaturateMode = Saturate;
        fpi->U.I.DstReg = DstReg;
        fpi->U.I.SrcReg[0] = SrcReg;
        return fpi;
@@ -55,13 +58,17 @@ static struct rc_instruction *emit1(
 
 static struct rc_instruction *emit2(
        struct radeon_compiler * c, struct rc_instruction * after,
-       rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+       rc_opcode Opcode, struct rc_sub_instruction * base,
+       struct rc_dst_register DstReg,
        struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
 {
        struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
+       if (base) {
+               memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+       }
+
        fpi->U.I.Opcode = Opcode;
-       fpi->U.I.SaturateMode = Saturate;
        fpi->U.I.DstReg = DstReg;
        fpi->U.I.SrcReg[0] = SrcReg0;
        fpi->U.I.SrcReg[1] = SrcReg1;
@@ -70,14 +77,18 @@ static struct rc_instruction *emit2(
 
 static struct rc_instruction *emit3(
        struct radeon_compiler * c, struct rc_instruction * after,
-       rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+       rc_opcode Opcode, struct rc_sub_instruction * base,
+       struct rc_dst_register DstReg,
        struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
        struct rc_src_register SrcReg2)
 {
        struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
+       if (base) {
+               memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
+       }
+
        fpi->U.I.Opcode = Opcode;
-       fpi->U.I.SaturateMode = Saturate;
        fpi->U.I.DstReg = DstReg;
        fpi->U.I.SrcReg[0] = SrcReg0;
        fpi->U.I.SrcReg[1] = SrcReg1;
@@ -87,7 +98,7 @@ static struct rc_instruction *emit3(
 
 static struct rc_dst_register dstregtmpmask(int index, int mask)
 {
-       struct rc_dst_register dst = {0};
+       struct rc_dst_register dst = {0, 0, 0};
        dst.File = RC_FILE_TEMPORARY;
        dst.Index = index;
        dst.WriteMask = mask;
@@ -104,6 +115,13 @@ static const struct rc_src_register builtin_one = {
        .Index = 0,
        .Swizzle = RC_SWIZZLE_1111
 };
+
+static const struct rc_src_register builtin_half = {
+       .File = RC_FILE_NONE,
+       .Index = 0,
+       .Swizzle = RC_SWIZZLE_HHHH
+};
+
 static const struct rc_src_register srcreg_undefined = {
        .File = RC_FILE_NONE,
        .Index = 0,
@@ -214,7 +232,7 @@ static void transform_ABS(struct radeon_compiler* c,
        struct rc_src_register src = inst->U.I.SrcReg[0];
        src.Abs = 1;
        src.Negate = RC_MASK_NONE;
-       emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
+       emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
        rc_remove_instruction(inst);
 }
 
@@ -233,7 +251,7 @@ static void transform_CEIL(struct radeon_compiler* c,
 
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
        emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
-       emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
                inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
        rc_remove_instruction(inst);
 }
@@ -249,7 +267,7 @@ static void transform_CLAMP(struct radeon_compiler *c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
        emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
                inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
-       emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
                srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
        rc_remove_instruction(inst);
 }
@@ -265,7 +283,7 @@ static void transform_DP2(struct radeon_compiler* c,
        src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
        src1.Swizzle &= ~(63 << (3 * 2));
        src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
-       emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+       emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
        rc_remove_instruction(inst);
 }
 
@@ -276,7 +294,7 @@ static void transform_DPH(struct radeon_compiler* c,
        src0.Negate &= ~RC_MASK_W;
        src0.Swizzle &= ~(7 << (3 * 3));
        src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
-       emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
+       emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
        rc_remove_instruction(inst);
 }
 
@@ -287,7 +305,7 @@ static void transform_DPH(struct radeon_compiler* c,
 static void transform_DST(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
                swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
                swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
        rc_remove_instruction(inst);
@@ -298,11 +316,29 @@ static void transform_FLR(struct radeon_compiler* c,
 {
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
        emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
-       emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
                inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
        rc_remove_instruction(inst);
 }
 
+static void transform_TRUNC(struct radeon_compiler* c,
+       struct rc_instruction* inst)
+{
+       /* Definition of trunc:
+        *   trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
+        *
+        * The multiplication by sgn(x) can be simplified using CMP:
+        *   y * sgn(x) = (x < 0 ? -y : y)
+        */
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
+             negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
+             negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
+       rc_remove_instruction(inst);
+}
+
 /**
  * Definition of LIT (from ARB_fragment_program):
  *
@@ -372,14 +408,14 @@ static void transform_LIT(struct radeon_compiler* c,
                swizzle_wwww(srctemp));
 
        /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
                dstregtmpmask(temp, RC_MASK_Z),
                negate(swizzle_xxxx(srctemp)),
                swizzle_wwww(srctemp),
                builtin_zero);
 
        /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
-       emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
+       emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
                dstregtmpmask(temp, RC_MASK_XYW),
                swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
 
@@ -394,7 +430,7 @@ static void transform_LRP(struct radeon_compiler* c,
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
                dst,
                inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
-       emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
+       emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
                inst->U.I.DstReg,
                inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
 
@@ -411,8 +447,45 @@ static void transform_POW(struct radeon_compiler* c,
 
        emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
        emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
-       emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
+       emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
+
+       rc_remove_instruction(inst);
+}
 
+/* dst = ROUND(src) :
+ *   add = src + .5
+ *   frac = FRC(add)
+ *   dst = add - frac
+ *
+ * According to the GLSL spec, the implementor can decide which way to round
+ * when the fraction is .5.  We round down for .5.
+ *
+ */
+static void transform_ROUND(struct radeon_compiler* c,
+       struct rc_instruction* inst)
+{
+       unsigned int mask = inst->U.I.DstReg.WriteMask;
+       unsigned int frac_index, add_index;
+       struct rc_dst_register frac_dst, add_dst;
+       struct rc_src_register frac_src, add_src;
+
+       /* add = src + .5 */
+       add_index = rc_find_free_temporary(c);
+       add_dst = dstregtmpmask(add_index, mask);
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
+                                                               builtin_half);
+       add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
+
+
+       /* frac = FRC(add) */
+       frac_index = rc_find_free_temporary(c);
+       frac_dst = dstregtmpmask(frac_index, mask);
+       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
+       frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
+
+       /* dst = add - frac */
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
+                                               add_src, negate(frac_src));
        rc_remove_instruction(inst);
 }
 
@@ -428,7 +501,7 @@ static void transform_SEQ(struct radeon_compiler* c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
                negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
 
        rc_remove_instruction(inst);
@@ -437,7 +510,7 @@ static void transform_SEQ(struct radeon_compiler* c,
 static void transform_SFL(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
+       emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
        rc_remove_instruction(inst);
 }
 
@@ -447,7 +520,7 @@ static void transform_SGE(struct radeon_compiler* c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
                srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
        rc_remove_instruction(inst);
@@ -459,7 +532,7 @@ static void transform_SGT(struct radeon_compiler* c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
                srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
        rc_remove_instruction(inst);
@@ -471,7 +544,7 @@ static void transform_SLE(struct radeon_compiler* c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
                srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
        rc_remove_instruction(inst);
@@ -483,7 +556,7 @@ static void transform_SLT(struct radeon_compiler* c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
                srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
        rc_remove_instruction(inst);
@@ -495,7 +568,7 @@ static void transform_SNE(struct radeon_compiler* c,
        struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
-       emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
                negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
 
        rc_remove_instruction(inst);
@@ -560,7 +633,7 @@ static void transform_XPD(struct radeon_compiler* c,
        emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
                swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
                swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
-       emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+       emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
                swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
                swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
                negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
@@ -599,6 +672,7 @@ int radeonTransformALU(
        case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
        case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
        case RC_OPCODE_POW: transform_POW(c, inst); return 1;
+       case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
        case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
        case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
        case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
@@ -610,6 +684,7 @@ int radeonTransformALU(
        case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
        case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
        case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
+       case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
        case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
        default:
                return 0;
@@ -674,7 +749,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
        src1.Negate &= ~RC_MASK_W;
        src1.Swizzle &= ~(7 << (3 * 3));
        src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
-       emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+       emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
        rc_remove_instruction(inst);
 }
 
@@ -810,6 +885,17 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
        rc_remove_instruction(inst);
 }
 
+static void transform_vertex_TRUNC(struct radeon_compiler* c,
+       struct rc_instruction* inst)
+{
+       struct rc_instruction *next = inst->Next;
+
+       /* next->Prev is removed after each transformation and replaced
+        * by a new instruction. */
+       transform_TRUNC(c, next->Prev);
+       transform_r300_vertex_CMP(c, next->Prev);
+}
+
 /**
  * For use with rc_local_transform, this transforms non-native ALU
  * instructions of the r300 up to r500 vertex engine.
@@ -848,6 +934,7 @@ int r300_transform_vertex_alu(
        case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
        case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
        case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
+       case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
        case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
        default:
                return 0;
@@ -998,22 +1085,22 @@ static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
        unsigned srctmp)
 {
        if (inst->U.I.Opcode == RC_OPCODE_COS) {
-               emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
+               emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
                        srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
        } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
-               emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
+               emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
                        inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
        } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
                struct rc_dst_register moddst = inst->U.I.DstReg;
 
                if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
                        moddst.WriteMask = RC_MASK_X;
-                       emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
+                       emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
                                srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
                }
                if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
                        moddst.WriteMask = RC_MASK_Y;
-                       emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
+                       emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
                                srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
                }
        }
@@ -1120,35 +1207,79 @@ int radeonTransformDeriv(struct radeon_compiler* c,
 }
 
 /**
+ * IF Temp[0].x -> IF Temp[0].x
+ * ...          -> ...
+ * KILP         -> KIL -abs(Temp[0].x)
+ * ...          -> ...
+ * ENDIF        -> ENDIF
+ *
+ * === OR ===
+ *
  * IF Temp[0].x -\
  * KILP         - > KIL -abs(Temp[0].x)
  * ENDIF        -/
  *
- * This needs to be done in its own pass, because it modifies the instructions
- * before and after KILP.
+ * === OR ===
+ *
+ * IF Temp[0].x -> IF Temp[0].x
+ * ...          -> ...
+ * ELSE         -> ELSE
+ * ...         -> ...
+ * KILP                -> KIL -abs(Temp[0].x)
+ * ...          -> ...
+ * ENDIF        -> ENDIF
+ *
+ * === OR ===
+ *
+ * KILP         -> KIL -none.1111
+ *
+ * This needs to be done in its own pass, because it might modify the
+ * instructions before and after KILP.
  */
 void rc_transform_KILP(struct radeon_compiler * c, void *user)
 {
        struct rc_instruction * inst;
        for (inst = c->Program.Instructions.Next;
                        inst != &c->Program.Instructions; inst = inst->Next) {
+               struct rc_instruction * if_inst;
+               unsigned in_if = 0;
 
                if (inst->U.I.Opcode != RC_OPCODE_KILP)
                        continue;
 
+               for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
+                                               if_inst = if_inst->Prev) {
+
+                       if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
+                               in_if = 1;
+                               break;
+                       }
+               }
+
                inst->U.I.Opcode = RC_OPCODE_KIL;
 
-               if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
-                               || inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+               if (!in_if) {
                        inst->U.I.SrcReg[0] = negate(builtin_one);
                } else {
-
+                       /* This should work even if the KILP is inside the ELSE
+                        * block, because -0.0 is considered negative. */
                        inst->U.I.SrcReg[0] =
-                               negate(absolute(inst->Prev->U.I.SrcReg[0]));
-                       /* Remove IF */
-                       rc_remove_instruction(inst->Prev);
-                       /* Remove ENDIF */
-                       rc_remove_instruction(inst->Next);
+                               negate(absolute(if_inst->U.I.SrcReg[0]));
+
+                       if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
+                               && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+
+                               /* Optimize the special case:
+                                * IF Temp[0].x
+                                * KILP
+                                * ENDIF
+                                */
+
+                               /* Remove IF */
+                               rc_remove_instruction(inst->Prev);
+                               /* Remove ENDIF */
+                               rc_remove_instruction(inst->Next);
+                       }
                }
        }
 }