r300/compiler: make lowering passes possibly use up to two less temps
authorMarek Olšák <maraeo@gmail.com>
Tue, 7 Dec 2010 20:57:18 +0000 (21:57 +0100)
committerMarek Olšák <maraeo@gmail.com>
Wed, 8 Dec 2010 03:39:51 +0000 (04:39 +0100)
CMP may now use two less temps, other non-native instructions may end up
using one less temp, except for SIN/COS/SCS, which I am leaving unchanged
for now.

This may reduce register pressure inside loops, because the register
allocator doesn't do a very good job there.

src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c

index 01c2e74e7b324666204dfccf113e9579e602be51..58977a40c7c2ca71a4351c8b06bd786a848e48eb 100644 (file)
@@ -85,16 +85,6 @@ static struct rc_instruction *emit3(
        return fpi;
 }
 
-static struct rc_dst_register dstreg(int file, int index)
-{
-       struct rc_dst_register dst;
-       dst.File = file;
-       dst.Index = index;
-       dst.WriteMask = RC_MASK_XYZW;
-       dst.RelAddr = 0;
-       return dst;
-}
-
 static struct rc_dst_register dstregtmpmask(int index, int mask)
 {
        struct rc_dst_register dst = {0};
@@ -187,6 +177,38 @@ static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
        return swizzle_smear(reg, RC_SWIZZLE_W);
 }
 
+static int is_dst_safe_to_reuse(struct rc_instruction *inst)
+{
+       const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
+       unsigned i;
+
+       assert(info->HasDstReg);
+
+       if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
+               return 0;
+
+       for (i = 0; i < info->NumSrcRegs; i++) {
+               if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
+                   inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
+                       return 0;
+       }
+
+       return 1;
+}
+
+static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
+                                              struct rc_instruction *inst)
+{
+       unsigned tmp;
+
+       if (is_dst_safe_to_reuse(inst))
+               tmp = inst->U.I.DstReg.Index;
+       else
+               tmp = rc_find_free_temporary(c);
+
+       return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
+}
+
 static void transform_ABS(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
@@ -210,10 +232,10 @@ static void transform_CEIL(struct radeon_compiler* c,
         *     ceil(x) = x+frac(-x)
         */
 
-       int tempreg = rc_find_free_temporary(c);
-       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]));
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
        emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg));
+               inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
        rc_remove_instruction(inst);
 }
 
@@ -225,11 +247,11 @@ static void transform_CLAMP(struct radeon_compiler *c,
         * MIN tmp, src, max
         * MAX dst, tmp, min
         */
-       int tempreg = rc_find_free_temporary(c);
-       emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+       emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
                inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
        emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[1]);
+               srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
        rc_remove_instruction(inst);
 }
 
@@ -275,10 +297,10 @@ static void transform_DST(struct radeon_compiler* c,
 static void transform_FLR(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
-       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0]);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
        emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
+               inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
        rc_remove_instruction(inst);
 }
 
@@ -368,14 +390,14 @@ static void transform_LIT(struct radeon_compiler* c,
 static void transform_LRP(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
-               dstreg(RC_FILE_TEMPORARY, tempreg),
+               dst,
                inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
        emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
                inst->U.I.DstReg,
-               inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[2]);
+               inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
 
        rc_remove_instruction(inst);
 }
@@ -383,9 +405,8 @@ static void transform_LRP(struct radeon_compiler* c,
 static void transform_POW(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
-       struct rc_dst_register tempdst = dstreg(RC_FILE_TEMPORARY, tempreg);
-       struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempreg);
+       struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
+       struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
        tempdst.WriteMask = RC_MASK_W;
        tempsrc.Swizzle = RC_SWIZZLE_WWWW;
 
@@ -405,11 +426,11 @@ static void transform_RSQ(struct radeon_compiler* c,
 static void transform_SEQ(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_zero, builtin_one);
+               negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
 
        rc_remove_instruction(inst);
 }
@@ -424,11 +445,11 @@ static void transform_SFL(struct radeon_compiler* c,
 static void transform_SGE(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+               srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
        rc_remove_instruction(inst);
 }
@@ -436,11 +457,11 @@ static void transform_SGE(struct radeon_compiler* c,
 static void transform_SGT(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+               srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
        rc_remove_instruction(inst);
 }
@@ -448,11 +469,11 @@ static void transform_SGT(struct radeon_compiler* c,
 static void transform_SLE(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+               srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
        rc_remove_instruction(inst);
 }
@@ -460,11 +481,11 @@ static void transform_SLE(struct radeon_compiler* c,
 static void transform_SLT(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+               srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
        rc_remove_instruction(inst);
 }
@@ -472,11 +493,11 @@ static void transform_SLT(struct radeon_compiler* c,
 static void transform_SNE(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+       emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-               negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_one, builtin_zero);
+               negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
 
        rc_remove_instruction(inst);
 }
@@ -490,12 +511,13 @@ static void transform_SSG(struct radeon_compiler* c,
         *   CMP tmp1, x, 1, 0
         *   ADD result, tmp0, -tmp1;
         */
-       unsigned tmp0, tmp1;
+       struct rc_dst_register dst0;
+       unsigned tmp1;
 
        /* 0 < x */
-       tmp0 = rc_find_free_temporary(c);
+       dst0 = try_to_reuse_dst(c, inst);
        emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
-             dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+             dst0,
              negate(inst->U.I.SrcReg[0]),
              builtin_one,
              builtin_zero);
@@ -512,7 +534,7 @@ static void transform_SSG(struct radeon_compiler* c,
        /* result = tmp0 - tmp1 */
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
              inst->U.I.DstReg,
-             srcreg(RC_FILE_TEMPORARY, tmp0),
+             srcreg(RC_FILE_TEMPORARY, dst0.Index),
              negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
 
        rc_remove_instruction(inst);
@@ -534,15 +556,15 @@ static void transform_SWZ(struct radeon_compiler* c,
 static void transform_XPD(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-       emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
+       emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
                swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
                swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
        emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
                swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
                swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
-               negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
+               negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 
        rc_remove_instruction(inst);
 }
@@ -610,7 +632,7 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 {
        /* There is no decent CMP available, so let's rig one up.
         * CMP is defined as dst = src0 < 0.0 ? src1 : src2
-        * The following sequence consumes two temps and two extra slots
+        * The following sequence consumes zero to two temps and two extra slots
         * (the second temp and the second slot is consumed by transform_LRP),
         * but should be equivalent:
         *
@@ -618,18 +640,18 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
         * LRP dst, tmp0, src1, src2
         *
         * Yes, I know, I'm a mad scientist. ~ C. & M. */
-       int tempreg0 = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
        /* SLT tmp0, src0, 0.0 */
        emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
-               dstreg(RC_FILE_TEMPORARY, tempreg0),
+               dst,
                inst->U.I.SrcReg[0], builtin_zero);
 
        /* LRP dst, tmp0, src1, src2 */
        transform_LRP(c,
                emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
                      inst->U.I.DstReg,
-                     srcreg(RC_FILE_TEMPORARY, tempreg0), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
+                     srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
 
        rc_remove_instruction(inst);
 }
@@ -660,7 +682,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
-       int tempreg = rc_find_free_temporary(c);
+       struct rc_dst_register dst = try_to_reuse_dst(c, inst);
        unsigned constant_swizzle;
        int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
                                                         0.0000000000000000001,
@@ -668,16 +690,16 @@ static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
 
        /* MOV dst, src */
        emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
-               dstreg(RC_FILE_TEMPORARY, tempreg),
+               dst,
                inst->U.I.SrcReg[0]);
 
        /* MAX dst.z, src, 0.00...001 */
        emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
-               dstregtmpmask(tempreg, RC_MASK_Y),
-               srcreg(RC_FILE_TEMPORARY, tempreg),
+               dstregtmpmask(dst.Index, RC_MASK_Y),
+               srcreg(RC_FILE_TEMPORARY, dst.Index),
                srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
 
-       inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, tempreg);
+       inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
 }
 
 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
@@ -761,12 +783,13 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
         *   SLT tmp1, x, 0;
         *   ADD result, tmp0, -tmp1;
         */
-       unsigned tmp0, tmp1;
+       struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
+       unsigned tmp1;
 
        /* 0 < x */
-       tmp0 = rc_find_free_temporary(c);
+       dst0 = try_to_reuse_dst(c, inst);
        emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
-             dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+             dst0,
              builtin_zero,
              inst->U.I.SrcReg[0]);
 
@@ -781,7 +804,7 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
        /* result = tmp0 - tmp1 */
        emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
              inst->U.I.DstReg,
-             srcreg(RC_FILE_TEMPORARY, tmp0),
+             srcreg(RC_FILE_TEMPORARY, dst0.Index),
              negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
 
        rc_remove_instruction(inst);