Merge branch '7.8'
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_program_alu.c
index ced66af1eb59a7fda54f89e2594c3c96ba38c50a..05b874ba7cf7f41e36580a263348f533ca75b888 100644 (file)
@@ -175,6 +175,26 @@ static void transform_ABS(struct radeon_compiler* c,
        rc_remove_instruction(inst);
 }
 
+static void transform_CEIL(struct radeon_compiler* c,
+       struct rc_instruction* inst)
+{
+       /* Assuming:
+        *     ceil(x) = -floor(-x)
+        *
+        * After inlining floor:
+        *     ceil(x) = -(-x-frac(-x))
+        *
+        * After simplification:
+        *     ceil(x) = x+frac(-x)
+        */
+
+       int tempreg = rc_find_free_temporary(c);
+       emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]));
+       emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+               inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg));
+       rc_remove_instruction(inst);
+}
+
 static void transform_DP3(struct radeon_compiler* c,
        struct rc_instruction* inst)
 {
@@ -267,9 +287,9 @@ static void transform_LIT(struct radeon_compiler* c,
        temp = inst->U.I.DstReg.Index;
        srctemp = srcreg(RC_FILE_TEMPORARY, temp);
 
-       // tmp.x = max(0.0, Src.x);
-       // tmp.y = max(0.0, Src.y);
-       // tmp.w = clamp(Src.z, -128+eps, 128-eps);
+       /* tmp.x = max(0.0, Src.x); */
+       /* tmp.y = max(0.0, Src.y); */
+       /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
        emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
                dstregtmpmask(temp, RC_MASK_XYW),
                inst->U.I.SrcReg[0],
@@ -280,7 +300,7 @@ static void transform_LIT(struct radeon_compiler* c,
                swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
                negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
 
-       // tmp.w = Pow(tmp.y, tmp.w)
+       /* tmp.w = Pow(tmp.y, tmp.w) */
        emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
                dstregtmpmask(temp, RC_MASK_W),
                swizzle(srctemp, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y));
@@ -292,14 +312,14 @@ static void transform_LIT(struct radeon_compiler* c,
                dstregtmpmask(temp, RC_MASK_W),
                swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W));
 
-       // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
+       /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
        emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
                dstregtmpmask(temp, RC_MASK_Z),
                negate(swizzle(srctemp, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
                swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
                builtin_zero);
 
-       // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
+       /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
        emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
                dstregtmpmask(temp, RC_MASK_XYW),
                swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
@@ -458,7 +478,7 @@ static void transform_XPD(struct radeon_compiler* c,
  * no userData necessary.
  *
  * Eliminates the following ALU instructions:
- *  ABS, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
+ *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
  * using:
  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
  *
@@ -474,6 +494,7 @@ int radeonTransformALU(
 {
        switch(inst->U.I.Opcode) {
        case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
+       case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
        case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
        case RC_OPCODE_DST: transform_DST(c, inst); return 1;
        case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
@@ -506,6 +527,35 @@ static void transform_r300_vertex_ABS(struct radeon_compiler* c,
        inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
 }
 
+static void transform_r300_vertex_CMP(struct radeon_compiler* c,
+       struct rc_instruction* inst)
+{
+       /* There is no decent CMP available, so let's rig one up.
+        * CMP is defined as dst = src0 < 0.0 ? src1 : src2
+        * The following sequence consumes two temps and two extra slots
+        * (the second temp and the second slot is consumed by transform_LRP),
+        * but should be equivalent:
+        *
+        * SLT tmp0, src0, 0.0
+        * LRP dst, tmp0, src1, src2
+        *
+        * Yes, I know, I'm a mad scientist. ~ C. & M. */
+       int tempreg0 = rc_find_free_temporary(c);
+
+       /* SLT tmp0, src0, 0.0 */
+       emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
+               dstreg(RC_FILE_TEMPORARY, tempreg0),
+               inst->U.I.SrcReg[0], builtin_zero);
+
+       /* LRP dst, tmp0, src1, src2 */
+       transform_LRP(c,
+               emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
+                     inst->U.I.DstReg,
+                     srcreg(RC_FILE_TEMPORARY, tempreg0), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
+
+       rc_remove_instruction(inst);
+}
+
 /**
  * For use with radeonLocalTransform, this transforms non-native ALU
  * instructions of the r300 up to r500 vertex engine.
@@ -517,6 +567,8 @@ int r300_transform_vertex_alu(
 {
        switch(inst->U.I.Opcode) {
        case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
+       case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
+       case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
        case RC_OPCODE_DP3: transform_DP3(c, inst); return 1;
        case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
        case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
@@ -533,16 +585,16 @@ static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
 {
        static const float SinCosConsts[2][4] = {
                {
-                       1.273239545,            // 4/PI
-                       -0.405284735,           // -4/(PI*PI)
-                       3.141592654,            // PI
-                       0.2225                  // weight
+                       1.273239545,            /* 4/PI */
+                       -0.405284735,           /* -4/(PI*PI) */
+                       3.141592654,            /* PI */
+                       0.2225                  /* weight */
                },
                {
                        0.75,
                        0.5,
-                       0.159154943,            // 1/(2*PI)
-                       6.283185307             // 2*PI
+                       0.159154943,            /* 1/(2*PI) */
+                       6.283185307             /* 2*PI */
                }
        };
        int i;
@@ -602,9 +654,9 @@ int radeonTransformTrigSimple(struct radeon_compiler* c,
        sincos_constants(c, constants);
 
        if (inst->U.I.Opcode == RC_OPCODE_COS) {
-               // MAD tmp.x, src, 1/(2*PI), 0.75
-               // FRC tmp.x, tmp.x
-               // MAD tmp.z, tmp.x, 2*PI, -PI
+               /* MAD tmp.x, src, 1/(2*PI), 0.75 */
+               /* FRC tmp.x, tmp.x */
+               /* MAD tmp.z, tmp.x, 2*PI, -PI */
                emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
                        swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
                        swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z),