gallivm: don't calculate square root of rho if we use accurate rho method

author Roland Scheidegger <sroland@vmware.com>

Thu, 29 Aug 2013 01:58:18 +0000 (03:58 +0200)

committer Roland Scheidegger <sroland@vmware.com>

Fri, 30 Aug 2013 00:16:45 +0000 (02:16 +0200)
author Roland Scheidegger <sroland@vmware.com>
Thu, 29 Aug 2013 01:58:18 +0000 (03:58 +0200)
committer Roland Scheidegger <sroland@vmware.com>
Fri, 30 Aug 2013 00:16:45 +0000 (02:16 +0200)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c

index e1cfd78e885bca0cb493333f73cb09dda53996a8..9b0a92c9cb9564de20d34223601eefe48ca3f5b9 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -232,6 +232,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
     unsigned length = coord_bld->type.length;
     unsigned num_quads = length / 4;
     boolean rho_per_quad = rho_bld->type.length != length;
+   boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1);
     unsigned i;
     LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
     LLVMValueRef rho_xvec, rho_yvec;
@@ -264,12 +265,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
        else {
           rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
        }
-      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-         rho = lp_build_sqrt(rho_bld, rho);
-      }
        /* Could optimize this for single quad just skip the broadcast */
        cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                              rho_bld->type, float_size, index0);
+      if (no_rho_opt) {
+         /* skipping sqrt hence returning rho squared */
+         cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
+      }
        rho = lp_build_mul(rho_bld, cubesize, rho);
     }
     else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
@@ -281,7 +283,11 @@ lp_build_rho(struct lp_build_sample_context *bld,
           floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                                 coord_bld->type, float_size, indexi);
  
-         if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+         /*
+          * note that for rho_per_quad case could reduce math (at some shuffle
+          * cost), but for now use same code to per-pixel lod case.
+          */
+         if (no_rho_opt) {
              ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
              ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
              ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
@@ -295,7 +301,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
              ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
           }
        }
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
           rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
           rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
           if (dims > 2) {
@@ -303,19 +309,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
              rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
           }
           rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
-
-         if (rho_per_quad) {
-            /*
-             * note for this case without per-pixel lod could reduce math more
-             * (at some shuffle cost), but for now only do sqrt after packing,
-             * otherwise would also need different code to per-pixel lod case.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
-         rho = lp_build_sqrt(rho_bld, rho);
-
-      }
+         /* skipping sqrt hence returning rho squared */
+     }
        else {
           rho = ddmax[0];
           if (dims > 1) {
@@ -324,13 +319,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
                 rho = lp_build_max(coord_bld, rho, ddmax[2]);
              }
           }
-         if (rho_per_quad) {
-            /*
-             * rho_vec contains per-pixel rho, convert to scalar per quad.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
+      }
+      if (rho_per_quad) {
+         /*
+          * rho_vec contains per-pixel rho, convert to scalar per quad.
+          */
+         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                         rho_bld->type, rho, 0);
        }
     }
     else {
@@ -362,7 +357,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
           }
        }
  
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
           static const unsigned char swizzle01[] = { /* no-op swizzle */
              0, 1,
              LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
@@ -407,16 +402,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
                                              rho_bld->type, rho, 0);
           }
           else {
-            /*
-             * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
-             * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
-             * same is true for cpus having faster scalars than 4-wide vecs
-             * for 4-wide case (where pack/unpack would be no-ops anyway).
-             * (Same is true really for cube_rho case above.)
-             */
              rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
           }
-         rho = lp_build_sqrt(rho_bld, rho);
+         /* skipping sqrt hence returning rho squared */
        }
        else {
           ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
@@ -636,7 +624,7 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
  
     /*
      * The pre factor will make the intersections with the exact powers of two
-    * happen precisely where we want then to be, which means that the integer
+    * happen precisely where we want them to be, which means that the integer
      * part will not need any post adjustments.
      */
     rho = lp_build_mul(bld, rho,
@@ -665,6 +653,34 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
  }
  
  
+/**
+ * Fast implementation of iround(log2(sqrt(x))), based on
+ * log2(x^n) == n*log2(x).
+ *
+ * Gives accurate results all the time.
+ * (Could be trivially extended to handle other power-of-two roots.)
+ */
+static LLVMValueRef
+lp_build_ilog2_sqrt(struct lp_build_context *bld,
+                    LLVMValueRef x)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef ipart;
+   struct lp_type i_type = lp_int_type(bld->type);
+   LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
+   ipart = lp_build_extract_exponent(bld, x, 1);
+   ipart = LLVMBuildAShr(builder, ipart, one, "");
+
+   return ipart;
+}
+
+
  /**
   * Generate code to compute texture level of detail (lambda).
   * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
@@ -740,6 +756,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
        }
        else {
           LLVMValueRef rho;
+         boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+                               (bld->dims > 1);
  
           rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
  
@@ -760,16 +778,28 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
              if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                  mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
                 /*
-                * Don't actually need both all the time, ipart is needed
-                * for nearest mipfilter, pos_or_zero if min != mag.
+                * Don't actually need both values all the time, lod_ipart is
+                * needed for nearest mipfilter, lod_positive if min != mag.
                  */
-               *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               if (rho_squared) {
+                  *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
+               }
+               else {
+                  *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               }
                 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
                                                  rho, lodf_bld->one);
                 return;
              }
              if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
-                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) &&
+                !rho_squared) {
+               /*
+                * This can't work if rho is squared. Not sure if it could be
+                * fixed while keeping it worthwile, could also do sqrt here
+                * but brilinear and no_rho_opt seems like a combination not
+                * making much sense anyway so just use ordinary path below.
+                */
                 lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
                                        out_lod_ipart, out_lod_fpart);
                 *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
@@ -784,6 +814,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
           else {
              lod = lp_build_fast_log2(lodf_bld, rho);
           }
+         if (rho_squared) {
+            /* log2(x^2) == 0.5*log2(x) */
+            lod = lp_build_mul(lodf_bld, lod,
+                               lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
+         }
  
           /* add shader lod bias */
           if (lod_bias) {
author	Roland Scheidegger <sroland@vmware.com>
	Thu, 29 Aug 2013 01:58:18 +0000 (03:58 +0200)
committer	Roland Scheidegger <sroland@vmware.com>
	Fri, 30 Aug 2013 00:16:45 +0000 (02:16 +0200)