gallivm: optimize lp_build_minify for sse

author Roland Scheidegger <sroland@vmware.com>

Tue, 5 Nov 2013 18:21:25 +0000 (19:21 +0100)

committer Roland Scheidegger <sroland@vmware.com>

Tue, 5 Nov 2013 22:32:24 +0000 (23:32 +0100)
author Roland Scheidegger <sroland@vmware.com>
Tue, 5 Nov 2013 18:21:25 +0000 (19:21 +0100)
committer Roland Scheidegger <sroland@vmware.com>
Tue, 5 Nov 2013 22:32:24 +0000 (23:32 +0100)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c

index a032d9d68954f8b9cd55ca4558d340c246157715..e60a035a8829a11ac45cdb14633172578f6c2574 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -36,6 +36,7 @@
  #include "pipe/p_state.h"
  #include "util/u_format.h"
  #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
  #include "lp_bld_arit.h"
  #include "lp_bld_const.h"
  #include "lp_bld_debug.h"
@@ -248,7 +249,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
     first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                   bld->gallivm, texture_unit);
     first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
-   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
+   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE);
     float_size = lp_build_int_to_float(float_size_bld, int_size);
  
     if (cube_rho) {
@@ -1089,12 +1090,14 @@ lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
  
  /**
   * Codegen equivalent for u_minify().
+ * @param lod_scalar  if lod is a (broadcasted) scalar
   * Return max(1, base_size >> level);
   */
  LLVMValueRef
  lp_build_minify(struct lp_build_context *bld,
                  LLVMValueRef base_size,
-                LLVMValueRef level)
+                LLVMValueRef level,
+                boolean lod_scalar)
  {
     LLVMBuilderRef builder = bld->gallivm->builder;
     assert(lp_check_value(bld->type, base_size));
@@ -1105,10 +1108,49 @@ lp_build_minify(struct lp_build_context *bld,
        return base_size;
     }
     else {
-      LLVMValueRef size =
-         LLVMBuildLShr(builder, base_size, level, "minify");
+      LLVMValueRef size;
        assert(bld->type.sign);
-      size = lp_build_max(bld, size, bld->one);
+      if (lod_scalar ||
+         (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
+         size = LLVMBuildLShr(builder, base_size, level, "minify");
+         size = lp_build_max(bld, size, bld->one);
+      }
+      else {
+         /*
+          * emulate shift with float mul, since intel "forgot" shifts with
+          * per-element shift count until avx2, which results in terrible
+          * scalar extraction (both count and value), scalar shift,
+          * vector reinsertion. Should not be an issue on any non-x86 cpu
+          * with a vector instruction set.
+          * On cpus with AMD's XOP this should also be unnecessary but I'm
+          * not sure if llvm would emit this with current flags.
+          */
+         LLVMValueRef const127, const23, lf;
+         struct lp_type ftype;
+         struct lp_build_context fbld;
+         ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
+         lp_build_context_init(&fbld, bld->gallivm, ftype);
+         const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
+         const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
+
+         /* calculate 2^(-level) float */
+         lf = lp_build_sub(bld, const127, level);
+         lf = lp_build_shl(bld, lf, const23);
+         lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
+
+         /* finish shift operation by doing float mul */
+         base_size = lp_build_int_to_float(&fbld, base_size);
+         size = lp_build_mul(&fbld, base_size, lf);
+         /*
+          * do the max also with floats because
+          * a) non-emulated int max requires sse41
+          *    (this is actually a lie as we could cast to 16bit values
+          *    as 16bit is sufficient and 16bit int max is sse2)
+          * b) with avx we can do int max 4-wide but float max 8-wide
+          */
+         size = lp_build_max(&fbld, size, fbld.one);
+         size = lp_build_itrunc(&fbld, size);
+      }
        return size;
     }
  }
@@ -1185,7 +1227,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
      */
     if (bld->num_mips == 1) {
        ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
-      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE);
     }
     else {
        LLVMValueRef int_size_vec;
@@ -1229,7 +1271,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                                                   bld4.type,
                                                   ilevel,
                                                   indexi);
-            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
+            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE);
           }
           /*
            * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
@@ -1248,7 +1290,6 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
           * with 4-wide vector pack all elements into a 8xi16 vector
           * (on which we can still do useful math) instead of using a 16xi32
           * vector.
-         * FIXME: some callers can't handle this yet.
           * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
           * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
           */
@@ -1257,8 +1298,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
              assert(bld->int_size_in_bld.type.length == 1);
              int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
                                                       bld->int_size);
-            /* vector shift with variable shift count alert... */
-            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
+            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE);
           }
           else {
              LLVMValueRef ilevel1;
@@ -1267,7 +1307,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                 ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
                                                      bld->int_size_in_bld.type, ilevel, indexi);
                 tmp[i] = bld->int_size;
-               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
+               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE);
              }
              *out_size = lp_build_concat(bld->gallivm, tmp,
                                          bld->int_size_in_bld.type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h

index 5039128a203e5af03148732ce25ba856bab65959..fd4e0532607dbfef0ab82d73b1a35a8a303e8d9d 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -547,7 +547,8 @@ lp_build_sample_nop(struct gallivm_state *gallivm,
  LLVMValueRef
  lp_build_minify(struct lp_build_context *bld,
                  LLVMValueRef base_size,
-                LLVMValueRef level);
+                LLVMValueRef level,
+                boolean lod_scalar);
  
  
  #endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c

index 2d833318aeee4f9b2f361cd19921b9eca275c16f..e8c04d1e6c53343a9a9503202b1dcd49b378dcf4 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -2940,7 +2940,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
                                      lp_build_const_int32(gallivm, 2), "");
     }
  
-   size = lp_build_minify(&bld_int_vec4, size, lod);
+   size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
  
     if (has_array)
        size = LLVMBuildInsertElement(gallivm->builder, size,
author	Roland Scheidegger <sroland@vmware.com>
	Tue, 5 Nov 2013 18:21:25 +0000 (19:21 +0100)
committer	Roland Scheidegger <sroland@vmware.com>
	Tue, 5 Nov 2013 22:32:24 +0000 (23:32 +0100)
src/gallium/auxiliary/gallivm/lp_bld_sample.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_sample.h		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c		patch \| blob \| history