From 5ae31d7e1d3d51c7843571c63aa228f8ca9b879f Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Tue, 5 Nov 2013 19:21:25 +0100
Subject: [PATCH] gallivm: optimize lp_build_minify for sse

SSE can't handle true vector shifts (with variable shift count),
so llvm is turning them into a mess of extracts, scalar shifts and inserts.
It is however possible to emulate them in lp_build_minify with float muls,
which should be way faster (saves over 20 instructions per 8-wide
lp_build_minify). This wouldn't work for "generic" 32bit shifts though
since we've got only 24bits of mantissa (actually for left shifts it would
work by using sse41 int mul instead of float mul but not for right shifts).
Note that this has very limited scope for now, since this is only used with
per-pixel lod (otherwise we're avoiding the non-constant shift count by doing
per-quad shifts manually), and only 1d textures even then (though the latter
should change).

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c | 62 +++++++++++++++----
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |  3 +-
 .../auxiliary/gallivm/lp_bld_sample_soa.c     |  2 +-
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index a032d9d6895..e60a035a882 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -36,6 +36,7 @@
 #include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_const.h"
 #include "lp_bld_debug.h"
@@ -248,7 +249,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, texture_unit);
    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
-   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
+   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE);
    float_size = lp_build_int_to_float(float_size_bld, int_size);
 
    if (cube_rho) {
@@ -1089,12 +1090,14 @@ lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
 
 /**
  * Codegen equivalent for u_minify().
+ * @param lod_scalar  if lod is a (broadcasted) scalar
  * Return max(1, base_size >> level);
  */
 LLVMValueRef
 lp_build_minify(struct lp_build_context *bld,
                 LLVMValueRef base_size,
-                LLVMValueRef level)
+                LLVMValueRef level,
+                boolean lod_scalar)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    assert(lp_check_value(bld->type, base_size));
@@ -1105,10 +1108,49 @@ lp_build_minify(struct lp_build_context *bld,
       return base_size;
    }
    else {
-      LLVMValueRef size =
-         LLVMBuildLShr(builder, base_size, level, "minify");
+      LLVMValueRef size;
       assert(bld->type.sign);
-      size = lp_build_max(bld, size, bld->one);
+      if (lod_scalar ||
+         (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
+         size = LLVMBuildLShr(builder, base_size, level, "minify");
+         size = lp_build_max(bld, size, bld->one);
+      }
+      else {
+         /*
+          * emulate shift with float mul, since intel "forgot" shifts with
+          * per-element shift count until avx2, which results in terrible
+          * scalar extraction (both count and value), scalar shift,
+          * vector reinsertion. Should not be an issue on any non-x86 cpu
+          * with a vector instruction set.
+          * On cpus with AMD's XOP this should also be unnecessary but I'm
+          * not sure if llvm would emit this with current flags.
+          */
+         LLVMValueRef const127, const23, lf;
+         struct lp_type ftype;
+         struct lp_build_context fbld;
+         ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
+         lp_build_context_init(&fbld, bld->gallivm, ftype);
+         const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
+         const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
+
+         /* calculate 2^(-level) float */
+         lf = lp_build_sub(bld, const127, level);
+         lf = lp_build_shl(bld, lf, const23);
+         lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
+
+         /* finish shift operation by doing float mul */
+         base_size = lp_build_int_to_float(&fbld, base_size);
+         size = lp_build_mul(&fbld, base_size, lf);
+         /*
+          * do the max also with floats because
+          * a) non-emulated int max requires sse41
+          *    (this is actually a lie as we could cast to 16bit values
+          *    as 16bit is sufficient and 16bit int max is sse2)
+          * b) with avx we can do int max 4-wide but float max 8-wide
+          */
+         size = lp_build_max(&fbld, size, fbld.one);
+         size = lp_build_itrunc(&fbld, size);
+      }
       return size;
    }
 }
@@ -1185,7 +1227,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
     */
    if (bld->num_mips == 1) {
       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
-      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE);
    }
    else {
       LLVMValueRef int_size_vec;
@@ -1229,7 +1271,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                                                  bld4.type,
                                                  ilevel,
                                                  indexi);
-            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
+            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE);
          }
          /*
           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
@@ -1248,7 +1290,6 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
          * with 4-wide vector pack all elements into a 8xi16 vector
          * (on which we can still do useful math) instead of using a 16xi32
          * vector.
-         * FIXME: some callers can't handle this yet.
          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
          */
@@ -1257,8 +1298,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
             assert(bld->int_size_in_bld.type.length == 1);
             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
                                                      bld->int_size);
-            /* vector shift with variable shift count alert... */
-            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
+            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE);
          }
          else {
             LLVMValueRef ilevel1;
@@ -1267,7 +1307,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
                                                     bld->int_size_in_bld.type, ilevel, indexi);
                tmp[i] = bld->int_size;
-               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
+               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE);
             }
             *out_size = lp_build_concat(bld->gallivm, tmp,
                                         bld->int_size_in_bld.type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5039128a203..fd4e0532607 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -547,7 +547,8 @@ lp_build_sample_nop(struct gallivm_state *gallivm,
 LLVMValueRef
 lp_build_minify(struct lp_build_context *bld,
                 LLVMValueRef base_size,
-                LLVMValueRef level);
+                LLVMValueRef level,
+                boolean lod_scalar);
 
 
 #endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 2d833318aee..e8c04d1e6c5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -2940,7 +2940,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
                                     lp_build_const_int32(gallivm, 2), "");
    }
 
-   size = lp_build_minify(&bld_int_vec4, size, lod);
+   size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
 
    if (has_array)
       size = LLVMBuildInsertElement(gallivm->builder, size,
-- 
2.30.2