gallivm: replace major llvm version checks with LLVM_VERSION_MAJOR

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index beff4143fd8cc17ae3b53ccc88835c30dc203f15..8a912ac575368628da89858a65ae609ffe3ce0e4 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -47,10 +47,11 @@
  
  #include <float.h>
  
+#include <llvm/Config/llvm-config.h>
+
  #include "util/u_memory.h"
  #include "util/u_debug.h"
  #include "util/u_math.h"
-#include "util/u_string.h"
  #include "util/u_cpu_detect.h"
  
  #include "lp_bld_type.h"
@@ -143,7 +144,22 @@ lp_build_min_simple(struct lp_build_context *bld,
           intrinsic = "llvm.ppc.altivec.vminfp";
           intr_size = 128;
        }
-   } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_avx2 && type.length > 4) {
+      intr_size = 256;
+      switch (type.width) {
+      case 8:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
+         break;
+      case 16:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
+         break;
+      case 32:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
+         break;
+      }
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_sse2 && type.length >= 2) {
        intr_size = 128;
        if ((type.width == 8 || type.width == 16) &&
            (type.width * type.length <= 64) &&
@@ -262,6 +278,28 @@ lp_build_min_simple(struct lp_build_context *bld,
  }
  
  
+LLVMValueRef
+lp_build_fmuladd(LLVMBuilderRef builder,
+                 LLVMValueRef a,
+                 LLVMValueRef b,
+                 LLVMValueRef c)
+{
+   LLVMTypeRef type = LLVMTypeOf(a);
+   assert(type == LLVMTypeOf(b));
+   assert(type == LLVMTypeOf(c));
+   if (HAVE_LLVM < 0x0304) {
+      /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
+       * not supported, and instead it falls-back to a C function.
+       */
+      return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
+   }
+   char intrinsic[32];
+   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
+   LLVMValueRef args[] = { a, b, c };
+   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
+}
+
+
  /**
   * Generate max(a, b)
   * No checks for special case values of a or b = 1 or 0 are done.
@@ -324,7 +362,22 @@ lp_build_max_simple(struct lp_build_context *bld,
           intrinsic = "llvm.ppc.altivec.vmaxfp";
           intr_size = 128;
        }
-   } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_avx2 && type.length > 4) {
+      intr_size = 256;
+      switch (type.width) {
+      case 8:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
+         break;
+      case 16:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
+         break;
+      case 32:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
+         break;
+      }
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_sse2 && type.length >= 2) {
        intr_size = 128;
        if ((type.width == 8 || type.width == 16) &&
            (type.width * type.length <= 64) &&
@@ -490,31 +543,50 @@ lp_build_add(struct lp_build_context *bld,
     assert(lp_check_value(type, a));
     assert(lp_check_value(type, b));
  
-   if(a == bld->zero)
+   if (a == bld->zero)
        return b;
-   if(b == bld->zero)
+   if (b == bld->zero)
        return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
        return bld->undef;
  
-   if(bld->type.norm) {
+   if (type.norm) {
        const char *intrinsic = NULL;
  
-      if(a == bld->one || b == bld->one)
+      if (!type.sign && (a == bld->one || b == bld->one))
          return bld->one;
  
-      if (type.width * type.length == 128 &&
-          !type.floating && !type.fixed) {
-         if(util_cpu_caps.has_sse2) {
-           if(type.width == 8)
-             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-           if(type.width == 16)
-             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
-         } else if (util_cpu_caps.has_altivec) {
-           if(type.width == 8)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-           if(type.width == 16)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+      if (!type.floating && !type.fixed) {
+         if (LLVM_VERSION_MAJOR >= 9) {
+            char intrin[32];
+            intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
+            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
+            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+         }
+         if (type.width * type.length == 128) {
+            if (util_cpu_caps.has_sse2) {
+               if (type.width == 8)
+                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
+                                         LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.b" : NULL;
+               if (type.width == 16)
+                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
+                                         LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.w" : NULL;
+            } else if (util_cpu_caps.has_altivec) {
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+            }
+         }
+         if (type.width * type.length == 256) {
+            if (util_cpu_caps.has_avx2) {
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
+                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.b" : NULL;
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
+                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.w" : NULL;
+            }
           }
        }
     
@@ -532,8 +604,6 @@ lp_build_add(struct lp_build_context *bld,
           LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
           LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
           a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
-      } else {
-         a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
        }
     }
  
@@ -552,6 +622,25 @@ lp_build_add(struct lp_build_context *bld,
     if(bld->type.norm && (bld->type.floating || bld->type.fixed))
        res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  
+   if (type.norm && !type.floating && !type.fixed) {
+      if (!type.sign) {
+         /*
+          * newer llvm versions no longer support the intrinsics, but recognize
+          * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
+          * code, it is important we match the pattern llvm uses (and pray llvm
+          * doesn't change it - and hope they decide on the same pattern for
+          * all backends supporting it...).
+          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+          * interfere with llvm's ability to recognize the pattern but seems
+          * a bit brittle.
+          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
+          */
+         LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
+         res = lp_build_select(bld, overflowed,
+                               LLVMConstAllOnes(bld->int_vec_type), res);
+      }
+   }
+
     /* XXX clamp to floor of -1 or 0??? */
  
     return res;
@@ -782,31 +871,50 @@ lp_build_sub(struct lp_build_context *bld,
     assert(lp_check_value(type, a));
     assert(lp_check_value(type, b));
  
-   if(b == bld->zero)
+   if (b == bld->zero)
        return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
        return bld->undef;
-   if(a == b)
+   if (a == b)
        return bld->zero;
  
-   if(bld->type.norm) {
+   if (type.norm) {
        const char *intrinsic = NULL;
  
-      if(b == bld->one)
+      if (!type.sign && b == bld->one)
          return bld->zero;
  
-      if (type.width * type.length == 128 &&
-          !type.floating && !type.fixed) {
-         if (util_cpu_caps.has_sse2) {
-           if(type.width == 8)
-              intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-           if(type.width == 16)
-              intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
-         } else if (util_cpu_caps.has_altivec) {
-           if(type.width == 8)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-           if(type.width == 16)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+      if (!type.floating && !type.fixed) {
+         if (LLVM_VERSION_MAJOR >= 9) {
+            char intrin[32];
+            intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
+            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
+            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+         }
+         if (type.width * type.length == 128) {
+            if (util_cpu_caps.has_sse2) {
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
+                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.b" : NULL;
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
+                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.w" : NULL;
+            } else if (util_cpu_caps.has_altivec) {
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+            }
+         }
+         if (type.width * type.length == 256) {
+            if (util_cpu_caps.has_avx2) {
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
+                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.b" : NULL;
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
+                                          LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.w" : NULL;
+            }
           }
        }
     
@@ -825,7 +933,17 @@ lp_build_sub(struct lp_build_context *bld,
           LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
           a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
        } else {
-         a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
+         /*
+          * This must match llvm pattern for saturated unsigned sub.
+          * (lp_build_max_simple actually does the job with its current
+          * definition but do it explicitly here.)
+          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+          * interfere with llvm's ability to recognize the pattern but seems
+          * a bit brittle.
+          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
+          */
+         LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
+         a = lp_build_select(bld, no_ov, a, b);
        }
     }
  
@@ -894,7 +1012,7 @@ lp_build_sub(struct lp_build_context *bld,
   * @sa Michael Herf, The "double blend trick", May 2000, 
   *     http://www.stereopsis.com/doubleblend.html
   */
-static LLVMValueRef
+LLVMValueRef
  lp_build_mul_norm(struct gallivm_state *gallivm,
                    struct lp_type wide_type,
                    LLVMValueRef a, LLVMValueRef b)
@@ -977,14 +1095,14 @@ lp_build_mul(struct lp_build_context *bld,
        struct lp_type wide_type = lp_wider_type(type);
        LLVMValueRef al, ah, bl, bh, abl, abh, ab;
  
-      lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
-      lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
  
        /* PMULLW, PSRLW, PADDW */
        abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
        abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
  
-      ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
+      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
  
        return ab;
     }
@@ -1022,6 +1140,201 @@ lp_build_mul(struct lp_build_context *bld,
     return res;
  }
  
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits code that is meant to be compiled for the host CPU.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+                         LLVMValueRef a,
+                         LLVMValueRef b,
+                         LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+
+   /*
+    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+    * for x86 simd is atrocious (even if the high bits weren't required),
+    * trying to handle real 64bit inputs (which of course can't happen due
+    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+    * apparently llvm does not recognize this widening mul). This includes 6
+    * (instead of 2) pmuludq plus extra adds and shifts
+    * The same story applies to signed mul, albeit fixing this requires sse41.
+    * https://llvm.org/bugs/show_bug.cgi?id=30845
+    * So, whip up our own code, albeit only for length 4 and 8 (which
+    * should be good enough)...
+    * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
+    * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
+    * for signed), which the fallback code does not, without this llvm
+    * will likely still produce atrocious code.
+    */
+   if (LLVM_VERSION_MAJOR < 7 &&
+       (bld->type.length == 4 || bld->type.length == 8) &&
+       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+        util_cpu_caps.has_sse4_1)) {
+      const char *intrinsic = NULL;
+      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+      struct lp_type type_wide = lp_wider_type(bld->type);
+      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+      unsigned i;
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i+1);
+         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      aeven = a;
+      beven = b;
+      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.avx2.pmul.dq";
+         } else {
+            intrinsic = "llvm.x86.avx2.pmulu.dq";
+         }
+         muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                             wider_type, aeven, beven);
+         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                            wider_type, aodd, bodd);
+      }
+      else {
+         /* for consistent naming look elsewhere... */
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.sse41.pmuldq";
+         } else {
+            intrinsic = "llvm.x86.sse2.pmulu.dq";
+         }
+         /*
+          * XXX If we only have AVX but not AVX2 this is a pain.
+          * lp_build_intrinsic_binary_anylength() can't handle it
+          * (due to src and dst type not being identical).
+          */
+         if (bld->type.length == 8) {
+            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+            LLVMValueRef muleven2[2], mulodd2[2];
+            struct lp_type type_wide_half = type_wide;
+            LLVMTypeRef wtype_half;
+            type_wide_half.length = 2;
+            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
+            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
+            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
+            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenlo, bevenlo);
+            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddlo, boddlo);
+            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenhi, bevenhi);
+            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddhi, boddhi);
+            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
+            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
+
+         }
+         else {
+            muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                                wider_type, aeven, beven);
+            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                               wider_type, aodd, bodd);
+         }
+      }
+      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
+      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i + 1);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+   }
+   else {
+      return lp_build_mul_32_lohi(bld, a, b, res_hi);
+   }
+}
+
+
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits generic code.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp, shift, res_lo;
+   struct lp_type type_tmp;
+   LLVMTypeRef wide_type, narrow_type;
+
+   type_tmp = bld->type;
+   narrow_type = lp_build_vec_type(gallivm, type_tmp);
+   type_tmp.width *= 2;
+   wide_type = lp_build_vec_type(gallivm, type_tmp);
+   shift = lp_build_const_vec(gallivm, type_tmp, 32);
+
+   if (bld->type.sign) {
+      a = LLVMBuildSExt(builder, a, wide_type, "");
+      b = LLVMBuildSExt(builder, b, wide_type, "");
+   } else {
+      a = LLVMBuildZExt(builder, a, wide_type, "");
+      b = LLVMBuildZExt(builder, b, wide_type, "");
+   }
+   tmp = LLVMBuildMul(builder, a, b, "");
+
+   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   /* Since we truncate anyway, LShr and AShr are equivalent. */
+   tmp = LLVMBuildLShr(builder, tmp, shift, "");
+   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   return res_lo;
+}
+
+
+/* a * b + c */
+LLVMValueRef
+lp_build_mad(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b,
+             LLVMValueRef c)
+{
+   const struct lp_type type = bld->type;
+   if (type.floating) {
+      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
+   } else {
+      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
+   }
+}
+
  
  /**
   * Small vector x scale multiplication optimization.
@@ -1048,7 +1361,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
     if(b == 2 && bld->type.floating)
        return lp_build_add(bld, a, a);
  
-   if(util_is_power_of_two(b)) {
+   if(util_is_power_of_two_or_zero(b)) {
        unsigned shift = ffs(b) - 1;
  
        if(bld->type.floating) {
@@ -1113,7 +1426,9 @@ lp_build_div(struct lp_build_context *bld,
           return LLVMConstUDiv(a, b);
     }
  
-   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+   /* fast rcp is disabled (just uses div), so makes no sense to try that */
+   if(FALSE &&
+      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
        type.floating)
        return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -1153,6 +1468,11 @@ lp_build_lerp_simple(struct lp_build_context *bld,
  
     delta = lp_build_sub(bld, v1, v0);
  
+   if (bld->type.floating) {
+      assert(flags == 0);
+      return lp_build_mad(bld, x, delta, v0);
+   }
+
     if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
        if (!bld->type.sign) {
           if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
@@ -1182,16 +1502,41 @@ lp_build_lerp_simple(struct lp_build_context *bld,
        res = lp_build_mul(bld, x, delta);
     }
  
-   res = lp_build_add(bld, v0, res);
+   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      /*
+       * At this point both res and v0 only use the lower half of the bits,
+       * the rest is zero. Instead of add / mask, do add with half wide type.
+       */
+      struct lp_type narrow_type;
+      struct lp_build_context narrow_bld;
+
+      memset(&narrow_type, 0, sizeof narrow_type);
+      narrow_type.sign   = bld->type.sign;
+      narrow_type.width  = bld->type.width/2;
+      narrow_type.length = bld->type.length*2;
+
+      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
+      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
+      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
+      res = lp_build_add(&narrow_bld, v0, res);
+      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
+   } else {
+      res = lp_build_add(bld, v0, res);
  
-   if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
-       bld->type.fixed) {
-      /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
-      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
-       * but it will be wrong for true fixed point use cases. Basically we need
-       * a more powerful lp_type, capable of further distinguishing the values
-       * interpretation from the value storage. */
-      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
+      if (bld->type.fixed) {
+         /*
+          * We need to mask out the high order bits when lerping 8bit
+          * normalized colors stored on 16bits
+          */
+         /* XXX: This step is necessary for lerping 8bit colors stored on
+          * 16bits, but it will be wrong for true fixed point use cases.
+          * Basically we need a more powerful lp_type, capable of further
+          * distinguishing the values interpretation from the value storage.
+          */
+         LLVMValueRef low_bits;
+         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
+         res = LLVMBuildAnd(builder, res, low_bits, "");
+      }
     }
  
     return res;
@@ -1235,9 +1580,9 @@ lp_build_lerp(struct lp_build_context *bld,
  
        lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
  
-      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
-      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
-      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
  
        /*
         * Lerp both halves.
@@ -1248,7 +1593,7 @@ lp_build_lerp(struct lp_build_context *bld,
        resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
        resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
  
-      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
+      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
     } else {
        res = lp_build_lerp_simple(bld, x, v0, v1, flags);
     }
@@ -1508,7 +1853,7 @@ lp_build_abs(struct lp_build_context *bld,
        }
     }
  
-   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
        switch(type.width) {
        case 8:
           return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
@@ -1518,14 +1863,19 @@ lp_build_abs(struct lp_build_context *bld,
           return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
        }
     }
-   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
-            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
-            (type.width == 8 || type.width == 16 || type.width == 32)) {
-      debug_printf("%s: inefficient code, should split vectors manually\n",
-                   __FUNCTION__);
+   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
+      switch(type.width) {
+      case 8:
+         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
+      case 16:
+         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
+      case 32:
+         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
+      }
     }
  
-   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
+   return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
+                          a, LLVMBuildNeg(builder, a, ""));
  }
  
  
@@ -1657,11 +2007,14 @@ arch_rounding_available(const struct lp_type type)
  {
     if ((util_cpu_caps.has_sse4_1 &&
         (type.length == 1 || type.width*type.length == 128)) ||
-       (util_cpu_caps.has_avx && type.width*type.length == 256))
+       (util_cpu_caps.has_avx && type.width*type.length == 256) ||
+       (util_cpu_caps.has_avx512f && type.width*type.length == 512))
        return TRUE;
     else if ((util_cpu_caps.has_altivec &&
              (type.width == 32 && type.length == 4)))
        return TRUE;
+   else if (util_cpu_caps.has_neon)
+      return TRUE;
  
     return FALSE;
  }
@@ -1769,7 +2122,7 @@ lp_build_round_arch(struct lp_build_context *bld,
                      LLVMValueRef a,
                      enum lp_build_round_mode mode)
  {
-   if (util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
        LLVMBuilderRef builder = bld->gallivm->builder;
        const struct lp_type type = bld->type;
        const char *intrinsic_root;
@@ -2069,8 +2422,8 @@ lp_build_fract(struct lp_build_context *bld,
  
  
  /**
- * Prevent returning a fractional part of 1.0 for very small negative values of
- * 'a' by clamping against 0.99999(9).
+ * Prevent returning 1.0 for very small negative values of 'a' by clamping
+ * against 0.99999(9). (Will also return that value for NaNs.)
   */
  static inline LLVMValueRef
  clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
@@ -2080,13 +2433,14 @@ clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
     /* this is the largest number smaller than 1.0 representable as float */
     max = lp_build_const_vec(bld->gallivm, bld->type,
                              1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
-   return lp_build_min(bld, fract, max);
+   return lp_build_min_ext(bld, fract, max,
+                           GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
  }
  
  
  /**
   * Same as lp_build_fract, but guarantees that the result is always smaller
- * than one.
+ * than one. Will also return the smaller-than-one value for infs, NaNs.
   */
  LLVMValueRef
  lp_build_fract_safe(struct lp_build_context *bld,
@@ -2146,7 +2500,7 @@ lp_build_iround(struct lp_build_context *bld,
     else {
        LLVMValueRef half;
  
-      half = lp_build_const_vec(bld->gallivm, type, 0.5);
+      half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
  
        if (type.sign) {
           LLVMTypeRef vec_type = bld->vec_type;
@@ -2360,11 +2714,11 @@ lp_build_sqrt(struct lp_build_context *bld,
  /**
   * Do one Newton-Raphson step to improve reciprocate precision:
   *
- *   x_{i+1} = x_i * (2 - a * x_i)
+ *   x_{i+1} = x_i + x_i * (1 - a * x_i)
   *
   * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
   * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
- * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
   * halo. It would be necessary to clamp the argument to prevent this.
   *
   * See also:
@@ -2377,12 +2731,12 @@ lp_build_rcp_refine(struct lp_build_context *bld,
                      LLVMValueRef rcp_a)
  {
     LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
+   LLVMValueRef neg_a;
     LLVMValueRef res;
  
-   res = LLVMBuildFMul(builder, a, rcp_a, "");
-   res = LLVMBuildFSub(builder, two, res, "");
-   res = LLVMBuildFMul(builder, rcp_a, res, "");
+   neg_a = LLVMBuildFNeg(builder, a, "");
+   res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
+   res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
  
     return res;
  }
@@ -2691,23 +3045,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
     /*
      * The magic pass: "Extended precision modular arithmetic"
      * x = ((x - y * DP1) - y * DP2) - y * DP3;
-    * xmm1 = _mm_mul_ps(y, xmm1);
-    * xmm2 = _mm_mul_ps(y, xmm2);
-    * xmm3 = _mm_mul_ps(y, xmm3);
-    */
-   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
-   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
-   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
-
-   /*
-    * x = _mm_add_ps(x, xmm1);
-    * x = _mm_add_ps(x, xmm2);
-    * x = _mm_add_ps(x, xmm3);
      */
-
-   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
-   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
-   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
+   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
+   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
+   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
  
     /*
      * Evaluate the first polynom  (0 <= x <= Pi/4)
@@ -2729,10 +3070,8 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
      * y = *(v4sf*)_ps_coscof_p0;
      * y = _mm_mul_ps(y, z);
      */
-   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
-   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
-   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
-   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
+   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
     LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
     LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
  
@@ -2770,13 +3109,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
      * y2 = _mm_add_ps(y2, x);
      */
  
-   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
-   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
-   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
-   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
+   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
     LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
-   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
-   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
+   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
  
     /*
      * select the correct result from the two polynoms
@@ -2943,19 +3279,19 @@ lp_build_polynomial(struct lp_build_context *bld,
  
        if (i % 2 == 0) {
           if (even)
-            even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
+            even = lp_build_mad(bld, x2, even, coeff);
           else
              even = coeff;
        } else {
           if (odd)
-            odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
+            odd = lp_build_mad(bld, x2, odd, coeff);
           else
              odd = coeff;
        }
     }
  
     if (odd)
-      return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
+      return lp_build_mad(bld, odd, x, even);
     else if (even)
        return even;
     else
@@ -3038,7 +3374,7 @@ lp_build_exp2(struct lp_build_context *bld,
     expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
  
     expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
-                                  Elements(lp_build_exp2_polynomial));
+                                  ARRAY_SIZE(lp_build_exp2_polynomial));
  
     res = LLVMBuildFMul(builder, expipart, expfpart, "");
  
@@ -3186,7 +3522,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
     LLVMValueRef exp = NULL;
     LLVMValueRef mant = NULL;
     LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
+   LLVMValueRef p_z = NULL;
     LLVMValueRef res = NULL;
  
     assert(lp_check_value(bld->type, x));
@@ -3235,13 +3571,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
        z = lp_build_mul(bld, y, y);
  
        /* compute P(z) */
-      logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
-                                    Elements(lp_build_log2_polynomial));
-
-      /* logmant = y * P(z) */
-      logmant = lp_build_mul(bld, y, logmant);
+      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
+                                ARRAY_SIZE(lp_build_log2_polynomial));
  
-      res = lp_build_add(bld, logmant, logexp);
+      /* y * P(z) + logexp */
+      res = lp_build_mad(bld, y, p_z, logexp);
  
        if (type.floating && handle_edge_cases) {
           LLVMValueRef negmask, infmask,  zmask;