gallivm: add no-signed-zeros-fp-math option to lp_create_builder (v2)

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index 0c43617d5315c6c356d1c6409fe53c55caf600e5..04f86bef286ff6d1348cd43f03f80faf36ea74ab 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -50,7 +50,6 @@
  #include "util/u_memory.h"
  #include "util/u_debug.h"
  #include "util/u_math.h"
-#include "util/u_string.h"
  #include "util/u_cpu_detect.h"
  
  #include "lp_bld_type.h"
@@ -143,7 +142,22 @@ lp_build_min_simple(struct lp_build_context *bld,
           intrinsic = "llvm.ppc.altivec.vminfp";
           intr_size = 128;
        }
-   } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_avx2 && type.length > 4) {
+      intr_size = 256;
+      switch (type.width) {
+      case 8:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
+         break;
+      case 16:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
+         break;
+      case 32:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
+         break;
+      }
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_sse2 && type.length >= 2) {
        intr_size = 128;
        if ((type.width == 8 || type.width == 16) &&
            (type.width * type.length <= 64) &&
@@ -262,6 +276,28 @@ lp_build_min_simple(struct lp_build_context *bld,
  }
  
  
+LLVMValueRef
+lp_build_fmuladd(LLVMBuilderRef builder,
+                 LLVMValueRef a,
+                 LLVMValueRef b,
+                 LLVMValueRef c)
+{
+   LLVMTypeRef type = LLVMTypeOf(a);
+   assert(type == LLVMTypeOf(b));
+   assert(type == LLVMTypeOf(c));
+   if (HAVE_LLVM < 0x0304) {
+      /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
+       * not supported, and instead it falls-back to a C function.
+       */
+      return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
+   }
+   char intrinsic[32];
+   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
+   LLVMValueRef args[] = { a, b, c };
+   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
+}
+
+
  /**
   * Generate max(a, b)
   * No checks for special case values of a or b = 1 or 0 are done.
@@ -324,7 +360,22 @@ lp_build_max_simple(struct lp_build_context *bld,
           intrinsic = "llvm.ppc.altivec.vmaxfp";
           intr_size = 128;
        }
-   } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_avx2 && type.length > 4) {
+      intr_size = 256;
+      switch (type.width) {
+      case 8:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
+         break;
+      case 16:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
+         break;
+      case 32:
+         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
+         break;
+      }
+   } else if (HAVE_LLVM < 0x0309 &&
+              util_cpu_caps.has_sse2 && type.length >= 2) {
        intr_size = 128;
        if ((type.width == 8 || type.width == 16) &&
            (type.width * type.length <= 64) &&
@@ -503,18 +554,27 @@ lp_build_add(struct lp_build_context *bld,
        if(a == bld->one || b == bld->one)
          return bld->one;
  
-      if (type.width * type.length == 128 &&
-          !type.floating && !type.fixed) {
-         if(util_cpu_caps.has_sse2) {
-           if(type.width == 8)
-             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-           if(type.width == 16)
-             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
-         } else if (util_cpu_caps.has_altivec) {
-           if(type.width == 8)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-           if(type.width == 16)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+      if (!type.floating && !type.fixed) {
+         if (type.width * type.length == 128) {
+            if(util_cpu_caps.has_sse2) {
+              if(type.width == 8)
+                intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
+              if(type.width == 16)
+                intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+            } else if (util_cpu_caps.has_altivec) {
+              if(type.width == 8)
+                 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+              if(type.width == 16)
+                 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+            }
+         }
+         if (type.width * type.length == 256) {
+            if(util_cpu_caps.has_avx2) {
+              if(type.width == 8)
+                intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
+              if(type.width == 16)
+                intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
+            }
           }
        }
     
@@ -795,18 +855,27 @@ lp_build_sub(struct lp_build_context *bld,
        if(b == bld->one)
          return bld->zero;
  
-      if (type.width * type.length == 128 &&
-          !type.floating && !type.fixed) {
-         if (util_cpu_caps.has_sse2) {
-           if(type.width == 8)
-              intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-           if(type.width == 16)
-              intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
-         } else if (util_cpu_caps.has_altivec) {
-           if(type.width == 8)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-           if(type.width == 16)
-              intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+      if (!type.floating && !type.fixed) {
+         if (type.width * type.length == 128) {
+            if (util_cpu_caps.has_sse2) {
+              if(type.width == 8)
+                 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
+              if(type.width == 16)
+                 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+            } else if (util_cpu_caps.has_altivec) {
+              if(type.width == 8)
+                 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+              if(type.width == 16)
+                 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+            }
+         }
+         if (type.width * type.length == 256) {
+            if (util_cpu_caps.has_avx2) {
+              if(type.width == 8)
+                 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
+              if(type.width == 16)
+                 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
+            }
           }
        }
     
@@ -977,14 +1046,14 @@ lp_build_mul(struct lp_build_context *bld,
        struct lp_type wide_type = lp_wider_type(type);
        LLVMValueRef al, ah, bl, bh, abl, abh, ab;
  
-      lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
-      lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
  
        /* PMULLW, PSRLW, PADDW */
        abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
        abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
  
-      ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
+      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
  
        return ab;
     }
@@ -1022,6 +1091,196 @@ lp_build_mul(struct lp_build_context *bld,
     return res;
  }
  
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits code that is meant to be compiled for the host CPU.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+                         LLVMValueRef a,
+                         LLVMValueRef b,
+                         LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+
+   /*
+    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+    * for x86 simd is atrocious (even if the high bits weren't required),
+    * trying to handle real 64bit inputs (which of course can't happen due
+    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+    * apparently llvm does not recognize this widening mul). This includes 6
+    * (instead of 2) pmuludq plus extra adds and shifts
+    * The same story applies to signed mul, albeit fixing this requires sse41.
+    * https://llvm.org/bugs/show_bug.cgi?id=30845
+    * So, whip up our own code, albeit only for length 4 and 8 (which
+    * should be good enough)...
+    */
+   if ((bld->type.length == 4 || bld->type.length == 8) &&
+       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+        util_cpu_caps.has_sse4_1)) {
+      const char *intrinsic = NULL;
+      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+      struct lp_type type_wide = lp_wider_type(bld->type);
+      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+      unsigned i;
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i+1);
+         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      aeven = a;
+      beven = b;
+      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.avx2.pmul.dq";
+         } else {
+            intrinsic = "llvm.x86.avx2.pmulu.dq";
+         }
+         muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                             wider_type, aeven, beven);
+         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                            wider_type, aodd, bodd);
+      }
+      else {
+         /* for consistent naming look elsewhere... */
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.sse41.pmuldq";
+         } else {
+            intrinsic = "llvm.x86.sse2.pmulu.dq";
+         }
+         /*
+          * XXX If we only have AVX but not AVX2 this is a pain.
+          * lp_build_intrinsic_binary_anylength() can't handle it
+          * (due to src and dst type not being identical).
+          */
+         if (bld->type.length == 8) {
+            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+            LLVMValueRef muleven2[2], mulodd2[2];
+            struct lp_type type_wide_half = type_wide;
+            LLVMTypeRef wtype_half;
+            type_wide_half.length = 2;
+            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
+            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
+            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
+            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenlo, bevenlo);
+            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddlo, boddlo);
+            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenhi, bevenhi);
+            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddhi, boddhi);
+            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
+            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
+
+         }
+         else {
+            muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                                wider_type, aeven, beven);
+            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                               wider_type, aodd, bodd);
+         }
+      }
+      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
+      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i + 1);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+   }
+   else {
+      return lp_build_mul_32_lohi(bld, a, b, res_hi);
+   }
+}
+
+
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits generic code.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp, shift, res_lo;
+   struct lp_type type_tmp;
+   LLVMTypeRef wide_type, narrow_type;
+
+   type_tmp = bld->type;
+   narrow_type = lp_build_vec_type(gallivm, type_tmp);
+   type_tmp.width *= 2;
+   wide_type = lp_build_vec_type(gallivm, type_tmp);
+   shift = lp_build_const_vec(gallivm, type_tmp, 32);
+
+   if (bld->type.sign) {
+      a = LLVMBuildSExt(builder, a, wide_type, "");
+      b = LLVMBuildSExt(builder, b, wide_type, "");
+   } else {
+      a = LLVMBuildZExt(builder, a, wide_type, "");
+      b = LLVMBuildZExt(builder, b, wide_type, "");
+   }
+   tmp = LLVMBuildMul(builder, a, b, "");
+
+   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   /* Since we truncate anyway, LShr and AShr are equivalent. */
+   tmp = LLVMBuildLShr(builder, tmp, shift, "");
+   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   return res_lo;
+}
+
+
+/* a * b + c */
+LLVMValueRef
+lp_build_mad(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b,
+             LLVMValueRef c)
+{
+   const struct lp_type type = bld->type;
+   if (type.floating) {
+      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
+   } else {
+      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
+   }
+}
+
  
  /**
   * Small vector x scale multiplication optimization.
@@ -1113,7 +1372,9 @@ lp_build_div(struct lp_build_context *bld,
           return LLVMConstUDiv(a, b);
     }
  
-   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+   /* fast rcp is disabled (just uses div), so makes no sense to try that */
+   if(FALSE &&
+      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
        type.floating)
        return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -1153,6 +1414,11 @@ lp_build_lerp_simple(struct lp_build_context *bld,
  
     delta = lp_build_sub(bld, v1, v0);
  
+   if (bld->type.floating) {
+      assert(flags == 0);
+      return lp_build_mad(bld, x, delta, v0);
+   }
+
     if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
        if (!bld->type.sign) {
           if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
@@ -1182,16 +1448,41 @@ lp_build_lerp_simple(struct lp_build_context *bld,
        res = lp_build_mul(bld, x, delta);
     }
  
-   res = lp_build_add(bld, v0, res);
+   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
+      /*
+       * At this point both res and v0 only use the lower half of the bits,
+       * the rest is zero. Instead of add / mask, do add with half wide type.
+       */
+      struct lp_type narrow_type;
+      struct lp_build_context narrow_bld;
+
+      memset(&narrow_type, 0, sizeof narrow_type);
+      narrow_type.sign   = bld->type.sign;
+      narrow_type.width  = bld->type.width/2;
+      narrow_type.length = bld->type.length*2;
+
+      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
+      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
+      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
+      res = lp_build_add(&narrow_bld, v0, res);
+      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
+   } else {
+      res = lp_build_add(bld, v0, res);
  
-   if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
-       bld->type.fixed) {
-      /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
-      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
-       * but it will be wrong for true fixed point use cases. Basically we need
-       * a more powerful lp_type, capable of further distinguishing the values
-       * interpretation from the value storage. */
-      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
+      if (bld->type.fixed) {
+         /*
+          * We need to mask out the high order bits when lerping 8bit
+          * normalized colors stored on 16bits
+          */
+         /* XXX: This step is necessary for lerping 8bit colors stored on
+          * 16bits, but it will be wrong for true fixed point use cases.
+          * Basically we need a more powerful lp_type, capable of further
+          * distinguishing the values interpretation from the value storage.
+          */
+         LLVMValueRef low_bits;
+         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
+         res = LLVMBuildAnd(builder, res, low_bits, "");
+      }
     }
  
     return res;
@@ -1235,9 +1526,9 @@ lp_build_lerp(struct lp_build_context *bld,
  
        lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
  
-      lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
-      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
-      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
  
        /*
         * Lerp both halves.
@@ -1248,7 +1539,7 @@ lp_build_lerp(struct lp_build_context *bld,
        resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
        resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
  
-      res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
+      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
     } else {
        res = lp_build_lerp_simple(bld, x, v0, v1, flags);
     }
@@ -1492,9 +1783,20 @@ lp_build_abs(struct lp_build_context *bld,
        return a;
  
     if(type.floating) {
-      char intrinsic[32];
-      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
-      return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
+         /* Workaround llvm.org/PR27332 */
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
+         a = LLVMBuildBitCast(builder, a, int_vec_type, "");
+         a = LLVMBuildAnd(builder, a, mask, "");
+         a = LLVMBuildBitCast(builder, a, vec_type, "");
+         return a;
+      } else {
+         char intrinsic[32];
+         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
     }
  
     if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -1507,6 +1809,16 @@ lp_build_abs(struct lp_build_context *bld,
           return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
        }
     }
+   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
+      switch(type.width) {
+      case 8:
+         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
+      case 16:
+         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
+      case 32:
+         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
+      }
+   }
     else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
              (gallivm_debug & GALLIVM_DEBUG_PERF) &&
              (type.width == 8 || type.width == 16 || type.width == 32)) {
@@ -1663,99 +1975,6 @@ enum lp_build_round_mode
     LP_BUILD_ROUND_TRUNCATE = 3
  };
  
-/**
- * Helper for SSE4.1's ROUNDxx instructions.
- *
- * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
- * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
- */
-static inline LLVMValueRef
-lp_build_nearest_sse41(struct lp_build_context *bld,
-                       LLVMValueRef a)
-{
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   const struct lp_type type = bld->type;
-   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
-   LLVMValueRef mode = LLVMConstNull(i32t);
-   const char *intrinsic;
-   LLVMValueRef res;
-
-   assert(type.floating);
-
-   assert(lp_check_value(type, a));
-   assert(util_cpu_caps.has_sse4_1);
-
-   if (type.length == 1) {
-      LLVMTypeRef vec_type;
-      LLVMValueRef undef;
-      LLVMValueRef args[3];
-      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
-
-      switch(type.width) {
-      case 32:
-         intrinsic = "llvm.x86.sse41.round.ss";
-         break;
-      case 64:
-         intrinsic = "llvm.x86.sse41.round.sd";
-         break;
-      default:
-         assert(0);
-         return bld->undef;
-      }
-
-      vec_type = LLVMVectorType(bld->elem_type, 4);
-
-      undef = LLVMGetUndef(vec_type);
-
-      args[0] = undef;
-      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
-      args[2] = mode;
-
-      res = lp_build_intrinsic(builder, intrinsic,
-                               vec_type, args, Elements(args), 0);
-
-      res = LLVMBuildExtractElement(builder, res, index0, "");
-   }
-   else {
-      if (type.width * type.length == 128) {
-         switch(type.width) {
-         case 32:
-            intrinsic = "llvm.x86.sse41.round.ps";
-            break;
-         case 64:
-            intrinsic = "llvm.x86.sse41.round.pd";
-            break;
-         default:
-            assert(0);
-            return bld->undef;
-         }
-      }
-      else {
-         assert(type.width * type.length == 256);
-         assert(util_cpu_caps.has_avx);
-
-         switch(type.width) {
-         case 32:
-            intrinsic = "llvm.x86.avx.round.ps.256";
-            break;
-         case 64:
-            intrinsic = "llvm.x86.avx.round.pd.256";
-            break;
-         default:
-            assert(0);
-            return bld->undef;
-         }
-      }
-
-      res = lp_build_intrinsic_binary(builder, intrinsic,
-                                      bld->vec_type, a,
-                                      mode);
-   }
-
-   return res;
-}
-
-
  static inline LLVMValueRef
  lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                               LLVMValueRef a)
@@ -1863,11 +2082,7 @@ lp_build_round_arch(struct lp_build_context *bld,
  
        switch (mode) {
        case LP_BUILD_ROUND_NEAREST:
-         if (HAVE_LLVM >= 0x0304) {
-            intrinsic_root = "llvm.round";
-         } else {
-            return lp_build_nearest_sse41(bld, a);
-         }
+         intrinsic_root = "llvm.nearbyint";
           break;
        case LP_BUILD_ROUND_FLOOR:
           intrinsic_root = "llvm.floor";
@@ -2155,8 +2370,8 @@ lp_build_fract(struct lp_build_context *bld,
  
  
  /**
- * Prevent returning a fractional part of 1.0 for very small negative values of
- * 'a' by clamping against 0.99999(9).
+ * Prevent returning 1.0 for very small negative values of 'a' by clamping
+ * against 0.99999(9). (Will also return that value for NaNs.)
   */
  static inline LLVMValueRef
  clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
@@ -2166,13 +2381,14 @@ clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
     /* this is the largest number smaller than 1.0 representable as float */
     max = lp_build_const_vec(bld->gallivm, bld->type,
                              1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
-   return lp_build_min(bld, fract, max);
+   return lp_build_min_ext(bld, fract, max,
+                           GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
  }
  
  
  /**
   * Same as lp_build_fract, but guarantees that the result is always smaller
- * than one.
+ * than one. Will also return the smaller-than-one value for infs, NaNs.
   */
  LLVMValueRef
  lp_build_fract_safe(struct lp_build_context *bld,
@@ -2777,23 +2993,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
     /*
      * The magic pass: "Extended precision modular arithmetic"
      * x = ((x - y * DP1) - y * DP2) - y * DP3;
-    * xmm1 = _mm_mul_ps(y, xmm1);
-    * xmm2 = _mm_mul_ps(y, xmm2);
-    * xmm3 = _mm_mul_ps(y, xmm3);
      */
-   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
-   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
-   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
-
-   /*
-    * x = _mm_add_ps(x, xmm1);
-    * x = _mm_add_ps(x, xmm2);
-    * x = _mm_add_ps(x, xmm3);
-    */
-
-   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
-   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
-   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
+   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
+   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
+   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
  
     /*
      * Evaluate the first polynom  (0 <= x <= Pi/4)
@@ -2815,10 +3018,8 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
      * y = *(v4sf*)_ps_coscof_p0;
      * y = _mm_mul_ps(y, z);
      */
-   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
-   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
-   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
-   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
+   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
     LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
     LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
  
@@ -2856,13 +3057,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
      * y2 = _mm_add_ps(y2, x);
      */
  
-   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
-   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
-   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
-   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
+   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
     LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
-   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
-   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
+   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
  
     /*
      * select the correct result from the two polynoms
@@ -3029,19 +3227,19 @@ lp_build_polynomial(struct lp_build_context *bld,
  
        if (i % 2 == 0) {
           if (even)
-            even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
+            even = lp_build_mad(bld, x2, even, coeff);
           else
              even = coeff;
        } else {
           if (odd)
-            odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
+            odd = lp_build_mad(bld, x2, odd, coeff);
           else
              odd = coeff;
        }
     }
  
     if (odd)
-      return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
+      return lp_build_mad(bld, odd, x, even);
     else if (even)
        return even;
     else
@@ -3124,7 +3322,7 @@ lp_build_exp2(struct lp_build_context *bld,
     expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
  
     expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
-                                  Elements(lp_build_exp2_polynomial));
+                                  ARRAY_SIZE(lp_build_exp2_polynomial));
  
     res = LLVMBuildFMul(builder, expipart, expfpart, "");
  
@@ -3272,7 +3470,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
     LLVMValueRef exp = NULL;
     LLVMValueRef mant = NULL;
     LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
+   LLVMValueRef p_z = NULL;
     LLVMValueRef res = NULL;
  
     assert(lp_check_value(bld->type, x));
@@ -3321,13 +3519,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
        z = lp_build_mul(bld, y, y);
  
        /* compute P(z) */
-      logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
-                                    Elements(lp_build_log2_polynomial));
-
-      /* logmant = y * P(z) */
-      logmant = lp_build_mul(bld, y, logmant);
+      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
+                                ARRAY_SIZE(lp_build_log2_polynomial));
  
-      res = lp_build_add(bld, logmant, logexp);
+      /* y * P(z) + logexp */
+      res = lp_build_mad(bld, y, p_z, logexp);
  
        if (type.floating && handle_edge_cases) {
           LLVMValueRef negmask, infmask,  zmask;