gallivm/nir: allow 64-bit arit ops

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index 3de46287f705691580c9637a149bb3d5ec1b4a72..dbd526d916103d4aea69410b92568b3976424817 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -47,6 +47,8 @@
  
  #include <float.h>
  
+#include <llvm/Config/llvm-config.h>
+
  #include "util/u_memory.h"
  #include "util/u_debug.h"
  #include "util/u_math.h"
@@ -142,49 +144,6 @@ lp_build_min_simple(struct lp_build_context *bld,
           intrinsic = "llvm.ppc.altivec.vminfp";
           intr_size = 128;
        }
-   } else if (HAVE_LLVM < 0x0309 &&
-              util_cpu_caps.has_avx2 && type.length > 4) {
-      intr_size = 256;
-      switch (type.width) {
-      case 8:
-         intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
-         break;
-      case 16:
-         intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
-         break;
-      case 32:
-         intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
-         break;
-      }
-   } else if (HAVE_LLVM < 0x0309 &&
-              util_cpu_caps.has_sse2 && type.length >= 2) {
-      intr_size = 128;
-      if ((type.width == 8 || type.width == 16) &&
-          (type.width * type.length <= 64) &&
-          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
-         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
-                      __FUNCTION__);
-      }
-      if (type.width == 8 && !type.sign) {
-         intrinsic = "llvm.x86.sse2.pminu.b";
-      }
-      else if (type.width == 16 && type.sign) {
-         intrinsic = "llvm.x86.sse2.pmins.w";
-      }
-      if (util_cpu_caps.has_sse4_1) {
-         if (type.width == 8 && type.sign) {
-            intrinsic = "llvm.x86.sse41.pminsb";
-         }
-         if (type.width == 16 && !type.sign) {
-            intrinsic = "llvm.x86.sse41.pminuw";
-         }
-         if (type.width == 32 && !type.sign) {
-            intrinsic = "llvm.x86.sse41.pminud";
-         }
-         if (type.width == 32 && type.sign) {
-            intrinsic = "llvm.x86.sse41.pminsd";
-         }
-      }
     } else if (util_cpu_caps.has_altivec) {
        intr_size = 128;
        if (type.width == 8) {
@@ -285,12 +244,7 @@ lp_build_fmuladd(LLVMBuilderRef builder,
     LLVMTypeRef type = LLVMTypeOf(a);
     assert(type == LLVMTypeOf(b));
     assert(type == LLVMTypeOf(c));
-   if (HAVE_LLVM < 0x0304) {
-      /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
-       * not supported, and instead it falls-back to a C function.
-       */
-      return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
-   }
+
     char intrinsic[32];
     lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
     LLVMValueRef args[] = { a, b, c };
@@ -360,50 +314,6 @@ lp_build_max_simple(struct lp_build_context *bld,
           intrinsic = "llvm.ppc.altivec.vmaxfp";
           intr_size = 128;
        }
-   } else if (HAVE_LLVM < 0x0309 &&
-              util_cpu_caps.has_avx2 && type.length > 4) {
-      intr_size = 256;
-      switch (type.width) {
-      case 8:
-         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
-         break;
-      case 16:
-         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
-         break;
-      case 32:
-         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
-         break;
-      }
-   } else if (HAVE_LLVM < 0x0309 &&
-              util_cpu_caps.has_sse2 && type.length >= 2) {
-      intr_size = 128;
-      if ((type.width == 8 || type.width == 16) &&
-          (type.width * type.length <= 64) &&
-          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
-         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
-                      __FUNCTION__);
-         }
-      if (type.width == 8 && !type.sign) {
-         intrinsic = "llvm.x86.sse2.pmaxu.b";
-         intr_size = 128;
-      }
-      else if (type.width == 16 && type.sign) {
-         intrinsic = "llvm.x86.sse2.pmaxs.w";
-      }
-      if (util_cpu_caps.has_sse4_1) {
-         if (type.width == 8 && type.sign) {
-            intrinsic = "llvm.x86.sse41.pmaxsb";
-         }
-         if (type.width == 16 && !type.sign) {
-            intrinsic = "llvm.x86.sse41.pmaxuw";
-         }
-         if (type.width == 32 && !type.sign) {
-            intrinsic = "llvm.x86.sse41.pmaxud";
-        }
-         if (type.width == 32 && type.sign) {
-            intrinsic = "llvm.x86.sse41.pmaxsd";
-         }
-      }
     } else if (util_cpu_caps.has_altivec) {
       intr_size = 128;
       if (type.width == 8) {
@@ -541,39 +451,45 @@ lp_build_add(struct lp_build_context *bld,
     assert(lp_check_value(type, a));
     assert(lp_check_value(type, b));
  
-   if(a == bld->zero)
+   if (a == bld->zero)
        return b;
-   if(b == bld->zero)
+   if (b == bld->zero)
        return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
        return bld->undef;
  
-   if(bld->type.norm) {
+   if (type.norm) {
        const char *intrinsic = NULL;
  
-      if(a == bld->one || b == bld->one)
+      if (!type.sign && (a == bld->one || b == bld->one))
          return bld->one;
  
        if (!type.floating && !type.fixed) {
+         if (LLVM_VERSION_MAJOR >= 8) {
+            char intrin[32];
+            intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
+            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
+            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+         }
           if (type.width * type.length == 128) {
-            if(util_cpu_caps.has_sse2) {
-              if(type.width == 8)
-                intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
-              if(type.width == 16)
-                intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+            if (util_cpu_caps.has_sse2) {
+               if (type.width == 8)
+                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
+               if (type.width == 16)
+                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
              } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
-              if(type.width == 16)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
              }
           }
           if (type.width * type.length == 256) {
-            if(util_cpu_caps.has_avx2) {
-              if(type.width == 8)
-                intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
-              if(type.width == 16)
-                intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
+            if (util_cpu_caps.has_avx2) {
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
              }
           }
        }
@@ -592,8 +508,6 @@ lp_build_add(struct lp_build_context *bld,
           LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
           LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
           a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
-      } else {
-         a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
        }
     }
  
@@ -612,6 +526,25 @@ lp_build_add(struct lp_build_context *bld,
     if(bld->type.norm && (bld->type.floating || bld->type.fixed))
        res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  
+   if (type.norm && !type.floating && !type.fixed) {
+      if (!type.sign) {
+         /*
+          * newer llvm versions no longer support the intrinsics, but recognize
+          * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
+          * code, it is important we match the pattern llvm uses (and pray llvm
+          * doesn't change it - and hope they decide on the same pattern for
+          * all backends supporting it...).
+          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+          * interfere with llvm's ability to recognize the pattern but seems
+          * a bit brittle.
+          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
+          */
+         LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
+         res = lp_build_select(bld, overflowed,
+                               LLVMConstAllOnes(bld->int_vec_type), res);
+      }
+   }
+
     /* XXX clamp to floor of -1 or 0??? */
  
     return res;
@@ -842,39 +775,45 @@ lp_build_sub(struct lp_build_context *bld,
     assert(lp_check_value(type, a));
     assert(lp_check_value(type, b));
  
-   if(b == bld->zero)
+   if (b == bld->zero)
        return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
        return bld->undef;
-   if(a == b)
+   if (a == b)
        return bld->zero;
  
-   if(bld->type.norm) {
+   if (type.norm) {
        const char *intrinsic = NULL;
  
-      if(b == bld->one)
+      if (!type.sign && b == bld->one)
          return bld->zero;
  
        if (!type.floating && !type.fixed) {
+         if (LLVM_VERSION_MAJOR >= 8) {
+            char intrin[32];
+            intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
+            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
+            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+         }
           if (type.width * type.length == 128) {
              if (util_cpu_caps.has_sse2) {
-              if(type.width == 8)
-                 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
-              if(type.width == 16)
-                 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
              } else if (util_cpu_caps.has_altivec) {
-              if(type.width == 8)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
-              if(type.width == 16)
-                 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
              }
           }
           if (type.width * type.length == 256) {
              if (util_cpu_caps.has_avx2) {
-              if(type.width == 8)
-                 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
-              if(type.width == 16)
-                 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
+               if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
+               if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
              }
           }
        }
@@ -894,7 +833,17 @@ lp_build_sub(struct lp_build_context *bld,
           LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
           a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
        } else {
-         a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
+         /*
+          * This must match llvm pattern for saturated unsigned sub.
+          * (lp_build_max_simple actually does the job with its current
+          * definition but do it explicitly here.)
+          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
+          * interfere with llvm's ability to recognize the pattern but seems
+          * a bit brittle.
+          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
+          */
+         LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
+         a = lp_build_select(bld, no_ov, a, b);
        }
     }
  
@@ -963,7 +912,7 @@ lp_build_sub(struct lp_build_context *bld,
   * @sa Michael Herf, The "double blend trick", May 2000, 
   *     http://www.stereopsis.com/doubleblend.html
   */
-static LLVMValueRef
+LLVMValueRef
  lp_build_mul_norm(struct gallivm_state *gallivm,
                    struct lp_type wide_type,
                    LLVMValueRef a, LLVMValueRef b)
@@ -1094,12 +1043,14 @@ lp_build_mul(struct lp_build_context *bld,
  /*
   * Widening mul, valid for 32x32 bit -> 64bit only.
   * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits code that is meant to be compiled for the host CPU.
   */
  LLVMValueRef
-lp_build_mul_32_lohi(struct lp_build_context *bld,
-                     LLVMValueRef a,
-                     LLVMValueRef b,
-                     LLVMValueRef *res_hi)
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+                         LLVMValueRef a,
+                         LLVMValueRef b,
+                         LLVMValueRef *res_hi)
  {
     struct gallivm_state *gallivm = bld->gallivm;
     LLVMBuilderRef builder = gallivm->builder;
@@ -1120,8 +1071,13 @@ lp_build_mul_32_lohi(struct lp_build_context *bld,
      * https://llvm.org/bugs/show_bug.cgi?id=30845
      * So, whip up our own code, albeit only for length 4 and 8 (which
      * should be good enough)...
+    * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
+    * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
+    * for signed), which the fallback code does not, without this llvm
+    * will likely still produce atrocious code.
      */
-   if ((bld->type.length == 4 || bld->type.length == 8) &&
+   if (LLVM_VERSION_MAJOR < 7 &&
+       (bld->type.length == 4 || bld->type.length == 8) &&
         ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
          util_cpu_caps.has_sse4_1)) {
        const char *intrinsic = NULL;
@@ -1216,29 +1172,51 @@ lp_build_mul_32_lohi(struct lp_build_context *bld,
        return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
     }
     else {
-      LLVMValueRef tmp;
-      struct lp_type type_tmp;
-      LLVMTypeRef wide_type, cast_type;
-
-      type_tmp = bld->type;
-      type_tmp.width *= 2;
-      wide_type = lp_build_vec_type(gallivm, type_tmp);
-      type_tmp = bld->type;
-      type_tmp.length *= 2;
-      cast_type = lp_build_vec_type(gallivm, type_tmp);
-
-      if (bld->type.sign) {
-         a = LLVMBuildSExt(builder, a, wide_type, "");
-         b = LLVMBuildSExt(builder, b, wide_type, "");
-      } else {
-         a = LLVMBuildZExt(builder, a, wide_type, "");
-         b = LLVMBuildZExt(builder, b, wide_type, "");
-      }
-      tmp = LLVMBuildMul(builder, a, b, "");
-      tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
-      *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
-      return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
+      return lp_build_mul_32_lohi(bld, a, b, res_hi);
+   }
+}
+
+
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits generic code.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp, shift, res_lo;
+   struct lp_type type_tmp;
+   LLVMTypeRef wide_type, narrow_type;
+
+   type_tmp = bld->type;
+   narrow_type = lp_build_vec_type(gallivm, type_tmp);
+   type_tmp.width *= 2;
+   wide_type = lp_build_vec_type(gallivm, type_tmp);
+   shift = lp_build_const_vec(gallivm, type_tmp, 32);
+
+   if (bld->type.sign) {
+      a = LLVMBuildSExt(builder, a, wide_type, "");
+      b = LLVMBuildSExt(builder, b, wide_type, "");
+   } else {
+      a = LLVMBuildZExt(builder, a, wide_type, "");
+      b = LLVMBuildZExt(builder, b, wide_type, "");
     }
+   tmp = LLVMBuildMul(builder, a, b, "");
+
+   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   /* Since we truncate anyway, LShr and AShr are equivalent. */
+   tmp = LLVMBuildLShr(builder, tmp, shift, "");
+   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   return res_lo;
  }
  
  
@@ -1283,7 +1261,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
     if(b == 2 && bld->type.floating)
        return lp_build_add(bld, a, a);
  
-   if(util_is_power_of_two(b)) {
+   if(util_is_power_of_two_or_zero(b)) {
        unsigned shift = ffs(b) - 1;
  
        if(bld->type.floating) {
@@ -1348,7 +1326,9 @@ lp_build_div(struct lp_build_context *bld,
           return LLVMConstUDiv(a, b);
     }
  
-   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+   /* fast rcp is disabled (just uses div), so makes no sense to try that */
+   if(FALSE &&
+      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
        type.floating)
        return lp_build_mul(bld, a, lp_build_rcp(bld, b));
@@ -1757,23 +1737,12 @@ lp_build_abs(struct lp_build_context *bld,
        return a;
  
     if(type.floating) {
-      if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
-         /* Workaround llvm.org/PR27332 */
-         LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
-         unsigned long long absMask = ~(1ULL << (type.width - 1));
-         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
-         a = LLVMBuildBitCast(builder, a, int_vec_type, "");
-         a = LLVMBuildAnd(builder, a, mask, "");
-         a = LLVMBuildBitCast(builder, a, vec_type, "");
-         return a;
-      } else {
-         char intrinsic[32];
-         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
-         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
-      }
+      char intrinsic[32];
+      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
+      return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
     }
  
-   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
        switch(type.width) {
        case 8:
           return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
@@ -1783,7 +1752,7 @@ lp_build_abs(struct lp_build_context *bld,
           return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
        }
     }
-   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
+   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
        switch(type.width) {
        case 8:
           return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
@@ -1793,14 +1762,9 @@ lp_build_abs(struct lp_build_context *bld,
           return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
        }
     }
-   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
-            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
-            (type.width == 8 || type.width == 16 || type.width == 32)) {
-      debug_printf("%s: inefficient code, should split vectors manually\n",
-                   __FUNCTION__);
-   }
  
-   return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
+   return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
+                          a, LLVMBuildNeg(builder, a, ""));
  }
  
  
@@ -1932,11 +1896,14 @@ arch_rounding_available(const struct lp_type type)
  {
     if ((util_cpu_caps.has_sse4_1 &&
         (type.length == 1 || type.width*type.length == 128)) ||
-       (util_cpu_caps.has_avx && type.width*type.length == 256))
+       (util_cpu_caps.has_avx && type.width*type.length == 256) ||
+       (util_cpu_caps.has_avx512f && type.width*type.length == 512))
        return TRUE;
     else if ((util_cpu_caps.has_altivec &&
              (type.width == 32 && type.length == 4)))
        return TRUE;
+   else if (util_cpu_caps.has_neon)
+      return TRUE;
  
     return FALSE;
  }
@@ -2044,7 +2011,7 @@ lp_build_round_arch(struct lp_build_context *bld,
                      LLVMValueRef a,
                      enum lp_build_round_mode mode)
  {
-   if (util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
        LLVMBuilderRef builder = bld->gallivm->builder;
        const struct lp_type type = bld->type;
        const char *intrinsic_root;
@@ -2103,8 +2070,6 @@ lp_build_trunc(struct lp_build_context *bld,
        LLVMTypeRef int_vec_type = bld->int_vec_type;
        LLVMTypeRef vec_type = bld->vec_type;
  
-      assert(type.width == 32); /* might want to handle doubles at some point */
-
        inttype = type;
        inttype.floating = 0;
        lp_build_context_init(&intbld, bld->gallivm, inttype);
@@ -2158,8 +2123,6 @@ lp_build_round(struct lp_build_context *bld,
        LLVMTypeRef int_vec_type = bld->int_vec_type;
        LLVMTypeRef vec_type = bld->vec_type;
  
-      assert(type.width == 32); /* might want to handle doubles at some point */
-
        inttype = type;
        inttype.floating = 0;
        lp_build_context_init(&intbld, bld->gallivm, inttype);
@@ -2422,7 +2385,7 @@ lp_build_iround(struct lp_build_context *bld,
     else {
        LLVMValueRef half;
  
-      half = lp_build_const_vec(bld->gallivm, type, 0.5);
+      half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
  
        if (type.sign) {
           LLVMTypeRef vec_type = bld->vec_type;
@@ -2636,11 +2599,11 @@ lp_build_sqrt(struct lp_build_context *bld,
  /**
   * Do one Newton-Raphson step to improve reciprocate precision:
   *
- *   x_{i+1} = x_i * (2 - a * x_i)
+ *   x_{i+1} = x_i + x_i * (1 - a * x_i)
   *
   * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
   * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
- * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
   * halo. It would be necessary to clamp the argument to prevent this.
   *
   * See also:
@@ -2653,12 +2616,12 @@ lp_build_rcp_refine(struct lp_build_context *bld,
                      LLVMValueRef rcp_a)
  {
     LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
+   LLVMValueRef neg_a;
     LLVMValueRef res;
  
-   res = LLVMBuildFMul(builder, a, rcp_a, "");
-   res = LLVMBuildFSub(builder, two, res, "");
-   res = LLVMBuildFMul(builder, rcp_a, res, "");
+   neg_a = LLVMBuildFNeg(builder, a, "");
+   res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
+   res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
  
     return res;
  }