src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include <llvm/Config/llvm-config.h>
  51
  52 #include "util/u_memory.h"
  53 #include "util/u_debug.h"
  54 #include "util/u_math.h"
  55 #include "util/u_cpu_detect.h"
  56
  57 #include "lp_bld_type.h"
  58 #include "lp_bld_const.h"
  59 #include "lp_bld_init.h"
  60 #include "lp_bld_intr.h"
  61 #include "lp_bld_logic.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_debug.h"
  64 #include "lp_bld_bitarit.h"
  65 #include "lp_bld_arit.h"
  66 #include "lp_bld_flow.h"
  67
  68 #if defined(PIPE_ARCH_SSE)
  69 #include <xmmintrin.h>
  70 #endif
  71
  72 #ifndef _MM_DENORMALS_ZERO_MASK
  73 #define _MM_DENORMALS_ZERO_MASK 0x0040
  74 #endif
  75
  76 #ifndef _MM_FLUSH_ZERO_MASK
  77 #define _MM_FLUSH_ZERO_MASK 0x8000
  78 #endif
  79
  80 #define EXP_POLY_DEGREE 5
  81
  82 #define LOG_POLY_DEGREE 4
  83
  84
  85 /**
  86  * Generate min(a, b)
  87  * No checks for special case values of a or b = 1 or 0 are done.
  88  * NaN's are handled according to the behavior specified by the
  89  * nan_behavior argument.
  90  */
  91 static LLVMValueRef
  92 lp_build_min_simple(struct lp_build_context *bld,
  93                     LLVMValueRef a,
  94                     LLVMValueRef b,
  95                     enum gallivm_nan_behavior nan_behavior)
  96 {
  97    const struct lp_type type = bld->type;
  98    const char *intrinsic = NULL;
  99    unsigned intr_size = 0;
 100    LLVMValueRef cond;
 101
 102    assert(lp_check_value(type, a));
 103    assert(lp_check_value(type, b));
 104
 105    /* TODO: optimize the constant case */
 106
 107    if (type.floating && util_cpu_caps.has_sse) {
 108       if (type.width == 32) {
 109          if (type.length == 1) {
 110             intrinsic = "llvm.x86.sse.min.ss";
 111             intr_size = 128;
 112          }
 113          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 114             intrinsic = "llvm.x86.sse.min.ps";
 115             intr_size = 128;
 116          }
 117          else {
 118             intrinsic = "llvm.x86.avx.min.ps.256";
 119             intr_size = 256;
 120          }
 121       }
 122       if (type.width == 64 && util_cpu_caps.has_sse2) {
 123          if (type.length == 1) {
 124             intrinsic = "llvm.x86.sse2.min.sd";
 125             intr_size = 128;
 126          }
 127          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 128             intrinsic = "llvm.x86.sse2.min.pd";
 129             intr_size = 128;
 130          }
 131          else {
 132             intrinsic = "llvm.x86.avx.min.pd.256";
 133             intr_size = 256;
 134          }
 135       }
 136    }
 137    else if (type.floating && util_cpu_caps.has_altivec) {
 138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 139           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 140          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 141                       __FUNCTION__);
 142       }
 143       if (type.width == 32 && type.length == 4) {
 144          intrinsic = "llvm.ppc.altivec.vminfp";
 145          intr_size = 128;
 146       }
 147    } else if (util_cpu_caps.has_altivec) {
 148       intr_size = 128;
 149       if (type.width == 8) {
 150          if (!type.sign) {
 151             intrinsic = "llvm.ppc.altivec.vminub";
 152          } else {
 153             intrinsic = "llvm.ppc.altivec.vminsb";
 154          }
 155       } else if (type.width == 16) {
 156          if (!type.sign) {
 157             intrinsic = "llvm.ppc.altivec.vminuh";
 158          } else {
 159             intrinsic = "llvm.ppc.altivec.vminsh";
 160          }
 161       } else if (type.width == 32) {
 162          if (!type.sign) {
 163             intrinsic = "llvm.ppc.altivec.vminuw";
 164          } else {
 165             intrinsic = "llvm.ppc.altivec.vminsw";
 166          }
 167       }
 168    }
 169
 170    if (intrinsic) {
 171       /* We need to handle nan's for floating point numbers. If one of the
 172        * inputs is nan the other should be returned (required by both D3D10+
 173        * and OpenCL).
 174        * The sse intrinsics return the second operator in case of nan by
 175        * default so we need to special code to handle those.
 176        */
 177       if (util_cpu_caps.has_sse && type.floating &&
 178           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 179           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 180           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 181          LLVMValueRef isnan, min;
 182          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 183                                                    type,
 184                                                    intr_size, a, b);
 185          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 186             isnan = lp_build_isnan(bld, b);
 187             return lp_build_select(bld, isnan, a, min);
 188          } else {
 189             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 190             isnan = lp_build_isnan(bld, a);
 191             return lp_build_select(bld, isnan, a, min);
 192          }
 193       } else {
 194          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 195                                                     type,
 196                                                     intr_size, a, b);
 197       }
 198    }
 199
 200    if (type.floating) {
 201       switch (nan_behavior) {
 202       case GALLIVM_NAN_RETURN_NAN: {
 203          LLVMValueRef isnan = lp_build_isnan(bld, b);
 204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 205          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 206          return lp_build_select(bld, cond, a, b);
 207       }
 208          break;
 209       case GALLIVM_NAN_RETURN_OTHER: {
 210          LLVMValueRef isnan = lp_build_isnan(bld, a);
 211          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 212          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 213          return lp_build_select(bld, cond, a, b);
 214       }
 215          break;
 216       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 217          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 218          return lp_build_select(bld, cond, a, b);
 219       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 220          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 221          return lp_build_select(bld, cond, b, a);
 222       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 223          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 224          return lp_build_select(bld, cond, a, b);
 225          break;
 226       default:
 227          assert(0);
 228          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 229          return lp_build_select(bld, cond, a, b);
 230       }
 231    } else {
 232       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 233       return lp_build_select(bld, cond, a, b);
 234    }
 235 }
 236
 237
 238 LLVMValueRef
 239 lp_build_fmuladd(LLVMBuilderRef builder,
 240                  LLVMValueRef a,
 241                  LLVMValueRef b,
 242                  LLVMValueRef c)
 243 {
 244    LLVMTypeRef type = LLVMTypeOf(a);
 245    assert(type == LLVMTypeOf(b));
 246    assert(type == LLVMTypeOf(c));
 247
 248    char intrinsic[32];
 249    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 250    LLVMValueRef args[] = { a, b, c };
 251    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 252 }
 253
 254
 255 /**
 256  * Generate max(a, b)
 257  * No checks for special case values of a or b = 1 or 0 are done.
 258  * NaN's are handled according to the behavior specified by the
 259  * nan_behavior argument.
 260  */
 261 static LLVMValueRef
 262 lp_build_max_simple(struct lp_build_context *bld,
 263                     LLVMValueRef a,
 264                     LLVMValueRef b,
 265                     enum gallivm_nan_behavior nan_behavior)
 266 {
 267    const struct lp_type type = bld->type;
 268    const char *intrinsic = NULL;
 269    unsigned intr_size = 0;
 270    LLVMValueRef cond;
 271
 272    assert(lp_check_value(type, a));
 273    assert(lp_check_value(type, b));
 274
 275    /* TODO: optimize the constant case */
 276
 277    if (type.floating && util_cpu_caps.has_sse) {
 278       if (type.width == 32) {
 279          if (type.length == 1) {
 280             intrinsic = "llvm.x86.sse.max.ss";
 281             intr_size = 128;
 282          }
 283          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 284             intrinsic = "llvm.x86.sse.max.ps";
 285             intr_size = 128;
 286          }
 287          else {
 288             intrinsic = "llvm.x86.avx.max.ps.256";
 289             intr_size = 256;
 290          }
 291       }
 292       if (type.width == 64 && util_cpu_caps.has_sse2) {
 293          if (type.length == 1) {
 294             intrinsic = "llvm.x86.sse2.max.sd";
 295             intr_size = 128;
 296          }
 297          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 298             intrinsic = "llvm.x86.sse2.max.pd";
 299             intr_size = 128;
 300          }
 301          else {
 302             intrinsic = "llvm.x86.avx.max.pd.256";
 303             intr_size = 256;
 304          }
 305       }
 306    }
 307    else if (type.floating && util_cpu_caps.has_altivec) {
 308       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 309           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 310          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 311                       __FUNCTION__);
 312       }
 313       if (type.width == 32 || type.length == 4) {
 314          intrinsic = "llvm.ppc.altivec.vmaxfp";
 315          intr_size = 128;
 316       }
 317    } else if (util_cpu_caps.has_altivec) {
 318      intr_size = 128;
 319      if (type.width == 8) {
 320        if (!type.sign) {
 321          intrinsic = "llvm.ppc.altivec.vmaxub";
 322        } else {
 323          intrinsic = "llvm.ppc.altivec.vmaxsb";
 324        }
 325      } else if (type.width == 16) {
 326        if (!type.sign) {
 327          intrinsic = "llvm.ppc.altivec.vmaxuh";
 328        } else {
 329          intrinsic = "llvm.ppc.altivec.vmaxsh";
 330        }
 331      } else if (type.width == 32) {
 332        if (!type.sign) {
 333          intrinsic = "llvm.ppc.altivec.vmaxuw";
 334        } else {
 335          intrinsic = "llvm.ppc.altivec.vmaxsw";
 336        }
 337      }
 338    }
 339
 340    if (intrinsic) {
 341       if (util_cpu_caps.has_sse && type.floating &&
 342           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 343           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 344           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 345          LLVMValueRef isnan, max;
 346          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 347                                                    type,
 348                                                    intr_size, a, b);
 349          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 350             isnan = lp_build_isnan(bld, b);
 351             return lp_build_select(bld, isnan, a, max);
 352          } else {
 353             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 354             isnan = lp_build_isnan(bld, a);
 355             return lp_build_select(bld, isnan, a, max);
 356          }
 357       } else {
 358          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 359                                                     type,
 360                                                     intr_size, a, b);
 361       }
 362    }
 363
 364    if (type.floating) {
 365       switch (nan_behavior) {
 366       case GALLIVM_NAN_RETURN_NAN: {
 367          LLVMValueRef isnan = lp_build_isnan(bld, b);
 368          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 369          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 370          return lp_build_select(bld, cond, a, b);
 371       }
 372          break;
 373       case GALLIVM_NAN_RETURN_OTHER: {
 374          LLVMValueRef isnan = lp_build_isnan(bld, a);
 375          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 376          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 377          return lp_build_select(bld, cond, a, b);
 378       }
 379          break;
 380       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 381          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 382          return lp_build_select(bld, cond, a, b);
 383       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 384          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 385          return lp_build_select(bld, cond, b, a);
 386       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 387          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 388          return lp_build_select(bld, cond, a, b);
 389          break;
 390       default:
 391          assert(0);
 392          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 393          return lp_build_select(bld, cond, a, b);
 394       }
 395    } else {
 396       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 397       return lp_build_select(bld, cond, a, b);
 398    }
 399 }
 400
 401
 402 /**
 403  * Generate 1 - a, or ~a depending on bld->type.
 404  */
 405 LLVMValueRef
 406 lp_build_comp(struct lp_build_context *bld,
 407               LLVMValueRef a)
 408 {
 409    LLVMBuilderRef builder = bld->gallivm->builder;
 410    const struct lp_type type = bld->type;
 411
 412    assert(lp_check_value(type, a));
 413
 414    if(a == bld->one)
 415       return bld->zero;
 416    if(a == bld->zero)
 417       return bld->one;
 418
 419    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 420       if(LLVMIsConstant(a))
 421          return LLVMConstNot(a);
 422       else
 423          return LLVMBuildNot(builder, a, "");
 424    }
 425
 426    if(LLVMIsConstant(a))
 427       if (type.floating)
 428           return LLVMConstFSub(bld->one, a);
 429       else
 430           return LLVMConstSub(bld->one, a);
 431    else
 432       if (type.floating)
 433          return LLVMBuildFSub(builder, bld->one, a, "");
 434       else
 435          return LLVMBuildSub(builder, bld->one, a, "");
 436 }
 437
 438
 439 /**
 440  * Generate a + b
 441  */
 442 LLVMValueRef
 443 lp_build_add(struct lp_build_context *bld,
 444              LLVMValueRef a,
 445              LLVMValueRef b)
 446 {
 447    LLVMBuilderRef builder = bld->gallivm->builder;
 448    const struct lp_type type = bld->type;
 449    LLVMValueRef res;
 450
 451    assert(lp_check_value(type, a));
 452    assert(lp_check_value(type, b));
 453
 454    if (a == bld->zero)
 455       return b;
 456    if (b == bld->zero)
 457       return a;
 458    if (a == bld->undef || b == bld->undef)
 459       return bld->undef;
 460
 461    if (type.norm) {
 462       const char *intrinsic = NULL;
 463
 464       if (!type.sign && (a == bld->one || b == bld->one))
 465         return bld->one;
 466
 467       if (!type.floating && !type.fixed) {
 468          if (LLVM_VERSION_MAJOR >= 8) {
 469             char intrin[32];
 470             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 471             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 472             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 473          }
 474          if (type.width * type.length == 128) {
 475             if (util_cpu_caps.has_sse2) {
 476                if (type.width == 8)
 477                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 478                if (type.width == 16)
 479                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 480             } else if (util_cpu_caps.has_altivec) {
 481                if (type.width == 8)
 482                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 483                if (type.width == 16)
 484                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 485             }
 486          }
 487          if (type.width * type.length == 256) {
 488             if (util_cpu_caps.has_avx2) {
 489                if (type.width == 8)
 490                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
 491                if (type.width == 16)
 492                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
 493             }
 494          }
 495       }
 496
 497       if (intrinsic)
 498          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 499    }
 500
 501    if(type.norm && !type.floating && !type.fixed) {
 502       if (type.sign) {
 503          uint64_t sign = (uint64_t)1 << (type.width - 1);
 504          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 505          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 506          /* a_clamp_max is the maximum a for positive b,
 507             a_clamp_min is the minimum a for negative b. */
 508          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 509          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 510          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 511       }
 512    }
 513
 514    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 515       if (type.floating)
 516          res = LLVMConstFAdd(a, b);
 517       else
 518          res = LLVMConstAdd(a, b);
 519    else
 520       if (type.floating)
 521          res = LLVMBuildFAdd(builder, a, b, "");
 522       else
 523          res = LLVMBuildAdd(builder, a, b, "");
 524
 525    /* clamp to ceiling of 1.0 */
 526    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 527       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 528
 529    if (type.norm && !type.floating && !type.fixed) {
 530       if (!type.sign) {
 531          /*
 532           * newer llvm versions no longer support the intrinsics, but recognize
 533           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 534           * code, it is important we match the pattern llvm uses (and pray llvm
 535           * doesn't change it - and hope they decide on the same pattern for
 536           * all backends supporting it...).
 537           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 538           * interfere with llvm's ability to recognize the pattern but seems
 539           * a bit brittle.
 540           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 541           */
 542          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 543          res = lp_build_select(bld, overflowed,
 544                                LLVMConstAllOnes(bld->int_vec_type), res);
 545       }
 546    }
 547
 548    /* XXX clamp to floor of -1 or 0??? */
 549
 550    return res;
 551 }
 552
 553
 554 /** Return the scalar sum of the elements of a.
 555  * Should avoid this operation whenever possible.
 556  */
 557 LLVMValueRef
 558 lp_build_horizontal_add(struct lp_build_context *bld,
 559                         LLVMValueRef a)
 560 {
 561    LLVMBuilderRef builder = bld->gallivm->builder;
 562    const struct lp_type type = bld->type;
 563    LLVMValueRef index, res;
 564    unsigned i, length;
 565    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 566    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 567    LLVMValueRef vecres, elem2;
 568
 569    assert(lp_check_value(type, a));
 570
 571    if (type.length == 1) {
 572       return a;
 573    }
 574
 575    assert(!bld->type.norm);
 576
 577    /*
 578     * for byte vectors can do much better with psadbw.
 579     * Using repeated shuffle/adds here. Note with multiple vectors
 580     * this can be done more efficiently as outlined in the intel
 581     * optimization manual.
 582     * Note: could cause data rearrangement if used with smaller element
 583     * sizes.
 584     */
 585
 586    vecres = a;
 587    length = type.length / 2;
 588    while (length > 1) {
 589       LLVMValueRef vec1, vec2;
 590       for (i = 0; i < length; i++) {
 591          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 592          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 593       }
 594       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 595                                     LLVMConstVector(shuffles1, length), "");
 596       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 597                                     LLVMConstVector(shuffles2, length), "");
 598       if (type.floating) {
 599          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 600       }
 601       else {
 602          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 603       }
 604       length = length >> 1;
 605    }
 606
 607    /* always have vector of size 2 here */
 608    assert(length == 1);
 609
 610    index = lp_build_const_int32(bld->gallivm, 0);
 611    res = LLVMBuildExtractElement(builder, vecres, index, "");
 612    index = lp_build_const_int32(bld->gallivm, 1);
 613    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 614
 615    if (type.floating)
 616       res = LLVMBuildFAdd(builder, res, elem2, "");
 617     else
 618       res = LLVMBuildAdd(builder, res, elem2, "");
 619
 620    return res;
 621 }
 622
 623 /**
 624  * Return the horizontal sums of 4 float vectors as a float4 vector.
 625  * This uses the technique as outlined in Intel Optimization Manual.
 626  */
 627 static LLVMValueRef
 628 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 629                             LLVMValueRef src[4])
 630 {
 631    struct gallivm_state *gallivm = bld->gallivm;
 632    LLVMBuilderRef builder = gallivm->builder;
 633    LLVMValueRef shuffles[4];
 634    LLVMValueRef tmp[4];
 635    LLVMValueRef sumtmp[2], shuftmp[2];
 636
 637    /* lower half of regs */
 638    shuffles[0] = lp_build_const_int32(gallivm, 0);
 639    shuffles[1] = lp_build_const_int32(gallivm, 1);
 640    shuffles[2] = lp_build_const_int32(gallivm, 4);
 641    shuffles[3] = lp_build_const_int32(gallivm, 5);
 642    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 643                                    LLVMConstVector(shuffles, 4), "");
 644    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 645                                    LLVMConstVector(shuffles, 4), "");
 646
 647    /* upper half of regs */
 648    shuffles[0] = lp_build_const_int32(gallivm, 2);
 649    shuffles[1] = lp_build_const_int32(gallivm, 3);
 650    shuffles[2] = lp_build_const_int32(gallivm, 6);
 651    shuffles[3] = lp_build_const_int32(gallivm, 7);
 652    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 653                                    LLVMConstVector(shuffles, 4), "");
 654    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 655                                    LLVMConstVector(shuffles, 4), "");
 656
 657    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 658    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 659
 660    shuffles[0] = lp_build_const_int32(gallivm, 0);
 661    shuffles[1] = lp_build_const_int32(gallivm, 2);
 662    shuffles[2] = lp_build_const_int32(gallivm, 4);
 663    shuffles[3] = lp_build_const_int32(gallivm, 6);
 664    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 665                                        LLVMConstVector(shuffles, 4), "");
 666
 667    shuffles[0] = lp_build_const_int32(gallivm, 1);
 668    shuffles[1] = lp_build_const_int32(gallivm, 3);
 669    shuffles[2] = lp_build_const_int32(gallivm, 5);
 670    shuffles[3] = lp_build_const_int32(gallivm, 7);
 671    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 672                                        LLVMConstVector(shuffles, 4), "");
 673
 674    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 675 }
 676
 677
 678 /*
 679  * partially horizontally add 2-4 float vectors with length nx4,
 680  * i.e. only four adjacent values in each vector will be added,
 681  * assuming values are really grouped in 4 which also determines
 682  * output order.
 683  *
 684  * Return a vector of the same length as the initial vectors,
 685  * with the excess elements (if any) being undefined.
 686  * The element order is independent of number of input vectors.
 687  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 688  * the output order thus will be
 689  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 690  */
 691 LLVMValueRef
 692 lp_build_hadd_partial4(struct lp_build_context *bld,
 693                        LLVMValueRef vectors[],
 694                        unsigned num_vecs)
 695 {
 696    struct gallivm_state *gallivm = bld->gallivm;
 697    LLVMBuilderRef builder = gallivm->builder;
 698    LLVMValueRef ret_vec;
 699    LLVMValueRef tmp[4];
 700    const char *intrinsic = NULL;
 701
 702    assert(num_vecs >= 2 && num_vecs <= 4);
 703    assert(bld->type.floating);
 704
 705    /* only use this with at least 2 vectors, as it is sort of expensive
 706     * (depending on cpu) and we always need two horizontal adds anyway,
 707     * so a shuffle/add approach might be better.
 708     */
 709
 710    tmp[0] = vectors[0];
 711    tmp[1] = vectors[1];
 712
 713    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 714    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 715
 716    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 717        bld->type.length == 4) {
 718       intrinsic = "llvm.x86.sse3.hadd.ps";
 719    }
 720    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 721             bld->type.length == 8) {
 722       intrinsic = "llvm.x86.avx.hadd.ps.256";
 723    }
 724    if (intrinsic) {
 725       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 726                                        lp_build_vec_type(gallivm, bld->type),
 727                                        tmp[0], tmp[1]);
 728       if (num_vecs > 2) {
 729          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 730                                           lp_build_vec_type(gallivm, bld->type),
 731                                           tmp[2], tmp[3]);
 732       }
 733       else {
 734          tmp[1] = tmp[0];
 735       }
 736       return lp_build_intrinsic_binary(builder, intrinsic,
 737                                        lp_build_vec_type(gallivm, bld->type),
 738                                        tmp[0], tmp[1]);
 739    }
 740
 741    if (bld->type.length == 4) {
 742       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 743    }
 744    else {
 745       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 746       unsigned j;
 747       unsigned num_iter = bld->type.length / 4;
 748       struct lp_type parttype = bld->type;
 749       parttype.length = 4;
 750       for (j = 0; j < num_iter; j++) {
 751          LLVMValueRef partsrc[4];
 752          unsigned i;
 753          for (i = 0; i < 4; i++) {
 754             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 755          }
 756          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 757       }
 758       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 759    }
 760    return ret_vec;
 761 }
 762
 763 /**
 764  * Generate a - b
 765  */
 766 LLVMValueRef
 767 lp_build_sub(struct lp_build_context *bld,
 768              LLVMValueRef a,
 769              LLVMValueRef b)
 770 {
 771    LLVMBuilderRef builder = bld->gallivm->builder;
 772    const struct lp_type type = bld->type;
 773    LLVMValueRef res;
 774
 775    assert(lp_check_value(type, a));
 776    assert(lp_check_value(type, b));
 777
 778    if (b == bld->zero)
 779       return a;
 780    if (a == bld->undef || b == bld->undef)
 781       return bld->undef;
 782    if (a == b)
 783       return bld->zero;
 784
 785    if (type.norm) {
 786       const char *intrinsic = NULL;
 787
 788       if (!type.sign && b == bld->one)
 789         return bld->zero;
 790
 791       if (!type.floating && !type.fixed) {
 792          if (LLVM_VERSION_MAJOR >= 8) {
 793             char intrin[32];
 794             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 795             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 796             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 797          }
 798          if (type.width * type.length == 128) {
 799             if (util_cpu_caps.has_sse2) {
 800                if (type.width == 8)
 801                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 802                if (type.width == 16)
 803                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 804             } else if (util_cpu_caps.has_altivec) {
 805                if (type.width == 8)
 806                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 807                if (type.width == 16)
 808                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 809             }
 810          }
 811          if (type.width * type.length == 256) {
 812             if (util_cpu_caps.has_avx2) {
 813                if (type.width == 8)
 814                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
 815                if (type.width == 16)
 816                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
 817             }
 818          }
 819       }
 820
 821       if (intrinsic)
 822          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 823    }
 824
 825    if(type.norm && !type.floating && !type.fixed) {
 826       if (type.sign) {
 827          uint64_t sign = (uint64_t)1 << (type.width - 1);
 828          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 829          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 830          /* a_clamp_max is the maximum a for negative b,
 831             a_clamp_min is the minimum a for positive b. */
 832          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 833          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 834          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 835       } else {
 836          /*
 837           * This must match llvm pattern for saturated unsigned sub.
 838           * (lp_build_max_simple actually does the job with its current
 839           * definition but do it explicitly here.)
 840           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 841           * interfere with llvm's ability to recognize the pattern but seems
 842           * a bit brittle.
 843           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 844           */
 845          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 846          a = lp_build_select(bld, no_ov, a, b);
 847       }
 848    }
 849
 850    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 851       if (type.floating)
 852          res = LLVMConstFSub(a, b);
 853       else
 854          res = LLVMConstSub(a, b);
 855    else
 856       if (type.floating)
 857          res = LLVMBuildFSub(builder, a, b, "");
 858       else
 859          res = LLVMBuildSub(builder, a, b, "");
 860
 861    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 862       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 863
 864    return res;
 865 }
 866
 867
 868
 869 /**
 870  * Normalized multiplication.
 871  *
 872  * There are several approaches for (using 8-bit normalized multiplication as
 873  * an example):
 874  *
 875  * - alpha plus one
 876  *
 877  *     makes the following approximation to the division (Sree)
 878  *
 879  *       a*b/255 ~= (a*(b + 1)) >> 256
 880  *
 881  *     which is the fastest method that satisfies the following OpenGL criteria of
 882  *
 883  *       0*0 = 0 and 255*255 = 255
 884  *
 885  * - geometric series
 886  *
 887  *     takes the geometric series approximation to the division
 888  *
 889  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 890  *
 891  *     in this case just the first two terms to fit in 16bit arithmetic
 892  *
 893  *       t/255 ~= (t + (t >> 8)) >> 8
 894  *
 895  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 896  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 897  *     must be used.
 898  *
 899  * - geometric series plus rounding
 900  *
 901  *     when using a geometric series division instead of truncating the result
 902  *     use roundoff in the approximation (Jim Blinn)
 903  *
 904  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 905  *
 906  *     achieving the exact results.
 907  *
 908  *
 909  *
 910  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 911  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 912  * @sa Michael Herf, The "double blend trick", May 2000,
 913  *     http://www.stereopsis.com/doubleblend.html
 914  */
 915 LLVMValueRef
 916 lp_build_mul_norm(struct gallivm_state *gallivm,
 917                   struct lp_type wide_type,
 918                   LLVMValueRef a, LLVMValueRef b)
 919 {
 920    LLVMBuilderRef builder = gallivm->builder;
 921    struct lp_build_context bld;
 922    unsigned n;
 923    LLVMValueRef half;
 924    LLVMValueRef ab;
 925
 926    assert(!wide_type.floating);
 927    assert(lp_check_value(wide_type, a));
 928    assert(lp_check_value(wide_type, b));
 929
 930    lp_build_context_init(&bld, gallivm, wide_type);
 931
 932    n = wide_type.width / 2;
 933    if (wide_type.sign) {
 934       --n;
 935    }
 936
 937    /*
 938     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 939     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 940     */
 941
 942    /*
 943     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 944     */
 945
 946    ab = LLVMBuildMul(builder, a, b, "");
 947    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 948
 949    /*
 950     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 951     */
 952
 953    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 954    if (wide_type.sign) {
 955       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 956       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 957       half = lp_build_select(&bld, sign, minus_half, half);
 958    }
 959    ab = LLVMBuildAdd(builder, ab, half, "");
 960
 961    /* Final division */
 962    ab = lp_build_shr_imm(&bld, ab, n);
 963
 964    return ab;
 965 }
 966
 967 /**
 968  * Generate a * b
 969  */
 970 LLVMValueRef
 971 lp_build_mul(struct lp_build_context *bld,
 972              LLVMValueRef a,
 973              LLVMValueRef b)
 974 {
 975    LLVMBuilderRef builder = bld->gallivm->builder;
 976    const struct lp_type type = bld->type;
 977    LLVMValueRef shift;
 978    LLVMValueRef res;
 979
 980    assert(lp_check_value(type, a));
 981    assert(lp_check_value(type, b));
 982
 983    if(a == bld->zero)
 984       return bld->zero;
 985    if(a == bld->one)
 986       return b;
 987    if(b == bld->zero)
 988       return bld->zero;
 989    if(b == bld->one)
 990       return a;
 991    if(a == bld->undef || b == bld->undef)
 992       return bld->undef;
 993
 994    if (!type.floating && !type.fixed && type.norm) {
 995       struct lp_type wide_type = lp_wider_type(type);
 996       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 997
 998       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
 999       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1000
1001       /* PMULLW, PSRLW, PADDW */
1002       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1003       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1004
1005       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1006
1007       return ab;
1008    }
1009
1010    if(type.fixed)
1011       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1012    else
1013       shift = NULL;
1014
1015    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1016       if (type.floating)
1017          res = LLVMConstFMul(a, b);
1018       else
1019          res = LLVMConstMul(a, b);
1020       if(shift) {
1021          if(type.sign)
1022             res = LLVMConstAShr(res, shift);
1023          else
1024             res = LLVMConstLShr(res, shift);
1025       }
1026    }
1027    else {
1028       if (type.floating)
1029          res = LLVMBuildFMul(builder, a, b, "");
1030       else
1031          res = LLVMBuildMul(builder, a, b, "");
1032       if(shift) {
1033          if(type.sign)
1034             res = LLVMBuildAShr(builder, res, shift, "");
1035          else
1036             res = LLVMBuildLShr(builder, res, shift, "");
1037       }
1038    }
1039
1040    return res;
1041 }
1042
1043 /*
1044  * Widening mul, valid for 32x32 bit -> 64bit only.
1045  * Result is low 32bits, high bits returned in res_hi.
1046  *
1047  * Emits code that is meant to be compiled for the host CPU.
1048  */
1049 LLVMValueRef
1050 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1051                          LLVMValueRef a,
1052                          LLVMValueRef b,
1053                          LLVMValueRef *res_hi)
1054 {
1055    struct gallivm_state *gallivm = bld->gallivm;
1056    LLVMBuilderRef builder = gallivm->builder;
1057
1058    assert(bld->type.width == 32);
1059    assert(bld->type.floating == 0);
1060    assert(bld->type.fixed == 0);
1061    assert(bld->type.norm == 0);
1062
1063    /*
1064     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1065     * for x86 simd is atrocious (even if the high bits weren't required),
1066     * trying to handle real 64bit inputs (which of course can't happen due
1067     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1068     * apparently llvm does not recognize this widening mul). This includes 6
1069     * (instead of 2) pmuludq plus extra adds and shifts
1070     * The same story applies to signed mul, albeit fixing this requires sse41.
1071     * https://llvm.org/bugs/show_bug.cgi?id=30845
1072     * So, whip up our own code, albeit only for length 4 and 8 (which
1073     * should be good enough)...
1074     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1075     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1076     * for signed), which the fallback code does not, without this llvm
1077     * will likely still produce atrocious code.
1078     */
1079    if (LLVM_VERSION_MAJOR < 7 &&
1080        (bld->type.length == 4 || bld->type.length == 8) &&
1081        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1082         util_cpu_caps.has_sse4_1)) {
1083       const char *intrinsic = NULL;
1084       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1085       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1086       struct lp_type type_wide = lp_wider_type(bld->type);
1087       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1088       unsigned i;
1089       for (i = 0; i < bld->type.length; i += 2) {
1090          shuf[i] = lp_build_const_int32(gallivm, i+1);
1091          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1092       }
1093       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1094       aeven = a;
1095       beven = b;
1096       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1097       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1098
1099       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1100          if (bld->type.sign) {
1101             intrinsic = "llvm.x86.avx2.pmul.dq";
1102          } else {
1103             intrinsic = "llvm.x86.avx2.pmulu.dq";
1104          }
1105          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1106                                              wider_type, aeven, beven);
1107          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1108                                             wider_type, aodd, bodd);
1109       }
1110       else {
1111          /* for consistent naming look elsewhere... */
1112          if (bld->type.sign) {
1113             intrinsic = "llvm.x86.sse41.pmuldq";
1114          } else {
1115             intrinsic = "llvm.x86.sse2.pmulu.dq";
1116          }
1117          /*
1118           * XXX If we only have AVX but not AVX2 this is a pain.
1119           * lp_build_intrinsic_binary_anylength() can't handle it
1120           * (due to src and dst type not being identical).
1121           */
1122          if (bld->type.length == 8) {
1123             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1124             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1125             LLVMValueRef muleven2[2], mulodd2[2];
1126             struct lp_type type_wide_half = type_wide;
1127             LLVMTypeRef wtype_half;
1128             type_wide_half.length = 2;
1129             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1130             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1131             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1132             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1133             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1134             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1135             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1136             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1137             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1138             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1139                                                     wtype_half, aevenlo, bevenlo);
1140             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1141                                                    wtype_half, aoddlo, boddlo);
1142             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1143                                                     wtype_half, aevenhi, bevenhi);
1144             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1145                                                    wtype_half, aoddhi, boddhi);
1146             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1147             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1148
1149          }
1150          else {
1151             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                                 wider_type, aeven, beven);
1153             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                                wider_type, aodd, bodd);
1155          }
1156       }
1157       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1158       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1159
1160       for (i = 0; i < bld->type.length; i += 2) {
1161          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1162          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1163       }
1164       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1165       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1166
1167       for (i = 0; i < bld->type.length; i += 2) {
1168          shuf[i] = lp_build_const_int32(gallivm, i);
1169          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1170       }
1171       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1172       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1173    }
1174    else {
1175       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1176    }
1177 }
1178
1179
1180 /*
1181  * Widening mul, valid for 32x32 bit -> 64bit only.
1182  * Result is low 32bits, high bits returned in res_hi.
1183  *
1184  * Emits generic code.
1185  */
1186 LLVMValueRef
1187 lp_build_mul_32_lohi(struct lp_build_context *bld,
1188                      LLVMValueRef a,
1189                      LLVMValueRef b,
1190                      LLVMValueRef *res_hi)
1191 {
1192    struct gallivm_state *gallivm = bld->gallivm;
1193    LLVMBuilderRef builder = gallivm->builder;
1194    LLVMValueRef tmp, shift, res_lo;
1195    struct lp_type type_tmp;
1196    LLVMTypeRef wide_type, narrow_type;
1197
1198    type_tmp = bld->type;
1199    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1200    type_tmp.width *= 2;
1201    wide_type = lp_build_vec_type(gallivm, type_tmp);
1202    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1203
1204    if (bld->type.sign) {
1205       a = LLVMBuildSExt(builder, a, wide_type, "");
1206       b = LLVMBuildSExt(builder, b, wide_type, "");
1207    } else {
1208       a = LLVMBuildZExt(builder, a, wide_type, "");
1209       b = LLVMBuildZExt(builder, b, wide_type, "");
1210    }
1211    tmp = LLVMBuildMul(builder, a, b, "");
1212
1213    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1214
1215    /* Since we truncate anyway, LShr and AShr are equivalent. */
1216    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1217    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1218
1219    return res_lo;
1220 }
1221
1222
1223 /* a * b + c */
1224 LLVMValueRef
1225 lp_build_mad(struct lp_build_context *bld,
1226              LLVMValueRef a,
1227              LLVMValueRef b,
1228              LLVMValueRef c)
1229 {
1230    const struct lp_type type = bld->type;
1231    if (type.floating) {
1232       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1233    } else {
1234       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1235    }
1236 }
1237
1238
1239 /**
1240  * Small vector x scale multiplication optimization.
1241  */
1242 LLVMValueRef
1243 lp_build_mul_imm(struct lp_build_context *bld,
1244                  LLVMValueRef a,
1245                  int b)
1246 {
1247    LLVMBuilderRef builder = bld->gallivm->builder;
1248    LLVMValueRef factor;
1249
1250    assert(lp_check_value(bld->type, a));
1251
1252    if(b == 0)
1253       return bld->zero;
1254
1255    if(b == 1)
1256       return a;
1257
1258    if(b == -1)
1259       return lp_build_negate(bld, a);
1260
1261    if(b == 2 && bld->type.floating)
1262       return lp_build_add(bld, a, a);
1263
1264    if(util_is_power_of_two_or_zero(b)) {
1265       unsigned shift = ffs(b) - 1;
1266
1267       if(bld->type.floating) {
1268 #if 0
1269          /*
1270           * Power of two multiplication by directly manipulating the exponent.
1271           *
1272           * XXX: This might not be always faster, it will introduce a small error
1273           * for multiplication by zero, and it will produce wrong results
1274           * for Inf and NaN.
1275           */
1276          unsigned mantissa = lp_mantissa(bld->type);
1277          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1278          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1279          a = LLVMBuildAdd(builder, a, factor, "");
1280          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1281          return a;
1282 #endif
1283       }
1284       else {
1285          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1286          return LLVMBuildShl(builder, a, factor, "");
1287       }
1288    }
1289
1290    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1291    return lp_build_mul(bld, a, factor);
1292 }
1293
1294
1295 /**
1296  * Generate a / b
1297  */
1298 LLVMValueRef
1299 lp_build_div(struct lp_build_context *bld,
1300              LLVMValueRef a,
1301              LLVMValueRef b)
1302 {
1303    LLVMBuilderRef builder = bld->gallivm->builder;
1304    const struct lp_type type = bld->type;
1305
1306    assert(lp_check_value(type, a));
1307    assert(lp_check_value(type, b));
1308
1309    if(a == bld->zero)
1310       return bld->zero;
1311    if(a == bld->one && type.floating)
1312       return lp_build_rcp(bld, b);
1313    if(b == bld->zero)
1314       return bld->undef;
1315    if(b == bld->one)
1316       return a;
1317    if(a == bld->undef || b == bld->undef)
1318       return bld->undef;
1319
1320    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1321       if (type.floating)
1322          return LLVMConstFDiv(a, b);
1323       else if (type.sign)
1324          return LLVMConstSDiv(a, b);
1325       else
1326          return LLVMConstUDiv(a, b);
1327    }
1328
1329    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1330    if(FALSE &&
1331       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1332        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1333       type.floating)
1334       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1335
1336    if (type.floating)
1337       return LLVMBuildFDiv(builder, a, b, "");
1338    else if (type.sign)
1339       return LLVMBuildSDiv(builder, a, b, "");
1340    else
1341       return LLVMBuildUDiv(builder, a, b, "");
1342 }
1343
1344
1345 /**
1346  * Linear interpolation helper.
1347  *
1348  * @param normalized whether we are interpolating normalized values,
1349  *        encoded in normalized integers, twice as wide.
1350  *
1351  * @sa http://www.stereopsis.com/doubleblend.html
1352  */
1353 static inline LLVMValueRef
1354 lp_build_lerp_simple(struct lp_build_context *bld,
1355                      LLVMValueRef x,
1356                      LLVMValueRef v0,
1357                      LLVMValueRef v1,
1358                      unsigned flags)
1359 {
1360    unsigned half_width = bld->type.width/2;
1361    LLVMBuilderRef builder = bld->gallivm->builder;
1362    LLVMValueRef delta;
1363    LLVMValueRef res;
1364
1365    assert(lp_check_value(bld->type, x));
1366    assert(lp_check_value(bld->type, v0));
1367    assert(lp_check_value(bld->type, v1));
1368
1369    delta = lp_build_sub(bld, v1, v0);
1370
1371    if (bld->type.floating) {
1372       assert(flags == 0);
1373       return lp_build_mad(bld, x, delta, v0);
1374    }
1375
1376    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1377       if (!bld->type.sign) {
1378          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1379             /*
1380              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1381              * most-significant-bit to the lowest-significant-bit, so that
1382              * later we can just divide by 2**n instead of 2**n - 1.
1383              */
1384
1385             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1386          }
1387
1388          /* (x * delta) >> n */
1389          res = lp_build_mul(bld, x, delta);
1390          res = lp_build_shr_imm(bld, res, half_width);
1391       } else {
1392          /*
1393           * The rescaling trick above doesn't work for signed numbers, so
1394           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1395           * instead.
1396           */
1397          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1398          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1399       }
1400    } else {
1401       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1402       res = lp_build_mul(bld, x, delta);
1403    }
1404
1405    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1406       /*
1407        * At this point both res and v0 only use the lower half of the bits,
1408        * the rest is zero. Instead of add / mask, do add with half wide type.
1409        */
1410       struct lp_type narrow_type;
1411       struct lp_build_context narrow_bld;
1412
1413       memset(&narrow_type, 0, sizeof narrow_type);
1414       narrow_type.sign   = bld->type.sign;
1415       narrow_type.width  = bld->type.width/2;
1416       narrow_type.length = bld->type.length*2;
1417
1418       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1419       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1420       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1421       res = lp_build_add(&narrow_bld, v0, res);
1422       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1423    } else {
1424       res = lp_build_add(bld, v0, res);
1425
1426       if (bld->type.fixed) {
1427          /*
1428           * We need to mask out the high order bits when lerping 8bit
1429           * normalized colors stored on 16bits
1430           */
1431          /* XXX: This step is necessary for lerping 8bit colors stored on
1432           * 16bits, but it will be wrong for true fixed point use cases.
1433           * Basically we need a more powerful lp_type, capable of further
1434           * distinguishing the values interpretation from the value storage.
1435           */
1436          LLVMValueRef low_bits;
1437          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1438          res = LLVMBuildAnd(builder, res, low_bits, "");
1439       }
1440    }
1441
1442    return res;
1443 }
1444
1445
1446 /**
1447  * Linear interpolation.
1448  */
1449 LLVMValueRef
1450 lp_build_lerp(struct lp_build_context *bld,
1451               LLVMValueRef x,
1452               LLVMValueRef v0,
1453               LLVMValueRef v1,
1454               unsigned flags)
1455 {
1456    const struct lp_type type = bld->type;
1457    LLVMValueRef res;
1458
1459    assert(lp_check_value(type, x));
1460    assert(lp_check_value(type, v0));
1461    assert(lp_check_value(type, v1));
1462
1463    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1464
1465    if (type.norm) {
1466       struct lp_type wide_type;
1467       struct lp_build_context wide_bld;
1468       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1469
1470       assert(type.length >= 2);
1471
1472       /*
1473        * Create a wider integer type, enough to hold the
1474        * intermediate result of the multiplication.
1475        */
1476       memset(&wide_type, 0, sizeof wide_type);
1477       wide_type.sign   = type.sign;
1478       wide_type.width  = type.width*2;
1479       wide_type.length = type.length/2;
1480
1481       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1482
1483       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1484       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1485       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1486
1487       /*
1488        * Lerp both halves.
1489        */
1490
1491       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1492
1493       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1494       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1495
1496       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1497    } else {
1498       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1499    }
1500
1501    return res;
1502 }
1503
1504
1505 /**
1506  * Bilinear interpolation.
1507  *
1508  * Values indices are in v_{yx}.
1509  */
1510 LLVMValueRef
1511 lp_build_lerp_2d(struct lp_build_context *bld,
1512                  LLVMValueRef x,
1513                  LLVMValueRef y,
1514                  LLVMValueRef v00,
1515                  LLVMValueRef v01,
1516                  LLVMValueRef v10,
1517                  LLVMValueRef v11,
1518                  unsigned flags)
1519 {
1520    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1521    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1522    return lp_build_lerp(bld, y, v0, v1, flags);
1523 }
1524
1525
1526 LLVMValueRef
1527 lp_build_lerp_3d(struct lp_build_context *bld,
1528                  LLVMValueRef x,
1529                  LLVMValueRef y,
1530                  LLVMValueRef z,
1531                  LLVMValueRef v000,
1532                  LLVMValueRef v001,
1533                  LLVMValueRef v010,
1534                  LLVMValueRef v011,
1535                  LLVMValueRef v100,
1536                  LLVMValueRef v101,
1537                  LLVMValueRef v110,
1538                  LLVMValueRef v111,
1539                  unsigned flags)
1540 {
1541    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1542    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1543    return lp_build_lerp(bld, z, v0, v1, flags);
1544 }
1545
1546
1547 /**
1548  * Generate min(a, b)
1549  * Do checks for special cases but not for nans.
1550  */
1551 LLVMValueRef
1552 lp_build_min(struct lp_build_context *bld,
1553              LLVMValueRef a,
1554              LLVMValueRef b)
1555 {
1556    assert(lp_check_value(bld->type, a));
1557    assert(lp_check_value(bld->type, b));
1558
1559    if(a == bld->undef || b == bld->undef)
1560       return bld->undef;
1561
1562    if(a == b)
1563       return a;
1564
1565    if (bld->type.norm) {
1566       if (!bld->type.sign) {
1567          if (a == bld->zero || b == bld->zero) {
1568             return bld->zero;
1569          }
1570       }
1571       if(a == bld->one)
1572          return b;
1573       if(b == bld->one)
1574          return a;
1575    }
1576
1577    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1578 }
1579
1580
1581 /**
1582  * Generate min(a, b)
1583  * NaN's are handled according to the behavior specified by the
1584  * nan_behavior argument.
1585  */
1586 LLVMValueRef
1587 lp_build_min_ext(struct lp_build_context *bld,
1588                  LLVMValueRef a,
1589                  LLVMValueRef b,
1590                  enum gallivm_nan_behavior nan_behavior)
1591 {
1592    assert(lp_check_value(bld->type, a));
1593    assert(lp_check_value(bld->type, b));
1594
1595    if(a == bld->undef || b == bld->undef)
1596       return bld->undef;
1597
1598    if(a == b)
1599       return a;
1600
1601    if (bld->type.norm) {
1602       if (!bld->type.sign) {
1603          if (a == bld->zero || b == bld->zero) {
1604             return bld->zero;
1605          }
1606       }
1607       if(a == bld->one)
1608          return b;
1609       if(b == bld->one)
1610          return a;
1611    }
1612
1613    return lp_build_min_simple(bld, a, b, nan_behavior);
1614 }
1615
1616 /**
1617  * Generate max(a, b)
1618  * Do checks for special cases, but NaN behavior is undefined.
1619  */
1620 LLVMValueRef
1621 lp_build_max(struct lp_build_context *bld,
1622              LLVMValueRef a,
1623              LLVMValueRef b)
1624 {
1625    assert(lp_check_value(bld->type, a));
1626    assert(lp_check_value(bld->type, b));
1627
1628    if(a == bld->undef || b == bld->undef)
1629       return bld->undef;
1630
1631    if(a == b)
1632       return a;
1633
1634    if(bld->type.norm) {
1635       if(a == bld->one || b == bld->one)
1636          return bld->one;
1637       if (!bld->type.sign) {
1638          if (a == bld->zero) {
1639             return b;
1640          }
1641          if (b == bld->zero) {
1642             return a;
1643          }
1644       }
1645    }
1646
1647    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1648 }
1649
1650
1651 /**
1652  * Generate max(a, b)
1653  * Checks for special cases.
1654  * NaN's are handled according to the behavior specified by the
1655  * nan_behavior argument.
1656  */
1657 LLVMValueRef
1658 lp_build_max_ext(struct lp_build_context *bld,
1659                   LLVMValueRef a,
1660                   LLVMValueRef b,
1661                   enum gallivm_nan_behavior nan_behavior)
1662 {
1663    assert(lp_check_value(bld->type, a));
1664    assert(lp_check_value(bld->type, b));
1665
1666    if(a == bld->undef || b == bld->undef)
1667       return bld->undef;
1668
1669    if(a == b)
1670       return a;
1671
1672    if(bld->type.norm) {
1673       if(a == bld->one || b == bld->one)
1674          return bld->one;
1675       if (!bld->type.sign) {
1676          if (a == bld->zero) {
1677             return b;
1678          }
1679          if (b == bld->zero) {
1680             return a;
1681          }
1682       }
1683    }
1684
1685    return lp_build_max_simple(bld, a, b, nan_behavior);
1686 }
1687
1688 /**
1689  * Generate clamp(a, min, max)
1690  * NaN behavior (for any of a, min, max) is undefined.
1691  * Do checks for special cases.
1692  */
1693 LLVMValueRef
1694 lp_build_clamp(struct lp_build_context *bld,
1695                LLVMValueRef a,
1696                LLVMValueRef min,
1697                LLVMValueRef max)
1698 {
1699    assert(lp_check_value(bld->type, a));
1700    assert(lp_check_value(bld->type, min));
1701    assert(lp_check_value(bld->type, max));
1702
1703    a = lp_build_min(bld, a, max);
1704    a = lp_build_max(bld, a, min);
1705    return a;
1706 }
1707
1708
1709 /**
1710  * Generate clamp(a, 0, 1)
1711  * A NaN will get converted to zero.
1712  */
1713 LLVMValueRef
1714 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1715                                 LLVMValueRef a)
1716 {
1717    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1718    a = lp_build_min(bld, a, bld->one);
1719    return a;
1720 }
1721
1722
1723 /**
1724  * Generate abs(a)
1725  */
1726 LLVMValueRef
1727 lp_build_abs(struct lp_build_context *bld,
1728              LLVMValueRef a)
1729 {
1730    LLVMBuilderRef builder = bld->gallivm->builder;
1731    const struct lp_type type = bld->type;
1732    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1733
1734    assert(lp_check_value(type, a));
1735
1736    if(!type.sign)
1737       return a;
1738
1739    if(type.floating) {
1740       char intrinsic[32];
1741       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1742       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1743    }
1744
1745    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1746       switch(type.width) {
1747       case 8:
1748          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1749       case 16:
1750          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1751       case 32:
1752          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1753       }
1754    }
1755    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1756       switch(type.width) {
1757       case 8:
1758          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1759       case 16:
1760          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1761       case 32:
1762          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1763       }
1764    }
1765
1766    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1767                           a, LLVMBuildNeg(builder, a, ""));
1768 }
1769
1770
1771 LLVMValueRef
1772 lp_build_negate(struct lp_build_context *bld,
1773                 LLVMValueRef a)
1774 {
1775    LLVMBuilderRef builder = bld->gallivm->builder;
1776
1777    assert(lp_check_value(bld->type, a));
1778
1779    if (bld->type.floating)
1780       a = LLVMBuildFNeg(builder, a, "");
1781    else
1782       a = LLVMBuildNeg(builder, a, "");
1783
1784    return a;
1785 }
1786
1787
1788 /** Return -1, 0 or +1 depending on the sign of a */
1789 LLVMValueRef
1790 lp_build_sgn(struct lp_build_context *bld,
1791              LLVMValueRef a)
1792 {
1793    LLVMBuilderRef builder = bld->gallivm->builder;
1794    const struct lp_type type = bld->type;
1795    LLVMValueRef cond;
1796    LLVMValueRef res;
1797
1798    assert(lp_check_value(type, a));
1799
1800    /* Handle non-zero case */
1801    if(!type.sign) {
1802       /* if not zero then sign must be positive */
1803       res = bld->one;
1804    }
1805    else if(type.floating) {
1806       LLVMTypeRef vec_type;
1807       LLVMTypeRef int_type;
1808       LLVMValueRef mask;
1809       LLVMValueRef sign;
1810       LLVMValueRef one;
1811       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1812
1813       int_type = lp_build_int_vec_type(bld->gallivm, type);
1814       vec_type = lp_build_vec_type(bld->gallivm, type);
1815       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1816
1817       /* Take the sign bit and add it to 1 constant */
1818       sign = LLVMBuildBitCast(builder, a, int_type, "");
1819       sign = LLVMBuildAnd(builder, sign, mask, "");
1820       one = LLVMConstBitCast(bld->one, int_type);
1821       res = LLVMBuildOr(builder, sign, one, "");
1822       res = LLVMBuildBitCast(builder, res, vec_type, "");
1823    }
1824    else
1825    {
1826       /* signed int/norm/fixed point */
1827       /* could use psign with sse3 and appropriate vectors here */
1828       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1829       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1830       res = lp_build_select(bld, cond, bld->one, minus_one);
1831    }
1832
1833    /* Handle zero */
1834    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1835    res = lp_build_select(bld, cond, bld->zero, res);
1836
1837    return res;
1838 }
1839
1840
1841 /**
1842  * Set the sign of float vector 'a' according to 'sign'.
1843  * If sign==0, return abs(a).
1844  * If sign==1, return -abs(a);
1845  * Other values for sign produce undefined results.
1846  */
1847 LLVMValueRef
1848 lp_build_set_sign(struct lp_build_context *bld,
1849                   LLVMValueRef a, LLVMValueRef sign)
1850 {
1851    LLVMBuilderRef builder = bld->gallivm->builder;
1852    const struct lp_type type = bld->type;
1853    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1854    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1855    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1856    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1857                              ~((unsigned long long) 1 << (type.width - 1)));
1858    LLVMValueRef val, res;
1859
1860    assert(type.floating);
1861    assert(lp_check_value(type, a));
1862
1863    /* val = reinterpret_cast<int>(a) */
1864    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1865    /* val = val & mask */
1866    val = LLVMBuildAnd(builder, val, mask, "");
1867    /* sign = sign << shift */
1868    sign = LLVMBuildShl(builder, sign, shift, "");
1869    /* res = val | sign */
1870    res = LLVMBuildOr(builder, val, sign, "");
1871    /* res = reinterpret_cast<float>(res) */
1872    res = LLVMBuildBitCast(builder, res, vec_type, "");
1873
1874    return res;
1875 }
1876
1877
1878 /**
1879  * Convert vector of (or scalar) int to vector of (or scalar) float.
1880  */
1881 LLVMValueRef
1882 lp_build_int_to_float(struct lp_build_context *bld,
1883                       LLVMValueRef a)
1884 {
1885    LLVMBuilderRef builder = bld->gallivm->builder;
1886    const struct lp_type type = bld->type;
1887    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1888
1889    assert(type.floating);
1890
1891    return LLVMBuildSIToFP(builder, a, vec_type, "");
1892 }
1893
1894 static boolean
1895 arch_rounding_available(const struct lp_type type)
1896 {
1897    if ((util_cpu_caps.has_sse4_1 &&
1898        (type.length == 1 || type.width*type.length == 128)) ||
1899        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1900        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1901       return TRUE;
1902    else if ((util_cpu_caps.has_altivec &&
1903             (type.width == 32 && type.length == 4)))
1904       return TRUE;
1905    else if (util_cpu_caps.has_neon)
1906       return TRUE;
1907
1908    return FALSE;
1909 }
1910
1911 enum lp_build_round_mode
1912 {
1913    LP_BUILD_ROUND_NEAREST = 0,
1914    LP_BUILD_ROUND_FLOOR = 1,
1915    LP_BUILD_ROUND_CEIL = 2,
1916    LP_BUILD_ROUND_TRUNCATE = 3
1917 };
1918
1919 static inline LLVMValueRef
1920 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1921                              LLVMValueRef a)
1922 {
1923    LLVMBuilderRef builder = bld->gallivm->builder;
1924    const struct lp_type type = bld->type;
1925    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1926    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1927    const char *intrinsic;
1928    LLVMValueRef res;
1929
1930    assert(type.floating);
1931    /* using the double precision conversions is a bit more complicated */
1932    assert(type.width == 32);
1933
1934    assert(lp_check_value(type, a));
1935    assert(util_cpu_caps.has_sse2);
1936
1937    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1938    if (type.length == 1) {
1939       LLVMTypeRef vec_type;
1940       LLVMValueRef undef;
1941       LLVMValueRef arg;
1942       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1943
1944       vec_type = LLVMVectorType(bld->elem_type, 4);
1945
1946       intrinsic = "llvm.x86.sse.cvtss2si";
1947
1948       undef = LLVMGetUndef(vec_type);
1949
1950       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1951
1952       res = lp_build_intrinsic_unary(builder, intrinsic,
1953                                      ret_type, arg);
1954    }
1955    else {
1956       if (type.width* type.length == 128) {
1957          intrinsic = "llvm.x86.sse2.cvtps2dq";
1958       }
1959       else {
1960          assert(type.width*type.length == 256);
1961          assert(util_cpu_caps.has_avx);
1962
1963          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1964       }
1965       res = lp_build_intrinsic_unary(builder, intrinsic,
1966                                      ret_type, a);
1967    }
1968
1969    return res;
1970 }
1971
1972
1973 /*
1974  */
1975 static inline LLVMValueRef
1976 lp_build_round_altivec(struct lp_build_context *bld,
1977                        LLVMValueRef a,
1978                        enum lp_build_round_mode mode)
1979 {
1980    LLVMBuilderRef builder = bld->gallivm->builder;
1981    const struct lp_type type = bld->type;
1982    const char *intrinsic = NULL;
1983
1984    assert(type.floating);
1985
1986    assert(lp_check_value(type, a));
1987    assert(util_cpu_caps.has_altivec);
1988
1989    (void)type;
1990
1991    switch (mode) {
1992    case LP_BUILD_ROUND_NEAREST:
1993       intrinsic = "llvm.ppc.altivec.vrfin";
1994       break;
1995    case LP_BUILD_ROUND_FLOOR:
1996       intrinsic = "llvm.ppc.altivec.vrfim";
1997       break;
1998    case LP_BUILD_ROUND_CEIL:
1999       intrinsic = "llvm.ppc.altivec.vrfip";
2000       break;
2001    case LP_BUILD_ROUND_TRUNCATE:
2002       intrinsic = "llvm.ppc.altivec.vrfiz";
2003       break;
2004    }
2005
2006    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2007 }
2008
2009 static inline LLVMValueRef
2010 lp_build_round_arch(struct lp_build_context *bld,
2011                     LLVMValueRef a,
2012                     enum lp_build_round_mode mode)
2013 {
2014    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2015       LLVMBuilderRef builder = bld->gallivm->builder;
2016       const struct lp_type type = bld->type;
2017       const char *intrinsic_root;
2018       char intrinsic[32];
2019
2020       assert(type.floating);
2021       assert(lp_check_value(type, a));
2022       (void)type;
2023
2024       switch (mode) {
2025       case LP_BUILD_ROUND_NEAREST:
2026          intrinsic_root = "llvm.nearbyint";
2027          break;
2028       case LP_BUILD_ROUND_FLOOR:
2029          intrinsic_root = "llvm.floor";
2030          break;
2031       case LP_BUILD_ROUND_CEIL:
2032          intrinsic_root = "llvm.ceil";
2033          break;
2034       case LP_BUILD_ROUND_TRUNCATE:
2035          intrinsic_root = "llvm.trunc";
2036          break;
2037       }
2038
2039       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2040       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2041    }
2042    else /* (util_cpu_caps.has_altivec) */
2043      return lp_build_round_altivec(bld, a, mode);
2044 }
2045
2046 /**
2047  * Return the integer part of a float (vector) value (== round toward zero).
2048  * The returned value is a float (vector).
2049  * Ex: trunc(-1.5) = -1.0
2050  */
2051 LLVMValueRef
2052 lp_build_trunc(struct lp_build_context *bld,
2053                LLVMValueRef a)
2054 {
2055    LLVMBuilderRef builder = bld->gallivm->builder;
2056    const struct lp_type type = bld->type;
2057
2058    assert(type.floating);
2059    assert(lp_check_value(type, a));
2060
2061    if (arch_rounding_available(type)) {
2062       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2063    }
2064    else {
2065       const struct lp_type type = bld->type;
2066       struct lp_type inttype;
2067       struct lp_build_context intbld;
2068       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2069       LLVMValueRef trunc, res, anosign, mask;
2070       LLVMTypeRef int_vec_type = bld->int_vec_type;
2071       LLVMTypeRef vec_type = bld->vec_type;
2072
2073       inttype = type;
2074       inttype.floating = 0;
2075       lp_build_context_init(&intbld, bld->gallivm, inttype);
2076
2077       /* round by truncation */
2078       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2079       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2080
2081       /* mask out sign bit */
2082       anosign = lp_build_abs(bld, a);
2083       /*
2084        * mask out all values if anosign > 2^24
2085        * This should work both for large ints (all rounding is no-op for them
2086        * because such floats are always exact) as well as special cases like
2087        * NaNs, Infs (taking advantage of the fact they use max exponent).
2088        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2089        */
2090       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2091       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2092       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2093       return lp_build_select(bld, mask, a, res);
2094    }
2095 }
2096
2097
2098 /**
2099  * Return float (vector) rounded to nearest integer (vector).  The returned
2100  * value is a float (vector).
2101  * Ex: round(0.9) = 1.0
2102  * Ex: round(-1.5) = -2.0
2103  */
2104 LLVMValueRef
2105 lp_build_round(struct lp_build_context *bld,
2106                LLVMValueRef a)
2107 {
2108    LLVMBuilderRef builder = bld->gallivm->builder;
2109    const struct lp_type type = bld->type;
2110
2111    assert(type.floating);
2112    assert(lp_check_value(type, a));
2113
2114    if (arch_rounding_available(type)) {
2115       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2116    }
2117    else {
2118       const struct lp_type type = bld->type;
2119       struct lp_type inttype;
2120       struct lp_build_context intbld;
2121       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2122       LLVMValueRef res, anosign, mask;
2123       LLVMTypeRef int_vec_type = bld->int_vec_type;
2124       LLVMTypeRef vec_type = bld->vec_type;
2125
2126       inttype = type;
2127       inttype.floating = 0;
2128       lp_build_context_init(&intbld, bld->gallivm, inttype);
2129
2130       res = lp_build_iround(bld, a);
2131       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2132
2133       /* mask out sign bit */
2134       anosign = lp_build_abs(bld, a);
2135       /*
2136        * mask out all values if anosign > 2^24
2137        * This should work both for large ints (all rounding is no-op for them
2138        * because such floats are always exact) as well as special cases like
2139        * NaNs, Infs (taking advantage of the fact they use max exponent).
2140        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2141        */
2142       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2143       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2144       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2145       return lp_build_select(bld, mask, a, res);
2146    }
2147 }
2148
2149
2150 /**
2151  * Return floor of float (vector), result is a float (vector)
2152  * Ex: floor(1.1) = 1.0
2153  * Ex: floor(-1.1) = -2.0
2154  */
2155 LLVMValueRef
2156 lp_build_floor(struct lp_build_context *bld,
2157                LLVMValueRef a)
2158 {
2159    LLVMBuilderRef builder = bld->gallivm->builder;
2160    const struct lp_type type = bld->type;
2161
2162    assert(type.floating);
2163    assert(lp_check_value(type, a));
2164
2165    if (arch_rounding_available(type)) {
2166       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2167    }
2168    else {
2169       const struct lp_type type = bld->type;
2170       struct lp_type inttype;
2171       struct lp_build_context intbld;
2172       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173       LLVMValueRef trunc, res, anosign, mask;
2174       LLVMTypeRef int_vec_type = bld->int_vec_type;
2175       LLVMTypeRef vec_type = bld->vec_type;
2176
2177       if (type.width != 32) {
2178          char intrinsic[32];
2179          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2180          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2181       }
2182
2183       assert(type.width == 32); /* might want to handle doubles at some point */
2184
2185       inttype = type;
2186       inttype.floating = 0;
2187       lp_build_context_init(&intbld, bld->gallivm, inttype);
2188
2189       /* round by truncation */
2190       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2191       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2192
2193       if (type.sign) {
2194          LLVMValueRef tmp;
2195
2196          /*
2197           * fix values if rounding is wrong (for non-special cases)
2198           * - this is the case if trunc > a
2199           */
2200          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2201          /* tmp = trunc > a ? 1.0 : 0.0 */
2202          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2203          tmp = lp_build_and(&intbld, mask, tmp);
2204          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2205          res = lp_build_sub(bld, res, tmp);
2206       }
2207
2208       /* mask out sign bit */
2209       anosign = lp_build_abs(bld, a);
2210       /*
2211        * mask out all values if anosign > 2^24
2212        * This should work both for large ints (all rounding is no-op for them
2213        * because such floats are always exact) as well as special cases like
2214        * NaNs, Infs (taking advantage of the fact they use max exponent).
2215        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2216        */
2217       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2218       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2219       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2220       return lp_build_select(bld, mask, a, res);
2221    }
2222 }
2223
2224
2225 /**
2226  * Return ceiling of float (vector), returning float (vector).
2227  * Ex: ceil( 1.1) = 2.0
2228  * Ex: ceil(-1.1) = -1.0
2229  */
2230 LLVMValueRef
2231 lp_build_ceil(struct lp_build_context *bld,
2232               LLVMValueRef a)
2233 {
2234    LLVMBuilderRef builder = bld->gallivm->builder;
2235    const struct lp_type type = bld->type;
2236
2237    assert(type.floating);
2238    assert(lp_check_value(type, a));
2239
2240    if (arch_rounding_available(type)) {
2241       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2242    }
2243    else {
2244       const struct lp_type type = bld->type;
2245       struct lp_type inttype;
2246       struct lp_build_context intbld;
2247       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2248       LLVMValueRef trunc, res, anosign, mask, tmp;
2249       LLVMTypeRef int_vec_type = bld->int_vec_type;
2250       LLVMTypeRef vec_type = bld->vec_type;
2251
2252       if (type.width != 32) {
2253          char intrinsic[32];
2254          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2255          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2256       }
2257
2258       assert(type.width == 32); /* might want to handle doubles at some point */
2259
2260       inttype = type;
2261       inttype.floating = 0;
2262       lp_build_context_init(&intbld, bld->gallivm, inttype);
2263
2264       /* round by truncation */
2265       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2266       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2267
2268       /*
2269        * fix values if rounding is wrong (for non-special cases)
2270        * - this is the case if trunc < a
2271        */
2272       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2273       /* tmp = trunc < a ? 1.0 : 0.0 */
2274       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2275       tmp = lp_build_and(&intbld, mask, tmp);
2276       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2277       res = lp_build_add(bld, trunc, tmp);
2278
2279       /* mask out sign bit */
2280       anosign = lp_build_abs(bld, a);
2281       /*
2282        * mask out all values if anosign > 2^24
2283        * This should work both for large ints (all rounding is no-op for them
2284        * because such floats are always exact) as well as special cases like
2285        * NaNs, Infs (taking advantage of the fact they use max exponent).
2286        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2287        */
2288       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2289       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2290       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2291       return lp_build_select(bld, mask, a, res);
2292    }
2293 }
2294
2295
2296 /**
2297  * Return fractional part of 'a' computed as a - floor(a)
2298  * Typically used in texture coord arithmetic.
2299  */
2300 LLVMValueRef
2301 lp_build_fract(struct lp_build_context *bld,
2302                LLVMValueRef a)
2303 {
2304    assert(bld->type.floating);
2305    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2306 }
2307
2308
2309 /**
2310  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2311  * against 0.99999(9). (Will also return that value for NaNs.)
2312  */
2313 static inline LLVMValueRef
2314 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2315 {
2316    LLVMValueRef max;
2317
2318    /* this is the largest number smaller than 1.0 representable as float */
2319    max = lp_build_const_vec(bld->gallivm, bld->type,
2320                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2321    return lp_build_min_ext(bld, fract, max,
2322                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2323 }
2324
2325
2326 /**
2327  * Same as lp_build_fract, but guarantees that the result is always smaller
2328  * than one. Will also return the smaller-than-one value for infs, NaNs.
2329  */
2330 LLVMValueRef
2331 lp_build_fract_safe(struct lp_build_context *bld,
2332                     LLVMValueRef a)
2333 {
2334    return clamp_fract(bld, lp_build_fract(bld, a));
2335 }
2336
2337
2338 /**
2339  * Return the integer part of a float (vector) value (== round toward zero).
2340  * The returned value is an integer (vector).
2341  * Ex: itrunc(-1.5) = -1
2342  */
2343 LLVMValueRef
2344 lp_build_itrunc(struct lp_build_context *bld,
2345                 LLVMValueRef a)
2346 {
2347    LLVMBuilderRef builder = bld->gallivm->builder;
2348    const struct lp_type type = bld->type;
2349    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2350
2351    assert(type.floating);
2352    assert(lp_check_value(type, a));
2353
2354    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2355 }
2356
2357
2358 /**
2359  * Return float (vector) rounded to nearest integer (vector).  The returned
2360  * value is an integer (vector).
2361  * Ex: iround(0.9) = 1
2362  * Ex: iround(-1.5) = -2
2363  */
2364 LLVMValueRef
2365 lp_build_iround(struct lp_build_context *bld,
2366                 LLVMValueRef a)
2367 {
2368    LLVMBuilderRef builder = bld->gallivm->builder;
2369    const struct lp_type type = bld->type;
2370    LLVMTypeRef int_vec_type = bld->int_vec_type;
2371    LLVMValueRef res;
2372
2373    assert(type.floating);
2374
2375    assert(lp_check_value(type, a));
2376
2377    if ((util_cpu_caps.has_sse2 &&
2378        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2379        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2380       return lp_build_iround_nearest_sse2(bld, a);
2381    }
2382    if (arch_rounding_available(type)) {
2383       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2384    }
2385    else {
2386       LLVMValueRef half;
2387
2388       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2389
2390       if (type.sign) {
2391          LLVMTypeRef vec_type = bld->vec_type;
2392          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2393                                     (unsigned long long)1 << (type.width - 1));
2394          LLVMValueRef sign;
2395
2396          /* get sign bit */
2397          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2398          sign = LLVMBuildAnd(builder, sign, mask, "");
2399
2400          /* sign * 0.5 */
2401          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2402          half = LLVMBuildOr(builder, sign, half, "");
2403          half = LLVMBuildBitCast(builder, half, vec_type, "");
2404       }
2405
2406       res = LLVMBuildFAdd(builder, a, half, "");
2407    }
2408
2409    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2410
2411    return res;
2412 }
2413
2414
2415 /**
2416  * Return floor of float (vector), result is an int (vector)
2417  * Ex: ifloor(1.1) = 1.0
2418  * Ex: ifloor(-1.1) = -2.0
2419  */
2420 LLVMValueRef
2421 lp_build_ifloor(struct lp_build_context *bld,
2422                 LLVMValueRef a)
2423 {
2424    LLVMBuilderRef builder = bld->gallivm->builder;
2425    const struct lp_type type = bld->type;
2426    LLVMTypeRef int_vec_type = bld->int_vec_type;
2427    LLVMValueRef res;
2428
2429    assert(type.floating);
2430    assert(lp_check_value(type, a));
2431
2432    res = a;
2433    if (type.sign) {
2434       if (arch_rounding_available(type)) {
2435          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2436       }
2437       else {
2438          struct lp_type inttype;
2439          struct lp_build_context intbld;
2440          LLVMValueRef trunc, itrunc, mask;
2441
2442          assert(type.floating);
2443          assert(lp_check_value(type, a));
2444
2445          inttype = type;
2446          inttype.floating = 0;
2447          lp_build_context_init(&intbld, bld->gallivm, inttype);
2448
2449          /* round by truncation */
2450          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2451          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2452
2453          /*
2454           * fix values if rounding is wrong (for non-special cases)
2455           * - this is the case if trunc > a
2456           * The results of doing this with NaNs, very large values etc.
2457           * are undefined but this seems to be the case anyway.
2458           */
2459          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2460          /* cheapie minus one with mask since the mask is minus one / zero */
2461          return lp_build_add(&intbld, itrunc, mask);
2462       }
2463    }
2464
2465    /* round to nearest (toward zero) */
2466    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2467
2468    return res;
2469 }
2470
2471
2472 /**
2473  * Return ceiling of float (vector), returning int (vector).
2474  * Ex: iceil( 1.1) = 2
2475  * Ex: iceil(-1.1) = -1
2476  */
2477 LLVMValueRef
2478 lp_build_iceil(struct lp_build_context *bld,
2479                LLVMValueRef a)
2480 {
2481    LLVMBuilderRef builder = bld->gallivm->builder;
2482    const struct lp_type type = bld->type;
2483    LLVMTypeRef int_vec_type = bld->int_vec_type;
2484    LLVMValueRef res;
2485
2486    assert(type.floating);
2487    assert(lp_check_value(type, a));
2488
2489    if (arch_rounding_available(type)) {
2490       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2491    }
2492    else {
2493       struct lp_type inttype;
2494       struct lp_build_context intbld;
2495       LLVMValueRef trunc, itrunc, mask;
2496
2497       assert(type.floating);
2498       assert(lp_check_value(type, a));
2499
2500       inttype = type;
2501       inttype.floating = 0;
2502       lp_build_context_init(&intbld, bld->gallivm, inttype);
2503
2504       /* round by truncation */
2505       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2506       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2507
2508       /*
2509        * fix values if rounding is wrong (for non-special cases)
2510        * - this is the case if trunc < a
2511        * The results of doing this with NaNs, very large values etc.
2512        * are undefined but this seems to be the case anyway.
2513        */
2514       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2515       /* cheapie plus one with mask since the mask is minus one / zero */
2516       return lp_build_sub(&intbld, itrunc, mask);
2517    }
2518
2519    /* round to nearest (toward zero) */
2520    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2521
2522    return res;
2523 }
2524
2525
2526 /**
2527  * Combined ifloor() & fract().
2528  *
2529  * Preferred to calling the functions separately, as it will ensure that the
2530  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2531  */
2532 void
2533 lp_build_ifloor_fract(struct lp_build_context *bld,
2534                       LLVMValueRef a,
2535                       LLVMValueRef *out_ipart,
2536                       LLVMValueRef *out_fpart)
2537 {
2538    LLVMBuilderRef builder = bld->gallivm->builder;
2539    const struct lp_type type = bld->type;
2540    LLVMValueRef ipart;
2541
2542    assert(type.floating);
2543    assert(lp_check_value(type, a));
2544
2545    if (arch_rounding_available(type)) {
2546       /*
2547        * floor() is easier.
2548        */
2549
2550       ipart = lp_build_floor(bld, a);
2551       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2552       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2553    }
2554    else {
2555       /*
2556        * ifloor() is easier.
2557        */
2558
2559       *out_ipart = lp_build_ifloor(bld, a);
2560       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2561       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2562    }
2563 }
2564
2565
2566 /**
2567  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2568  * always smaller than one.
2569  */
2570 void
2571 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2572                            LLVMValueRef a,
2573                            LLVMValueRef *out_ipart,
2574                            LLVMValueRef *out_fpart)
2575 {
2576    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2577    *out_fpart = clamp_fract(bld, *out_fpart);
2578 }
2579
2580
2581 LLVMValueRef
2582 lp_build_sqrt(struct lp_build_context *bld,
2583               LLVMValueRef a)
2584 {
2585    LLVMBuilderRef builder = bld->gallivm->builder;
2586    const struct lp_type type = bld->type;
2587    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2588    char intrinsic[32];
2589
2590    assert(lp_check_value(type, a));
2591
2592    assert(type.floating);
2593    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2594
2595    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2596 }
2597
2598
2599 /**
2600  * Do one Newton-Raphson step to improve reciprocate precision:
2601  *
2602  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2603  *
2604  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2605  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2606  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2607  * halo. It would be necessary to clamp the argument to prevent this.
2608  *
2609  * See also:
2610  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2611  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2612  */
2613 static inline LLVMValueRef
2614 lp_build_rcp_refine(struct lp_build_context *bld,
2615                     LLVMValueRef a,
2616                     LLVMValueRef rcp_a)
2617 {
2618    LLVMBuilderRef builder = bld->gallivm->builder;
2619    LLVMValueRef neg_a;
2620    LLVMValueRef res;
2621
2622    neg_a = LLVMBuildFNeg(builder, a, "");
2623    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2624    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2625
2626    return res;
2627 }
2628
2629
2630 LLVMValueRef
2631 lp_build_rcp(struct lp_build_context *bld,
2632              LLVMValueRef a)
2633 {
2634    LLVMBuilderRef builder = bld->gallivm->builder;
2635    const struct lp_type type = bld->type;
2636
2637    assert(lp_check_value(type, a));
2638
2639    if(a == bld->zero)
2640       return bld->undef;
2641    if(a == bld->one)
2642       return bld->one;
2643    if(a == bld->undef)
2644       return bld->undef;
2645
2646    assert(type.floating);
2647
2648    if(LLVMIsConstant(a))
2649       return LLVMConstFDiv(bld->one, a);
2650
2651    /*
2652     * We don't use RCPPS because:
2653     * - it only has 10bits of precision
2654     * - it doesn't even get the reciprocate of 1.0 exactly
2655     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2656     * - for recent processors the benefit over DIVPS is marginal, a case
2657     *   dependent
2658     *
2659     * We could still use it on certain processors if benchmarks show that the
2660     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2661     * particular uses that require less workarounds.
2662     */
2663
2664    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2665          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2666       const unsigned num_iterations = 0;
2667       LLVMValueRef res;
2668       unsigned i;
2669       const char *intrinsic = NULL;
2670
2671       if (type.length == 4) {
2672          intrinsic = "llvm.x86.sse.rcp.ps";
2673       }
2674       else {
2675          intrinsic = "llvm.x86.avx.rcp.ps.256";
2676       }
2677
2678       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2679
2680       for (i = 0; i < num_iterations; ++i) {
2681          res = lp_build_rcp_refine(bld, a, res);
2682       }
2683
2684       return res;
2685    }
2686
2687    return LLVMBuildFDiv(builder, bld->one, a, "");
2688 }
2689
2690
2691 /**
2692  * Do one Newton-Raphson step to improve rsqrt precision:
2693  *
2694  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2695  *
2696  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2697  */
2698 static inline LLVMValueRef
2699 lp_build_rsqrt_refine(struct lp_build_context *bld,
2700                       LLVMValueRef a,
2701                       LLVMValueRef rsqrt_a)
2702 {
2703    LLVMBuilderRef builder = bld->gallivm->builder;
2704    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2705    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2706    LLVMValueRef res;
2707
2708    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2709    res = LLVMBuildFMul(builder, a, res, "");
2710    res = LLVMBuildFSub(builder, three, res, "");
2711    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2712    res = LLVMBuildFMul(builder, half, res, "");
2713
2714    return res;
2715 }
2716
2717
2718 /**
2719  * Generate 1/sqrt(a).
2720  * Result is undefined for values < 0, infinity for +0.
2721  */
2722 LLVMValueRef
2723 lp_build_rsqrt(struct lp_build_context *bld,
2724                LLVMValueRef a)
2725 {
2726    const struct lp_type type = bld->type;
2727
2728    assert(lp_check_value(type, a));
2729
2730    assert(type.floating);
2731
2732    /*
2733     * This should be faster but all denormals will end up as infinity.
2734     */
2735    if (0 && lp_build_fast_rsqrt_available(type)) {
2736       const unsigned num_iterations = 1;
2737       LLVMValueRef res;
2738       unsigned i;
2739
2740       /* rsqrt(1.0) != 1.0 here */
2741       res = lp_build_fast_rsqrt(bld, a);
2742
2743       if (num_iterations) {
2744          /*
2745           * Newton-Raphson will result in NaN instead of infinity for zero,
2746           * and NaN instead of zero for infinity.
2747           * Also, need to ensure rsqrt(1.0) == 1.0.
2748           * All numbers smaller than FLT_MIN will result in +infinity
2749           * (rsqrtps treats all denormals as zero).
2750           */
2751          LLVMValueRef cmp;
2752          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2753          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2754
2755          for (i = 0; i < num_iterations; ++i) {
2756             res = lp_build_rsqrt_refine(bld, a, res);
2757          }
2758          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2759          res = lp_build_select(bld, cmp, inf, res);
2760          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2761          res = lp_build_select(bld, cmp, bld->zero, res);
2762          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2763          res = lp_build_select(bld, cmp, bld->one, res);
2764       }
2765
2766       return res;
2767    }
2768
2769    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2770 }
2771
2772 /**
2773  * If there's a fast (inaccurate) rsqrt instruction available
2774  * (caller may want to avoid to call rsqrt_fast if it's not available,
2775  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2776  * unavailable it would result in sqrt/div/mul so obviously
2777  * much better to just call sqrt, skipping both div and mul).
2778  */
2779 boolean
2780 lp_build_fast_rsqrt_available(struct lp_type type)
2781 {
2782    assert(type.floating);
2783
2784    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2785        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2786       return true;
2787    }
2788    return false;
2789 }
2790
2791
2792 /**
2793  * Generate 1/sqrt(a).
2794  * Result is undefined for values < 0, infinity for +0.
2795  * Precision is limited, only ~10 bits guaranteed
2796  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2797  */
2798 LLVMValueRef
2799 lp_build_fast_rsqrt(struct lp_build_context *bld,
2800                     LLVMValueRef a)
2801 {
2802    LLVMBuilderRef builder = bld->gallivm->builder;
2803    const struct lp_type type = bld->type;
2804
2805    assert(lp_check_value(type, a));
2806
2807    if (lp_build_fast_rsqrt_available(type)) {
2808       const char *intrinsic = NULL;
2809
2810       if (type.length == 4) {
2811          intrinsic = "llvm.x86.sse.rsqrt.ps";
2812       }
2813       else {
2814          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2815       }
2816       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2817    }
2818    else {
2819       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2820    }
2821    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2822 }
2823
2824
2825 /**
2826  * Generate sin(a) or cos(a) using polynomial approximation.
2827  * TODO: it might be worth recognizing sin and cos using same source
2828  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2829  * would be way cheaper than calculating (nearly) everything twice...
2830  * Not sure it's common enough to be worth bothering however, scs
2831  * opcode could also benefit from calculating both though.
2832  */
2833 static LLVMValueRef
2834 lp_build_sin_or_cos(struct lp_build_context *bld,
2835                     LLVMValueRef a,
2836                     boolean cos)
2837 {
2838    struct gallivm_state *gallivm = bld->gallivm;
2839    LLVMBuilderRef b = gallivm->builder;
2840    struct lp_type int_type = lp_int_type(bld->type);
2841
2842    /*
2843     *  take the absolute value,
2844     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2845     */
2846
2847    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2848    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2849
2850    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2851    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2852
2853    /*
2854     * scale by 4/Pi
2855     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2856     */
2857
2858    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2859    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2860
2861    /*
2862     * store the integer part of y in mm0
2863     * emm2 = _mm_cvttps_epi32(y);
2864     */
2865
2866    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2867
2868    /*
2869     * j=(j+1) & (~1) (see the cephes sources)
2870     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2871     */
2872
2873    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2874    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2875    /*
2876     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2877     */
2878    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2879    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2880
2881    /*
2882     * y = _mm_cvtepi32_ps(emm2);
2883     */
2884    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2885
2886    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2887    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2888    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2889    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2890
2891    /*
2892     * Argument used for poly selection and sign bit determination
2893     * is different for sin vs. cos.
2894     */
2895    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2896                                emm2_and;
2897
2898    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2899                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2900                                               const_29, "sign_bit") :
2901                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2902                                                               LLVMBuildShl(b, emm2_add,
2903                                                                            const_29, ""), ""),
2904                                               sign_mask, "sign_bit");
2905
2906    /*
2907     * get the polynom selection mask
2908     * there is one polynom for 0 <= x <= Pi/4
2909     * and another one for Pi/4<x<=Pi/2
2910     * Both branches will be computed.
2911     *
2912     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2913     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2914     */
2915
2916    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2917    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2918                                              int_type, PIPE_FUNC_EQUAL,
2919                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2920
2921    /*
2922     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2923     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2924     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2925     */
2926    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2927    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2928    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2929
2930    /*
2931     * The magic pass: "Extended precision modular arithmetic"
2932     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2933     */
2934    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2935    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2936    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2937
2938    /*
2939     * Evaluate the first polynom  (0 <= x <= Pi/4)
2940     *
2941     * z = _mm_mul_ps(x,x);
2942     */
2943    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2944
2945    /*
2946     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2947     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2948     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2949     */
2950    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2951    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2952    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2953
2954    /*
2955     * y = *(v4sf*)_ps_coscof_p0;
2956     * y = _mm_mul_ps(y, z);
2957     */
2958    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2959    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2960    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2961    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2962
2963
2964    /*
2965     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2966     * y = _mm_sub_ps(y, tmp);
2967     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2968     */
2969    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2970    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2971    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2972    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2973    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2974
2975    /*
2976     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2977     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2978     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2979     */
2980    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2981    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2982    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2983
2984    /*
2985     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2986     *
2987     * y2 = *(v4sf*)_ps_sincof_p0;
2988     * y2 = _mm_mul_ps(y2, z);
2989     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2990     * y2 = _mm_mul_ps(y2, z);
2991     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2992     * y2 = _mm_mul_ps(y2, z);
2993     * y2 = _mm_mul_ps(y2, x);
2994     * y2 = _mm_add_ps(y2, x);
2995     */
2996
2997    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2998    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2999    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3000    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3001
3002    /*
3003     * select the correct result from the two polynoms
3004     * xmm3 = poly_mask;
3005     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3006     * y = _mm_andnot_ps(xmm3, y);
3007     * y = _mm_or_ps(y,y2);
3008     */
3009    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3010    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3011    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3012    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3013    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3014    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3015
3016    /*
3017     * update the sign
3018     * y = _mm_xor_ps(y, sign_bit);
3019     */
3020    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3021    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3022
3023    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3024
3025    /* clamp output to be within [-1, 1] */
3026    y_result = lp_build_clamp(bld, y_result,
3027                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3028                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3029    /* If a is -inf, inf or NaN then return NaN */
3030    y_result = lp_build_select(bld, isfinite, y_result,
3031                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3032    return y_result;
3033 }
3034
3035
3036 /**
3037  * Generate sin(a)
3038  */
3039 LLVMValueRef
3040 lp_build_sin(struct lp_build_context *bld,
3041              LLVMValueRef a)
3042 {
3043    return lp_build_sin_or_cos(bld, a, FALSE);
3044 }
3045
3046
3047 /**
3048  * Generate cos(a)
3049  */
3050 LLVMValueRef
3051 lp_build_cos(struct lp_build_context *bld,
3052              LLVMValueRef a)
3053 {
3054    return lp_build_sin_or_cos(bld, a, TRUE);
3055 }
3056
3057
3058 /**
3059  * Generate pow(x, y)
3060  */
3061 LLVMValueRef
3062 lp_build_pow(struct lp_build_context *bld,
3063              LLVMValueRef x,
3064              LLVMValueRef y)
3065 {
3066    /* TODO: optimize the constant case */
3067    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3068        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3069       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3070                    __FUNCTION__);
3071    }
3072
3073    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3074 }
3075
3076
3077 /**
3078  * Generate exp(x)
3079  */
3080 LLVMValueRef
3081 lp_build_exp(struct lp_build_context *bld,
3082              LLVMValueRef x)
3083 {
3084    /* log2(e) = 1/log(2) */
3085    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3086                                            1.4426950408889634);
3087
3088    assert(lp_check_value(bld->type, x));
3089
3090    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3091 }
3092
3093
3094 /**
3095  * Generate log(x)
3096  * Behavior is undefined with infs, 0s and nans
3097  */
3098 LLVMValueRef
3099 lp_build_log(struct lp_build_context *bld,
3100              LLVMValueRef x)
3101 {
3102    /* log(2) */
3103    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3104                                           0.69314718055994529);
3105
3106    assert(lp_check_value(bld->type, x));
3107
3108    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3109 }
3110
3111 /**
3112  * Generate log(x) that handles edge cases (infs, 0s and nans)
3113  */
3114 LLVMValueRef
3115 lp_build_log_safe(struct lp_build_context *bld,
3116                   LLVMValueRef x)
3117 {
3118    /* log(2) */
3119    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3120                                           0.69314718055994529);
3121
3122    assert(lp_check_value(bld->type, x));
3123
3124    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3125 }
3126
3127
3128 /**
3129  * Generate polynomial.
3130  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3131  */
3132 LLVMValueRef
3133 lp_build_polynomial(struct lp_build_context *bld,
3134                     LLVMValueRef x,
3135                     const double *coeffs,
3136                     unsigned num_coeffs)
3137 {
3138    const struct lp_type type = bld->type;
3139    LLVMValueRef even = NULL, odd = NULL;
3140    LLVMValueRef x2;
3141    unsigned i;
3142
3143    assert(lp_check_value(bld->type, x));
3144
3145    /* TODO: optimize the constant case */
3146    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3147        LLVMIsConstant(x)) {
3148       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3149                    __FUNCTION__);
3150    }
3151
3152    /*
3153     * Calculate odd and even terms seperately to decrease data dependency
3154     * Ex:
3155     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3156     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3157     */
3158    x2 = lp_build_mul(bld, x, x);
3159
3160    for (i = num_coeffs; i--; ) {
3161       LLVMValueRef coeff;
3162
3163       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3164
3165       if (i % 2 == 0) {
3166          if (even)
3167             even = lp_build_mad(bld, x2, even, coeff);
3168          else
3169             even = coeff;
3170       } else {
3171          if (odd)
3172             odd = lp_build_mad(bld, x2, odd, coeff);
3173          else
3174             odd = coeff;
3175       }
3176    }
3177
3178    if (odd)
3179       return lp_build_mad(bld, odd, x, even);
3180    else if (even)
3181       return even;
3182    else
3183       return bld->undef;
3184 }
3185
3186
3187 /**
3188  * Minimax polynomial fit of 2**x, in range [0, 1[
3189  */
3190 const double lp_build_exp2_polynomial[] = {
3191 #if EXP_POLY_DEGREE == 5
3192    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3193    0.693153073200168932794,
3194    0.240153617044375388211,
3195    0.0558263180532956664775,
3196    0.00898934009049466391101,
3197    0.00187757667519147912699
3198 #elif EXP_POLY_DEGREE == 4
3199    1.00000259337069434683,
3200    0.693003834469974940458,
3201    0.24144275689150793076,
3202    0.0520114606103070150235,
3203    0.0135341679161270268764
3204 #elif EXP_POLY_DEGREE == 3
3205    0.999925218562710312959,
3206    0.695833540494823811697,
3207    0.226067155427249155588,
3208    0.0780245226406372992967
3209 #elif EXP_POLY_DEGREE == 2
3210    1.00172476321474503578,
3211    0.657636275736077639316,
3212    0.33718943461968720704
3213 #else
3214 #error
3215 #endif
3216 };
3217
3218
3219 LLVMValueRef
3220 lp_build_exp2(struct lp_build_context *bld,
3221               LLVMValueRef x)
3222 {
3223    LLVMBuilderRef builder = bld->gallivm->builder;
3224    const struct lp_type type = bld->type;
3225    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3226    LLVMValueRef ipart = NULL;
3227    LLVMValueRef fpart = NULL;
3228    LLVMValueRef expipart = NULL;
3229    LLVMValueRef expfpart = NULL;
3230    LLVMValueRef res = NULL;
3231
3232    assert(lp_check_value(bld->type, x));
3233
3234    /* TODO: optimize the constant case */
3235    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3236        LLVMIsConstant(x)) {
3237       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3238                    __FUNCTION__);
3239    }
3240
3241    assert(type.floating && type.width == 32);
3242
3243    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3244     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3245    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3246                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3247    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3248                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3249
3250    /* ipart = floor(x) */
3251    /* fpart = x - ipart */
3252    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3253
3254    /* expipart = (float) (1 << ipart) */
3255    expipart = LLVMBuildAdd(builder, ipart,
3256                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3257    expipart = LLVMBuildShl(builder, expipart,
3258                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3259    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3260
3261    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3262                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3263
3264    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3265
3266    return res;
3267 }
3268
3269
3270
3271 /**
3272  * Extract the exponent of a IEEE-754 floating point value.
3273  *
3274  * Optionally apply an integer bias.
3275  *
3276  * Result is an integer value with
3277  *
3278  *   ifloor(log2(x)) + bias
3279  */
3280 LLVMValueRef
3281 lp_build_extract_exponent(struct lp_build_context *bld,
3282                           LLVMValueRef x,
3283                           int bias)
3284 {
3285    LLVMBuilderRef builder = bld->gallivm->builder;
3286    const struct lp_type type = bld->type;
3287    unsigned mantissa = lp_mantissa(type);
3288    LLVMValueRef res;
3289
3290    assert(type.floating);
3291
3292    assert(lp_check_value(bld->type, x));
3293
3294    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3295
3296    res = LLVMBuildLShr(builder, x,
3297                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3298    res = LLVMBuildAnd(builder, res,
3299                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3300    res = LLVMBuildSub(builder, res,
3301                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3302
3303    return res;
3304 }
3305
3306
3307 /**
3308  * Extract the mantissa of the a floating.
3309  *
3310  * Result is a floating point value with
3311  *
3312  *   x / floor(log2(x))
3313  */
3314 LLVMValueRef
3315 lp_build_extract_mantissa(struct lp_build_context *bld,
3316                           LLVMValueRef x)
3317 {
3318    LLVMBuilderRef builder = bld->gallivm->builder;
3319    const struct lp_type type = bld->type;
3320    unsigned mantissa = lp_mantissa(type);
3321    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3322                                                   (1ULL << mantissa) - 1);
3323    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3324    LLVMValueRef res;
3325
3326    assert(lp_check_value(bld->type, x));
3327
3328    assert(type.floating);
3329
3330    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3331
3332    /* res = x / 2**ipart */
3333    res = LLVMBuildAnd(builder, x, mantmask, "");
3334    res = LLVMBuildOr(builder, res, one, "");
3335    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3336
3337    return res;
3338 }
3339
3340
3341
3342 /**
3343  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3344  * These coefficients can be generate with
3345  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3346  */
3347 const double lp_build_log2_polynomial[] = {
3348 #if LOG_POLY_DEGREE == 5
3349    2.88539008148777786488L,
3350    0.961796878841293367824L,
3351    0.577058946784739859012L,
3352    0.412914355135828735411L,
3353    0.308591899232910175289L,
3354    0.352376952300281371868L,
3355 #elif LOG_POLY_DEGREE == 4
3356    2.88539009343309178325L,
3357    0.961791550404184197881L,
3358    0.577440339438736392009L,
3359    0.403343858251329912514L,
3360    0.406718052498846252698L,
3361 #elif LOG_POLY_DEGREE == 3
3362    2.88538959748872753838L,
3363    0.961932915889597772928L,
3364    0.571118517972136195241L,
3365    0.493997535084709500285L,
3366 #else
3367 #error
3368 #endif
3369 };
3370
3371 /**
3372  * See http://www.devmaster.net/forums/showthread.php?p=43580
3373  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3374  * http://www.nezumi.demon.co.uk/consult/logx.htm
3375  *
3376  * If handle_edge_cases is true the function will perform computations
3377  * to match the required D3D10+ behavior for each of the edge cases.
3378  * That means that if input is:
3379  * - less than zero (to and including -inf) then NaN will be returned
3380  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3381  * - +infinity, then +infinity will be returned
3382  * - NaN, then NaN will be returned
3383  *
3384  * Those checks are fairly expensive so if you don't need them make sure
3385  * handle_edge_cases is false.
3386  */
3387 void
3388 lp_build_log2_approx(struct lp_build_context *bld,
3389                      LLVMValueRef x,
3390                      LLVMValueRef *p_exp,
3391                      LLVMValueRef *p_floor_log2,
3392                      LLVMValueRef *p_log2,
3393                      boolean handle_edge_cases)
3394 {
3395    LLVMBuilderRef builder = bld->gallivm->builder;
3396    const struct lp_type type = bld->type;
3397    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3398    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3399
3400    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3401    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3402    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3403
3404    LLVMValueRef i = NULL;
3405    LLVMValueRef y = NULL;
3406    LLVMValueRef z = NULL;
3407    LLVMValueRef exp = NULL;
3408    LLVMValueRef mant = NULL;
3409    LLVMValueRef logexp = NULL;
3410    LLVMValueRef p_z = NULL;
3411    LLVMValueRef res = NULL;
3412
3413    assert(lp_check_value(bld->type, x));
3414
3415    if(p_exp || p_floor_log2 || p_log2) {
3416       /* TODO: optimize the constant case */
3417       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3418           LLVMIsConstant(x)) {
3419          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3420                       __FUNCTION__);
3421       }
3422
3423       assert(type.floating && type.width == 32);
3424
3425       /*
3426        * We don't explicitly handle denormalized numbers. They will yield a
3427        * result in the neighbourhood of -127, which appears to be adequate
3428        * enough.
3429        */
3430
3431       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3432
3433       /* exp = (float) exponent(x) */
3434       exp = LLVMBuildAnd(builder, i, expmask, "");
3435    }
3436
3437    if(p_floor_log2 || p_log2) {
3438       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3439       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3440       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3441    }
3442
3443    if (p_log2) {
3444       /* mant = 1 + (float) mantissa(x) */
3445       mant = LLVMBuildAnd(builder, i, mantmask, "");
3446       mant = LLVMBuildOr(builder, mant, one, "");
3447       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3448
3449       /* y = (mant - 1) / (mant + 1) */
3450       y = lp_build_div(bld,
3451          lp_build_sub(bld, mant, bld->one),
3452          lp_build_add(bld, mant, bld->one)
3453       );
3454
3455       /* z = y^2 */
3456       z = lp_build_mul(bld, y, y);
3457
3458       /* compute P(z) */
3459       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3460                                 ARRAY_SIZE(lp_build_log2_polynomial));
3461
3462       /* y * P(z) + logexp */
3463       res = lp_build_mad(bld, y, p_z, logexp);
3464
3465       if (type.floating && handle_edge_cases) {
3466          LLVMValueRef negmask, infmask,  zmask;
3467          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3468                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3469          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3470                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3471          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3472                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3473
3474          /* If x is qual to inf make sure we return inf */
3475          res = lp_build_select(bld, infmask,
3476                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3477                                res);
3478          /* If x is qual to 0, return -inf */
3479          res = lp_build_select(bld, zmask,
3480                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3481                                res);
3482          /* If x is nan or less than 0, return nan */
3483          res = lp_build_select(bld, negmask,
3484                                lp_build_const_vec(bld->gallivm, type,  NAN),
3485                                res);
3486       }
3487    }
3488
3489    if (p_exp) {
3490       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3491       *p_exp = exp;
3492    }
3493
3494    if (p_floor_log2)
3495       *p_floor_log2 = logexp;
3496
3497    if (p_log2)
3498       *p_log2 = res;
3499 }
3500
3501
3502 /*
3503  * log2 implementation which doesn't have special code to
3504  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3505  * the results for those cases are undefined.
3506  */
3507 LLVMValueRef
3508 lp_build_log2(struct lp_build_context *bld,
3509               LLVMValueRef x)
3510 {
3511    LLVMValueRef res;
3512    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3513    return res;
3514 }
3515
3516 /*
3517  * Version of log2 which handles all edge cases.
3518  * Look at documentation of lp_build_log2_approx for
3519  * description of the behavior for each of the edge cases.
3520  */
3521 LLVMValueRef
3522 lp_build_log2_safe(struct lp_build_context *bld,
3523                    LLVMValueRef x)
3524 {
3525    LLVMValueRef res;
3526    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3527    return res;
3528 }
3529
3530
3531 /**
3532  * Faster (and less accurate) log2.
3533  *
3534  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3535  *
3536  * Piece-wise linear approximation, with exact results when x is a
3537  * power of two.
3538  *
3539  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3540  */
3541 LLVMValueRef
3542 lp_build_fast_log2(struct lp_build_context *bld,
3543                    LLVMValueRef x)
3544 {
3545    LLVMBuilderRef builder = bld->gallivm->builder;
3546    LLVMValueRef ipart;
3547    LLVMValueRef fpart;
3548
3549    assert(lp_check_value(bld->type, x));
3550
3551    assert(bld->type.floating);
3552
3553    /* ipart = floor(log2(x)) - 1 */
3554    ipart = lp_build_extract_exponent(bld, x, -1);
3555    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3556
3557    /* fpart = x / 2**ipart */
3558    fpart = lp_build_extract_mantissa(bld, x);
3559
3560    /* ipart + fpart */
3561    return LLVMBuildFAdd(builder, ipart, fpart, "");
3562 }
3563
3564
3565 /**
3566  * Fast implementation of iround(log2(x)).
3567  *
3568  * Not an approximation -- it should give accurate results all the time.
3569  */
3570 LLVMValueRef
3571 lp_build_ilog2(struct lp_build_context *bld,
3572                LLVMValueRef x)
3573 {
3574    LLVMBuilderRef builder = bld->gallivm->builder;
3575    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3576    LLVMValueRef ipart;
3577
3578    assert(bld->type.floating);
3579
3580    assert(lp_check_value(bld->type, x));
3581
3582    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3583    x = LLVMBuildFMul(builder, x, sqrt2, "");
3584
3585    /* ipart = floor(log2(x) + 0.5)  */
3586    ipart = lp_build_extract_exponent(bld, x, 0);
3587
3588    return ipart;
3589 }
3590
3591 LLVMValueRef
3592 lp_build_mod(struct lp_build_context *bld,
3593              LLVMValueRef x,
3594              LLVMValueRef y)
3595 {
3596    LLVMBuilderRef builder = bld->gallivm->builder;
3597    LLVMValueRef res;
3598    const struct lp_type type = bld->type;
3599
3600    assert(lp_check_value(type, x));
3601    assert(lp_check_value(type, y));
3602
3603    if (type.floating)
3604       res = LLVMBuildFRem(builder, x, y, "");
3605    else if (type.sign)
3606       res = LLVMBuildSRem(builder, x, y, "");
3607    else
3608       res = LLVMBuildURem(builder, x, y, "");
3609    return res;
3610 }
3611
3612
3613 /*
3614  * For floating inputs it creates and returns a mask
3615  * which is all 1's for channels which are NaN.
3616  * Channels inside x which are not NaN will be 0.
3617  */
3618 LLVMValueRef
3619 lp_build_isnan(struct lp_build_context *bld,
3620                LLVMValueRef x)
3621 {
3622    LLVMValueRef mask;
3623    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3624
3625    assert(bld->type.floating);
3626    assert(lp_check_value(bld->type, x));
3627
3628    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3629                         "isnotnan");
3630    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3631    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3632    return mask;
3633 }
3634
3635 /* Returns all 1's for floating point numbers that are
3636  * finite numbers and returns all zeros for -inf,
3637  * inf and nan's */
3638 LLVMValueRef
3639 lp_build_isfinite(struct lp_build_context *bld,
3640                   LLVMValueRef x)
3641 {
3642    LLVMBuilderRef builder = bld->gallivm->builder;
3643    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3644    struct lp_type int_type = lp_int_type(bld->type);
3645    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3646    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3647                                                     0x7f800000);
3648
3649    if (!bld->type.floating) {
3650       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3651    }
3652    assert(bld->type.floating);
3653    assert(lp_check_value(bld->type, x));
3654    assert(bld->type.width == 32);
3655
3656    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3657    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3658                            intx, infornan32);
3659 }
3660
3661 /*
3662  * Returns true if the number is nan or inf and false otherwise.
3663  * The input has to be a floating point vector.
3664  */
3665 LLVMValueRef
3666 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3667                        const struct lp_type type,
3668                        LLVMValueRef x)
3669 {
3670    LLVMBuilderRef builder = gallivm->builder;
3671    struct lp_type int_type = lp_int_type(type);
3672    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3673                                                 0x7f800000);
3674    LLVMValueRef ret;
3675
3676    assert(type.floating);
3677
3678    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3679    ret = LLVMBuildAnd(builder, ret, const0, "");
3680    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3681                           ret, const0);
3682
3683    return ret;
3684 }
3685
3686
3687 LLVMValueRef
3688 lp_build_fpstate_get(struct gallivm_state *gallivm)
3689 {
3690    if (util_cpu_caps.has_sse) {
3691       LLVMBuilderRef builder = gallivm->builder;
3692       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3693          gallivm,
3694          LLVMInt32TypeInContext(gallivm->context),
3695          "mxcsr_ptr");
3696       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3697           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3698       lp_build_intrinsic(builder,
3699                          "llvm.x86.sse.stmxcsr",
3700                          LLVMVoidTypeInContext(gallivm->context),
3701                          &mxcsr_ptr8, 1, 0);
3702       return mxcsr_ptr;
3703    }
3704    return 0;
3705 }
3706
3707 void
3708 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3709                                   boolean zero)
3710 {
3711    if (util_cpu_caps.has_sse) {
3712       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3713       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3714
3715       LLVMBuilderRef builder = gallivm->builder;
3716       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3717       LLVMValueRef mxcsr =
3718          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3719
3720       if (util_cpu_caps.has_daz) {
3721          /* Enable denormals are zero mode */
3722          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3723       }
3724       if (zero) {
3725          mxcsr = LLVMBuildOr(builder, mxcsr,
3726                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3727       } else {
3728          mxcsr = LLVMBuildAnd(builder, mxcsr,
3729                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3730       }
3731
3732       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3733       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3734    }
3735 }
3736
3737 void
3738 lp_build_fpstate_set(struct gallivm_state *gallivm,
3739                      LLVMValueRef mxcsr_ptr)
3740 {
3741    if (util_cpu_caps.has_sse) {
3742       LLVMBuilderRef builder = gallivm->builder;
3743       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3744                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3745       lp_build_intrinsic(builder,
3746                          "llvm.x86.sse.ldmxcsr",
3747                          LLVMVoidTypeInContext(gallivm->context),
3748                          &mxcsr_ptr, 1, 0);
3749    }
3750 }