src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include <llvm/Config/llvm-config.h>
  51
  52 #include "util/u_memory.h"
  53 #include "util/u_debug.h"
  54 #include "util/u_math.h"
  55 #include "util/u_cpu_detect.h"
  56
  57 #include "lp_bld_type.h"
  58 #include "lp_bld_const.h"
  59 #include "lp_bld_init.h"
  60 #include "lp_bld_intr.h"
  61 #include "lp_bld_logic.h"
  62 #include "lp_bld_pack.h"
  63 #include "lp_bld_debug.h"
  64 #include "lp_bld_bitarit.h"
  65 #include "lp_bld_arit.h"
  66 #include "lp_bld_flow.h"
  67
  68 #if defined(PIPE_ARCH_SSE)
  69 #include <xmmintrin.h>
  70 #endif
  71
  72 #ifndef _MM_DENORMALS_ZERO_MASK
  73 #define _MM_DENORMALS_ZERO_MASK 0x0040
  74 #endif
  75
  76 #ifndef _MM_FLUSH_ZERO_MASK
  77 #define _MM_FLUSH_ZERO_MASK 0x8000
  78 #endif
  79
  80 #define EXP_POLY_DEGREE 5
  81
  82 #define LOG_POLY_DEGREE 4
  83
  84
  85 /**
  86  * Generate min(a, b)
  87  * No checks for special case values of a or b = 1 or 0 are done.
  88  * NaN's are handled according to the behavior specified by the
  89  * nan_behavior argument.
  90  */
  91 static LLVMValueRef
  92 lp_build_min_simple(struct lp_build_context *bld,
  93                     LLVMValueRef a,
  94                     LLVMValueRef b,
  95                     enum gallivm_nan_behavior nan_behavior)
  96 {
  97    const struct lp_type type = bld->type;
  98    const char *intrinsic = NULL;
  99    unsigned intr_size = 0;
 100    LLVMValueRef cond;
 101
 102    assert(lp_check_value(type, a));
 103    assert(lp_check_value(type, b));
 104
 105    /* TODO: optimize the constant case */
 106
 107    if (type.floating && util_cpu_caps.has_sse) {
 108       if (type.width == 32) {
 109          if (type.length == 1) {
 110             intrinsic = "llvm.x86.sse.min.ss";
 111             intr_size = 128;
 112          }
 113          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 114             intrinsic = "llvm.x86.sse.min.ps";
 115             intr_size = 128;
 116          }
 117          else {
 118             intrinsic = "llvm.x86.avx.min.ps.256";
 119             intr_size = 256;
 120          }
 121       }
 122       if (type.width == 64 && util_cpu_caps.has_sse2) {
 123          if (type.length == 1) {
 124             intrinsic = "llvm.x86.sse2.min.sd";
 125             intr_size = 128;
 126          }
 127          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 128             intrinsic = "llvm.x86.sse2.min.pd";
 129             intr_size = 128;
 130          }
 131          else {
 132             intrinsic = "llvm.x86.avx.min.pd.256";
 133             intr_size = 256;
 134          }
 135       }
 136    }
 137    else if (type.floating && util_cpu_caps.has_altivec) {
 138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 139           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 140          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 141                       __FUNCTION__);
 142       }
 143       if (type.width == 32 && type.length == 4) {
 144          intrinsic = "llvm.ppc.altivec.vminfp";
 145          intr_size = 128;
 146       }
 147    } else if (util_cpu_caps.has_altivec) {
 148       intr_size = 128;
 149       if (type.width == 8) {
 150          if (!type.sign) {
 151             intrinsic = "llvm.ppc.altivec.vminub";
 152          } else {
 153             intrinsic = "llvm.ppc.altivec.vminsb";
 154          }
 155       } else if (type.width == 16) {
 156          if (!type.sign) {
 157             intrinsic = "llvm.ppc.altivec.vminuh";
 158          } else {
 159             intrinsic = "llvm.ppc.altivec.vminsh";
 160          }
 161       } else if (type.width == 32) {
 162          if (!type.sign) {
 163             intrinsic = "llvm.ppc.altivec.vminuw";
 164          } else {
 165             intrinsic = "llvm.ppc.altivec.vminsw";
 166          }
 167       }
 168    }
 169
 170    if (intrinsic) {
 171       /* We need to handle nan's for floating point numbers. If one of the
 172        * inputs is nan the other should be returned (required by both D3D10+
 173        * and OpenCL).
 174        * The sse intrinsics return the second operator in case of nan by
 175        * default so we need to special code to handle those.
 176        */
 177       if (util_cpu_caps.has_sse && type.floating &&
 178           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 179           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 180           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 181          LLVMValueRef isnan, min;
 182          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 183                                                    type,
 184                                                    intr_size, a, b);
 185          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 186             isnan = lp_build_isnan(bld, b);
 187             return lp_build_select(bld, isnan, a, min);
 188          } else {
 189             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 190             isnan = lp_build_isnan(bld, a);
 191             return lp_build_select(bld, isnan, a, min);
 192          }
 193       } else {
 194          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 195                                                     type,
 196                                                     intr_size, a, b);
 197       }
 198    }
 199
 200    if (type.floating) {
 201       switch (nan_behavior) {
 202       case GALLIVM_NAN_RETURN_NAN: {
 203          LLVMValueRef isnan = lp_build_isnan(bld, b);
 204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 205          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 206          return lp_build_select(bld, cond, a, b);
 207       }
 208          break;
 209       case GALLIVM_NAN_RETURN_OTHER: {
 210          LLVMValueRef isnan = lp_build_isnan(bld, a);
 211          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 212          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 213          return lp_build_select(bld, cond, a, b);
 214       }
 215          break;
 216       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 217          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 218          return lp_build_select(bld, cond, a, b);
 219       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 220          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 221          return lp_build_select(bld, cond, b, a);
 222       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 223          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 224          return lp_build_select(bld, cond, a, b);
 225          break;
 226       default:
 227          assert(0);
 228          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 229          return lp_build_select(bld, cond, a, b);
 230       }
 231    } else {
 232       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 233       return lp_build_select(bld, cond, a, b);
 234    }
 235 }
 236
 237
 238 LLVMValueRef
 239 lp_build_fmuladd(LLVMBuilderRef builder,
 240                  LLVMValueRef a,
 241                  LLVMValueRef b,
 242                  LLVMValueRef c)
 243 {
 244    LLVMTypeRef type = LLVMTypeOf(a);
 245    assert(type == LLVMTypeOf(b));
 246    assert(type == LLVMTypeOf(c));
 247
 248    char intrinsic[32];
 249    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 250    LLVMValueRef args[] = { a, b, c };
 251    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 252 }
 253
 254
 255 /**
 256  * Generate max(a, b)
 257  * No checks for special case values of a or b = 1 or 0 are done.
 258  * NaN's are handled according to the behavior specified by the
 259  * nan_behavior argument.
 260  */
 261 static LLVMValueRef
 262 lp_build_max_simple(struct lp_build_context *bld,
 263                     LLVMValueRef a,
 264                     LLVMValueRef b,
 265                     enum gallivm_nan_behavior nan_behavior)
 266 {
 267    const struct lp_type type = bld->type;
 268    const char *intrinsic = NULL;
 269    unsigned intr_size = 0;
 270    LLVMValueRef cond;
 271
 272    assert(lp_check_value(type, a));
 273    assert(lp_check_value(type, b));
 274
 275    /* TODO: optimize the constant case */
 276
 277    if (type.floating && util_cpu_caps.has_sse) {
 278       if (type.width == 32) {
 279          if (type.length == 1) {
 280             intrinsic = "llvm.x86.sse.max.ss";
 281             intr_size = 128;
 282          }
 283          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 284             intrinsic = "llvm.x86.sse.max.ps";
 285             intr_size = 128;
 286          }
 287          else {
 288             intrinsic = "llvm.x86.avx.max.ps.256";
 289             intr_size = 256;
 290          }
 291       }
 292       if (type.width == 64 && util_cpu_caps.has_sse2) {
 293          if (type.length == 1) {
 294             intrinsic = "llvm.x86.sse2.max.sd";
 295             intr_size = 128;
 296          }
 297          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 298             intrinsic = "llvm.x86.sse2.max.pd";
 299             intr_size = 128;
 300          }
 301          else {
 302             intrinsic = "llvm.x86.avx.max.pd.256";
 303             intr_size = 256;
 304          }
 305       }
 306    }
 307    else if (type.floating && util_cpu_caps.has_altivec) {
 308       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 309           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 310          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 311                       __FUNCTION__);
 312       }
 313       if (type.width == 32 || type.length == 4) {
 314          intrinsic = "llvm.ppc.altivec.vmaxfp";
 315          intr_size = 128;
 316       }
 317    } else if (util_cpu_caps.has_altivec) {
 318      intr_size = 128;
 319      if (type.width == 8) {
 320        if (!type.sign) {
 321          intrinsic = "llvm.ppc.altivec.vmaxub";
 322        } else {
 323          intrinsic = "llvm.ppc.altivec.vmaxsb";
 324        }
 325      } else if (type.width == 16) {
 326        if (!type.sign) {
 327          intrinsic = "llvm.ppc.altivec.vmaxuh";
 328        } else {
 329          intrinsic = "llvm.ppc.altivec.vmaxsh";
 330        }
 331      } else if (type.width == 32) {
 332        if (!type.sign) {
 333          intrinsic = "llvm.ppc.altivec.vmaxuw";
 334        } else {
 335          intrinsic = "llvm.ppc.altivec.vmaxsw";
 336        }
 337      }
 338    }
 339
 340    if (intrinsic) {
 341       if (util_cpu_caps.has_sse && type.floating &&
 342           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 343           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 344           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 345          LLVMValueRef isnan, max;
 346          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 347                                                    type,
 348                                                    intr_size, a, b);
 349          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 350             isnan = lp_build_isnan(bld, b);
 351             return lp_build_select(bld, isnan, a, max);
 352          } else {
 353             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 354             isnan = lp_build_isnan(bld, a);
 355             return lp_build_select(bld, isnan, a, max);
 356          }
 357       } else {
 358          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 359                                                     type,
 360                                                     intr_size, a, b);
 361       }
 362    }
 363
 364    if (type.floating) {
 365       switch (nan_behavior) {
 366       case GALLIVM_NAN_RETURN_NAN: {
 367          LLVMValueRef isnan = lp_build_isnan(bld, b);
 368          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 369          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 370          return lp_build_select(bld, cond, a, b);
 371       }
 372          break;
 373       case GALLIVM_NAN_RETURN_OTHER: {
 374          LLVMValueRef isnan = lp_build_isnan(bld, a);
 375          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 376          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 377          return lp_build_select(bld, cond, a, b);
 378       }
 379          break;
 380       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 381          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 382          return lp_build_select(bld, cond, a, b);
 383       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 384          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 385          return lp_build_select(bld, cond, b, a);
 386       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 387          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 388          return lp_build_select(bld, cond, a, b);
 389          break;
 390       default:
 391          assert(0);
 392          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 393          return lp_build_select(bld, cond, a, b);
 394       }
 395    } else {
 396       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 397       return lp_build_select(bld, cond, a, b);
 398    }
 399 }
 400
 401
 402 /**
 403  * Generate 1 - a, or ~a depending on bld->type.
 404  */
 405 LLVMValueRef
 406 lp_build_comp(struct lp_build_context *bld,
 407               LLVMValueRef a)
 408 {
 409    LLVMBuilderRef builder = bld->gallivm->builder;
 410    const struct lp_type type = bld->type;
 411
 412    assert(lp_check_value(type, a));
 413
 414    if(a == bld->one)
 415       return bld->zero;
 416    if(a == bld->zero)
 417       return bld->one;
 418
 419    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 420       if(LLVMIsConstant(a))
 421          return LLVMConstNot(a);
 422       else
 423          return LLVMBuildNot(builder, a, "");
 424    }
 425
 426    if(LLVMIsConstant(a))
 427       if (type.floating)
 428           return LLVMConstFSub(bld->one, a);
 429       else
 430           return LLVMConstSub(bld->one, a);
 431    else
 432       if (type.floating)
 433          return LLVMBuildFSub(builder, bld->one, a, "");
 434       else
 435          return LLVMBuildSub(builder, bld->one, a, "");
 436 }
 437
 438
 439 /**
 440  * Generate a + b
 441  */
 442 LLVMValueRef
 443 lp_build_add(struct lp_build_context *bld,
 444              LLVMValueRef a,
 445              LLVMValueRef b)
 446 {
 447    LLVMBuilderRef builder = bld->gallivm->builder;
 448    const struct lp_type type = bld->type;
 449    LLVMValueRef res;
 450
 451    assert(lp_check_value(type, a));
 452    assert(lp_check_value(type, b));
 453
 454    if (a == bld->zero)
 455       return b;
 456    if (b == bld->zero)
 457       return a;
 458    if (a == bld->undef || b == bld->undef)
 459       return bld->undef;
 460
 461    if (type.norm) {
 462       const char *intrinsic = NULL;
 463
 464       if (!type.sign && (a == bld->one || b == bld->one))
 465         return bld->one;
 466
 467       if (!type.floating && !type.fixed) {
 468          if (LLVM_VERSION_MAJOR >= 8) {
 469             char intrin[32];
 470             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 471             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 472             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 473          }
 474          if (type.width * type.length == 128) {
 475             if (util_cpu_caps.has_sse2) {
 476                if (type.width == 8)
 477                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 478                if (type.width == 16)
 479                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 480             } else if (util_cpu_caps.has_altivec) {
 481                if (type.width == 8)
 482                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 483                if (type.width == 16)
 484                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 485             }
 486          }
 487          if (type.width * type.length == 256) {
 488             if (util_cpu_caps.has_avx2) {
 489                if (type.width == 8)
 490                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
 491                if (type.width == 16)
 492                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
 493             }
 494          }
 495       }
 496
 497       if (intrinsic)
 498          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 499    }
 500
 501    if(type.norm && !type.floating && !type.fixed) {
 502       if (type.sign) {
 503          uint64_t sign = (uint64_t)1 << (type.width - 1);
 504          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 505          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 506          /* a_clamp_max is the maximum a for positive b,
 507             a_clamp_min is the minimum a for negative b. */
 508          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 509          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 510          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 511       }
 512    }
 513
 514    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 515       if (type.floating)
 516          res = LLVMConstFAdd(a, b);
 517       else
 518          res = LLVMConstAdd(a, b);
 519    else
 520       if (type.floating)
 521          res = LLVMBuildFAdd(builder, a, b, "");
 522       else
 523          res = LLVMBuildAdd(builder, a, b, "");
 524
 525    /* clamp to ceiling of 1.0 */
 526    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 527       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 528
 529    if (type.norm && !type.floating && !type.fixed) {
 530       if (!type.sign) {
 531          /*
 532           * newer llvm versions no longer support the intrinsics, but recognize
 533           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 534           * code, it is important we match the pattern llvm uses (and pray llvm
 535           * doesn't change it - and hope they decide on the same pattern for
 536           * all backends supporting it...).
 537           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 538           * interfere with llvm's ability to recognize the pattern but seems
 539           * a bit brittle.
 540           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 541           */
 542          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 543          res = lp_build_select(bld, overflowed,
 544                                LLVMConstAllOnes(bld->int_vec_type), res);
 545       }
 546    }
 547
 548    /* XXX clamp to floor of -1 or 0??? */
 549
 550    return res;
 551 }
 552
 553
 554 /** Return the scalar sum of the elements of a.
 555  * Should avoid this operation whenever possible.
 556  */
 557 LLVMValueRef
 558 lp_build_horizontal_add(struct lp_build_context *bld,
 559                         LLVMValueRef a)
 560 {
 561    LLVMBuilderRef builder = bld->gallivm->builder;
 562    const struct lp_type type = bld->type;
 563    LLVMValueRef index, res;
 564    unsigned i, length;
 565    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 566    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 567    LLVMValueRef vecres, elem2;
 568
 569    assert(lp_check_value(type, a));
 570
 571    if (type.length == 1) {
 572       return a;
 573    }
 574
 575    assert(!bld->type.norm);
 576
 577    /*
 578     * for byte vectors can do much better with psadbw.
 579     * Using repeated shuffle/adds here. Note with multiple vectors
 580     * this can be done more efficiently as outlined in the intel
 581     * optimization manual.
 582     * Note: could cause data rearrangement if used with smaller element
 583     * sizes.
 584     */
 585
 586    vecres = a;
 587    length = type.length / 2;
 588    while (length > 1) {
 589       LLVMValueRef vec1, vec2;
 590       for (i = 0; i < length; i++) {
 591          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 592          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 593       }
 594       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 595                                     LLVMConstVector(shuffles1, length), "");
 596       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 597                                     LLVMConstVector(shuffles2, length), "");
 598       if (type.floating) {
 599          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 600       }
 601       else {
 602          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 603       }
 604       length = length >> 1;
 605    }
 606
 607    /* always have vector of size 2 here */
 608    assert(length == 1);
 609
 610    index = lp_build_const_int32(bld->gallivm, 0);
 611    res = LLVMBuildExtractElement(builder, vecres, index, "");
 612    index = lp_build_const_int32(bld->gallivm, 1);
 613    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 614
 615    if (type.floating)
 616       res = LLVMBuildFAdd(builder, res, elem2, "");
 617     else
 618       res = LLVMBuildAdd(builder, res, elem2, "");
 619
 620    return res;
 621 }
 622
 623 /**
 624  * Return the horizontal sums of 4 float vectors as a float4 vector.
 625  * This uses the technique as outlined in Intel Optimization Manual.
 626  */
 627 static LLVMValueRef
 628 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 629                             LLVMValueRef src[4])
 630 {
 631    struct gallivm_state *gallivm = bld->gallivm;
 632    LLVMBuilderRef builder = gallivm->builder;
 633    LLVMValueRef shuffles[4];
 634    LLVMValueRef tmp[4];
 635    LLVMValueRef sumtmp[2], shuftmp[2];
 636
 637    /* lower half of regs */
 638    shuffles[0] = lp_build_const_int32(gallivm, 0);
 639    shuffles[1] = lp_build_const_int32(gallivm, 1);
 640    shuffles[2] = lp_build_const_int32(gallivm, 4);
 641    shuffles[3] = lp_build_const_int32(gallivm, 5);
 642    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 643                                    LLVMConstVector(shuffles, 4), "");
 644    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 645                                    LLVMConstVector(shuffles, 4), "");
 646
 647    /* upper half of regs */
 648    shuffles[0] = lp_build_const_int32(gallivm, 2);
 649    shuffles[1] = lp_build_const_int32(gallivm, 3);
 650    shuffles[2] = lp_build_const_int32(gallivm, 6);
 651    shuffles[3] = lp_build_const_int32(gallivm, 7);
 652    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 653                                    LLVMConstVector(shuffles, 4), "");
 654    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 655                                    LLVMConstVector(shuffles, 4), "");
 656
 657    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 658    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 659
 660    shuffles[0] = lp_build_const_int32(gallivm, 0);
 661    shuffles[1] = lp_build_const_int32(gallivm, 2);
 662    shuffles[2] = lp_build_const_int32(gallivm, 4);
 663    shuffles[3] = lp_build_const_int32(gallivm, 6);
 664    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 665                                        LLVMConstVector(shuffles, 4), "");
 666
 667    shuffles[0] = lp_build_const_int32(gallivm, 1);
 668    shuffles[1] = lp_build_const_int32(gallivm, 3);
 669    shuffles[2] = lp_build_const_int32(gallivm, 5);
 670    shuffles[3] = lp_build_const_int32(gallivm, 7);
 671    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 672                                        LLVMConstVector(shuffles, 4), "");
 673
 674    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 675 }
 676
 677
 678 /*
 679  * partially horizontally add 2-4 float vectors with length nx4,
 680  * i.e. only four adjacent values in each vector will be added,
 681  * assuming values are really grouped in 4 which also determines
 682  * output order.
 683  *
 684  * Return a vector of the same length as the initial vectors,
 685  * with the excess elements (if any) being undefined.
 686  * The element order is independent of number of input vectors.
 687  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 688  * the output order thus will be
 689  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 690  */
 691 LLVMValueRef
 692 lp_build_hadd_partial4(struct lp_build_context *bld,
 693                        LLVMValueRef vectors[],
 694                        unsigned num_vecs)
 695 {
 696    struct gallivm_state *gallivm = bld->gallivm;
 697    LLVMBuilderRef builder = gallivm->builder;
 698    LLVMValueRef ret_vec;
 699    LLVMValueRef tmp[4];
 700    const char *intrinsic = NULL;
 701
 702    assert(num_vecs >= 2 && num_vecs <= 4);
 703    assert(bld->type.floating);
 704
 705    /* only use this with at least 2 vectors, as it is sort of expensive
 706     * (depending on cpu) and we always need two horizontal adds anyway,
 707     * so a shuffle/add approach might be better.
 708     */
 709
 710    tmp[0] = vectors[0];
 711    tmp[1] = vectors[1];
 712
 713    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 714    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 715
 716    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 717        bld->type.length == 4) {
 718       intrinsic = "llvm.x86.sse3.hadd.ps";
 719    }
 720    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 721             bld->type.length == 8) {
 722       intrinsic = "llvm.x86.avx.hadd.ps.256";
 723    }
 724    if (intrinsic) {
 725       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 726                                        lp_build_vec_type(gallivm, bld->type),
 727                                        tmp[0], tmp[1]);
 728       if (num_vecs > 2) {
 729          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 730                                           lp_build_vec_type(gallivm, bld->type),
 731                                           tmp[2], tmp[3]);
 732       }
 733       else {
 734          tmp[1] = tmp[0];
 735       }
 736       return lp_build_intrinsic_binary(builder, intrinsic,
 737                                        lp_build_vec_type(gallivm, bld->type),
 738                                        tmp[0], tmp[1]);
 739    }
 740
 741    if (bld->type.length == 4) {
 742       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 743    }
 744    else {
 745       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 746       unsigned j;
 747       unsigned num_iter = bld->type.length / 4;
 748       struct lp_type parttype = bld->type;
 749       parttype.length = 4;
 750       for (j = 0; j < num_iter; j++) {
 751          LLVMValueRef partsrc[4];
 752          unsigned i;
 753          for (i = 0; i < 4; i++) {
 754             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 755          }
 756          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 757       }
 758       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 759    }
 760    return ret_vec;
 761 }
 762
 763 /**
 764  * Generate a - b
 765  */
 766 LLVMValueRef
 767 lp_build_sub(struct lp_build_context *bld,
 768              LLVMValueRef a,
 769              LLVMValueRef b)
 770 {
 771    LLVMBuilderRef builder = bld->gallivm->builder;
 772    const struct lp_type type = bld->type;
 773    LLVMValueRef res;
 774
 775    assert(lp_check_value(type, a));
 776    assert(lp_check_value(type, b));
 777
 778    if (b == bld->zero)
 779       return a;
 780    if (a == bld->undef || b == bld->undef)
 781       return bld->undef;
 782    if (a == b)
 783       return bld->zero;
 784
 785    if (type.norm) {
 786       const char *intrinsic = NULL;
 787
 788       if (!type.sign && b == bld->one)
 789         return bld->zero;
 790
 791       if (!type.floating && !type.fixed) {
 792          if (LLVM_VERSION_MAJOR >= 8) {
 793             char intrin[32];
 794             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 795             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 796             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 797          }
 798          if (type.width * type.length == 128) {
 799             if (util_cpu_caps.has_sse2) {
 800                if (type.width == 8)
 801                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 802                if (type.width == 16)
 803                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 804             } else if (util_cpu_caps.has_altivec) {
 805                if (type.width == 8)
 806                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 807                if (type.width == 16)
 808                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 809             }
 810          }
 811          if (type.width * type.length == 256) {
 812             if (util_cpu_caps.has_avx2) {
 813                if (type.width == 8)
 814                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
 815                if (type.width == 16)
 816                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
 817             }
 818          }
 819       }
 820
 821       if (intrinsic)
 822          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 823    }
 824
 825    if(type.norm && !type.floating && !type.fixed) {
 826       if (type.sign) {
 827          uint64_t sign = (uint64_t)1 << (type.width - 1);
 828          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 829          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 830          /* a_clamp_max is the maximum a for negative b,
 831             a_clamp_min is the minimum a for positive b. */
 832          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 833          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 834          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 835       } else {
 836          /*
 837           * This must match llvm pattern for saturated unsigned sub.
 838           * (lp_build_max_simple actually does the job with its current
 839           * definition but do it explicitly here.)
 840           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 841           * interfere with llvm's ability to recognize the pattern but seems
 842           * a bit brittle.
 843           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 844           */
 845          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 846          a = lp_build_select(bld, no_ov, a, b);
 847       }
 848    }
 849
 850    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 851       if (type.floating)
 852          res = LLVMConstFSub(a, b);
 853       else
 854          res = LLVMConstSub(a, b);
 855    else
 856       if (type.floating)
 857          res = LLVMBuildFSub(builder, a, b, "");
 858       else
 859          res = LLVMBuildSub(builder, a, b, "");
 860
 861    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 862       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 863
 864    return res;
 865 }
 866
 867
 868
 869 /**
 870  * Normalized multiplication.
 871  *
 872  * There are several approaches for (using 8-bit normalized multiplication as
 873  * an example):
 874  *
 875  * - alpha plus one
 876  *
 877  *     makes the following approximation to the division (Sree)
 878  *
 879  *       a*b/255 ~= (a*(b + 1)) >> 256
 880  *
 881  *     which is the fastest method that satisfies the following OpenGL criteria of
 882  *
 883  *       0*0 = 0 and 255*255 = 255
 884  *
 885  * - geometric series
 886  *
 887  *     takes the geometric series approximation to the division
 888  *
 889  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 890  *
 891  *     in this case just the first two terms to fit in 16bit arithmetic
 892  *
 893  *       t/255 ~= (t + (t >> 8)) >> 8
 894  *
 895  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 896  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 897  *     must be used.
 898  *
 899  * - geometric series plus rounding
 900  *
 901  *     when using a geometric series division instead of truncating the result
 902  *     use roundoff in the approximation (Jim Blinn)
 903  *
 904  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 905  *
 906  *     achieving the exact results.
 907  *
 908  *
 909  *
 910  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 911  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 912  * @sa Michael Herf, The "double blend trick", May 2000,
 913  *     http://www.stereopsis.com/doubleblend.html
 914  */
 915 LLVMValueRef
 916 lp_build_mul_norm(struct gallivm_state *gallivm,
 917                   struct lp_type wide_type,
 918                   LLVMValueRef a, LLVMValueRef b)
 919 {
 920    LLVMBuilderRef builder = gallivm->builder;
 921    struct lp_build_context bld;
 922    unsigned n;
 923    LLVMValueRef half;
 924    LLVMValueRef ab;
 925
 926    assert(!wide_type.floating);
 927    assert(lp_check_value(wide_type, a));
 928    assert(lp_check_value(wide_type, b));
 929
 930    lp_build_context_init(&bld, gallivm, wide_type);
 931
 932    n = wide_type.width / 2;
 933    if (wide_type.sign) {
 934       --n;
 935    }
 936
 937    /*
 938     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 939     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 940     */
 941
 942    /*
 943     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 944     */
 945
 946    ab = LLVMBuildMul(builder, a, b, "");
 947    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 948
 949    /*
 950     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 951     */
 952
 953    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 954    if (wide_type.sign) {
 955       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 956       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 957       half = lp_build_select(&bld, sign, minus_half, half);
 958    }
 959    ab = LLVMBuildAdd(builder, ab, half, "");
 960
 961    /* Final division */
 962    ab = lp_build_shr_imm(&bld, ab, n);
 963
 964    return ab;
 965 }
 966
 967 /**
 968  * Generate a * b
 969  */
 970 LLVMValueRef
 971 lp_build_mul(struct lp_build_context *bld,
 972              LLVMValueRef a,
 973              LLVMValueRef b)
 974 {
 975    LLVMBuilderRef builder = bld->gallivm->builder;
 976    const struct lp_type type = bld->type;
 977    LLVMValueRef shift;
 978    LLVMValueRef res;
 979
 980    assert(lp_check_value(type, a));
 981    assert(lp_check_value(type, b));
 982
 983    if(a == bld->zero)
 984       return bld->zero;
 985    if(a == bld->one)
 986       return b;
 987    if(b == bld->zero)
 988       return bld->zero;
 989    if(b == bld->one)
 990       return a;
 991    if(a == bld->undef || b == bld->undef)
 992       return bld->undef;
 993
 994    if (!type.floating && !type.fixed && type.norm) {
 995       struct lp_type wide_type = lp_wider_type(type);
 996       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 997
 998       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
 999       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1000
1001       /* PMULLW, PSRLW, PADDW */
1002       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1003       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1004
1005       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1006
1007       return ab;
1008    }
1009
1010    if(type.fixed)
1011       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1012    else
1013       shift = NULL;
1014
1015    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1016       if (type.floating)
1017          res = LLVMConstFMul(a, b);
1018       else
1019          res = LLVMConstMul(a, b);
1020       if(shift) {
1021          if(type.sign)
1022             res = LLVMConstAShr(res, shift);
1023          else
1024             res = LLVMConstLShr(res, shift);
1025       }
1026    }
1027    else {
1028       if (type.floating)
1029          res = LLVMBuildFMul(builder, a, b, "");
1030       else
1031          res = LLVMBuildMul(builder, a, b, "");
1032       if(shift) {
1033          if(type.sign)
1034             res = LLVMBuildAShr(builder, res, shift, "");
1035          else
1036             res = LLVMBuildLShr(builder, res, shift, "");
1037       }
1038    }
1039
1040    return res;
1041 }
1042
1043 /*
1044  * Widening mul, valid for 32x32 bit -> 64bit only.
1045  * Result is low 32bits, high bits returned in res_hi.
1046  *
1047  * Emits code that is meant to be compiled for the host CPU.
1048  */
1049 LLVMValueRef
1050 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1051                          LLVMValueRef a,
1052                          LLVMValueRef b,
1053                          LLVMValueRef *res_hi)
1054 {
1055    struct gallivm_state *gallivm = bld->gallivm;
1056    LLVMBuilderRef builder = gallivm->builder;
1057
1058    assert(bld->type.width == 32);
1059    assert(bld->type.floating == 0);
1060    assert(bld->type.fixed == 0);
1061    assert(bld->type.norm == 0);
1062
1063    /*
1064     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1065     * for x86 simd is atrocious (even if the high bits weren't required),
1066     * trying to handle real 64bit inputs (which of course can't happen due
1067     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1068     * apparently llvm does not recognize this widening mul). This includes 6
1069     * (instead of 2) pmuludq plus extra adds and shifts
1070     * The same story applies to signed mul, albeit fixing this requires sse41.
1071     * https://llvm.org/bugs/show_bug.cgi?id=30845
1072     * So, whip up our own code, albeit only for length 4 and 8 (which
1073     * should be good enough)...
1074     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1075     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1076     * for signed), which the fallback code does not, without this llvm
1077     * will likely still produce atrocious code.
1078     */
1079    if (LLVM_VERSION_MAJOR < 7 &&
1080        (bld->type.length == 4 || bld->type.length == 8) &&
1081        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1082         util_cpu_caps.has_sse4_1)) {
1083       const char *intrinsic = NULL;
1084       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1085       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1086       struct lp_type type_wide = lp_wider_type(bld->type);
1087       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1088       unsigned i;
1089       for (i = 0; i < bld->type.length; i += 2) {
1090          shuf[i] = lp_build_const_int32(gallivm, i+1);
1091          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1092       }
1093       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1094       aeven = a;
1095       beven = b;
1096       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1097       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1098
1099       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1100          if (bld->type.sign) {
1101             intrinsic = "llvm.x86.avx2.pmul.dq";
1102          } else {
1103             intrinsic = "llvm.x86.avx2.pmulu.dq";
1104          }
1105          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1106                                              wider_type, aeven, beven);
1107          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1108                                             wider_type, aodd, bodd);
1109       }
1110       else {
1111          /* for consistent naming look elsewhere... */
1112          if (bld->type.sign) {
1113             intrinsic = "llvm.x86.sse41.pmuldq";
1114          } else {
1115             intrinsic = "llvm.x86.sse2.pmulu.dq";
1116          }
1117          /*
1118           * XXX If we only have AVX but not AVX2 this is a pain.
1119           * lp_build_intrinsic_binary_anylength() can't handle it
1120           * (due to src and dst type not being identical).
1121           */
1122          if (bld->type.length == 8) {
1123             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1124             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1125             LLVMValueRef muleven2[2], mulodd2[2];
1126             struct lp_type type_wide_half = type_wide;
1127             LLVMTypeRef wtype_half;
1128             type_wide_half.length = 2;
1129             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1130             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1131             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1132             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1133             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1134             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1135             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1136             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1137             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1138             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1139                                                     wtype_half, aevenlo, bevenlo);
1140             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1141                                                    wtype_half, aoddlo, boddlo);
1142             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1143                                                     wtype_half, aevenhi, bevenhi);
1144             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1145                                                    wtype_half, aoddhi, boddhi);
1146             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1147             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1148
1149          }
1150          else {
1151             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                                 wider_type, aeven, beven);
1153             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                                wider_type, aodd, bodd);
1155          }
1156       }
1157       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1158       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1159
1160       for (i = 0; i < bld->type.length; i += 2) {
1161          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1162          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1163       }
1164       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1165       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1166
1167       for (i = 0; i < bld->type.length; i += 2) {
1168          shuf[i] = lp_build_const_int32(gallivm, i);
1169          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1170       }
1171       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1172       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1173    }
1174    else {
1175       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1176    }
1177 }
1178
1179
1180 /*
1181  * Widening mul, valid for 32x32 bit -> 64bit only.
1182  * Result is low 32bits, high bits returned in res_hi.
1183  *
1184  * Emits generic code.
1185  */
1186 LLVMValueRef
1187 lp_build_mul_32_lohi(struct lp_build_context *bld,
1188                      LLVMValueRef a,
1189                      LLVMValueRef b,
1190                      LLVMValueRef *res_hi)
1191 {
1192    struct gallivm_state *gallivm = bld->gallivm;
1193    LLVMBuilderRef builder = gallivm->builder;
1194    LLVMValueRef tmp, shift, res_lo;
1195    struct lp_type type_tmp;
1196    LLVMTypeRef wide_type, narrow_type;
1197
1198    type_tmp = bld->type;
1199    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1200    type_tmp.width *= 2;
1201    wide_type = lp_build_vec_type(gallivm, type_tmp);
1202    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1203
1204    if (bld->type.sign) {
1205       a = LLVMBuildSExt(builder, a, wide_type, "");
1206       b = LLVMBuildSExt(builder, b, wide_type, "");
1207    } else {
1208       a = LLVMBuildZExt(builder, a, wide_type, "");
1209       b = LLVMBuildZExt(builder, b, wide_type, "");
1210    }
1211    tmp = LLVMBuildMul(builder, a, b, "");
1212
1213    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1214
1215    /* Since we truncate anyway, LShr and AShr are equivalent. */
1216    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1217    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1218
1219    return res_lo;
1220 }
1221
1222
1223 /* a * b + c */
1224 LLVMValueRef
1225 lp_build_mad(struct lp_build_context *bld,
1226              LLVMValueRef a,
1227              LLVMValueRef b,
1228              LLVMValueRef c)
1229 {
1230    const struct lp_type type = bld->type;
1231    if (type.floating) {
1232       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1233    } else {
1234       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1235    }
1236 }
1237
1238
1239 /**
1240  * Small vector x scale multiplication optimization.
1241  */
1242 LLVMValueRef
1243 lp_build_mul_imm(struct lp_build_context *bld,
1244                  LLVMValueRef a,
1245                  int b)
1246 {
1247    LLVMBuilderRef builder = bld->gallivm->builder;
1248    LLVMValueRef factor;
1249
1250    assert(lp_check_value(bld->type, a));
1251
1252    if(b == 0)
1253       return bld->zero;
1254
1255    if(b == 1)
1256       return a;
1257
1258    if(b == -1)
1259       return lp_build_negate(bld, a);
1260
1261    if(b == 2 && bld->type.floating)
1262       return lp_build_add(bld, a, a);
1263
1264    if(util_is_power_of_two_or_zero(b)) {
1265       unsigned shift = ffs(b) - 1;
1266
1267       if(bld->type.floating) {
1268 #if 0
1269          /*
1270           * Power of two multiplication by directly manipulating the exponent.
1271           *
1272           * XXX: This might not be always faster, it will introduce a small error
1273           * for multiplication by zero, and it will produce wrong results
1274           * for Inf and NaN.
1275           */
1276          unsigned mantissa = lp_mantissa(bld->type);
1277          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1278          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1279          a = LLVMBuildAdd(builder, a, factor, "");
1280          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1281          return a;
1282 #endif
1283       }
1284       else {
1285          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1286          return LLVMBuildShl(builder, a, factor, "");
1287       }
1288    }
1289
1290    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1291    return lp_build_mul(bld, a, factor);
1292 }
1293
1294
1295 /**
1296  * Generate a / b
1297  */
1298 LLVMValueRef
1299 lp_build_div(struct lp_build_context *bld,
1300              LLVMValueRef a,
1301              LLVMValueRef b)
1302 {
1303    LLVMBuilderRef builder = bld->gallivm->builder;
1304    const struct lp_type type = bld->type;
1305
1306    assert(lp_check_value(type, a));
1307    assert(lp_check_value(type, b));
1308
1309    if(a == bld->zero)
1310       return bld->zero;
1311    if(a == bld->one && type.floating)
1312       return lp_build_rcp(bld, b);
1313    if(b == bld->zero)
1314       return bld->undef;
1315    if(b == bld->one)
1316       return a;
1317    if(a == bld->undef || b == bld->undef)
1318       return bld->undef;
1319
1320    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1321       if (type.floating)
1322          return LLVMConstFDiv(a, b);
1323       else if (type.sign)
1324          return LLVMConstSDiv(a, b);
1325       else
1326          return LLVMConstUDiv(a, b);
1327    }
1328
1329    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1330    if(FALSE &&
1331       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1332        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1333       type.floating)
1334       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1335
1336    if (type.floating)
1337       return LLVMBuildFDiv(builder, a, b, "");
1338    else if (type.sign)
1339       return LLVMBuildSDiv(builder, a, b, "");
1340    else
1341       return LLVMBuildUDiv(builder, a, b, "");
1342 }
1343
1344
1345 /**
1346  * Linear interpolation helper.
1347  *
1348  * @param normalized whether we are interpolating normalized values,
1349  *        encoded in normalized integers, twice as wide.
1350  *
1351  * @sa http://www.stereopsis.com/doubleblend.html
1352  */
1353 static inline LLVMValueRef
1354 lp_build_lerp_simple(struct lp_build_context *bld,
1355                      LLVMValueRef x,
1356                      LLVMValueRef v0,
1357                      LLVMValueRef v1,
1358                      unsigned flags)
1359 {
1360    unsigned half_width = bld->type.width/2;
1361    LLVMBuilderRef builder = bld->gallivm->builder;
1362    LLVMValueRef delta;
1363    LLVMValueRef res;
1364
1365    assert(lp_check_value(bld->type, x));
1366    assert(lp_check_value(bld->type, v0));
1367    assert(lp_check_value(bld->type, v1));
1368
1369    delta = lp_build_sub(bld, v1, v0);
1370
1371    if (bld->type.floating) {
1372       assert(flags == 0);
1373       return lp_build_mad(bld, x, delta, v0);
1374    }
1375
1376    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1377       if (!bld->type.sign) {
1378          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1379             /*
1380              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1381              * most-significant-bit to the lowest-significant-bit, so that
1382              * later we can just divide by 2**n instead of 2**n - 1.
1383              */
1384
1385             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1386          }
1387
1388          /* (x * delta) >> n */
1389          res = lp_build_mul(bld, x, delta);
1390          res = lp_build_shr_imm(bld, res, half_width);
1391       } else {
1392          /*
1393           * The rescaling trick above doesn't work for signed numbers, so
1394           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1395           * instead.
1396           */
1397          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1398          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1399       }
1400    } else {
1401       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1402       res = lp_build_mul(bld, x, delta);
1403    }
1404
1405    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1406       /*
1407        * At this point both res and v0 only use the lower half of the bits,
1408        * the rest is zero. Instead of add / mask, do add with half wide type.
1409        */
1410       struct lp_type narrow_type;
1411       struct lp_build_context narrow_bld;
1412
1413       memset(&narrow_type, 0, sizeof narrow_type);
1414       narrow_type.sign   = bld->type.sign;
1415       narrow_type.width  = bld->type.width/2;
1416       narrow_type.length = bld->type.length*2;
1417
1418       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1419       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1420       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1421       res = lp_build_add(&narrow_bld, v0, res);
1422       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1423    } else {
1424       res = lp_build_add(bld, v0, res);
1425
1426       if (bld->type.fixed) {
1427          /*
1428           * We need to mask out the high order bits when lerping 8bit
1429           * normalized colors stored on 16bits
1430           */
1431          /* XXX: This step is necessary for lerping 8bit colors stored on
1432           * 16bits, but it will be wrong for true fixed point use cases.
1433           * Basically we need a more powerful lp_type, capable of further
1434           * distinguishing the values interpretation from the value storage.
1435           */
1436          LLVMValueRef low_bits;
1437          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1438          res = LLVMBuildAnd(builder, res, low_bits, "");
1439       }
1440    }
1441
1442    return res;
1443 }
1444
1445
1446 /**
1447  * Linear interpolation.
1448  */
1449 LLVMValueRef
1450 lp_build_lerp(struct lp_build_context *bld,
1451               LLVMValueRef x,
1452               LLVMValueRef v0,
1453               LLVMValueRef v1,
1454               unsigned flags)
1455 {
1456    const struct lp_type type = bld->type;
1457    LLVMValueRef res;
1458
1459    assert(lp_check_value(type, x));
1460    assert(lp_check_value(type, v0));
1461    assert(lp_check_value(type, v1));
1462
1463    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1464
1465    if (type.norm) {
1466       struct lp_type wide_type;
1467       struct lp_build_context wide_bld;
1468       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1469
1470       assert(type.length >= 2);
1471
1472       /*
1473        * Create a wider integer type, enough to hold the
1474        * intermediate result of the multiplication.
1475        */
1476       memset(&wide_type, 0, sizeof wide_type);
1477       wide_type.sign   = type.sign;
1478       wide_type.width  = type.width*2;
1479       wide_type.length = type.length/2;
1480
1481       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1482
1483       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1484       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1485       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1486
1487       /*
1488        * Lerp both halves.
1489        */
1490
1491       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1492
1493       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1494       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1495
1496       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1497    } else {
1498       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1499    }
1500
1501    return res;
1502 }
1503
1504
1505 /**
1506  * Bilinear interpolation.
1507  *
1508  * Values indices are in v_{yx}.
1509  */
1510 LLVMValueRef
1511 lp_build_lerp_2d(struct lp_build_context *bld,
1512                  LLVMValueRef x,
1513                  LLVMValueRef y,
1514                  LLVMValueRef v00,
1515                  LLVMValueRef v01,
1516                  LLVMValueRef v10,
1517                  LLVMValueRef v11,
1518                  unsigned flags)
1519 {
1520    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1521    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1522    return lp_build_lerp(bld, y, v0, v1, flags);
1523 }
1524
1525
1526 LLVMValueRef
1527 lp_build_lerp_3d(struct lp_build_context *bld,
1528                  LLVMValueRef x,
1529                  LLVMValueRef y,
1530                  LLVMValueRef z,
1531                  LLVMValueRef v000,
1532                  LLVMValueRef v001,
1533                  LLVMValueRef v010,
1534                  LLVMValueRef v011,
1535                  LLVMValueRef v100,
1536                  LLVMValueRef v101,
1537                  LLVMValueRef v110,
1538                  LLVMValueRef v111,
1539                  unsigned flags)
1540 {
1541    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1542    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1543    return lp_build_lerp(bld, z, v0, v1, flags);
1544 }
1545
1546
1547 /**
1548  * Generate min(a, b)
1549  * Do checks for special cases but not for nans.
1550  */
1551 LLVMValueRef
1552 lp_build_min(struct lp_build_context *bld,
1553              LLVMValueRef a,
1554              LLVMValueRef b)
1555 {
1556    assert(lp_check_value(bld->type, a));
1557    assert(lp_check_value(bld->type, b));
1558
1559    if(a == bld->undef || b == bld->undef)
1560       return bld->undef;
1561
1562    if(a == b)
1563       return a;
1564
1565    if (bld->type.norm) {
1566       if (!bld->type.sign) {
1567          if (a == bld->zero || b == bld->zero) {
1568             return bld->zero;
1569          }
1570       }
1571       if(a == bld->one)
1572          return b;
1573       if(b == bld->one)
1574          return a;
1575    }
1576
1577    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1578 }
1579
1580
1581 /**
1582  * Generate min(a, b)
1583  * NaN's are handled according to the behavior specified by the
1584  * nan_behavior argument.
1585  */
1586 LLVMValueRef
1587 lp_build_min_ext(struct lp_build_context *bld,
1588                  LLVMValueRef a,
1589                  LLVMValueRef b,
1590                  enum gallivm_nan_behavior nan_behavior)
1591 {
1592    assert(lp_check_value(bld->type, a));
1593    assert(lp_check_value(bld->type, b));
1594
1595    if(a == bld->undef || b == bld->undef)
1596       return bld->undef;
1597
1598    if(a == b)
1599       return a;
1600
1601    if (bld->type.norm) {
1602       if (!bld->type.sign) {
1603          if (a == bld->zero || b == bld->zero) {
1604             return bld->zero;
1605          }
1606       }
1607       if(a == bld->one)
1608          return b;
1609       if(b == bld->one)
1610          return a;
1611    }
1612
1613    return lp_build_min_simple(bld, a, b, nan_behavior);
1614 }
1615
1616 /**
1617  * Generate max(a, b)
1618  * Do checks for special cases, but NaN behavior is undefined.
1619  */
1620 LLVMValueRef
1621 lp_build_max(struct lp_build_context *bld,
1622              LLVMValueRef a,
1623              LLVMValueRef b)
1624 {
1625    assert(lp_check_value(bld->type, a));
1626    assert(lp_check_value(bld->type, b));
1627
1628    if(a == bld->undef || b == bld->undef)
1629       return bld->undef;
1630
1631    if(a == b)
1632       return a;
1633
1634    if(bld->type.norm) {
1635       if(a == bld->one || b == bld->one)
1636          return bld->one;
1637       if (!bld->type.sign) {
1638          if (a == bld->zero) {
1639             return b;
1640          }
1641          if (b == bld->zero) {
1642             return a;
1643          }
1644       }
1645    }
1646
1647    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1648 }
1649
1650
1651 /**
1652  * Generate max(a, b)
1653  * Checks for special cases.
1654  * NaN's are handled according to the behavior specified by the
1655  * nan_behavior argument.
1656  */
1657 LLVMValueRef
1658 lp_build_max_ext(struct lp_build_context *bld,
1659                   LLVMValueRef a,
1660                   LLVMValueRef b,
1661                   enum gallivm_nan_behavior nan_behavior)
1662 {
1663    assert(lp_check_value(bld->type, a));
1664    assert(lp_check_value(bld->type, b));
1665
1666    if(a == bld->undef || b == bld->undef)
1667       return bld->undef;
1668
1669    if(a == b)
1670       return a;
1671
1672    if(bld->type.norm) {
1673       if(a == bld->one || b == bld->one)
1674          return bld->one;
1675       if (!bld->type.sign) {
1676          if (a == bld->zero) {
1677             return b;
1678          }
1679          if (b == bld->zero) {
1680             return a;
1681          }
1682       }
1683    }
1684
1685    return lp_build_max_simple(bld, a, b, nan_behavior);
1686 }
1687
1688 /**
1689  * Generate clamp(a, min, max)
1690  * NaN behavior (for any of a, min, max) is undefined.
1691  * Do checks for special cases.
1692  */
1693 LLVMValueRef
1694 lp_build_clamp(struct lp_build_context *bld,
1695                LLVMValueRef a,
1696                LLVMValueRef min,
1697                LLVMValueRef max)
1698 {
1699    assert(lp_check_value(bld->type, a));
1700    assert(lp_check_value(bld->type, min));
1701    assert(lp_check_value(bld->type, max));
1702
1703    a = lp_build_min(bld, a, max);
1704    a = lp_build_max(bld, a, min);
1705    return a;
1706 }
1707
1708
1709 /**
1710  * Generate clamp(a, 0, 1)
1711  * A NaN will get converted to zero.
1712  */
1713 LLVMValueRef
1714 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1715                                 LLVMValueRef a)
1716 {
1717    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1718    a = lp_build_min(bld, a, bld->one);
1719    return a;
1720 }
1721
1722
1723 /**
1724  * Generate abs(a)
1725  */
1726 LLVMValueRef
1727 lp_build_abs(struct lp_build_context *bld,
1728              LLVMValueRef a)
1729 {
1730    LLVMBuilderRef builder = bld->gallivm->builder;
1731    const struct lp_type type = bld->type;
1732    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1733
1734    assert(lp_check_value(type, a));
1735
1736    if(!type.sign)
1737       return a;
1738
1739    if(type.floating) {
1740       char intrinsic[32];
1741       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1742       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1743    }
1744
1745    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1746       switch(type.width) {
1747       case 8:
1748          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1749       case 16:
1750          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1751       case 32:
1752          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1753       }
1754    }
1755    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1756       switch(type.width) {
1757       case 8:
1758          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1759       case 16:
1760          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1761       case 32:
1762          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1763       }
1764    }
1765
1766    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1767                           a, LLVMBuildNeg(builder, a, ""));
1768 }
1769
1770
1771 LLVMValueRef
1772 lp_build_negate(struct lp_build_context *bld,
1773                 LLVMValueRef a)
1774 {
1775    LLVMBuilderRef builder = bld->gallivm->builder;
1776
1777    assert(lp_check_value(bld->type, a));
1778
1779    if (bld->type.floating)
1780       a = LLVMBuildFNeg(builder, a, "");
1781    else
1782       a = LLVMBuildNeg(builder, a, "");
1783
1784    return a;
1785 }
1786
1787
1788 /** Return -1, 0 or +1 depending on the sign of a */
1789 LLVMValueRef
1790 lp_build_sgn(struct lp_build_context *bld,
1791              LLVMValueRef a)
1792 {
1793    LLVMBuilderRef builder = bld->gallivm->builder;
1794    const struct lp_type type = bld->type;
1795    LLVMValueRef cond;
1796    LLVMValueRef res;
1797
1798    assert(lp_check_value(type, a));
1799
1800    /* Handle non-zero case */
1801    if(!type.sign) {
1802       /* if not zero then sign must be positive */
1803       res = bld->one;
1804    }
1805    else if(type.floating) {
1806       LLVMTypeRef vec_type;
1807       LLVMTypeRef int_type;
1808       LLVMValueRef mask;
1809       LLVMValueRef sign;
1810       LLVMValueRef one;
1811       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1812
1813       int_type = lp_build_int_vec_type(bld->gallivm, type);
1814       vec_type = lp_build_vec_type(bld->gallivm, type);
1815       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1816
1817       /* Take the sign bit and add it to 1 constant */
1818       sign = LLVMBuildBitCast(builder, a, int_type, "");
1819       sign = LLVMBuildAnd(builder, sign, mask, "");
1820       one = LLVMConstBitCast(bld->one, int_type);
1821       res = LLVMBuildOr(builder, sign, one, "");
1822       res = LLVMBuildBitCast(builder, res, vec_type, "");
1823    }
1824    else
1825    {
1826       /* signed int/norm/fixed point */
1827       /* could use psign with sse3 and appropriate vectors here */
1828       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1829       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1830       res = lp_build_select(bld, cond, bld->one, minus_one);
1831    }
1832
1833    /* Handle zero */
1834    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1835    res = lp_build_select(bld, cond, bld->zero, res);
1836
1837    return res;
1838 }
1839
1840
1841 /**
1842  * Set the sign of float vector 'a' according to 'sign'.
1843  * If sign==0, return abs(a).
1844  * If sign==1, return -abs(a);
1845  * Other values for sign produce undefined results.
1846  */
1847 LLVMValueRef
1848 lp_build_set_sign(struct lp_build_context *bld,
1849                   LLVMValueRef a, LLVMValueRef sign)
1850 {
1851    LLVMBuilderRef builder = bld->gallivm->builder;
1852    const struct lp_type type = bld->type;
1853    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1854    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1855    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1856    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1857                              ~((unsigned long long) 1 << (type.width - 1)));
1858    LLVMValueRef val, res;
1859
1860    assert(type.floating);
1861    assert(lp_check_value(type, a));
1862
1863    /* val = reinterpret_cast<int>(a) */
1864    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1865    /* val = val & mask */
1866    val = LLVMBuildAnd(builder, val, mask, "");
1867    /* sign = sign << shift */
1868    sign = LLVMBuildShl(builder, sign, shift, "");
1869    /* res = val | sign */
1870    res = LLVMBuildOr(builder, val, sign, "");
1871    /* res = reinterpret_cast<float>(res) */
1872    res = LLVMBuildBitCast(builder, res, vec_type, "");
1873
1874    return res;
1875 }
1876
1877
1878 /**
1879  * Convert vector of (or scalar) int to vector of (or scalar) float.
1880  */
1881 LLVMValueRef
1882 lp_build_int_to_float(struct lp_build_context *bld,
1883                       LLVMValueRef a)
1884 {
1885    LLVMBuilderRef builder = bld->gallivm->builder;
1886    const struct lp_type type = bld->type;
1887    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1888
1889    assert(type.floating);
1890
1891    return LLVMBuildSIToFP(builder, a, vec_type, "");
1892 }
1893
1894 static boolean
1895 arch_rounding_available(const struct lp_type type)
1896 {
1897    if ((util_cpu_caps.has_sse4_1 &&
1898        (type.length == 1 || type.width*type.length == 128)) ||
1899        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1900        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1901       return TRUE;
1902    else if ((util_cpu_caps.has_altivec &&
1903             (type.width == 32 && type.length == 4)))
1904       return TRUE;
1905    else if (util_cpu_caps.has_neon)
1906       return TRUE;
1907
1908    return FALSE;
1909 }
1910
1911 enum lp_build_round_mode
1912 {
1913    LP_BUILD_ROUND_NEAREST = 0,
1914    LP_BUILD_ROUND_FLOOR = 1,
1915    LP_BUILD_ROUND_CEIL = 2,
1916    LP_BUILD_ROUND_TRUNCATE = 3
1917 };
1918
1919 static inline LLVMValueRef
1920 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1921                              LLVMValueRef a)
1922 {
1923    LLVMBuilderRef builder = bld->gallivm->builder;
1924    const struct lp_type type = bld->type;
1925    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1926    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1927    const char *intrinsic;
1928    LLVMValueRef res;
1929
1930    assert(type.floating);
1931    /* using the double precision conversions is a bit more complicated */
1932    assert(type.width == 32);
1933
1934    assert(lp_check_value(type, a));
1935    assert(util_cpu_caps.has_sse2);
1936
1937    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1938    if (type.length == 1) {
1939       LLVMTypeRef vec_type;
1940       LLVMValueRef undef;
1941       LLVMValueRef arg;
1942       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1943
1944       vec_type = LLVMVectorType(bld->elem_type, 4);
1945
1946       intrinsic = "llvm.x86.sse.cvtss2si";
1947
1948       undef = LLVMGetUndef(vec_type);
1949
1950       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1951
1952       res = lp_build_intrinsic_unary(builder, intrinsic,
1953                                      ret_type, arg);
1954    }
1955    else {
1956       if (type.width* type.length == 128) {
1957          intrinsic = "llvm.x86.sse2.cvtps2dq";
1958       }
1959       else {
1960          assert(type.width*type.length == 256);
1961          assert(util_cpu_caps.has_avx);
1962
1963          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1964       }
1965       res = lp_build_intrinsic_unary(builder, intrinsic,
1966                                      ret_type, a);
1967    }
1968
1969    return res;
1970 }
1971
1972
1973 /*
1974  */
1975 static inline LLVMValueRef
1976 lp_build_round_altivec(struct lp_build_context *bld,
1977                        LLVMValueRef a,
1978                        enum lp_build_round_mode mode)
1979 {
1980    LLVMBuilderRef builder = bld->gallivm->builder;
1981    const struct lp_type type = bld->type;
1982    const char *intrinsic = NULL;
1983
1984    assert(type.floating);
1985
1986    assert(lp_check_value(type, a));
1987    assert(util_cpu_caps.has_altivec);
1988
1989    (void)type;
1990
1991    switch (mode) {
1992    case LP_BUILD_ROUND_NEAREST:
1993       intrinsic = "llvm.ppc.altivec.vrfin";
1994       break;
1995    case LP_BUILD_ROUND_FLOOR:
1996       intrinsic = "llvm.ppc.altivec.vrfim";
1997       break;
1998    case LP_BUILD_ROUND_CEIL:
1999       intrinsic = "llvm.ppc.altivec.vrfip";
2000       break;
2001    case LP_BUILD_ROUND_TRUNCATE:
2002       intrinsic = "llvm.ppc.altivec.vrfiz";
2003       break;
2004    }
2005
2006    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2007 }
2008
2009 static inline LLVMValueRef
2010 lp_build_round_arch(struct lp_build_context *bld,
2011                     LLVMValueRef a,
2012                     enum lp_build_round_mode mode)
2013 {
2014    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2015       LLVMBuilderRef builder = bld->gallivm->builder;
2016       const struct lp_type type = bld->type;
2017       const char *intrinsic_root;
2018       char intrinsic[32];
2019
2020       assert(type.floating);
2021       assert(lp_check_value(type, a));
2022       (void)type;
2023
2024       switch (mode) {
2025       case LP_BUILD_ROUND_NEAREST:
2026          intrinsic_root = "llvm.nearbyint";
2027          break;
2028       case LP_BUILD_ROUND_FLOOR:
2029          intrinsic_root = "llvm.floor";
2030          break;
2031       case LP_BUILD_ROUND_CEIL:
2032          intrinsic_root = "llvm.ceil";
2033          break;
2034       case LP_BUILD_ROUND_TRUNCATE:
2035          intrinsic_root = "llvm.trunc";
2036          break;
2037       }
2038
2039       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2040       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2041    }
2042    else /* (util_cpu_caps.has_altivec) */
2043      return lp_build_round_altivec(bld, a, mode);
2044 }
2045
2046 /**
2047  * Return the integer part of a float (vector) value (== round toward zero).
2048  * The returned value is a float (vector).
2049  * Ex: trunc(-1.5) = -1.0
2050  */
2051 LLVMValueRef
2052 lp_build_trunc(struct lp_build_context *bld,
2053                LLVMValueRef a)
2054 {
2055    LLVMBuilderRef builder = bld->gallivm->builder;
2056    const struct lp_type type = bld->type;
2057
2058    assert(type.floating);
2059    assert(lp_check_value(type, a));
2060
2061    if (arch_rounding_available(type)) {
2062       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2063    }
2064    else {
2065       const struct lp_type type = bld->type;
2066       struct lp_type inttype;
2067       struct lp_build_context intbld;
2068       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2069       LLVMValueRef trunc, res, anosign, mask;
2070       LLVMTypeRef int_vec_type = bld->int_vec_type;
2071       LLVMTypeRef vec_type = bld->vec_type;
2072
2073       assert(type.width == 32); /* might want to handle doubles at some point */
2074
2075       inttype = type;
2076       inttype.floating = 0;
2077       lp_build_context_init(&intbld, bld->gallivm, inttype);
2078
2079       /* round by truncation */
2080       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2081       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2082
2083       /* mask out sign bit */
2084       anosign = lp_build_abs(bld, a);
2085       /*
2086        * mask out all values if anosign > 2^24
2087        * This should work both for large ints (all rounding is no-op for them
2088        * because such floats are always exact) as well as special cases like
2089        * NaNs, Infs (taking advantage of the fact they use max exponent).
2090        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2091        */
2092       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2093       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2094       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2095       return lp_build_select(bld, mask, a, res);
2096    }
2097 }
2098
2099
2100 /**
2101  * Return float (vector) rounded to nearest integer (vector).  The returned
2102  * value is a float (vector).
2103  * Ex: round(0.9) = 1.0
2104  * Ex: round(-1.5) = -2.0
2105  */
2106 LLVMValueRef
2107 lp_build_round(struct lp_build_context *bld,
2108                LLVMValueRef a)
2109 {
2110    LLVMBuilderRef builder = bld->gallivm->builder;
2111    const struct lp_type type = bld->type;
2112
2113    assert(type.floating);
2114    assert(lp_check_value(type, a));
2115
2116    if (arch_rounding_available(type)) {
2117       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2118    }
2119    else {
2120       const struct lp_type type = bld->type;
2121       struct lp_type inttype;
2122       struct lp_build_context intbld;
2123       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2124       LLVMValueRef res, anosign, mask;
2125       LLVMTypeRef int_vec_type = bld->int_vec_type;
2126       LLVMTypeRef vec_type = bld->vec_type;
2127
2128       assert(type.width == 32); /* might want to handle doubles at some point */
2129
2130       inttype = type;
2131       inttype.floating = 0;
2132       lp_build_context_init(&intbld, bld->gallivm, inttype);
2133
2134       res = lp_build_iround(bld, a);
2135       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2136
2137       /* mask out sign bit */
2138       anosign = lp_build_abs(bld, a);
2139       /*
2140        * mask out all values if anosign > 2^24
2141        * This should work both for large ints (all rounding is no-op for them
2142        * because such floats are always exact) as well as special cases like
2143        * NaNs, Infs (taking advantage of the fact they use max exponent).
2144        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2145        */
2146       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2147       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2148       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2149       return lp_build_select(bld, mask, a, res);
2150    }
2151 }
2152
2153
2154 /**
2155  * Return floor of float (vector), result is a float (vector)
2156  * Ex: floor(1.1) = 1.0
2157  * Ex: floor(-1.1) = -2.0
2158  */
2159 LLVMValueRef
2160 lp_build_floor(struct lp_build_context *bld,
2161                LLVMValueRef a)
2162 {
2163    LLVMBuilderRef builder = bld->gallivm->builder;
2164    const struct lp_type type = bld->type;
2165
2166    assert(type.floating);
2167    assert(lp_check_value(type, a));
2168
2169    if (arch_rounding_available(type)) {
2170       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2171    }
2172    else {
2173       const struct lp_type type = bld->type;
2174       struct lp_type inttype;
2175       struct lp_build_context intbld;
2176       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2177       LLVMValueRef trunc, res, anosign, mask;
2178       LLVMTypeRef int_vec_type = bld->int_vec_type;
2179       LLVMTypeRef vec_type = bld->vec_type;
2180
2181       if (type.width != 32) {
2182          char intrinsic[32];
2183          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2184          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2185       }
2186
2187       assert(type.width == 32); /* might want to handle doubles at some point */
2188
2189       inttype = type;
2190       inttype.floating = 0;
2191       lp_build_context_init(&intbld, bld->gallivm, inttype);
2192
2193       /* round by truncation */
2194       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2195       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2196
2197       if (type.sign) {
2198          LLVMValueRef tmp;
2199
2200          /*
2201           * fix values if rounding is wrong (for non-special cases)
2202           * - this is the case if trunc > a
2203           */
2204          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2205          /* tmp = trunc > a ? 1.0 : 0.0 */
2206          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2207          tmp = lp_build_and(&intbld, mask, tmp);
2208          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2209          res = lp_build_sub(bld, res, tmp);
2210       }
2211
2212       /* mask out sign bit */
2213       anosign = lp_build_abs(bld, a);
2214       /*
2215        * mask out all values if anosign > 2^24
2216        * This should work both for large ints (all rounding is no-op for them
2217        * because such floats are always exact) as well as special cases like
2218        * NaNs, Infs (taking advantage of the fact they use max exponent).
2219        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2220        */
2221       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2222       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2223       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2224       return lp_build_select(bld, mask, a, res);
2225    }
2226 }
2227
2228
2229 /**
2230  * Return ceiling of float (vector), returning float (vector).
2231  * Ex: ceil( 1.1) = 2.0
2232  * Ex: ceil(-1.1) = -1.0
2233  */
2234 LLVMValueRef
2235 lp_build_ceil(struct lp_build_context *bld,
2236               LLVMValueRef a)
2237 {
2238    LLVMBuilderRef builder = bld->gallivm->builder;
2239    const struct lp_type type = bld->type;
2240
2241    assert(type.floating);
2242    assert(lp_check_value(type, a));
2243
2244    if (arch_rounding_available(type)) {
2245       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2246    }
2247    else {
2248       const struct lp_type type = bld->type;
2249       struct lp_type inttype;
2250       struct lp_build_context intbld;
2251       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2252       LLVMValueRef trunc, res, anosign, mask, tmp;
2253       LLVMTypeRef int_vec_type = bld->int_vec_type;
2254       LLVMTypeRef vec_type = bld->vec_type;
2255
2256       if (type.width != 32) {
2257          char intrinsic[32];
2258          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2259          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2260       }
2261
2262       assert(type.width == 32); /* might want to handle doubles at some point */
2263
2264       inttype = type;
2265       inttype.floating = 0;
2266       lp_build_context_init(&intbld, bld->gallivm, inttype);
2267
2268       /* round by truncation */
2269       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2270       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2271
2272       /*
2273        * fix values if rounding is wrong (for non-special cases)
2274        * - this is the case if trunc < a
2275        */
2276       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2277       /* tmp = trunc < a ? 1.0 : 0.0 */
2278       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2279       tmp = lp_build_and(&intbld, mask, tmp);
2280       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2281       res = lp_build_add(bld, trunc, tmp);
2282
2283       /* mask out sign bit */
2284       anosign = lp_build_abs(bld, a);
2285       /*
2286        * mask out all values if anosign > 2^24
2287        * This should work both for large ints (all rounding is no-op for them
2288        * because such floats are always exact) as well as special cases like
2289        * NaNs, Infs (taking advantage of the fact they use max exponent).
2290        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2291        */
2292       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2293       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2294       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2295       return lp_build_select(bld, mask, a, res);
2296    }
2297 }
2298
2299
2300 /**
2301  * Return fractional part of 'a' computed as a - floor(a)
2302  * Typically used in texture coord arithmetic.
2303  */
2304 LLVMValueRef
2305 lp_build_fract(struct lp_build_context *bld,
2306                LLVMValueRef a)
2307 {
2308    assert(bld->type.floating);
2309    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2310 }
2311
2312
2313 /**
2314  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2315  * against 0.99999(9). (Will also return that value for NaNs.)
2316  */
2317 static inline LLVMValueRef
2318 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2319 {
2320    LLVMValueRef max;
2321
2322    /* this is the largest number smaller than 1.0 representable as float */
2323    max = lp_build_const_vec(bld->gallivm, bld->type,
2324                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2325    return lp_build_min_ext(bld, fract, max,
2326                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2327 }
2328
2329
2330 /**
2331  * Same as lp_build_fract, but guarantees that the result is always smaller
2332  * than one. Will also return the smaller-than-one value for infs, NaNs.
2333  */
2334 LLVMValueRef
2335 lp_build_fract_safe(struct lp_build_context *bld,
2336                     LLVMValueRef a)
2337 {
2338    return clamp_fract(bld, lp_build_fract(bld, a));
2339 }
2340
2341
2342 /**
2343  * Return the integer part of a float (vector) value (== round toward zero).
2344  * The returned value is an integer (vector).
2345  * Ex: itrunc(-1.5) = -1
2346  */
2347 LLVMValueRef
2348 lp_build_itrunc(struct lp_build_context *bld,
2349                 LLVMValueRef a)
2350 {
2351    LLVMBuilderRef builder = bld->gallivm->builder;
2352    const struct lp_type type = bld->type;
2353    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2354
2355    assert(type.floating);
2356    assert(lp_check_value(type, a));
2357
2358    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2359 }
2360
2361
2362 /**
2363  * Return float (vector) rounded to nearest integer (vector).  The returned
2364  * value is an integer (vector).
2365  * Ex: iround(0.9) = 1
2366  * Ex: iround(-1.5) = -2
2367  */
2368 LLVMValueRef
2369 lp_build_iround(struct lp_build_context *bld,
2370                 LLVMValueRef a)
2371 {
2372    LLVMBuilderRef builder = bld->gallivm->builder;
2373    const struct lp_type type = bld->type;
2374    LLVMTypeRef int_vec_type = bld->int_vec_type;
2375    LLVMValueRef res;
2376
2377    assert(type.floating);
2378
2379    assert(lp_check_value(type, a));
2380
2381    if ((util_cpu_caps.has_sse2 &&
2382        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2383        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2384       return lp_build_iround_nearest_sse2(bld, a);
2385    }
2386    if (arch_rounding_available(type)) {
2387       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2388    }
2389    else {
2390       LLVMValueRef half;
2391
2392       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2393
2394       if (type.sign) {
2395          LLVMTypeRef vec_type = bld->vec_type;
2396          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2397                                     (unsigned long long)1 << (type.width - 1));
2398          LLVMValueRef sign;
2399
2400          /* get sign bit */
2401          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2402          sign = LLVMBuildAnd(builder, sign, mask, "");
2403
2404          /* sign * 0.5 */
2405          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2406          half = LLVMBuildOr(builder, sign, half, "");
2407          half = LLVMBuildBitCast(builder, half, vec_type, "");
2408       }
2409
2410       res = LLVMBuildFAdd(builder, a, half, "");
2411    }
2412
2413    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2414
2415    return res;
2416 }
2417
2418
2419 /**
2420  * Return floor of float (vector), result is an int (vector)
2421  * Ex: ifloor(1.1) = 1.0
2422  * Ex: ifloor(-1.1) = -2.0
2423  */
2424 LLVMValueRef
2425 lp_build_ifloor(struct lp_build_context *bld,
2426                 LLVMValueRef a)
2427 {
2428    LLVMBuilderRef builder = bld->gallivm->builder;
2429    const struct lp_type type = bld->type;
2430    LLVMTypeRef int_vec_type = bld->int_vec_type;
2431    LLVMValueRef res;
2432
2433    assert(type.floating);
2434    assert(lp_check_value(type, a));
2435
2436    res = a;
2437    if (type.sign) {
2438       if (arch_rounding_available(type)) {
2439          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2440       }
2441       else {
2442          struct lp_type inttype;
2443          struct lp_build_context intbld;
2444          LLVMValueRef trunc, itrunc, mask;
2445
2446          assert(type.floating);
2447          assert(lp_check_value(type, a));
2448
2449          inttype = type;
2450          inttype.floating = 0;
2451          lp_build_context_init(&intbld, bld->gallivm, inttype);
2452
2453          /* round by truncation */
2454          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2455          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2456
2457          /*
2458           * fix values if rounding is wrong (for non-special cases)
2459           * - this is the case if trunc > a
2460           * The results of doing this with NaNs, very large values etc.
2461           * are undefined but this seems to be the case anyway.
2462           */
2463          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2464          /* cheapie minus one with mask since the mask is minus one / zero */
2465          return lp_build_add(&intbld, itrunc, mask);
2466       }
2467    }
2468
2469    /* round to nearest (toward zero) */
2470    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2471
2472    return res;
2473 }
2474
2475
2476 /**
2477  * Return ceiling of float (vector), returning int (vector).
2478  * Ex: iceil( 1.1) = 2
2479  * Ex: iceil(-1.1) = -1
2480  */
2481 LLVMValueRef
2482 lp_build_iceil(struct lp_build_context *bld,
2483                LLVMValueRef a)
2484 {
2485    LLVMBuilderRef builder = bld->gallivm->builder;
2486    const struct lp_type type = bld->type;
2487    LLVMTypeRef int_vec_type = bld->int_vec_type;
2488    LLVMValueRef res;
2489
2490    assert(type.floating);
2491    assert(lp_check_value(type, a));
2492
2493    if (arch_rounding_available(type)) {
2494       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2495    }
2496    else {
2497       struct lp_type inttype;
2498       struct lp_build_context intbld;
2499       LLVMValueRef trunc, itrunc, mask;
2500
2501       assert(type.floating);
2502       assert(lp_check_value(type, a));
2503
2504       inttype = type;
2505       inttype.floating = 0;
2506       lp_build_context_init(&intbld, bld->gallivm, inttype);
2507
2508       /* round by truncation */
2509       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2511
2512       /*
2513        * fix values if rounding is wrong (for non-special cases)
2514        * - this is the case if trunc < a
2515        * The results of doing this with NaNs, very large values etc.
2516        * are undefined but this seems to be the case anyway.
2517        */
2518       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2519       /* cheapie plus one with mask since the mask is minus one / zero */
2520       return lp_build_sub(&intbld, itrunc, mask);
2521    }
2522
2523    /* round to nearest (toward zero) */
2524    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2525
2526    return res;
2527 }
2528
2529
2530 /**
2531  * Combined ifloor() & fract().
2532  *
2533  * Preferred to calling the functions separately, as it will ensure that the
2534  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2535  */
2536 void
2537 lp_build_ifloor_fract(struct lp_build_context *bld,
2538                       LLVMValueRef a,
2539                       LLVMValueRef *out_ipart,
2540                       LLVMValueRef *out_fpart)
2541 {
2542    LLVMBuilderRef builder = bld->gallivm->builder;
2543    const struct lp_type type = bld->type;
2544    LLVMValueRef ipart;
2545
2546    assert(type.floating);
2547    assert(lp_check_value(type, a));
2548
2549    if (arch_rounding_available(type)) {
2550       /*
2551        * floor() is easier.
2552        */
2553
2554       ipart = lp_build_floor(bld, a);
2555       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2556       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2557    }
2558    else {
2559       /*
2560        * ifloor() is easier.
2561        */
2562
2563       *out_ipart = lp_build_ifloor(bld, a);
2564       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2565       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2566    }
2567 }
2568
2569
2570 /**
2571  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2572  * always smaller than one.
2573  */
2574 void
2575 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2576                            LLVMValueRef a,
2577                            LLVMValueRef *out_ipart,
2578                            LLVMValueRef *out_fpart)
2579 {
2580    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2581    *out_fpart = clamp_fract(bld, *out_fpart);
2582 }
2583
2584
2585 LLVMValueRef
2586 lp_build_sqrt(struct lp_build_context *bld,
2587               LLVMValueRef a)
2588 {
2589    LLVMBuilderRef builder = bld->gallivm->builder;
2590    const struct lp_type type = bld->type;
2591    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2592    char intrinsic[32];
2593
2594    assert(lp_check_value(type, a));
2595
2596    assert(type.floating);
2597    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2598
2599    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2600 }
2601
2602
2603 /**
2604  * Do one Newton-Raphson step to improve reciprocate precision:
2605  *
2606  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2607  *
2608  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2609  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2610  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2611  * halo. It would be necessary to clamp the argument to prevent this.
2612  *
2613  * See also:
2614  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2615  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2616  */
2617 static inline LLVMValueRef
2618 lp_build_rcp_refine(struct lp_build_context *bld,
2619                     LLVMValueRef a,
2620                     LLVMValueRef rcp_a)
2621 {
2622    LLVMBuilderRef builder = bld->gallivm->builder;
2623    LLVMValueRef neg_a;
2624    LLVMValueRef res;
2625
2626    neg_a = LLVMBuildFNeg(builder, a, "");
2627    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2628    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2629
2630    return res;
2631 }
2632
2633
2634 LLVMValueRef
2635 lp_build_rcp(struct lp_build_context *bld,
2636              LLVMValueRef a)
2637 {
2638    LLVMBuilderRef builder = bld->gallivm->builder;
2639    const struct lp_type type = bld->type;
2640
2641    assert(lp_check_value(type, a));
2642
2643    if(a == bld->zero)
2644       return bld->undef;
2645    if(a == bld->one)
2646       return bld->one;
2647    if(a == bld->undef)
2648       return bld->undef;
2649
2650    assert(type.floating);
2651
2652    if(LLVMIsConstant(a))
2653       return LLVMConstFDiv(bld->one, a);
2654
2655    /*
2656     * We don't use RCPPS because:
2657     * - it only has 10bits of precision
2658     * - it doesn't even get the reciprocate of 1.0 exactly
2659     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2660     * - for recent processors the benefit over DIVPS is marginal, a case
2661     *   dependent
2662     *
2663     * We could still use it on certain processors if benchmarks show that the
2664     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2665     * particular uses that require less workarounds.
2666     */
2667
2668    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2669          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2670       const unsigned num_iterations = 0;
2671       LLVMValueRef res;
2672       unsigned i;
2673       const char *intrinsic = NULL;
2674
2675       if (type.length == 4) {
2676          intrinsic = "llvm.x86.sse.rcp.ps";
2677       }
2678       else {
2679          intrinsic = "llvm.x86.avx.rcp.ps.256";
2680       }
2681
2682       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2683
2684       for (i = 0; i < num_iterations; ++i) {
2685          res = lp_build_rcp_refine(bld, a, res);
2686       }
2687
2688       return res;
2689    }
2690
2691    return LLVMBuildFDiv(builder, bld->one, a, "");
2692 }
2693
2694
2695 /**
2696  * Do one Newton-Raphson step to improve rsqrt precision:
2697  *
2698  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2699  *
2700  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2701  */
2702 static inline LLVMValueRef
2703 lp_build_rsqrt_refine(struct lp_build_context *bld,
2704                       LLVMValueRef a,
2705                       LLVMValueRef rsqrt_a)
2706 {
2707    LLVMBuilderRef builder = bld->gallivm->builder;
2708    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2709    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2710    LLVMValueRef res;
2711
2712    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2713    res = LLVMBuildFMul(builder, a, res, "");
2714    res = LLVMBuildFSub(builder, three, res, "");
2715    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2716    res = LLVMBuildFMul(builder, half, res, "");
2717
2718    return res;
2719 }
2720
2721
2722 /**
2723  * Generate 1/sqrt(a).
2724  * Result is undefined for values < 0, infinity for +0.
2725  */
2726 LLVMValueRef
2727 lp_build_rsqrt(struct lp_build_context *bld,
2728                LLVMValueRef a)
2729 {
2730    const struct lp_type type = bld->type;
2731
2732    assert(lp_check_value(type, a));
2733
2734    assert(type.floating);
2735
2736    /*
2737     * This should be faster but all denormals will end up as infinity.
2738     */
2739    if (0 && lp_build_fast_rsqrt_available(type)) {
2740       const unsigned num_iterations = 1;
2741       LLVMValueRef res;
2742       unsigned i;
2743
2744       /* rsqrt(1.0) != 1.0 here */
2745       res = lp_build_fast_rsqrt(bld, a);
2746
2747       if (num_iterations) {
2748          /*
2749           * Newton-Raphson will result in NaN instead of infinity for zero,
2750           * and NaN instead of zero for infinity.
2751           * Also, need to ensure rsqrt(1.0) == 1.0.
2752           * All numbers smaller than FLT_MIN will result in +infinity
2753           * (rsqrtps treats all denormals as zero).
2754           */
2755          LLVMValueRef cmp;
2756          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2757          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2758
2759          for (i = 0; i < num_iterations; ++i) {
2760             res = lp_build_rsqrt_refine(bld, a, res);
2761          }
2762          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2763          res = lp_build_select(bld, cmp, inf, res);
2764          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2765          res = lp_build_select(bld, cmp, bld->zero, res);
2766          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2767          res = lp_build_select(bld, cmp, bld->one, res);
2768       }
2769
2770       return res;
2771    }
2772
2773    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2774 }
2775
2776 /**
2777  * If there's a fast (inaccurate) rsqrt instruction available
2778  * (caller may want to avoid to call rsqrt_fast if it's not available,
2779  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2780  * unavailable it would result in sqrt/div/mul so obviously
2781  * much better to just call sqrt, skipping both div and mul).
2782  */
2783 boolean
2784 lp_build_fast_rsqrt_available(struct lp_type type)
2785 {
2786    assert(type.floating);
2787
2788    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2789        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2790       return true;
2791    }
2792    return false;
2793 }
2794
2795
2796 /**
2797  * Generate 1/sqrt(a).
2798  * Result is undefined for values < 0, infinity for +0.
2799  * Precision is limited, only ~10 bits guaranteed
2800  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2801  */
2802 LLVMValueRef
2803 lp_build_fast_rsqrt(struct lp_build_context *bld,
2804                     LLVMValueRef a)
2805 {
2806    LLVMBuilderRef builder = bld->gallivm->builder;
2807    const struct lp_type type = bld->type;
2808
2809    assert(lp_check_value(type, a));
2810
2811    if (lp_build_fast_rsqrt_available(type)) {
2812       const char *intrinsic = NULL;
2813
2814       if (type.length == 4) {
2815          intrinsic = "llvm.x86.sse.rsqrt.ps";
2816       }
2817       else {
2818          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2819       }
2820       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2821    }
2822    else {
2823       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2824    }
2825    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2826 }
2827
2828
2829 /**
2830  * Generate sin(a) or cos(a) using polynomial approximation.
2831  * TODO: it might be worth recognizing sin and cos using same source
2832  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2833  * would be way cheaper than calculating (nearly) everything twice...
2834  * Not sure it's common enough to be worth bothering however, scs
2835  * opcode could also benefit from calculating both though.
2836  */
2837 static LLVMValueRef
2838 lp_build_sin_or_cos(struct lp_build_context *bld,
2839                     LLVMValueRef a,
2840                     boolean cos)
2841 {
2842    struct gallivm_state *gallivm = bld->gallivm;
2843    LLVMBuilderRef b = gallivm->builder;
2844    struct lp_type int_type = lp_int_type(bld->type);
2845
2846    /*
2847     *  take the absolute value,
2848     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2849     */
2850
2851    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2852    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2853
2854    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2855    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2856
2857    /*
2858     * scale by 4/Pi
2859     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2860     */
2861
2862    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2863    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2864
2865    /*
2866     * store the integer part of y in mm0
2867     * emm2 = _mm_cvttps_epi32(y);
2868     */
2869
2870    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2871
2872    /*
2873     * j=(j+1) & (~1) (see the cephes sources)
2874     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2875     */
2876
2877    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2878    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2879    /*
2880     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2881     */
2882    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2883    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2884
2885    /*
2886     * y = _mm_cvtepi32_ps(emm2);
2887     */
2888    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2889
2890    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2891    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2892    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2893    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2894
2895    /*
2896     * Argument used for poly selection and sign bit determination
2897     * is different for sin vs. cos.
2898     */
2899    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2900                                emm2_and;
2901
2902    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2903                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2904                                               const_29, "sign_bit") :
2905                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2906                                                               LLVMBuildShl(b, emm2_add,
2907                                                                            const_29, ""), ""),
2908                                               sign_mask, "sign_bit");
2909
2910    /*
2911     * get the polynom selection mask
2912     * there is one polynom for 0 <= x <= Pi/4
2913     * and another one for Pi/4<x<=Pi/2
2914     * Both branches will be computed.
2915     *
2916     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2917     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2918     */
2919
2920    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2921    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2922                                              int_type, PIPE_FUNC_EQUAL,
2923                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2924
2925    /*
2926     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2927     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2928     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2929     */
2930    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2931    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2932    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2933
2934    /*
2935     * The magic pass: "Extended precision modular arithmetic"
2936     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2937     */
2938    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2939    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2940    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2941
2942    /*
2943     * Evaluate the first polynom  (0 <= x <= Pi/4)
2944     *
2945     * z = _mm_mul_ps(x,x);
2946     */
2947    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2948
2949    /*
2950     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2951     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2952     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2953     */
2954    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2955    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2956    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2957
2958    /*
2959     * y = *(v4sf*)_ps_coscof_p0;
2960     * y = _mm_mul_ps(y, z);
2961     */
2962    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2963    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2964    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2965    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2966
2967
2968    /*
2969     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2970     * y = _mm_sub_ps(y, tmp);
2971     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2972     */
2973    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2974    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2975    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2976    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2977    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2978
2979    /*
2980     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2981     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2982     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2983     */
2984    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2985    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2986    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2987
2988    /*
2989     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2990     *
2991     * y2 = *(v4sf*)_ps_sincof_p0;
2992     * y2 = _mm_mul_ps(y2, z);
2993     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2994     * y2 = _mm_mul_ps(y2, z);
2995     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2996     * y2 = _mm_mul_ps(y2, z);
2997     * y2 = _mm_mul_ps(y2, x);
2998     * y2 = _mm_add_ps(y2, x);
2999     */
3000
3001    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3002    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3003    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3004    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3005
3006    /*
3007     * select the correct result from the two polynoms
3008     * xmm3 = poly_mask;
3009     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3010     * y = _mm_andnot_ps(xmm3, y);
3011     * y = _mm_or_ps(y,y2);
3012     */
3013    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3014    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3015    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3016    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3017    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3018    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3019
3020    /*
3021     * update the sign
3022     * y = _mm_xor_ps(y, sign_bit);
3023     */
3024    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3025    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3026
3027    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3028
3029    /* clamp output to be within [-1, 1] */
3030    y_result = lp_build_clamp(bld, y_result,
3031                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3032                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3033    /* If a is -inf, inf or NaN then return NaN */
3034    y_result = lp_build_select(bld, isfinite, y_result,
3035                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3036    return y_result;
3037 }
3038
3039
3040 /**
3041  * Generate sin(a)
3042  */
3043 LLVMValueRef
3044 lp_build_sin(struct lp_build_context *bld,
3045              LLVMValueRef a)
3046 {
3047    return lp_build_sin_or_cos(bld, a, FALSE);
3048 }
3049
3050
3051 /**
3052  * Generate cos(a)
3053  */
3054 LLVMValueRef
3055 lp_build_cos(struct lp_build_context *bld,
3056              LLVMValueRef a)
3057 {
3058    return lp_build_sin_or_cos(bld, a, TRUE);
3059 }
3060
3061
3062 /**
3063  * Generate pow(x, y)
3064  */
3065 LLVMValueRef
3066 lp_build_pow(struct lp_build_context *bld,
3067              LLVMValueRef x,
3068              LLVMValueRef y)
3069 {
3070    /* TODO: optimize the constant case */
3071    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3072        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3073       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3074                    __FUNCTION__);
3075    }
3076
3077    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3078 }
3079
3080
3081 /**
3082  * Generate exp(x)
3083  */
3084 LLVMValueRef
3085 lp_build_exp(struct lp_build_context *bld,
3086              LLVMValueRef x)
3087 {
3088    /* log2(e) = 1/log(2) */
3089    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3090                                            1.4426950408889634);
3091
3092    assert(lp_check_value(bld->type, x));
3093
3094    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3095 }
3096
3097
3098 /**
3099  * Generate log(x)
3100  * Behavior is undefined with infs, 0s and nans
3101  */
3102 LLVMValueRef
3103 lp_build_log(struct lp_build_context *bld,
3104              LLVMValueRef x)
3105 {
3106    /* log(2) */
3107    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3108                                           0.69314718055994529);
3109
3110    assert(lp_check_value(bld->type, x));
3111
3112    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3113 }
3114
3115 /**
3116  * Generate log(x) that handles edge cases (infs, 0s and nans)
3117  */
3118 LLVMValueRef
3119 lp_build_log_safe(struct lp_build_context *bld,
3120                   LLVMValueRef x)
3121 {
3122    /* log(2) */
3123    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3124                                           0.69314718055994529);
3125
3126    assert(lp_check_value(bld->type, x));
3127
3128    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3129 }
3130
3131
3132 /**
3133  * Generate polynomial.
3134  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3135  */
3136 LLVMValueRef
3137 lp_build_polynomial(struct lp_build_context *bld,
3138                     LLVMValueRef x,
3139                     const double *coeffs,
3140                     unsigned num_coeffs)
3141 {
3142    const struct lp_type type = bld->type;
3143    LLVMValueRef even = NULL, odd = NULL;
3144    LLVMValueRef x2;
3145    unsigned i;
3146
3147    assert(lp_check_value(bld->type, x));
3148
3149    /* TODO: optimize the constant case */
3150    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3151        LLVMIsConstant(x)) {
3152       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3153                    __FUNCTION__);
3154    }
3155
3156    /*
3157     * Calculate odd and even terms seperately to decrease data dependency
3158     * Ex:
3159     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3160     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3161     */
3162    x2 = lp_build_mul(bld, x, x);
3163
3164    for (i = num_coeffs; i--; ) {
3165       LLVMValueRef coeff;
3166
3167       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3168
3169       if (i % 2 == 0) {
3170          if (even)
3171             even = lp_build_mad(bld, x2, even, coeff);
3172          else
3173             even = coeff;
3174       } else {
3175          if (odd)
3176             odd = lp_build_mad(bld, x2, odd, coeff);
3177          else
3178             odd = coeff;
3179       }
3180    }
3181
3182    if (odd)
3183       return lp_build_mad(bld, odd, x, even);
3184    else if (even)
3185       return even;
3186    else
3187       return bld->undef;
3188 }
3189
3190
3191 /**
3192  * Minimax polynomial fit of 2**x, in range [0, 1[
3193  */
3194 const double lp_build_exp2_polynomial[] = {
3195 #if EXP_POLY_DEGREE == 5
3196    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3197    0.693153073200168932794,
3198    0.240153617044375388211,
3199    0.0558263180532956664775,
3200    0.00898934009049466391101,
3201    0.00187757667519147912699
3202 #elif EXP_POLY_DEGREE == 4
3203    1.00000259337069434683,
3204    0.693003834469974940458,
3205    0.24144275689150793076,
3206    0.0520114606103070150235,
3207    0.0135341679161270268764
3208 #elif EXP_POLY_DEGREE == 3
3209    0.999925218562710312959,
3210    0.695833540494823811697,
3211    0.226067155427249155588,
3212    0.0780245226406372992967
3213 #elif EXP_POLY_DEGREE == 2
3214    1.00172476321474503578,
3215    0.657636275736077639316,
3216    0.33718943461968720704
3217 #else
3218 #error
3219 #endif
3220 };
3221
3222
3223 LLVMValueRef
3224 lp_build_exp2(struct lp_build_context *bld,
3225               LLVMValueRef x)
3226 {
3227    LLVMBuilderRef builder = bld->gallivm->builder;
3228    const struct lp_type type = bld->type;
3229    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3230    LLVMValueRef ipart = NULL;
3231    LLVMValueRef fpart = NULL;
3232    LLVMValueRef expipart = NULL;
3233    LLVMValueRef expfpart = NULL;
3234    LLVMValueRef res = NULL;
3235
3236    assert(lp_check_value(bld->type, x));
3237
3238    /* TODO: optimize the constant case */
3239    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3240        LLVMIsConstant(x)) {
3241       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3242                    __FUNCTION__);
3243    }
3244
3245    assert(type.floating && type.width == 32);
3246
3247    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3248     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3249    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3250                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3251    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3252                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3253
3254    /* ipart = floor(x) */
3255    /* fpart = x - ipart */
3256    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3257
3258    /* expipart = (float) (1 << ipart) */
3259    expipart = LLVMBuildAdd(builder, ipart,
3260                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3261    expipart = LLVMBuildShl(builder, expipart,
3262                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3263    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3264
3265    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3266                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3267
3268    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3269
3270    return res;
3271 }
3272
3273
3274
3275 /**
3276  * Extract the exponent of a IEEE-754 floating point value.
3277  *
3278  * Optionally apply an integer bias.
3279  *
3280  * Result is an integer value with
3281  *
3282  *   ifloor(log2(x)) + bias
3283  */
3284 LLVMValueRef
3285 lp_build_extract_exponent(struct lp_build_context *bld,
3286                           LLVMValueRef x,
3287                           int bias)
3288 {
3289    LLVMBuilderRef builder = bld->gallivm->builder;
3290    const struct lp_type type = bld->type;
3291    unsigned mantissa = lp_mantissa(type);
3292    LLVMValueRef res;
3293
3294    assert(type.floating);
3295
3296    assert(lp_check_value(bld->type, x));
3297
3298    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3299
3300    res = LLVMBuildLShr(builder, x,
3301                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3302    res = LLVMBuildAnd(builder, res,
3303                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3304    res = LLVMBuildSub(builder, res,
3305                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3306
3307    return res;
3308 }
3309
3310
3311 /**
3312  * Extract the mantissa of the a floating.
3313  *
3314  * Result is a floating point value with
3315  *
3316  *   x / floor(log2(x))
3317  */
3318 LLVMValueRef
3319 lp_build_extract_mantissa(struct lp_build_context *bld,
3320                           LLVMValueRef x)
3321 {
3322    LLVMBuilderRef builder = bld->gallivm->builder;
3323    const struct lp_type type = bld->type;
3324    unsigned mantissa = lp_mantissa(type);
3325    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3326                                                   (1ULL << mantissa) - 1);
3327    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3328    LLVMValueRef res;
3329
3330    assert(lp_check_value(bld->type, x));
3331
3332    assert(type.floating);
3333
3334    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3335
3336    /* res = x / 2**ipart */
3337    res = LLVMBuildAnd(builder, x, mantmask, "");
3338    res = LLVMBuildOr(builder, res, one, "");
3339    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3340
3341    return res;
3342 }
3343
3344
3345
3346 /**
3347  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3348  * These coefficients can be generate with
3349  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3350  */
3351 const double lp_build_log2_polynomial[] = {
3352 #if LOG_POLY_DEGREE == 5
3353    2.88539008148777786488L,
3354    0.961796878841293367824L,
3355    0.577058946784739859012L,
3356    0.412914355135828735411L,
3357    0.308591899232910175289L,
3358    0.352376952300281371868L,
3359 #elif LOG_POLY_DEGREE == 4
3360    2.88539009343309178325L,
3361    0.961791550404184197881L,
3362    0.577440339438736392009L,
3363    0.403343858251329912514L,
3364    0.406718052498846252698L,
3365 #elif LOG_POLY_DEGREE == 3
3366    2.88538959748872753838L,
3367    0.961932915889597772928L,
3368    0.571118517972136195241L,
3369    0.493997535084709500285L,
3370 #else
3371 #error
3372 #endif
3373 };
3374
3375 /**
3376  * See http://www.devmaster.net/forums/showthread.php?p=43580
3377  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3378  * http://www.nezumi.demon.co.uk/consult/logx.htm
3379  *
3380  * If handle_edge_cases is true the function will perform computations
3381  * to match the required D3D10+ behavior for each of the edge cases.
3382  * That means that if input is:
3383  * - less than zero (to and including -inf) then NaN will be returned
3384  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3385  * - +infinity, then +infinity will be returned
3386  * - NaN, then NaN will be returned
3387  *
3388  * Those checks are fairly expensive so if you don't need them make sure
3389  * handle_edge_cases is false.
3390  */
3391 void
3392 lp_build_log2_approx(struct lp_build_context *bld,
3393                      LLVMValueRef x,
3394                      LLVMValueRef *p_exp,
3395                      LLVMValueRef *p_floor_log2,
3396                      LLVMValueRef *p_log2,
3397                      boolean handle_edge_cases)
3398 {
3399    LLVMBuilderRef builder = bld->gallivm->builder;
3400    const struct lp_type type = bld->type;
3401    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3402    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3403
3404    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3405    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3406    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3407
3408    LLVMValueRef i = NULL;
3409    LLVMValueRef y = NULL;
3410    LLVMValueRef z = NULL;
3411    LLVMValueRef exp = NULL;
3412    LLVMValueRef mant = NULL;
3413    LLVMValueRef logexp = NULL;
3414    LLVMValueRef p_z = NULL;
3415    LLVMValueRef res = NULL;
3416
3417    assert(lp_check_value(bld->type, x));
3418
3419    if(p_exp || p_floor_log2 || p_log2) {
3420       /* TODO: optimize the constant case */
3421       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3422           LLVMIsConstant(x)) {
3423          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3424                       __FUNCTION__);
3425       }
3426
3427       assert(type.floating && type.width == 32);
3428
3429       /*
3430        * We don't explicitly handle denormalized numbers. They will yield a
3431        * result in the neighbourhood of -127, which appears to be adequate
3432        * enough.
3433        */
3434
3435       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3436
3437       /* exp = (float) exponent(x) */
3438       exp = LLVMBuildAnd(builder, i, expmask, "");
3439    }
3440
3441    if(p_floor_log2 || p_log2) {
3442       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3443       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3444       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3445    }
3446
3447    if (p_log2) {
3448       /* mant = 1 + (float) mantissa(x) */
3449       mant = LLVMBuildAnd(builder, i, mantmask, "");
3450       mant = LLVMBuildOr(builder, mant, one, "");
3451       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3452
3453       /* y = (mant - 1) / (mant + 1) */
3454       y = lp_build_div(bld,
3455          lp_build_sub(bld, mant, bld->one),
3456          lp_build_add(bld, mant, bld->one)
3457       );
3458
3459       /* z = y^2 */
3460       z = lp_build_mul(bld, y, y);
3461
3462       /* compute P(z) */
3463       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3464                                 ARRAY_SIZE(lp_build_log2_polynomial));
3465
3466       /* y * P(z) + logexp */
3467       res = lp_build_mad(bld, y, p_z, logexp);
3468
3469       if (type.floating && handle_edge_cases) {
3470          LLVMValueRef negmask, infmask,  zmask;
3471          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3472                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3473          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3474                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3475          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3476                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3477
3478          /* If x is qual to inf make sure we return inf */
3479          res = lp_build_select(bld, infmask,
3480                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3481                                res);
3482          /* If x is qual to 0, return -inf */
3483          res = lp_build_select(bld, zmask,
3484                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3485                                res);
3486          /* If x is nan or less than 0, return nan */
3487          res = lp_build_select(bld, negmask,
3488                                lp_build_const_vec(bld->gallivm, type,  NAN),
3489                                res);
3490       }
3491    }
3492
3493    if (p_exp) {
3494       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3495       *p_exp = exp;
3496    }
3497
3498    if (p_floor_log2)
3499       *p_floor_log2 = logexp;
3500
3501    if (p_log2)
3502       *p_log2 = res;
3503 }
3504
3505
3506 /*
3507  * log2 implementation which doesn't have special code to
3508  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3509  * the results for those cases are undefined.
3510  */
3511 LLVMValueRef
3512 lp_build_log2(struct lp_build_context *bld,
3513               LLVMValueRef x)
3514 {
3515    LLVMValueRef res;
3516    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3517    return res;
3518 }
3519
3520 /*
3521  * Version of log2 which handles all edge cases.
3522  * Look at documentation of lp_build_log2_approx for
3523  * description of the behavior for each of the edge cases.
3524  */
3525 LLVMValueRef
3526 lp_build_log2_safe(struct lp_build_context *bld,
3527                    LLVMValueRef x)
3528 {
3529    LLVMValueRef res;
3530    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3531    return res;
3532 }
3533
3534
3535 /**
3536  * Faster (and less accurate) log2.
3537  *
3538  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3539  *
3540  * Piece-wise linear approximation, with exact results when x is a
3541  * power of two.
3542  *
3543  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3544  */
3545 LLVMValueRef
3546 lp_build_fast_log2(struct lp_build_context *bld,
3547                    LLVMValueRef x)
3548 {
3549    LLVMBuilderRef builder = bld->gallivm->builder;
3550    LLVMValueRef ipart;
3551    LLVMValueRef fpart;
3552
3553    assert(lp_check_value(bld->type, x));
3554
3555    assert(bld->type.floating);
3556
3557    /* ipart = floor(log2(x)) - 1 */
3558    ipart = lp_build_extract_exponent(bld, x, -1);
3559    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3560
3561    /* fpart = x / 2**ipart */
3562    fpart = lp_build_extract_mantissa(bld, x);
3563
3564    /* ipart + fpart */
3565    return LLVMBuildFAdd(builder, ipart, fpart, "");
3566 }
3567
3568
3569 /**
3570  * Fast implementation of iround(log2(x)).
3571  *
3572  * Not an approximation -- it should give accurate results all the time.
3573  */
3574 LLVMValueRef
3575 lp_build_ilog2(struct lp_build_context *bld,
3576                LLVMValueRef x)
3577 {
3578    LLVMBuilderRef builder = bld->gallivm->builder;
3579    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3580    LLVMValueRef ipart;
3581
3582    assert(bld->type.floating);
3583
3584    assert(lp_check_value(bld->type, x));
3585
3586    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3587    x = LLVMBuildFMul(builder, x, sqrt2, "");
3588
3589    /* ipart = floor(log2(x) + 0.5)  */
3590    ipart = lp_build_extract_exponent(bld, x, 0);
3591
3592    return ipart;
3593 }
3594
3595 LLVMValueRef
3596 lp_build_mod(struct lp_build_context *bld,
3597              LLVMValueRef x,
3598              LLVMValueRef y)
3599 {
3600    LLVMBuilderRef builder = bld->gallivm->builder;
3601    LLVMValueRef res;
3602    const struct lp_type type = bld->type;
3603
3604    assert(lp_check_value(type, x));
3605    assert(lp_check_value(type, y));
3606
3607    if (type.floating)
3608       res = LLVMBuildFRem(builder, x, y, "");
3609    else if (type.sign)
3610       res = LLVMBuildSRem(builder, x, y, "");
3611    else
3612       res = LLVMBuildURem(builder, x, y, "");
3613    return res;
3614 }
3615
3616
3617 /*
3618  * For floating inputs it creates and returns a mask
3619  * which is all 1's for channels which are NaN.
3620  * Channels inside x which are not NaN will be 0.
3621  */
3622 LLVMValueRef
3623 lp_build_isnan(struct lp_build_context *bld,
3624                LLVMValueRef x)
3625 {
3626    LLVMValueRef mask;
3627    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3628
3629    assert(bld->type.floating);
3630    assert(lp_check_value(bld->type, x));
3631
3632    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3633                         "isnotnan");
3634    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3635    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3636    return mask;
3637 }
3638
3639 /* Returns all 1's for floating point numbers that are
3640  * finite numbers and returns all zeros for -inf,
3641  * inf and nan's */
3642 LLVMValueRef
3643 lp_build_isfinite(struct lp_build_context *bld,
3644                   LLVMValueRef x)
3645 {
3646    LLVMBuilderRef builder = bld->gallivm->builder;
3647    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3648    struct lp_type int_type = lp_int_type(bld->type);
3649    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3650    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3651                                                     0x7f800000);
3652
3653    if (!bld->type.floating) {
3654       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3655    }
3656    assert(bld->type.floating);
3657    assert(lp_check_value(bld->type, x));
3658    assert(bld->type.width == 32);
3659
3660    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3661    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3662                            intx, infornan32);
3663 }
3664
3665 /*
3666  * Returns true if the number is nan or inf and false otherwise.
3667  * The input has to be a floating point vector.
3668  */
3669 LLVMValueRef
3670 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3671                        const struct lp_type type,
3672                        LLVMValueRef x)
3673 {
3674    LLVMBuilderRef builder = gallivm->builder;
3675    struct lp_type int_type = lp_int_type(type);
3676    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3677                                                 0x7f800000);
3678    LLVMValueRef ret;
3679
3680    assert(type.floating);
3681
3682    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3683    ret = LLVMBuildAnd(builder, ret, const0, "");
3684    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3685                           ret, const0);
3686
3687    return ret;
3688 }
3689
3690
3691 LLVMValueRef
3692 lp_build_fpstate_get(struct gallivm_state *gallivm)
3693 {
3694    if (util_cpu_caps.has_sse) {
3695       LLVMBuilderRef builder = gallivm->builder;
3696       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3697          gallivm,
3698          LLVMInt32TypeInContext(gallivm->context),
3699          "mxcsr_ptr");
3700       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3701           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3702       lp_build_intrinsic(builder,
3703                          "llvm.x86.sse.stmxcsr",
3704                          LLVMVoidTypeInContext(gallivm->context),
3705                          &mxcsr_ptr8, 1, 0);
3706       return mxcsr_ptr;
3707    }
3708    return 0;
3709 }
3710
3711 void
3712 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3713                                   boolean zero)
3714 {
3715    if (util_cpu_caps.has_sse) {
3716       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3717       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3718
3719       LLVMBuilderRef builder = gallivm->builder;
3720       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3721       LLVMValueRef mxcsr =
3722          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3723
3724       if (util_cpu_caps.has_daz) {
3725          /* Enable denormals are zero mode */
3726          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3727       }
3728       if (zero) {
3729          mxcsr = LLVMBuildOr(builder, mxcsr,
3730                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3731       } else {
3732          mxcsr = LLVMBuildAnd(builder, mxcsr,
3733                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3734       }
3735
3736       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3737       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3738    }
3739 }
3740
3741 void
3742 lp_build_fpstate_set(struct gallivm_state *gallivm,
3743                      LLVMValueRef mxcsr_ptr)
3744 {
3745    if (util_cpu_caps.has_sse) {
3746       LLVMBuilderRef builder = gallivm->builder;
3747       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3748                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3749       lp_build_intrinsic(builder,
3750                          "llvm.x86.sse.ldmxcsr",
3751                          LLVMVoidTypeInContext(gallivm->context),
3752                          &mxcsr_ptr, 1, 0);
3753    }
3754 }