src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if(a == bld->zero)
 545       return b;
 546    if(b == bld->zero)
 547       return a;
 548    if(a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if(bld->type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if(a == bld->one || b == bld->one)
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (type.width * type.length == 128) {
 559             if(util_cpu_caps.has_sse2) {
 560               if(type.width == 8)
 561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 562               if(type.width == 16)
 563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 564             } else if (util_cpu_caps.has_altivec) {
 565               if(type.width == 8)
 566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 567               if(type.width == 16)
 568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 569             }
 570          }
 571          if (type.width * type.length == 256) {
 572             if(util_cpu_caps.has_avx2) {
 573               if(type.width == 8)
 574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
 575               if(type.width == 16)
 576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
 577             }
 578          }
 579       }
 580
 581       if (intrinsic)
 582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 583    }
 584
 585    if(type.norm && !type.floating && !type.fixed) {
 586       if (type.sign) {
 587          uint64_t sign = (uint64_t)1 << (type.width - 1);
 588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 590          /* a_clamp_max is the maximum a for positive b,
 591             a_clamp_min is the minimum a for negative b. */
 592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 595       } else {
 596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 597       }
 598    }
 599
 600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 601       if (type.floating)
 602          res = LLVMConstFAdd(a, b);
 603       else
 604          res = LLVMConstAdd(a, b);
 605    else
 606       if (type.floating)
 607          res = LLVMBuildFAdd(builder, a, b, "");
 608       else
 609          res = LLVMBuildAdd(builder, a, b, "");
 610
 611    /* clamp to ceiling of 1.0 */
 612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 614
 615    /* XXX clamp to floor of -1 or 0??? */
 616
 617    return res;
 618 }
 619
 620
 621 /** Return the scalar sum of the elements of a.
 622  * Should avoid this operation whenever possible.
 623  */
 624 LLVMValueRef
 625 lp_build_horizontal_add(struct lp_build_context *bld,
 626                         LLVMValueRef a)
 627 {
 628    LLVMBuilderRef builder = bld->gallivm->builder;
 629    const struct lp_type type = bld->type;
 630    LLVMValueRef index, res;
 631    unsigned i, length;
 632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 634    LLVMValueRef vecres, elem2;
 635
 636    assert(lp_check_value(type, a));
 637
 638    if (type.length == 1) {
 639       return a;
 640    }
 641
 642    assert(!bld->type.norm);
 643
 644    /*
 645     * for byte vectors can do much better with psadbw.
 646     * Using repeated shuffle/adds here. Note with multiple vectors
 647     * this can be done more efficiently as outlined in the intel
 648     * optimization manual.
 649     * Note: could cause data rearrangement if used with smaller element
 650     * sizes.
 651     */
 652
 653    vecres = a;
 654    length = type.length / 2;
 655    while (length > 1) {
 656       LLVMValueRef vec1, vec2;
 657       for (i = 0; i < length; i++) {
 658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 660       }
 661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 662                                     LLVMConstVector(shuffles1, length), "");
 663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 664                                     LLVMConstVector(shuffles2, length), "");
 665       if (type.floating) {
 666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 667       }
 668       else {
 669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 670       }
 671       length = length >> 1;
 672    }
 673
 674    /* always have vector of size 2 here */
 675    assert(length == 1);
 676
 677    index = lp_build_const_int32(bld->gallivm, 0);
 678    res = LLVMBuildExtractElement(builder, vecres, index, "");
 679    index = lp_build_const_int32(bld->gallivm, 1);
 680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 681
 682    if (type.floating)
 683       res = LLVMBuildFAdd(builder, res, elem2, "");
 684     else
 685       res = LLVMBuildAdd(builder, res, elem2, "");
 686
 687    return res;
 688 }
 689
 690 /**
 691  * Return the horizontal sums of 4 float vectors as a float4 vector.
 692  * This uses the technique as outlined in Intel Optimization Manual.
 693  */
 694 static LLVMValueRef
 695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 696                             LLVMValueRef src[4])
 697 {
 698    struct gallivm_state *gallivm = bld->gallivm;
 699    LLVMBuilderRef builder = gallivm->builder;
 700    LLVMValueRef shuffles[4];
 701    LLVMValueRef tmp[4];
 702    LLVMValueRef sumtmp[2], shuftmp[2];
 703
 704    /* lower half of regs */
 705    shuffles[0] = lp_build_const_int32(gallivm, 0);
 706    shuffles[1] = lp_build_const_int32(gallivm, 1);
 707    shuffles[2] = lp_build_const_int32(gallivm, 4);
 708    shuffles[3] = lp_build_const_int32(gallivm, 5);
 709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 710                                    LLVMConstVector(shuffles, 4), "");
 711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 712                                    LLVMConstVector(shuffles, 4), "");
 713
 714    /* upper half of regs */
 715    shuffles[0] = lp_build_const_int32(gallivm, 2);
 716    shuffles[1] = lp_build_const_int32(gallivm, 3);
 717    shuffles[2] = lp_build_const_int32(gallivm, 6);
 718    shuffles[3] = lp_build_const_int32(gallivm, 7);
 719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 720                                    LLVMConstVector(shuffles, 4), "");
 721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 722                                    LLVMConstVector(shuffles, 4), "");
 723
 724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 726
 727    shuffles[0] = lp_build_const_int32(gallivm, 0);
 728    shuffles[1] = lp_build_const_int32(gallivm, 2);
 729    shuffles[2] = lp_build_const_int32(gallivm, 4);
 730    shuffles[3] = lp_build_const_int32(gallivm, 6);
 731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 732                                        LLVMConstVector(shuffles, 4), "");
 733
 734    shuffles[0] = lp_build_const_int32(gallivm, 1);
 735    shuffles[1] = lp_build_const_int32(gallivm, 3);
 736    shuffles[2] = lp_build_const_int32(gallivm, 5);
 737    shuffles[3] = lp_build_const_int32(gallivm, 7);
 738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 739                                        LLVMConstVector(shuffles, 4), "");
 740
 741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 742 }
 743
 744
 745 /*
 746  * partially horizontally add 2-4 float vectors with length nx4,
 747  * i.e. only four adjacent values in each vector will be added,
 748  * assuming values are really grouped in 4 which also determines
 749  * output order.
 750  *
 751  * Return a vector of the same length as the initial vectors,
 752  * with the excess elements (if any) being undefined.
 753  * The element order is independent of number of input vectors.
 754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 755  * the output order thus will be
 756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 757  */
 758 LLVMValueRef
 759 lp_build_hadd_partial4(struct lp_build_context *bld,
 760                        LLVMValueRef vectors[],
 761                        unsigned num_vecs)
 762 {
 763    struct gallivm_state *gallivm = bld->gallivm;
 764    LLVMBuilderRef builder = gallivm->builder;
 765    LLVMValueRef ret_vec;
 766    LLVMValueRef tmp[4];
 767    const char *intrinsic = NULL;
 768
 769    assert(num_vecs >= 2 && num_vecs <= 4);
 770    assert(bld->type.floating);
 771
 772    /* only use this with at least 2 vectors, as it is sort of expensive
 773     * (depending on cpu) and we always need two horizontal adds anyway,
 774     * so a shuffle/add approach might be better.
 775     */
 776
 777    tmp[0] = vectors[0];
 778    tmp[1] = vectors[1];
 779
 780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 782
 783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 784        bld->type.length == 4) {
 785       intrinsic = "llvm.x86.sse3.hadd.ps";
 786    }
 787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 788             bld->type.length == 8) {
 789       intrinsic = "llvm.x86.avx.hadd.ps.256";
 790    }
 791    if (intrinsic) {
 792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 793                                        lp_build_vec_type(gallivm, bld->type),
 794                                        tmp[0], tmp[1]);
 795       if (num_vecs > 2) {
 796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 797                                           lp_build_vec_type(gallivm, bld->type),
 798                                           tmp[2], tmp[3]);
 799       }
 800       else {
 801          tmp[1] = tmp[0];
 802       }
 803       return lp_build_intrinsic_binary(builder, intrinsic,
 804                                        lp_build_vec_type(gallivm, bld->type),
 805                                        tmp[0], tmp[1]);
 806    }
 807
 808    if (bld->type.length == 4) {
 809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 810    }
 811    else {
 812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 813       unsigned j;
 814       unsigned num_iter = bld->type.length / 4;
 815       struct lp_type parttype = bld->type;
 816       parttype.length = 4;
 817       for (j = 0; j < num_iter; j++) {
 818          LLVMValueRef partsrc[4];
 819          unsigned i;
 820          for (i = 0; i < 4; i++) {
 821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 822          }
 823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 824       }
 825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 826    }
 827    return ret_vec;
 828 }
 829
 830 /**
 831  * Generate a - b
 832  */
 833 LLVMValueRef
 834 lp_build_sub(struct lp_build_context *bld,
 835              LLVMValueRef a,
 836              LLVMValueRef b)
 837 {
 838    LLVMBuilderRef builder = bld->gallivm->builder;
 839    const struct lp_type type = bld->type;
 840    LLVMValueRef res;
 841
 842    assert(lp_check_value(type, a));
 843    assert(lp_check_value(type, b));
 844
 845    if(b == bld->zero)
 846       return a;
 847    if(a == bld->undef || b == bld->undef)
 848       return bld->undef;
 849    if(a == b)
 850       return bld->zero;
 851
 852    if(bld->type.norm) {
 853       const char *intrinsic = NULL;
 854
 855       if(b == bld->one)
 856         return bld->zero;
 857
 858       if (!type.floating && !type.fixed) {
 859          if (type.width * type.length == 128) {
 860             if (util_cpu_caps.has_sse2) {
 861               if(type.width == 8)
 862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 863               if(type.width == 16)
 864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 865             } else if (util_cpu_caps.has_altivec) {
 866               if(type.width == 8)
 867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 868               if(type.width == 16)
 869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 870             }
 871          }
 872          if (type.width * type.length == 256) {
 873             if (util_cpu_caps.has_avx2) {
 874               if(type.width == 8)
 875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
 876               if(type.width == 16)
 877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
 878             }
 879          }
 880       }
 881
 882       if (intrinsic)
 883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 884    }
 885
 886    if(type.norm && !type.floating && !type.fixed) {
 887       if (type.sign) {
 888          uint64_t sign = (uint64_t)1 << (type.width - 1);
 889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 891          /* a_clamp_max is the maximum a for negative b,
 892             a_clamp_min is the minimum a for positive b. */
 893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 896       } else {
 897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 898       }
 899    }
 900
 901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 902       if (type.floating)
 903          res = LLVMConstFSub(a, b);
 904       else
 905          res = LLVMConstSub(a, b);
 906    else
 907       if (type.floating)
 908          res = LLVMBuildFSub(builder, a, b, "");
 909       else
 910          res = LLVMBuildSub(builder, a, b, "");
 911
 912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 914
 915    return res;
 916 }
 917
 918
 919
 920 /**
 921  * Normalized multiplication.
 922  *
 923  * There are several approaches for (using 8-bit normalized multiplication as
 924  * an example):
 925  *
 926  * - alpha plus one
 927  *
 928  *     makes the following approximation to the division (Sree)
 929  *
 930  *       a*b/255 ~= (a*(b + 1)) >> 256
 931  *
 932  *     which is the fastest method that satisfies the following OpenGL criteria of
 933  *
 934  *       0*0 = 0 and 255*255 = 255
 935  *
 936  * - geometric series
 937  *
 938  *     takes the geometric series approximation to the division
 939  *
 940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 941  *
 942  *     in this case just the first two terms to fit in 16bit arithmetic
 943  *
 944  *       t/255 ~= (t + (t >> 8)) >> 8
 945  *
 946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 948  *     must be used.
 949  *
 950  * - geometric series plus rounding
 951  *
 952  *     when using a geometric series division instead of truncating the result
 953  *     use roundoff in the approximation (Jim Blinn)
 954  *
 955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 956  *
 957  *     achieving the exact results.
 958  *
 959  *
 960  *
 961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 963  * @sa Michael Herf, The "double blend trick", May 2000,
 964  *     http://www.stereopsis.com/doubleblend.html
 965  */
 966 static LLVMValueRef
 967 lp_build_mul_norm(struct gallivm_state *gallivm,
 968                   struct lp_type wide_type,
 969                   LLVMValueRef a, LLVMValueRef b)
 970 {
 971    LLVMBuilderRef builder = gallivm->builder;
 972    struct lp_build_context bld;
 973    unsigned n;
 974    LLVMValueRef half;
 975    LLVMValueRef ab;
 976
 977    assert(!wide_type.floating);
 978    assert(lp_check_value(wide_type, a));
 979    assert(lp_check_value(wide_type, b));
 980
 981    lp_build_context_init(&bld, gallivm, wide_type);
 982
 983    n = wide_type.width / 2;
 984    if (wide_type.sign) {
 985       --n;
 986    }
 987
 988    /*
 989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 991     */
 992
 993    /*
 994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 995     */
 996
 997    ab = LLVMBuildMul(builder, a, b, "");
 998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 999
1000    /*
1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002     */
1003
1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005    if (wide_type.sign) {
1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008       half = lp_build_select(&bld, sign, minus_half, half);
1009    }
1010    ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012    /* Final division */
1013    ab = lp_build_shr_imm(&bld, ab, n);
1014
1015    return ab;
1016 }
1017
1018 /**
1019  * Generate a * b
1020  */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023              LLVMValueRef a,
1024              LLVMValueRef b)
1025 {
1026    LLVMBuilderRef builder = bld->gallivm->builder;
1027    const struct lp_type type = bld->type;
1028    LLVMValueRef shift;
1029    LLVMValueRef res;
1030
1031    assert(lp_check_value(type, a));
1032    assert(lp_check_value(type, b));
1033
1034    if(a == bld->zero)
1035       return bld->zero;
1036    if(a == bld->one)
1037       return b;
1038    if(b == bld->zero)
1039       return bld->zero;
1040    if(b == bld->one)
1041       return a;
1042    if(a == bld->undef || b == bld->undef)
1043       return bld->undef;
1044
1045    if (!type.floating && !type.fixed && type.norm) {
1046       struct lp_type wide_type = lp_wider_type(type);
1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
1050       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052       /* PMULLW, PSRLW, PADDW */
1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
1057
1058       return ab;
1059    }
1060
1061    if(type.fixed)
1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063    else
1064       shift = NULL;
1065
1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067       if (type.floating)
1068          res = LLVMConstFMul(a, b);
1069       else
1070          res = LLVMConstMul(a, b);
1071       if(shift) {
1072          if(type.sign)
1073             res = LLVMConstAShr(res, shift);
1074          else
1075             res = LLVMConstLShr(res, shift);
1076       }
1077    }
1078    else {
1079       if (type.floating)
1080          res = LLVMBuildFMul(builder, a, b, "");
1081       else
1082          res = LLVMBuildMul(builder, a, b, "");
1083       if(shift) {
1084          if(type.sign)
1085             res = LLVMBuildAShr(builder, res, shift, "");
1086          else
1087             res = LLVMBuildLShr(builder, res, shift, "");
1088       }
1089    }
1090
1091    return res;
1092 }
1093
1094
1095 /* a * b + c */
1096 LLVMValueRef
1097 lp_build_mad(struct lp_build_context *bld,
1098              LLVMValueRef a,
1099              LLVMValueRef b,
1100              LLVMValueRef c)
1101 {
1102    const struct lp_type type = bld->type;
1103    if (type.floating) {
1104       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1105    } else {
1106       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1107    }
1108 }
1109
1110
1111 /**
1112  * Small vector x scale multiplication optimization.
1113  */
1114 LLVMValueRef
1115 lp_build_mul_imm(struct lp_build_context *bld,
1116                  LLVMValueRef a,
1117                  int b)
1118 {
1119    LLVMBuilderRef builder = bld->gallivm->builder;
1120    LLVMValueRef factor;
1121
1122    assert(lp_check_value(bld->type, a));
1123
1124    if(b == 0)
1125       return bld->zero;
1126
1127    if(b == 1)
1128       return a;
1129
1130    if(b == -1)
1131       return lp_build_negate(bld, a);
1132
1133    if(b == 2 && bld->type.floating)
1134       return lp_build_add(bld, a, a);
1135
1136    if(util_is_power_of_two(b)) {
1137       unsigned shift = ffs(b) - 1;
1138
1139       if(bld->type.floating) {
1140 #if 0
1141          /*
1142           * Power of two multiplication by directly manipulating the exponent.
1143           *
1144           * XXX: This might not be always faster, it will introduce a small error
1145           * for multiplication by zero, and it will produce wrong results
1146           * for Inf and NaN.
1147           */
1148          unsigned mantissa = lp_mantissa(bld->type);
1149          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1150          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1151          a = LLVMBuildAdd(builder, a, factor, "");
1152          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1153          return a;
1154 #endif
1155       }
1156       else {
1157          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1158          return LLVMBuildShl(builder, a, factor, "");
1159       }
1160    }
1161
1162    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1163    return lp_build_mul(bld, a, factor);
1164 }
1165
1166
1167 /**
1168  * Generate a / b
1169  */
1170 LLVMValueRef
1171 lp_build_div(struct lp_build_context *bld,
1172              LLVMValueRef a,
1173              LLVMValueRef b)
1174 {
1175    LLVMBuilderRef builder = bld->gallivm->builder;
1176    const struct lp_type type = bld->type;
1177
1178    assert(lp_check_value(type, a));
1179    assert(lp_check_value(type, b));
1180
1181    if(a == bld->zero)
1182       return bld->zero;
1183    if(a == bld->one && type.floating)
1184       return lp_build_rcp(bld, b);
1185    if(b == bld->zero)
1186       return bld->undef;
1187    if(b == bld->one)
1188       return a;
1189    if(a == bld->undef || b == bld->undef)
1190       return bld->undef;
1191
1192    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1193       if (type.floating)
1194          return LLVMConstFDiv(a, b);
1195       else if (type.sign)
1196          return LLVMConstSDiv(a, b);
1197       else
1198          return LLVMConstUDiv(a, b);
1199    }
1200
1201    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1202        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1203       type.floating)
1204       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1205
1206    if (type.floating)
1207       return LLVMBuildFDiv(builder, a, b, "");
1208    else if (type.sign)
1209       return LLVMBuildSDiv(builder, a, b, "");
1210    else
1211       return LLVMBuildUDiv(builder, a, b, "");
1212 }
1213
1214
1215 /**
1216  * Linear interpolation helper.
1217  *
1218  * @param normalized whether we are interpolating normalized values,
1219  *        encoded in normalized integers, twice as wide.
1220  *
1221  * @sa http://www.stereopsis.com/doubleblend.html
1222  */
1223 static inline LLVMValueRef
1224 lp_build_lerp_simple(struct lp_build_context *bld,
1225                      LLVMValueRef x,
1226                      LLVMValueRef v0,
1227                      LLVMValueRef v1,
1228                      unsigned flags)
1229 {
1230    unsigned half_width = bld->type.width/2;
1231    LLVMBuilderRef builder = bld->gallivm->builder;
1232    LLVMValueRef delta;
1233    LLVMValueRef res;
1234
1235    assert(lp_check_value(bld->type, x));
1236    assert(lp_check_value(bld->type, v0));
1237    assert(lp_check_value(bld->type, v1));
1238
1239    delta = lp_build_sub(bld, v1, v0);
1240
1241    if (bld->type.floating) {
1242       assert(flags == 0);
1243       return lp_build_mad(bld, x, delta, v0);
1244    }
1245
1246    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1247       if (!bld->type.sign) {
1248          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1249             /*
1250              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1251              * most-significant-bit to the lowest-significant-bit, so that
1252              * later we can just divide by 2**n instead of 2**n - 1.
1253              */
1254
1255             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1256          }
1257
1258          /* (x * delta) >> n */
1259          res = lp_build_mul(bld, x, delta);
1260          res = lp_build_shr_imm(bld, res, half_width);
1261       } else {
1262          /*
1263           * The rescaling trick above doesn't work for signed numbers, so
1264           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1265           * instead.
1266           */
1267          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1268          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1269       }
1270    } else {
1271       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1272       res = lp_build_mul(bld, x, delta);
1273    }
1274
1275    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1276       /*
1277        * At this point both res and v0 only use the lower half of the bits,
1278        * the rest is zero. Instead of add / mask, do add with half wide type.
1279        */
1280       struct lp_type narrow_type;
1281       struct lp_build_context narrow_bld;
1282
1283       memset(&narrow_type, 0, sizeof narrow_type);
1284       narrow_type.sign   = bld->type.sign;
1285       narrow_type.width  = bld->type.width/2;
1286       narrow_type.length = bld->type.length*2;
1287
1288       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1289       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1290       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1291       res = lp_build_add(&narrow_bld, v0, res);
1292       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1293    } else {
1294       res = lp_build_add(bld, v0, res);
1295
1296       if (bld->type.fixed) {
1297          /*
1298           * We need to mask out the high order bits when lerping 8bit
1299           * normalized colors stored on 16bits
1300           */
1301          /* XXX: This step is necessary for lerping 8bit colors stored on
1302           * 16bits, but it will be wrong for true fixed point use cases.
1303           * Basically we need a more powerful lp_type, capable of further
1304           * distinguishing the values interpretation from the value storage.
1305           */
1306          LLVMValueRef low_bits;
1307          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1308          res = LLVMBuildAnd(builder, res, low_bits, "");
1309       }
1310    }
1311
1312    return res;
1313 }
1314
1315
1316 /**
1317  * Linear interpolation.
1318  */
1319 LLVMValueRef
1320 lp_build_lerp(struct lp_build_context *bld,
1321               LLVMValueRef x,
1322               LLVMValueRef v0,
1323               LLVMValueRef v1,
1324               unsigned flags)
1325 {
1326    const struct lp_type type = bld->type;
1327    LLVMValueRef res;
1328
1329    assert(lp_check_value(type, x));
1330    assert(lp_check_value(type, v0));
1331    assert(lp_check_value(type, v1));
1332
1333    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1334
1335    if (type.norm) {
1336       struct lp_type wide_type;
1337       struct lp_build_context wide_bld;
1338       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1339
1340       assert(type.length >= 2);
1341
1342       /*
1343        * Create a wider integer type, enough to hold the
1344        * intermediate result of the multiplication.
1345        */
1346       memset(&wide_type, 0, sizeof wide_type);
1347       wide_type.sign   = type.sign;
1348       wide_type.width  = type.width*2;
1349       wide_type.length = type.length/2;
1350
1351       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1352
1353       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1354       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1355       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1356
1357       /*
1358        * Lerp both halves.
1359        */
1360
1361       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1362
1363       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1364       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1365
1366       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1367    } else {
1368       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1369    }
1370
1371    return res;
1372 }
1373
1374
1375 /**
1376  * Bilinear interpolation.
1377  *
1378  * Values indices are in v_{yx}.
1379  */
1380 LLVMValueRef
1381 lp_build_lerp_2d(struct lp_build_context *bld,
1382                  LLVMValueRef x,
1383                  LLVMValueRef y,
1384                  LLVMValueRef v00,
1385                  LLVMValueRef v01,
1386                  LLVMValueRef v10,
1387                  LLVMValueRef v11,
1388                  unsigned flags)
1389 {
1390    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1391    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1392    return lp_build_lerp(bld, y, v0, v1, flags);
1393 }
1394
1395
1396 LLVMValueRef
1397 lp_build_lerp_3d(struct lp_build_context *bld,
1398                  LLVMValueRef x,
1399                  LLVMValueRef y,
1400                  LLVMValueRef z,
1401                  LLVMValueRef v000,
1402                  LLVMValueRef v001,
1403                  LLVMValueRef v010,
1404                  LLVMValueRef v011,
1405                  LLVMValueRef v100,
1406                  LLVMValueRef v101,
1407                  LLVMValueRef v110,
1408                  LLVMValueRef v111,
1409                  unsigned flags)
1410 {
1411    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1412    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1413    return lp_build_lerp(bld, z, v0, v1, flags);
1414 }
1415
1416
1417 /**
1418  * Generate min(a, b)
1419  * Do checks for special cases but not for nans.
1420  */
1421 LLVMValueRef
1422 lp_build_min(struct lp_build_context *bld,
1423              LLVMValueRef a,
1424              LLVMValueRef b)
1425 {
1426    assert(lp_check_value(bld->type, a));
1427    assert(lp_check_value(bld->type, b));
1428
1429    if(a == bld->undef || b == bld->undef)
1430       return bld->undef;
1431
1432    if(a == b)
1433       return a;
1434
1435    if (bld->type.norm) {
1436       if (!bld->type.sign) {
1437          if (a == bld->zero || b == bld->zero) {
1438             return bld->zero;
1439          }
1440       }
1441       if(a == bld->one)
1442          return b;
1443       if(b == bld->one)
1444          return a;
1445    }
1446
1447    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1448 }
1449
1450
1451 /**
1452  * Generate min(a, b)
1453  * NaN's are handled according to the behavior specified by the
1454  * nan_behavior argument.
1455  */
1456 LLVMValueRef
1457 lp_build_min_ext(struct lp_build_context *bld,
1458                  LLVMValueRef a,
1459                  LLVMValueRef b,
1460                  enum gallivm_nan_behavior nan_behavior)
1461 {
1462    assert(lp_check_value(bld->type, a));
1463    assert(lp_check_value(bld->type, b));
1464
1465    if(a == bld->undef || b == bld->undef)
1466       return bld->undef;
1467
1468    if(a == b)
1469       return a;
1470
1471    if (bld->type.norm) {
1472       if (!bld->type.sign) {
1473          if (a == bld->zero || b == bld->zero) {
1474             return bld->zero;
1475          }
1476       }
1477       if(a == bld->one)
1478          return b;
1479       if(b == bld->one)
1480          return a;
1481    }
1482
1483    return lp_build_min_simple(bld, a, b, nan_behavior);
1484 }
1485
1486 /**
1487  * Generate max(a, b)
1488  * Do checks for special cases, but NaN behavior is undefined.
1489  */
1490 LLVMValueRef
1491 lp_build_max(struct lp_build_context *bld,
1492              LLVMValueRef a,
1493              LLVMValueRef b)
1494 {
1495    assert(lp_check_value(bld->type, a));
1496    assert(lp_check_value(bld->type, b));
1497
1498    if(a == bld->undef || b == bld->undef)
1499       return bld->undef;
1500
1501    if(a == b)
1502       return a;
1503
1504    if(bld->type.norm) {
1505       if(a == bld->one || b == bld->one)
1506          return bld->one;
1507       if (!bld->type.sign) {
1508          if (a == bld->zero) {
1509             return b;
1510          }
1511          if (b == bld->zero) {
1512             return a;
1513          }
1514       }
1515    }
1516
1517    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1518 }
1519
1520
1521 /**
1522  * Generate max(a, b)
1523  * Checks for special cases.
1524  * NaN's are handled according to the behavior specified by the
1525  * nan_behavior argument.
1526  */
1527 LLVMValueRef
1528 lp_build_max_ext(struct lp_build_context *bld,
1529                   LLVMValueRef a,
1530                   LLVMValueRef b,
1531                   enum gallivm_nan_behavior nan_behavior)
1532 {
1533    assert(lp_check_value(bld->type, a));
1534    assert(lp_check_value(bld->type, b));
1535
1536    if(a == bld->undef || b == bld->undef)
1537       return bld->undef;
1538
1539    if(a == b)
1540       return a;
1541
1542    if(bld->type.norm) {
1543       if(a == bld->one || b == bld->one)
1544          return bld->one;
1545       if (!bld->type.sign) {
1546          if (a == bld->zero) {
1547             return b;
1548          }
1549          if (b == bld->zero) {
1550             return a;
1551          }
1552       }
1553    }
1554
1555    return lp_build_max_simple(bld, a, b, nan_behavior);
1556 }
1557
1558 /**
1559  * Generate clamp(a, min, max)
1560  * NaN behavior (for any of a, min, max) is undefined.
1561  * Do checks for special cases.
1562  */
1563 LLVMValueRef
1564 lp_build_clamp(struct lp_build_context *bld,
1565                LLVMValueRef a,
1566                LLVMValueRef min,
1567                LLVMValueRef max)
1568 {
1569    assert(lp_check_value(bld->type, a));
1570    assert(lp_check_value(bld->type, min));
1571    assert(lp_check_value(bld->type, max));
1572
1573    a = lp_build_min(bld, a, max);
1574    a = lp_build_max(bld, a, min);
1575    return a;
1576 }
1577
1578
1579 /**
1580  * Generate clamp(a, 0, 1)
1581  * A NaN will get converted to zero.
1582  */
1583 LLVMValueRef
1584 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1585                                 LLVMValueRef a)
1586 {
1587    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1588    a = lp_build_min(bld, a, bld->one);
1589    return a;
1590 }
1591
1592
1593 /**
1594  * Generate abs(a)
1595  */
1596 LLVMValueRef
1597 lp_build_abs(struct lp_build_context *bld,
1598              LLVMValueRef a)
1599 {
1600    LLVMBuilderRef builder = bld->gallivm->builder;
1601    const struct lp_type type = bld->type;
1602    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1603
1604    assert(lp_check_value(type, a));
1605
1606    if(!type.sign)
1607       return a;
1608
1609    if(type.floating) {
1610       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1611          /* Workaround llvm.org/PR27332 */
1612          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1613          unsigned long long absMask = ~(1ULL << (type.width - 1));
1614          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1615          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1616          a = LLVMBuildAnd(builder, a, mask, "");
1617          a = LLVMBuildBitCast(builder, a, vec_type, "");
1618          return a;
1619       } else {
1620          char intrinsic[32];
1621          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1622          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1623       }
1624    }
1625
1626    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1627       switch(type.width) {
1628       case 8:
1629          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1630       case 16:
1631          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1632       case 32:
1633          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1634       }
1635    }
1636    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1637       switch(type.width) {
1638       case 8:
1639          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1640       case 16:
1641          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1642       case 32:
1643          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1644       }
1645    }
1646    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1647             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1648             (type.width == 8 || type.width == 16 || type.width == 32)) {
1649       debug_printf("%s: inefficient code, should split vectors manually\n",
1650                    __FUNCTION__);
1651    }
1652
1653    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1654 }
1655
1656
1657 LLVMValueRef
1658 lp_build_negate(struct lp_build_context *bld,
1659                 LLVMValueRef a)
1660 {
1661    LLVMBuilderRef builder = bld->gallivm->builder;
1662
1663    assert(lp_check_value(bld->type, a));
1664
1665    if (bld->type.floating)
1666       a = LLVMBuildFNeg(builder, a, "");
1667    else
1668       a = LLVMBuildNeg(builder, a, "");
1669
1670    return a;
1671 }
1672
1673
1674 /** Return -1, 0 or +1 depending on the sign of a */
1675 LLVMValueRef
1676 lp_build_sgn(struct lp_build_context *bld,
1677              LLVMValueRef a)
1678 {
1679    LLVMBuilderRef builder = bld->gallivm->builder;
1680    const struct lp_type type = bld->type;
1681    LLVMValueRef cond;
1682    LLVMValueRef res;
1683
1684    assert(lp_check_value(type, a));
1685
1686    /* Handle non-zero case */
1687    if(!type.sign) {
1688       /* if not zero then sign must be positive */
1689       res = bld->one;
1690    }
1691    else if(type.floating) {
1692       LLVMTypeRef vec_type;
1693       LLVMTypeRef int_type;
1694       LLVMValueRef mask;
1695       LLVMValueRef sign;
1696       LLVMValueRef one;
1697       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1698
1699       int_type = lp_build_int_vec_type(bld->gallivm, type);
1700       vec_type = lp_build_vec_type(bld->gallivm, type);
1701       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1702
1703       /* Take the sign bit and add it to 1 constant */
1704       sign = LLVMBuildBitCast(builder, a, int_type, "");
1705       sign = LLVMBuildAnd(builder, sign, mask, "");
1706       one = LLVMConstBitCast(bld->one, int_type);
1707       res = LLVMBuildOr(builder, sign, one, "");
1708       res = LLVMBuildBitCast(builder, res, vec_type, "");
1709    }
1710    else
1711    {
1712       /* signed int/norm/fixed point */
1713       /* could use psign with sse3 and appropriate vectors here */
1714       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1715       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1716       res = lp_build_select(bld, cond, bld->one, minus_one);
1717    }
1718
1719    /* Handle zero */
1720    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1721    res = lp_build_select(bld, cond, bld->zero, res);
1722
1723    return res;
1724 }
1725
1726
1727 /**
1728  * Set the sign of float vector 'a' according to 'sign'.
1729  * If sign==0, return abs(a).
1730  * If sign==1, return -abs(a);
1731  * Other values for sign produce undefined results.
1732  */
1733 LLVMValueRef
1734 lp_build_set_sign(struct lp_build_context *bld,
1735                   LLVMValueRef a, LLVMValueRef sign)
1736 {
1737    LLVMBuilderRef builder = bld->gallivm->builder;
1738    const struct lp_type type = bld->type;
1739    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1740    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1741    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1742    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1743                              ~((unsigned long long) 1 << (type.width - 1)));
1744    LLVMValueRef val, res;
1745
1746    assert(type.floating);
1747    assert(lp_check_value(type, a));
1748
1749    /* val = reinterpret_cast<int>(a) */
1750    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1751    /* val = val & mask */
1752    val = LLVMBuildAnd(builder, val, mask, "");
1753    /* sign = sign << shift */
1754    sign = LLVMBuildShl(builder, sign, shift, "");
1755    /* res = val | sign */
1756    res = LLVMBuildOr(builder, val, sign, "");
1757    /* res = reinterpret_cast<float>(res) */
1758    res = LLVMBuildBitCast(builder, res, vec_type, "");
1759
1760    return res;
1761 }
1762
1763
1764 /**
1765  * Convert vector of (or scalar) int to vector of (or scalar) float.
1766  */
1767 LLVMValueRef
1768 lp_build_int_to_float(struct lp_build_context *bld,
1769                       LLVMValueRef a)
1770 {
1771    LLVMBuilderRef builder = bld->gallivm->builder;
1772    const struct lp_type type = bld->type;
1773    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1774
1775    assert(type.floating);
1776
1777    return LLVMBuildSIToFP(builder, a, vec_type, "");
1778 }
1779
1780 static boolean
1781 arch_rounding_available(const struct lp_type type)
1782 {
1783    if ((util_cpu_caps.has_sse4_1 &&
1784        (type.length == 1 || type.width*type.length == 128)) ||
1785        (util_cpu_caps.has_avx && type.width*type.length == 256))
1786       return TRUE;
1787    else if ((util_cpu_caps.has_altivec &&
1788             (type.width == 32 && type.length == 4)))
1789       return TRUE;
1790
1791    return FALSE;
1792 }
1793
1794 enum lp_build_round_mode
1795 {
1796    LP_BUILD_ROUND_NEAREST = 0,
1797    LP_BUILD_ROUND_FLOOR = 1,
1798    LP_BUILD_ROUND_CEIL = 2,
1799    LP_BUILD_ROUND_TRUNCATE = 3
1800 };
1801
1802 static inline LLVMValueRef
1803 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1804                              LLVMValueRef a)
1805 {
1806    LLVMBuilderRef builder = bld->gallivm->builder;
1807    const struct lp_type type = bld->type;
1808    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1809    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1810    const char *intrinsic;
1811    LLVMValueRef res;
1812
1813    assert(type.floating);
1814    /* using the double precision conversions is a bit more complicated */
1815    assert(type.width == 32);
1816
1817    assert(lp_check_value(type, a));
1818    assert(util_cpu_caps.has_sse2);
1819
1820    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1821    if (type.length == 1) {
1822       LLVMTypeRef vec_type;
1823       LLVMValueRef undef;
1824       LLVMValueRef arg;
1825       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1826
1827       vec_type = LLVMVectorType(bld->elem_type, 4);
1828
1829       intrinsic = "llvm.x86.sse.cvtss2si";
1830
1831       undef = LLVMGetUndef(vec_type);
1832
1833       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1834
1835       res = lp_build_intrinsic_unary(builder, intrinsic,
1836                                      ret_type, arg);
1837    }
1838    else {
1839       if (type.width* type.length == 128) {
1840          intrinsic = "llvm.x86.sse2.cvtps2dq";
1841       }
1842       else {
1843          assert(type.width*type.length == 256);
1844          assert(util_cpu_caps.has_avx);
1845
1846          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1847       }
1848       res = lp_build_intrinsic_unary(builder, intrinsic,
1849                                      ret_type, a);
1850    }
1851
1852    return res;
1853 }
1854
1855
1856 /*
1857  */
1858 static inline LLVMValueRef
1859 lp_build_round_altivec(struct lp_build_context *bld,
1860                        LLVMValueRef a,
1861                        enum lp_build_round_mode mode)
1862 {
1863    LLVMBuilderRef builder = bld->gallivm->builder;
1864    const struct lp_type type = bld->type;
1865    const char *intrinsic = NULL;
1866
1867    assert(type.floating);
1868
1869    assert(lp_check_value(type, a));
1870    assert(util_cpu_caps.has_altivec);
1871
1872    (void)type;
1873
1874    switch (mode) {
1875    case LP_BUILD_ROUND_NEAREST:
1876       intrinsic = "llvm.ppc.altivec.vrfin";
1877       break;
1878    case LP_BUILD_ROUND_FLOOR:
1879       intrinsic = "llvm.ppc.altivec.vrfim";
1880       break;
1881    case LP_BUILD_ROUND_CEIL:
1882       intrinsic = "llvm.ppc.altivec.vrfip";
1883       break;
1884    case LP_BUILD_ROUND_TRUNCATE:
1885       intrinsic = "llvm.ppc.altivec.vrfiz";
1886       break;
1887    }
1888
1889    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1890 }
1891
1892 static inline LLVMValueRef
1893 lp_build_round_arch(struct lp_build_context *bld,
1894                     LLVMValueRef a,
1895                     enum lp_build_round_mode mode)
1896 {
1897    if (util_cpu_caps.has_sse4_1) {
1898       LLVMBuilderRef builder = bld->gallivm->builder;
1899       const struct lp_type type = bld->type;
1900       const char *intrinsic_root;
1901       char intrinsic[32];
1902
1903       assert(type.floating);
1904       assert(lp_check_value(type, a));
1905       (void)type;
1906
1907       switch (mode) {
1908       case LP_BUILD_ROUND_NEAREST:
1909          intrinsic_root = "llvm.nearbyint";
1910          break;
1911       case LP_BUILD_ROUND_FLOOR:
1912          intrinsic_root = "llvm.floor";
1913          break;
1914       case LP_BUILD_ROUND_CEIL:
1915          intrinsic_root = "llvm.ceil";
1916          break;
1917       case LP_BUILD_ROUND_TRUNCATE:
1918          intrinsic_root = "llvm.trunc";
1919          break;
1920       }
1921
1922       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1923       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1924    }
1925    else /* (util_cpu_caps.has_altivec) */
1926      return lp_build_round_altivec(bld, a, mode);
1927 }
1928
1929 /**
1930  * Return the integer part of a float (vector) value (== round toward zero).
1931  * The returned value is a float (vector).
1932  * Ex: trunc(-1.5) = -1.0
1933  */
1934 LLVMValueRef
1935 lp_build_trunc(struct lp_build_context *bld,
1936                LLVMValueRef a)
1937 {
1938    LLVMBuilderRef builder = bld->gallivm->builder;
1939    const struct lp_type type = bld->type;
1940
1941    assert(type.floating);
1942    assert(lp_check_value(type, a));
1943
1944    if (arch_rounding_available(type)) {
1945       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1946    }
1947    else {
1948       const struct lp_type type = bld->type;
1949       struct lp_type inttype;
1950       struct lp_build_context intbld;
1951       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1952       LLVMValueRef trunc, res, anosign, mask;
1953       LLVMTypeRef int_vec_type = bld->int_vec_type;
1954       LLVMTypeRef vec_type = bld->vec_type;
1955
1956       assert(type.width == 32); /* might want to handle doubles at some point */
1957
1958       inttype = type;
1959       inttype.floating = 0;
1960       lp_build_context_init(&intbld, bld->gallivm, inttype);
1961
1962       /* round by truncation */
1963       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1964       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1965
1966       /* mask out sign bit */
1967       anosign = lp_build_abs(bld, a);
1968       /*
1969        * mask out all values if anosign > 2^24
1970        * This should work both for large ints (all rounding is no-op for them
1971        * because such floats are always exact) as well as special cases like
1972        * NaNs, Infs (taking advantage of the fact they use max exponent).
1973        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1974        */
1975       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1976       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1977       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1978       return lp_build_select(bld, mask, a, res);
1979    }
1980 }
1981
1982
1983 /**
1984  * Return float (vector) rounded to nearest integer (vector).  The returned
1985  * value is a float (vector).
1986  * Ex: round(0.9) = 1.0
1987  * Ex: round(-1.5) = -2.0
1988  */
1989 LLVMValueRef
1990 lp_build_round(struct lp_build_context *bld,
1991                LLVMValueRef a)
1992 {
1993    LLVMBuilderRef builder = bld->gallivm->builder;
1994    const struct lp_type type = bld->type;
1995
1996    assert(type.floating);
1997    assert(lp_check_value(type, a));
1998
1999    if (arch_rounding_available(type)) {
2000       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2001    }
2002    else {
2003       const struct lp_type type = bld->type;
2004       struct lp_type inttype;
2005       struct lp_build_context intbld;
2006       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2007       LLVMValueRef res, anosign, mask;
2008       LLVMTypeRef int_vec_type = bld->int_vec_type;
2009       LLVMTypeRef vec_type = bld->vec_type;
2010
2011       assert(type.width == 32); /* might want to handle doubles at some point */
2012
2013       inttype = type;
2014       inttype.floating = 0;
2015       lp_build_context_init(&intbld, bld->gallivm, inttype);
2016
2017       res = lp_build_iround(bld, a);
2018       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2019
2020       /* mask out sign bit */
2021       anosign = lp_build_abs(bld, a);
2022       /*
2023        * mask out all values if anosign > 2^24
2024        * This should work both for large ints (all rounding is no-op for them
2025        * because such floats are always exact) as well as special cases like
2026        * NaNs, Infs (taking advantage of the fact they use max exponent).
2027        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2028        */
2029       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2030       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2031       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2032       return lp_build_select(bld, mask, a, res);
2033    }
2034 }
2035
2036
2037 /**
2038  * Return floor of float (vector), result is a float (vector)
2039  * Ex: floor(1.1) = 1.0
2040  * Ex: floor(-1.1) = -2.0
2041  */
2042 LLVMValueRef
2043 lp_build_floor(struct lp_build_context *bld,
2044                LLVMValueRef a)
2045 {
2046    LLVMBuilderRef builder = bld->gallivm->builder;
2047    const struct lp_type type = bld->type;
2048
2049    assert(type.floating);
2050    assert(lp_check_value(type, a));
2051
2052    if (arch_rounding_available(type)) {
2053       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2054    }
2055    else {
2056       const struct lp_type type = bld->type;
2057       struct lp_type inttype;
2058       struct lp_build_context intbld;
2059       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2060       LLVMValueRef trunc, res, anosign, mask;
2061       LLVMTypeRef int_vec_type = bld->int_vec_type;
2062       LLVMTypeRef vec_type = bld->vec_type;
2063
2064       if (type.width != 32) {
2065          char intrinsic[32];
2066          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2067          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2068       }
2069
2070       assert(type.width == 32); /* might want to handle doubles at some point */
2071
2072       inttype = type;
2073       inttype.floating = 0;
2074       lp_build_context_init(&intbld, bld->gallivm, inttype);
2075
2076       /* round by truncation */
2077       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2078       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2079
2080       if (type.sign) {
2081          LLVMValueRef tmp;
2082
2083          /*
2084           * fix values if rounding is wrong (for non-special cases)
2085           * - this is the case if trunc > a
2086           */
2087          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2088          /* tmp = trunc > a ? 1.0 : 0.0 */
2089          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2090          tmp = lp_build_and(&intbld, mask, tmp);
2091          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2092          res = lp_build_sub(bld, res, tmp);
2093       }
2094
2095       /* mask out sign bit */
2096       anosign = lp_build_abs(bld, a);
2097       /*
2098        * mask out all values if anosign > 2^24
2099        * This should work both for large ints (all rounding is no-op for them
2100        * because such floats are always exact) as well as special cases like
2101        * NaNs, Infs (taking advantage of the fact they use max exponent).
2102        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2103        */
2104       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2105       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2106       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2107       return lp_build_select(bld, mask, a, res);
2108    }
2109 }
2110
2111
2112 /**
2113  * Return ceiling of float (vector), returning float (vector).
2114  * Ex: ceil( 1.1) = 2.0
2115  * Ex: ceil(-1.1) = -1.0
2116  */
2117 LLVMValueRef
2118 lp_build_ceil(struct lp_build_context *bld,
2119               LLVMValueRef a)
2120 {
2121    LLVMBuilderRef builder = bld->gallivm->builder;
2122    const struct lp_type type = bld->type;
2123
2124    assert(type.floating);
2125    assert(lp_check_value(type, a));
2126
2127    if (arch_rounding_available(type)) {
2128       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2129    }
2130    else {
2131       const struct lp_type type = bld->type;
2132       struct lp_type inttype;
2133       struct lp_build_context intbld;
2134       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2135       LLVMValueRef trunc, res, anosign, mask, tmp;
2136       LLVMTypeRef int_vec_type = bld->int_vec_type;
2137       LLVMTypeRef vec_type = bld->vec_type;
2138
2139       if (type.width != 32) {
2140          char intrinsic[32];
2141          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2142          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2143       }
2144
2145       assert(type.width == 32); /* might want to handle doubles at some point */
2146
2147       inttype = type;
2148       inttype.floating = 0;
2149       lp_build_context_init(&intbld, bld->gallivm, inttype);
2150
2151       /* round by truncation */
2152       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2153       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2154
2155       /*
2156        * fix values if rounding is wrong (for non-special cases)
2157        * - this is the case if trunc < a
2158        */
2159       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2160       /* tmp = trunc < a ? 1.0 : 0.0 */
2161       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2162       tmp = lp_build_and(&intbld, mask, tmp);
2163       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2164       res = lp_build_add(bld, trunc, tmp);
2165
2166       /* mask out sign bit */
2167       anosign = lp_build_abs(bld, a);
2168       /*
2169        * mask out all values if anosign > 2^24
2170        * This should work both for large ints (all rounding is no-op for them
2171        * because such floats are always exact) as well as special cases like
2172        * NaNs, Infs (taking advantage of the fact they use max exponent).
2173        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2174        */
2175       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2176       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2177       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2178       return lp_build_select(bld, mask, a, res);
2179    }
2180 }
2181
2182
2183 /**
2184  * Return fractional part of 'a' computed as a - floor(a)
2185  * Typically used in texture coord arithmetic.
2186  */
2187 LLVMValueRef
2188 lp_build_fract(struct lp_build_context *bld,
2189                LLVMValueRef a)
2190 {
2191    assert(bld->type.floating);
2192    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2193 }
2194
2195
2196 /**
2197  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2198  * against 0.99999(9). (Will also return that value for NaNs.)
2199  */
2200 static inline LLVMValueRef
2201 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2202 {
2203    LLVMValueRef max;
2204
2205    /* this is the largest number smaller than 1.0 representable as float */
2206    max = lp_build_const_vec(bld->gallivm, bld->type,
2207                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2208    return lp_build_min_ext(bld, fract, max,
2209                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2210 }
2211
2212
2213 /**
2214  * Same as lp_build_fract, but guarantees that the result is always smaller
2215  * than one. Will also return the smaller-than-one value for infs, NaNs.
2216  */
2217 LLVMValueRef
2218 lp_build_fract_safe(struct lp_build_context *bld,
2219                     LLVMValueRef a)
2220 {
2221    return clamp_fract(bld, lp_build_fract(bld, a));
2222 }
2223
2224
2225 /**
2226  * Return the integer part of a float (vector) value (== round toward zero).
2227  * The returned value is an integer (vector).
2228  * Ex: itrunc(-1.5) = -1
2229  */
2230 LLVMValueRef
2231 lp_build_itrunc(struct lp_build_context *bld,
2232                 LLVMValueRef a)
2233 {
2234    LLVMBuilderRef builder = bld->gallivm->builder;
2235    const struct lp_type type = bld->type;
2236    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2237
2238    assert(type.floating);
2239    assert(lp_check_value(type, a));
2240
2241    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2242 }
2243
2244
2245 /**
2246  * Return float (vector) rounded to nearest integer (vector).  The returned
2247  * value is an integer (vector).
2248  * Ex: iround(0.9) = 1
2249  * Ex: iround(-1.5) = -2
2250  */
2251 LLVMValueRef
2252 lp_build_iround(struct lp_build_context *bld,
2253                 LLVMValueRef a)
2254 {
2255    LLVMBuilderRef builder = bld->gallivm->builder;
2256    const struct lp_type type = bld->type;
2257    LLVMTypeRef int_vec_type = bld->int_vec_type;
2258    LLVMValueRef res;
2259
2260    assert(type.floating);
2261
2262    assert(lp_check_value(type, a));
2263
2264    if ((util_cpu_caps.has_sse2 &&
2265        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2266        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2267       return lp_build_iround_nearest_sse2(bld, a);
2268    }
2269    if (arch_rounding_available(type)) {
2270       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2271    }
2272    else {
2273       LLVMValueRef half;
2274
2275       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2276
2277       if (type.sign) {
2278          LLVMTypeRef vec_type = bld->vec_type;
2279          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2280                                     (unsigned long long)1 << (type.width - 1));
2281          LLVMValueRef sign;
2282
2283          /* get sign bit */
2284          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2285          sign = LLVMBuildAnd(builder, sign, mask, "");
2286
2287          /* sign * 0.5 */
2288          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2289          half = LLVMBuildOr(builder, sign, half, "");
2290          half = LLVMBuildBitCast(builder, half, vec_type, "");
2291       }
2292
2293       res = LLVMBuildFAdd(builder, a, half, "");
2294    }
2295
2296    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2297
2298    return res;
2299 }
2300
2301
2302 /**
2303  * Return floor of float (vector), result is an int (vector)
2304  * Ex: ifloor(1.1) = 1.0
2305  * Ex: ifloor(-1.1) = -2.0
2306  */
2307 LLVMValueRef
2308 lp_build_ifloor(struct lp_build_context *bld,
2309                 LLVMValueRef a)
2310 {
2311    LLVMBuilderRef builder = bld->gallivm->builder;
2312    const struct lp_type type = bld->type;
2313    LLVMTypeRef int_vec_type = bld->int_vec_type;
2314    LLVMValueRef res;
2315
2316    assert(type.floating);
2317    assert(lp_check_value(type, a));
2318
2319    res = a;
2320    if (type.sign) {
2321       if (arch_rounding_available(type)) {
2322          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2323       }
2324       else {
2325          struct lp_type inttype;
2326          struct lp_build_context intbld;
2327          LLVMValueRef trunc, itrunc, mask;
2328
2329          assert(type.floating);
2330          assert(lp_check_value(type, a));
2331
2332          inttype = type;
2333          inttype.floating = 0;
2334          lp_build_context_init(&intbld, bld->gallivm, inttype);
2335
2336          /* round by truncation */
2337          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2338          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2339
2340          /*
2341           * fix values if rounding is wrong (for non-special cases)
2342           * - this is the case if trunc > a
2343           * The results of doing this with NaNs, very large values etc.
2344           * are undefined but this seems to be the case anyway.
2345           */
2346          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2347          /* cheapie minus one with mask since the mask is minus one / zero */
2348          return lp_build_add(&intbld, itrunc, mask);
2349       }
2350    }
2351
2352    /* round to nearest (toward zero) */
2353    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2354
2355    return res;
2356 }
2357
2358
2359 /**
2360  * Return ceiling of float (vector), returning int (vector).
2361  * Ex: iceil( 1.1) = 2
2362  * Ex: iceil(-1.1) = -1
2363  */
2364 LLVMValueRef
2365 lp_build_iceil(struct lp_build_context *bld,
2366                LLVMValueRef a)
2367 {
2368    LLVMBuilderRef builder = bld->gallivm->builder;
2369    const struct lp_type type = bld->type;
2370    LLVMTypeRef int_vec_type = bld->int_vec_type;
2371    LLVMValueRef res;
2372
2373    assert(type.floating);
2374    assert(lp_check_value(type, a));
2375
2376    if (arch_rounding_available(type)) {
2377       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2378    }
2379    else {
2380       struct lp_type inttype;
2381       struct lp_build_context intbld;
2382       LLVMValueRef trunc, itrunc, mask;
2383
2384       assert(type.floating);
2385       assert(lp_check_value(type, a));
2386
2387       inttype = type;
2388       inttype.floating = 0;
2389       lp_build_context_init(&intbld, bld->gallivm, inttype);
2390
2391       /* round by truncation */
2392       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2393       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2394
2395       /*
2396        * fix values if rounding is wrong (for non-special cases)
2397        * - this is the case if trunc < a
2398        * The results of doing this with NaNs, very large values etc.
2399        * are undefined but this seems to be the case anyway.
2400        */
2401       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2402       /* cheapie plus one with mask since the mask is minus one / zero */
2403       return lp_build_sub(&intbld, itrunc, mask);
2404    }
2405
2406    /* round to nearest (toward zero) */
2407    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2408
2409    return res;
2410 }
2411
2412
2413 /**
2414  * Combined ifloor() & fract().
2415  *
2416  * Preferred to calling the functions separately, as it will ensure that the
2417  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2418  */
2419 void
2420 lp_build_ifloor_fract(struct lp_build_context *bld,
2421                       LLVMValueRef a,
2422                       LLVMValueRef *out_ipart,
2423                       LLVMValueRef *out_fpart)
2424 {
2425    LLVMBuilderRef builder = bld->gallivm->builder;
2426    const struct lp_type type = bld->type;
2427    LLVMValueRef ipart;
2428
2429    assert(type.floating);
2430    assert(lp_check_value(type, a));
2431
2432    if (arch_rounding_available(type)) {
2433       /*
2434        * floor() is easier.
2435        */
2436
2437       ipart = lp_build_floor(bld, a);
2438       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2439       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2440    }
2441    else {
2442       /*
2443        * ifloor() is easier.
2444        */
2445
2446       *out_ipart = lp_build_ifloor(bld, a);
2447       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2448       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2449    }
2450 }
2451
2452
2453 /**
2454  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2455  * always smaller than one.
2456  */
2457 void
2458 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2459                            LLVMValueRef a,
2460                            LLVMValueRef *out_ipart,
2461                            LLVMValueRef *out_fpart)
2462 {
2463    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2464    *out_fpart = clamp_fract(bld, *out_fpart);
2465 }
2466
2467
2468 LLVMValueRef
2469 lp_build_sqrt(struct lp_build_context *bld,
2470               LLVMValueRef a)
2471 {
2472    LLVMBuilderRef builder = bld->gallivm->builder;
2473    const struct lp_type type = bld->type;
2474    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2475    char intrinsic[32];
2476
2477    assert(lp_check_value(type, a));
2478
2479    assert(type.floating);
2480    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2481
2482    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2483 }
2484
2485
2486 /**
2487  * Do one Newton-Raphson step to improve reciprocate precision:
2488  *
2489  *   x_{i+1} = x_i * (2 - a * x_i)
2490  *
2491  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2492  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2493  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2494  * halo. It would be necessary to clamp the argument to prevent this.
2495  *
2496  * See also:
2497  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2498  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2499  */
2500 static inline LLVMValueRef
2501 lp_build_rcp_refine(struct lp_build_context *bld,
2502                     LLVMValueRef a,
2503                     LLVMValueRef rcp_a)
2504 {
2505    LLVMBuilderRef builder = bld->gallivm->builder;
2506    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2507    LLVMValueRef res;
2508
2509    res = LLVMBuildFMul(builder, a, rcp_a, "");
2510    res = LLVMBuildFSub(builder, two, res, "");
2511    res = LLVMBuildFMul(builder, rcp_a, res, "");
2512
2513    return res;
2514 }
2515
2516
2517 LLVMValueRef
2518 lp_build_rcp(struct lp_build_context *bld,
2519              LLVMValueRef a)
2520 {
2521    LLVMBuilderRef builder = bld->gallivm->builder;
2522    const struct lp_type type = bld->type;
2523
2524    assert(lp_check_value(type, a));
2525
2526    if(a == bld->zero)
2527       return bld->undef;
2528    if(a == bld->one)
2529       return bld->one;
2530    if(a == bld->undef)
2531       return bld->undef;
2532
2533    assert(type.floating);
2534
2535    if(LLVMIsConstant(a))
2536       return LLVMConstFDiv(bld->one, a);
2537
2538    /*
2539     * We don't use RCPPS because:
2540     * - it only has 10bits of precision
2541     * - it doesn't even get the reciprocate of 1.0 exactly
2542     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2543     * - for recent processors the benefit over DIVPS is marginal, a case
2544     *   dependent
2545     *
2546     * We could still use it on certain processors if benchmarks show that the
2547     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2548     * particular uses that require less workarounds.
2549     */
2550
2551    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2552          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2553       const unsigned num_iterations = 0;
2554       LLVMValueRef res;
2555       unsigned i;
2556       const char *intrinsic = NULL;
2557
2558       if (type.length == 4) {
2559          intrinsic = "llvm.x86.sse.rcp.ps";
2560       }
2561       else {
2562          intrinsic = "llvm.x86.avx.rcp.ps.256";
2563       }
2564
2565       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2566
2567       for (i = 0; i < num_iterations; ++i) {
2568          res = lp_build_rcp_refine(bld, a, res);
2569       }
2570
2571       return res;
2572    }
2573
2574    return LLVMBuildFDiv(builder, bld->one, a, "");
2575 }
2576
2577
2578 /**
2579  * Do one Newton-Raphson step to improve rsqrt precision:
2580  *
2581  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2582  *
2583  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2584  */
2585 static inline LLVMValueRef
2586 lp_build_rsqrt_refine(struct lp_build_context *bld,
2587                       LLVMValueRef a,
2588                       LLVMValueRef rsqrt_a)
2589 {
2590    LLVMBuilderRef builder = bld->gallivm->builder;
2591    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2592    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2593    LLVMValueRef res;
2594
2595    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2596    res = LLVMBuildFMul(builder, a, res, "");
2597    res = LLVMBuildFSub(builder, three, res, "");
2598    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2599    res = LLVMBuildFMul(builder, half, res, "");
2600
2601    return res;
2602 }
2603
2604
2605 /**
2606  * Generate 1/sqrt(a).
2607  * Result is undefined for values < 0, infinity for +0.
2608  */
2609 LLVMValueRef
2610 lp_build_rsqrt(struct lp_build_context *bld,
2611                LLVMValueRef a)
2612 {
2613    const struct lp_type type = bld->type;
2614
2615    assert(lp_check_value(type, a));
2616
2617    assert(type.floating);
2618
2619    /*
2620     * This should be faster but all denormals will end up as infinity.
2621     */
2622    if (0 && lp_build_fast_rsqrt_available(type)) {
2623       const unsigned num_iterations = 1;
2624       LLVMValueRef res;
2625       unsigned i;
2626
2627       /* rsqrt(1.0) != 1.0 here */
2628       res = lp_build_fast_rsqrt(bld, a);
2629
2630       if (num_iterations) {
2631          /*
2632           * Newton-Raphson will result in NaN instead of infinity for zero,
2633           * and NaN instead of zero for infinity.
2634           * Also, need to ensure rsqrt(1.0) == 1.0.
2635           * All numbers smaller than FLT_MIN will result in +infinity
2636           * (rsqrtps treats all denormals as zero).
2637           */
2638          LLVMValueRef cmp;
2639          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2640          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2641
2642          for (i = 0; i < num_iterations; ++i) {
2643             res = lp_build_rsqrt_refine(bld, a, res);
2644          }
2645          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2646          res = lp_build_select(bld, cmp, inf, res);
2647          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2648          res = lp_build_select(bld, cmp, bld->zero, res);
2649          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2650          res = lp_build_select(bld, cmp, bld->one, res);
2651       }
2652
2653       return res;
2654    }
2655
2656    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2657 }
2658
2659 /**
2660  * If there's a fast (inaccurate) rsqrt instruction available
2661  * (caller may want to avoid to call rsqrt_fast if it's not available,
2662  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2663  * unavailable it would result in sqrt/div/mul so obviously
2664  * much better to just call sqrt, skipping both div and mul).
2665  */
2666 boolean
2667 lp_build_fast_rsqrt_available(struct lp_type type)
2668 {
2669    assert(type.floating);
2670
2671    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2672        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2673       return true;
2674    }
2675    return false;
2676 }
2677
2678
2679 /**
2680  * Generate 1/sqrt(a).
2681  * Result is undefined for values < 0, infinity for +0.
2682  * Precision is limited, only ~10 bits guaranteed
2683  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2684  */
2685 LLVMValueRef
2686 lp_build_fast_rsqrt(struct lp_build_context *bld,
2687                     LLVMValueRef a)
2688 {
2689    LLVMBuilderRef builder = bld->gallivm->builder;
2690    const struct lp_type type = bld->type;
2691
2692    assert(lp_check_value(type, a));
2693
2694    if (lp_build_fast_rsqrt_available(type)) {
2695       const char *intrinsic = NULL;
2696
2697       if (type.length == 4) {
2698          intrinsic = "llvm.x86.sse.rsqrt.ps";
2699       }
2700       else {
2701          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2702       }
2703       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2704    }
2705    else {
2706       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2707    }
2708    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2709 }
2710
2711
2712 /**
2713  * Generate sin(a) or cos(a) using polynomial approximation.
2714  * TODO: it might be worth recognizing sin and cos using same source
2715  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2716  * would be way cheaper than calculating (nearly) everything twice...
2717  * Not sure it's common enough to be worth bothering however, scs
2718  * opcode could also benefit from calculating both though.
2719  */
2720 static LLVMValueRef
2721 lp_build_sin_or_cos(struct lp_build_context *bld,
2722                     LLVMValueRef a,
2723                     boolean cos)
2724 {
2725    struct gallivm_state *gallivm = bld->gallivm;
2726    LLVMBuilderRef b = gallivm->builder;
2727    struct lp_type int_type = lp_int_type(bld->type);
2728
2729    /*
2730     *  take the absolute value,
2731     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2732     */
2733
2734    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2735    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2736
2737    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2738    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2739
2740    /*
2741     * scale by 4/Pi
2742     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2743     */
2744
2745    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2746    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2747
2748    /*
2749     * store the integer part of y in mm0
2750     * emm2 = _mm_cvttps_epi32(y);
2751     */
2752
2753    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2754
2755    /*
2756     * j=(j+1) & (~1) (see the cephes sources)
2757     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2758     */
2759
2760    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2761    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2762    /*
2763     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2764     */
2765    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2766    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2767
2768    /*
2769     * y = _mm_cvtepi32_ps(emm2);
2770     */
2771    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2772
2773    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2774    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2775    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2776    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2777
2778    /*
2779     * Argument used for poly selection and sign bit determination
2780     * is different for sin vs. cos.
2781     */
2782    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2783                                emm2_and;
2784
2785    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2786                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2787                                               const_29, "sign_bit") :
2788                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2789                                                               LLVMBuildShl(b, emm2_add,
2790                                                                            const_29, ""), ""),
2791                                               sign_mask, "sign_bit");
2792
2793    /*
2794     * get the polynom selection mask
2795     * there is one polynom for 0 <= x <= Pi/4
2796     * and another one for Pi/4<x<=Pi/2
2797     * Both branches will be computed.
2798     *
2799     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2800     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2801     */
2802
2803    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2804    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2805                                              int_type, PIPE_FUNC_EQUAL,
2806                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2807
2808    /*
2809     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2810     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2811     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2812     */
2813    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2814    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2815    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2816
2817    /*
2818     * The magic pass: "Extended precision modular arithmetic"
2819     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2820     */
2821    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2822    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2823    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2824
2825    /*
2826     * Evaluate the first polynom  (0 <= x <= Pi/4)
2827     *
2828     * z = _mm_mul_ps(x,x);
2829     */
2830    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2831
2832    /*
2833     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2834     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2835     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2836     */
2837    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2838    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2839    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2840
2841    /*
2842     * y = *(v4sf*)_ps_coscof_p0;
2843     * y = _mm_mul_ps(y, z);
2844     */
2845    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2846    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2847    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2848    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2849
2850
2851    /*
2852     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2853     * y = _mm_sub_ps(y, tmp);
2854     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2855     */
2856    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2857    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2858    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2859    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2860    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2861
2862    /*
2863     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2864     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2865     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2866     */
2867    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2868    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2869    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2870
2871    /*
2872     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2873     *
2874     * y2 = *(v4sf*)_ps_sincof_p0;
2875     * y2 = _mm_mul_ps(y2, z);
2876     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2877     * y2 = _mm_mul_ps(y2, z);
2878     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2879     * y2 = _mm_mul_ps(y2, z);
2880     * y2 = _mm_mul_ps(y2, x);
2881     * y2 = _mm_add_ps(y2, x);
2882     */
2883
2884    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2885    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2886    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2887    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2888
2889    /*
2890     * select the correct result from the two polynoms
2891     * xmm3 = poly_mask;
2892     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2893     * y = _mm_andnot_ps(xmm3, y);
2894     * y = _mm_or_ps(y,y2);
2895     */
2896    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2897    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2898    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2899    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2900    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2901    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2902
2903    /*
2904     * update the sign
2905     * y = _mm_xor_ps(y, sign_bit);
2906     */
2907    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2908    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2909
2910    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2911
2912    /* clamp output to be within [-1, 1] */
2913    y_result = lp_build_clamp(bld, y_result,
2914                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2915                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2916    /* If a is -inf, inf or NaN then return NaN */
2917    y_result = lp_build_select(bld, isfinite, y_result,
2918                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2919    return y_result;
2920 }
2921
2922
2923 /**
2924  * Generate sin(a)
2925  */
2926 LLVMValueRef
2927 lp_build_sin(struct lp_build_context *bld,
2928              LLVMValueRef a)
2929 {
2930    return lp_build_sin_or_cos(bld, a, FALSE);
2931 }
2932
2933
2934 /**
2935  * Generate cos(a)
2936  */
2937 LLVMValueRef
2938 lp_build_cos(struct lp_build_context *bld,
2939              LLVMValueRef a)
2940 {
2941    return lp_build_sin_or_cos(bld, a, TRUE);
2942 }
2943
2944
2945 /**
2946  * Generate pow(x, y)
2947  */
2948 LLVMValueRef
2949 lp_build_pow(struct lp_build_context *bld,
2950              LLVMValueRef x,
2951              LLVMValueRef y)
2952 {
2953    /* TODO: optimize the constant case */
2954    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2955        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2956       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2957                    __FUNCTION__);
2958    }
2959
2960    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2961 }
2962
2963
2964 /**
2965  * Generate exp(x)
2966  */
2967 LLVMValueRef
2968 lp_build_exp(struct lp_build_context *bld,
2969              LLVMValueRef x)
2970 {
2971    /* log2(e) = 1/log(2) */
2972    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2973                                            1.4426950408889634);
2974
2975    assert(lp_check_value(bld->type, x));
2976
2977    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2978 }
2979
2980
2981 /**
2982  * Generate log(x)
2983  * Behavior is undefined with infs, 0s and nans
2984  */
2985 LLVMValueRef
2986 lp_build_log(struct lp_build_context *bld,
2987              LLVMValueRef x)
2988 {
2989    /* log(2) */
2990    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2991                                           0.69314718055994529);
2992
2993    assert(lp_check_value(bld->type, x));
2994
2995    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2996 }
2997
2998 /**
2999  * Generate log(x) that handles edge cases (infs, 0s and nans)
3000  */
3001 LLVMValueRef
3002 lp_build_log_safe(struct lp_build_context *bld,
3003                   LLVMValueRef x)
3004 {
3005    /* log(2) */
3006    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3007                                           0.69314718055994529);
3008
3009    assert(lp_check_value(bld->type, x));
3010
3011    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3012 }
3013
3014
3015 /**
3016  * Generate polynomial.
3017  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3018  */
3019 LLVMValueRef
3020 lp_build_polynomial(struct lp_build_context *bld,
3021                     LLVMValueRef x,
3022                     const double *coeffs,
3023                     unsigned num_coeffs)
3024 {
3025    const struct lp_type type = bld->type;
3026    LLVMValueRef even = NULL, odd = NULL;
3027    LLVMValueRef x2;
3028    unsigned i;
3029
3030    assert(lp_check_value(bld->type, x));
3031
3032    /* TODO: optimize the constant case */
3033    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3034        LLVMIsConstant(x)) {
3035       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3036                    __FUNCTION__);
3037    }
3038
3039    /*
3040     * Calculate odd and even terms seperately to decrease data dependency
3041     * Ex:
3042     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3043     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3044     */
3045    x2 = lp_build_mul(bld, x, x);
3046
3047    for (i = num_coeffs; i--; ) {
3048       LLVMValueRef coeff;
3049
3050       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3051
3052       if (i % 2 == 0) {
3053          if (even)
3054             even = lp_build_mad(bld, x2, even, coeff);
3055          else
3056             even = coeff;
3057       } else {
3058          if (odd)
3059             odd = lp_build_mad(bld, x2, odd, coeff);
3060          else
3061             odd = coeff;
3062       }
3063    }
3064
3065    if (odd)
3066       return lp_build_mad(bld, odd, x, even);
3067    else if (even)
3068       return even;
3069    else
3070       return bld->undef;
3071 }
3072
3073
3074 /**
3075  * Minimax polynomial fit of 2**x, in range [0, 1[
3076  */
3077 const double lp_build_exp2_polynomial[] = {
3078 #if EXP_POLY_DEGREE == 5
3079    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3080    0.693153073200168932794,
3081    0.240153617044375388211,
3082    0.0558263180532956664775,
3083    0.00898934009049466391101,
3084    0.00187757667519147912699
3085 #elif EXP_POLY_DEGREE == 4
3086    1.00000259337069434683,
3087    0.693003834469974940458,
3088    0.24144275689150793076,
3089    0.0520114606103070150235,
3090    0.0135341679161270268764
3091 #elif EXP_POLY_DEGREE == 3
3092    0.999925218562710312959,
3093    0.695833540494823811697,
3094    0.226067155427249155588,
3095    0.0780245226406372992967
3096 #elif EXP_POLY_DEGREE == 2
3097    1.00172476321474503578,
3098    0.657636275736077639316,
3099    0.33718943461968720704
3100 #else
3101 #error
3102 #endif
3103 };
3104
3105
3106 LLVMValueRef
3107 lp_build_exp2(struct lp_build_context *bld,
3108               LLVMValueRef x)
3109 {
3110    LLVMBuilderRef builder = bld->gallivm->builder;
3111    const struct lp_type type = bld->type;
3112    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3113    LLVMValueRef ipart = NULL;
3114    LLVMValueRef fpart = NULL;
3115    LLVMValueRef expipart = NULL;
3116    LLVMValueRef expfpart = NULL;
3117    LLVMValueRef res = NULL;
3118
3119    assert(lp_check_value(bld->type, x));
3120
3121    /* TODO: optimize the constant case */
3122    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3123        LLVMIsConstant(x)) {
3124       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3125                    __FUNCTION__);
3126    }
3127
3128    assert(type.floating && type.width == 32);
3129
3130    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3131     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3132    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3133                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3134    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3135                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3136
3137    /* ipart = floor(x) */
3138    /* fpart = x - ipart */
3139    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3140
3141    /* expipart = (float) (1 << ipart) */
3142    expipart = LLVMBuildAdd(builder, ipart,
3143                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3144    expipart = LLVMBuildShl(builder, expipart,
3145                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3146    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3147
3148    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3149                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3150
3151    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3152
3153    return res;
3154 }
3155
3156
3157
3158 /**
3159  * Extract the exponent of a IEEE-754 floating point value.
3160  *
3161  * Optionally apply an integer bias.
3162  *
3163  * Result is an integer value with
3164  *
3165  *   ifloor(log2(x)) + bias
3166  */
3167 LLVMValueRef
3168 lp_build_extract_exponent(struct lp_build_context *bld,
3169                           LLVMValueRef x,
3170                           int bias)
3171 {
3172    LLVMBuilderRef builder = bld->gallivm->builder;
3173    const struct lp_type type = bld->type;
3174    unsigned mantissa = lp_mantissa(type);
3175    LLVMValueRef res;
3176
3177    assert(type.floating);
3178
3179    assert(lp_check_value(bld->type, x));
3180
3181    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3182
3183    res = LLVMBuildLShr(builder, x,
3184                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3185    res = LLVMBuildAnd(builder, res,
3186                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3187    res = LLVMBuildSub(builder, res,
3188                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3189
3190    return res;
3191 }
3192
3193
3194 /**
3195  * Extract the mantissa of the a floating.
3196  *
3197  * Result is a floating point value with
3198  *
3199  *   x / floor(log2(x))
3200  */
3201 LLVMValueRef
3202 lp_build_extract_mantissa(struct lp_build_context *bld,
3203                           LLVMValueRef x)
3204 {
3205    LLVMBuilderRef builder = bld->gallivm->builder;
3206    const struct lp_type type = bld->type;
3207    unsigned mantissa = lp_mantissa(type);
3208    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3209                                                   (1ULL << mantissa) - 1);
3210    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3211    LLVMValueRef res;
3212
3213    assert(lp_check_value(bld->type, x));
3214
3215    assert(type.floating);
3216
3217    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3218
3219    /* res = x / 2**ipart */
3220    res = LLVMBuildAnd(builder, x, mantmask, "");
3221    res = LLVMBuildOr(builder, res, one, "");
3222    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3223
3224    return res;
3225 }
3226
3227
3228
3229 /**
3230  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3231  * These coefficients can be generate with
3232  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3233  */
3234 const double lp_build_log2_polynomial[] = {
3235 #if LOG_POLY_DEGREE == 5
3236    2.88539008148777786488L,
3237    0.961796878841293367824L,
3238    0.577058946784739859012L,
3239    0.412914355135828735411L,
3240    0.308591899232910175289L,
3241    0.352376952300281371868L,
3242 #elif LOG_POLY_DEGREE == 4
3243    2.88539009343309178325L,
3244    0.961791550404184197881L,
3245    0.577440339438736392009L,
3246    0.403343858251329912514L,
3247    0.406718052498846252698L,
3248 #elif LOG_POLY_DEGREE == 3
3249    2.88538959748872753838L,
3250    0.961932915889597772928L,
3251    0.571118517972136195241L,
3252    0.493997535084709500285L,
3253 #else
3254 #error
3255 #endif
3256 };
3257
3258 /**
3259  * See http://www.devmaster.net/forums/showthread.php?p=43580
3260  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3261  * http://www.nezumi.demon.co.uk/consult/logx.htm
3262  *
3263  * If handle_edge_cases is true the function will perform computations
3264  * to match the required D3D10+ behavior for each of the edge cases.
3265  * That means that if input is:
3266  * - less than zero (to and including -inf) then NaN will be returned
3267  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3268  * - +infinity, then +infinity will be returned
3269  * - NaN, then NaN will be returned
3270  *
3271  * Those checks are fairly expensive so if you don't need them make sure
3272  * handle_edge_cases is false.
3273  */
3274 void
3275 lp_build_log2_approx(struct lp_build_context *bld,
3276                      LLVMValueRef x,
3277                      LLVMValueRef *p_exp,
3278                      LLVMValueRef *p_floor_log2,
3279                      LLVMValueRef *p_log2,
3280                      boolean handle_edge_cases)
3281 {
3282    LLVMBuilderRef builder = bld->gallivm->builder;
3283    const struct lp_type type = bld->type;
3284    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3285    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3286
3287    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3288    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3289    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3290
3291    LLVMValueRef i = NULL;
3292    LLVMValueRef y = NULL;
3293    LLVMValueRef z = NULL;
3294    LLVMValueRef exp = NULL;
3295    LLVMValueRef mant = NULL;
3296    LLVMValueRef logexp = NULL;
3297    LLVMValueRef p_z = NULL;
3298    LLVMValueRef res = NULL;
3299
3300    assert(lp_check_value(bld->type, x));
3301
3302    if(p_exp || p_floor_log2 || p_log2) {
3303       /* TODO: optimize the constant case */
3304       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3305           LLVMIsConstant(x)) {
3306          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3307                       __FUNCTION__);
3308       }
3309
3310       assert(type.floating && type.width == 32);
3311
3312       /*
3313        * We don't explicitly handle denormalized numbers. They will yield a
3314        * result in the neighbourhood of -127, which appears to be adequate
3315        * enough.
3316        */
3317
3318       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3319
3320       /* exp = (float) exponent(x) */
3321       exp = LLVMBuildAnd(builder, i, expmask, "");
3322    }
3323
3324    if(p_floor_log2 || p_log2) {
3325       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3326       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3327       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3328    }
3329
3330    if (p_log2) {
3331       /* mant = 1 + (float) mantissa(x) */
3332       mant = LLVMBuildAnd(builder, i, mantmask, "");
3333       mant = LLVMBuildOr(builder, mant, one, "");
3334       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3335
3336       /* y = (mant - 1) / (mant + 1) */
3337       y = lp_build_div(bld,
3338          lp_build_sub(bld, mant, bld->one),
3339          lp_build_add(bld, mant, bld->one)
3340       );
3341
3342       /* z = y^2 */
3343       z = lp_build_mul(bld, y, y);
3344
3345       /* compute P(z) */
3346       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3347                                 ARRAY_SIZE(lp_build_log2_polynomial));
3348
3349       /* y * P(z) + logexp */
3350       res = lp_build_mad(bld, y, p_z, logexp);
3351
3352       if (type.floating && handle_edge_cases) {
3353          LLVMValueRef negmask, infmask,  zmask;
3354          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3355                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3356          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3357                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3358          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3359                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3360
3361          /* If x is qual to inf make sure we return inf */
3362          res = lp_build_select(bld, infmask,
3363                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3364                                res);
3365          /* If x is qual to 0, return -inf */
3366          res = lp_build_select(bld, zmask,
3367                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3368                                res);
3369          /* If x is nan or less than 0, return nan */
3370          res = lp_build_select(bld, negmask,
3371                                lp_build_const_vec(bld->gallivm, type,  NAN),
3372                                res);
3373       }
3374    }
3375
3376    if (p_exp) {
3377       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3378       *p_exp = exp;
3379    }
3380
3381    if (p_floor_log2)
3382       *p_floor_log2 = logexp;
3383
3384    if (p_log2)
3385       *p_log2 = res;
3386 }
3387
3388
3389 /*
3390  * log2 implementation which doesn't have special code to
3391  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3392  * the results for those cases are undefined.
3393  */
3394 LLVMValueRef
3395 lp_build_log2(struct lp_build_context *bld,
3396               LLVMValueRef x)
3397 {
3398    LLVMValueRef res;
3399    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3400    return res;
3401 }
3402
3403 /*
3404  * Version of log2 which handles all edge cases.
3405  * Look at documentation of lp_build_log2_approx for
3406  * description of the behavior for each of the edge cases.
3407  */
3408 LLVMValueRef
3409 lp_build_log2_safe(struct lp_build_context *bld,
3410                    LLVMValueRef x)
3411 {
3412    LLVMValueRef res;
3413    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3414    return res;
3415 }
3416
3417
3418 /**
3419  * Faster (and less accurate) log2.
3420  *
3421  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3422  *
3423  * Piece-wise linear approximation, with exact results when x is a
3424  * power of two.
3425  *
3426  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3427  */
3428 LLVMValueRef
3429 lp_build_fast_log2(struct lp_build_context *bld,
3430                    LLVMValueRef x)
3431 {
3432    LLVMBuilderRef builder = bld->gallivm->builder;
3433    LLVMValueRef ipart;
3434    LLVMValueRef fpart;
3435
3436    assert(lp_check_value(bld->type, x));
3437
3438    assert(bld->type.floating);
3439
3440    /* ipart = floor(log2(x)) - 1 */
3441    ipart = lp_build_extract_exponent(bld, x, -1);
3442    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3443
3444    /* fpart = x / 2**ipart */
3445    fpart = lp_build_extract_mantissa(bld, x);
3446
3447    /* ipart + fpart */
3448    return LLVMBuildFAdd(builder, ipart, fpart, "");
3449 }
3450
3451
3452 /**
3453  * Fast implementation of iround(log2(x)).
3454  *
3455  * Not an approximation -- it should give accurate results all the time.
3456  */
3457 LLVMValueRef
3458 lp_build_ilog2(struct lp_build_context *bld,
3459                LLVMValueRef x)
3460 {
3461    LLVMBuilderRef builder = bld->gallivm->builder;
3462    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3463    LLVMValueRef ipart;
3464
3465    assert(bld->type.floating);
3466
3467    assert(lp_check_value(bld->type, x));
3468
3469    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3470    x = LLVMBuildFMul(builder, x, sqrt2, "");
3471
3472    /* ipart = floor(log2(x) + 0.5)  */
3473    ipart = lp_build_extract_exponent(bld, x, 0);
3474
3475    return ipart;
3476 }
3477
3478 LLVMValueRef
3479 lp_build_mod(struct lp_build_context *bld,
3480              LLVMValueRef x,
3481              LLVMValueRef y)
3482 {
3483    LLVMBuilderRef builder = bld->gallivm->builder;
3484    LLVMValueRef res;
3485    const struct lp_type type = bld->type;
3486
3487    assert(lp_check_value(type, x));
3488    assert(lp_check_value(type, y));
3489
3490    if (type.floating)
3491       res = LLVMBuildFRem(builder, x, y, "");
3492    else if (type.sign)
3493       res = LLVMBuildSRem(builder, x, y, "");
3494    else
3495       res = LLVMBuildURem(builder, x, y, "");
3496    return res;
3497 }
3498
3499
3500 /*
3501  * For floating inputs it creates and returns a mask
3502  * which is all 1's for channels which are NaN.
3503  * Channels inside x which are not NaN will be 0.
3504  */
3505 LLVMValueRef
3506 lp_build_isnan(struct lp_build_context *bld,
3507                LLVMValueRef x)
3508 {
3509    LLVMValueRef mask;
3510    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3511
3512    assert(bld->type.floating);
3513    assert(lp_check_value(bld->type, x));
3514
3515    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3516                         "isnotnan");
3517    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3518    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3519    return mask;
3520 }
3521
3522 /* Returns all 1's for floating point numbers that are
3523  * finite numbers and returns all zeros for -inf,
3524  * inf and nan's */
3525 LLVMValueRef
3526 lp_build_isfinite(struct lp_build_context *bld,
3527                   LLVMValueRef x)
3528 {
3529    LLVMBuilderRef builder = bld->gallivm->builder;
3530    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3531    struct lp_type int_type = lp_int_type(bld->type);
3532    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3533    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3534                                                     0x7f800000);
3535
3536    if (!bld->type.floating) {
3537       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3538    }
3539    assert(bld->type.floating);
3540    assert(lp_check_value(bld->type, x));
3541    assert(bld->type.width == 32);
3542
3543    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3544    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3545                            intx, infornan32);
3546 }
3547
3548 /*
3549  * Returns true if the number is nan or inf and false otherwise.
3550  * The input has to be a floating point vector.
3551  */
3552 LLVMValueRef
3553 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3554                        const struct lp_type type,
3555                        LLVMValueRef x)
3556 {
3557    LLVMBuilderRef builder = gallivm->builder;
3558    struct lp_type int_type = lp_int_type(type);
3559    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3560                                                 0x7f800000);
3561    LLVMValueRef ret;
3562
3563    assert(type.floating);
3564
3565    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3566    ret = LLVMBuildAnd(builder, ret, const0, "");
3567    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3568                           ret, const0);
3569
3570    return ret;
3571 }
3572
3573
3574 LLVMValueRef
3575 lp_build_fpstate_get(struct gallivm_state *gallivm)
3576 {
3577    if (util_cpu_caps.has_sse) {
3578       LLVMBuilderRef builder = gallivm->builder;
3579       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3580          gallivm,
3581          LLVMInt32TypeInContext(gallivm->context),
3582          "mxcsr_ptr");
3583       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3584           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3585       lp_build_intrinsic(builder,
3586                          "llvm.x86.sse.stmxcsr",
3587                          LLVMVoidTypeInContext(gallivm->context),
3588                          &mxcsr_ptr8, 1, 0);
3589       return mxcsr_ptr;
3590    }
3591    return 0;
3592 }
3593
3594 void
3595 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3596                                   boolean zero)
3597 {
3598    if (util_cpu_caps.has_sse) {
3599       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3600       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3601
3602       LLVMBuilderRef builder = gallivm->builder;
3603       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3604       LLVMValueRef mxcsr =
3605          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3606
3607       if (util_cpu_caps.has_daz) {
3608          /* Enable denormals are zero mode */
3609          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3610       }
3611       if (zero) {
3612          mxcsr = LLVMBuildOr(builder, mxcsr,
3613                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3614       } else {
3615          mxcsr = LLVMBuildAnd(builder, mxcsr,
3616                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3617       }
3618
3619       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3620       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3621    }
3622 }
3623
3624 void
3625 lp_build_fpstate_set(struct gallivm_state *gallivm,
3626                      LLVMValueRef mxcsr_ptr)
3627 {
3628    if (util_cpu_caps.has_sse) {
3629       LLVMBuilderRef builder = gallivm->builder;
3630       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3631                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3632       lp_build_intrinsic(builder,
3633                          "llvm.x86.sse.ldmxcsr",
3634                          LLVMVoidTypeInContext(gallivm->context),
3635                          &mxcsr_ptr, 1, 0);
3636    }
3637 }