src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if (a == bld->zero)
 545       return b;
 546    if (b == bld->zero)
 547       return a;
 548    if (a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if (type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if (!type.sign && (a == bld->one || b == bld->one))
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (type.width * type.length == 128) {
 559             if (util_cpu_caps.has_sse2) {
 560                if (type.width == 8)
 561                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
 562                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
 563                if (type.width == 16)
 564                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
 565                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
 566             } else if (util_cpu_caps.has_altivec) {
 567                if (type.width == 8)
 568                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 569                if (type.width == 16)
 570                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 571             }
 572          }
 573          if (type.width * type.length == 256) {
 574             if (util_cpu_caps.has_avx2) {
 575                if (type.width == 8)
 576                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
 577                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
 578                if (type.width == 16)
 579                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
 580                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
 581             }
 582          }
 583       }
 584
 585       if (intrinsic)
 586          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 587    }
 588
 589    if(type.norm && !type.floating && !type.fixed) {
 590       if (type.sign) {
 591          uint64_t sign = (uint64_t)1 << (type.width - 1);
 592          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 593          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 594          /* a_clamp_max is the maximum a for positive b,
 595             a_clamp_min is the minimum a for negative b. */
 596          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 597          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 598          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 599       }
 600    }
 601
 602    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 603       if (type.floating)
 604          res = LLVMConstFAdd(a, b);
 605       else
 606          res = LLVMConstAdd(a, b);
 607    else
 608       if (type.floating)
 609          res = LLVMBuildFAdd(builder, a, b, "");
 610       else
 611          res = LLVMBuildAdd(builder, a, b, "");
 612
 613    /* clamp to ceiling of 1.0 */
 614    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 615       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 616
 617    if (type.norm && !type.floating && !type.fixed) {
 618       if (!type.sign) {
 619          /*
 620           * newer llvm versions no longer support the intrinsics, but recognize
 621           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 622           * code, it is important we match the pattern llvm uses (and pray llvm
 623           * doesn't change it - and hope they decide on the same pattern for
 624           * all backends supporting it...).
 625           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 626           * interfere with llvm's ability to recognize the pattern but seems
 627           * a bit brittle.
 628           */
 629          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 630          res = lp_build_select(bld, overflowed,
 631                                LLVMConstAllOnes(bld->int_vec_type), res);
 632       }
 633    }
 634
 635    /* XXX clamp to floor of -1 or 0??? */
 636
 637    return res;
 638 }
 639
 640
 641 /** Return the scalar sum of the elements of a.
 642  * Should avoid this operation whenever possible.
 643  */
 644 LLVMValueRef
 645 lp_build_horizontal_add(struct lp_build_context *bld,
 646                         LLVMValueRef a)
 647 {
 648    LLVMBuilderRef builder = bld->gallivm->builder;
 649    const struct lp_type type = bld->type;
 650    LLVMValueRef index, res;
 651    unsigned i, length;
 652    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 653    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 654    LLVMValueRef vecres, elem2;
 655
 656    assert(lp_check_value(type, a));
 657
 658    if (type.length == 1) {
 659       return a;
 660    }
 661
 662    assert(!bld->type.norm);
 663
 664    /*
 665     * for byte vectors can do much better with psadbw.
 666     * Using repeated shuffle/adds here. Note with multiple vectors
 667     * this can be done more efficiently as outlined in the intel
 668     * optimization manual.
 669     * Note: could cause data rearrangement if used with smaller element
 670     * sizes.
 671     */
 672
 673    vecres = a;
 674    length = type.length / 2;
 675    while (length > 1) {
 676       LLVMValueRef vec1, vec2;
 677       for (i = 0; i < length; i++) {
 678          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 679          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 680       }
 681       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 682                                     LLVMConstVector(shuffles1, length), "");
 683       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 684                                     LLVMConstVector(shuffles2, length), "");
 685       if (type.floating) {
 686          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 687       }
 688       else {
 689          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 690       }
 691       length = length >> 1;
 692    }
 693
 694    /* always have vector of size 2 here */
 695    assert(length == 1);
 696
 697    index = lp_build_const_int32(bld->gallivm, 0);
 698    res = LLVMBuildExtractElement(builder, vecres, index, "");
 699    index = lp_build_const_int32(bld->gallivm, 1);
 700    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 701
 702    if (type.floating)
 703       res = LLVMBuildFAdd(builder, res, elem2, "");
 704     else
 705       res = LLVMBuildAdd(builder, res, elem2, "");
 706
 707    return res;
 708 }
 709
 710 /**
 711  * Return the horizontal sums of 4 float vectors as a float4 vector.
 712  * This uses the technique as outlined in Intel Optimization Manual.
 713  */
 714 static LLVMValueRef
 715 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 716                             LLVMValueRef src[4])
 717 {
 718    struct gallivm_state *gallivm = bld->gallivm;
 719    LLVMBuilderRef builder = gallivm->builder;
 720    LLVMValueRef shuffles[4];
 721    LLVMValueRef tmp[4];
 722    LLVMValueRef sumtmp[2], shuftmp[2];
 723
 724    /* lower half of regs */
 725    shuffles[0] = lp_build_const_int32(gallivm, 0);
 726    shuffles[1] = lp_build_const_int32(gallivm, 1);
 727    shuffles[2] = lp_build_const_int32(gallivm, 4);
 728    shuffles[3] = lp_build_const_int32(gallivm, 5);
 729    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 730                                    LLVMConstVector(shuffles, 4), "");
 731    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 732                                    LLVMConstVector(shuffles, 4), "");
 733
 734    /* upper half of regs */
 735    shuffles[0] = lp_build_const_int32(gallivm, 2);
 736    shuffles[1] = lp_build_const_int32(gallivm, 3);
 737    shuffles[2] = lp_build_const_int32(gallivm, 6);
 738    shuffles[3] = lp_build_const_int32(gallivm, 7);
 739    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 740                                    LLVMConstVector(shuffles, 4), "");
 741    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 742                                    LLVMConstVector(shuffles, 4), "");
 743
 744    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 745    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 746
 747    shuffles[0] = lp_build_const_int32(gallivm, 0);
 748    shuffles[1] = lp_build_const_int32(gallivm, 2);
 749    shuffles[2] = lp_build_const_int32(gallivm, 4);
 750    shuffles[3] = lp_build_const_int32(gallivm, 6);
 751    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 752                                        LLVMConstVector(shuffles, 4), "");
 753
 754    shuffles[0] = lp_build_const_int32(gallivm, 1);
 755    shuffles[1] = lp_build_const_int32(gallivm, 3);
 756    shuffles[2] = lp_build_const_int32(gallivm, 5);
 757    shuffles[3] = lp_build_const_int32(gallivm, 7);
 758    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 759                                        LLVMConstVector(shuffles, 4), "");
 760
 761    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 762 }
 763
 764
 765 /*
 766  * partially horizontally add 2-4 float vectors with length nx4,
 767  * i.e. only four adjacent values in each vector will be added,
 768  * assuming values are really grouped in 4 which also determines
 769  * output order.
 770  *
 771  * Return a vector of the same length as the initial vectors,
 772  * with the excess elements (if any) being undefined.
 773  * The element order is independent of number of input vectors.
 774  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 775  * the output order thus will be
 776  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 777  */
 778 LLVMValueRef
 779 lp_build_hadd_partial4(struct lp_build_context *bld,
 780                        LLVMValueRef vectors[],
 781                        unsigned num_vecs)
 782 {
 783    struct gallivm_state *gallivm = bld->gallivm;
 784    LLVMBuilderRef builder = gallivm->builder;
 785    LLVMValueRef ret_vec;
 786    LLVMValueRef tmp[4];
 787    const char *intrinsic = NULL;
 788
 789    assert(num_vecs >= 2 && num_vecs <= 4);
 790    assert(bld->type.floating);
 791
 792    /* only use this with at least 2 vectors, as it is sort of expensive
 793     * (depending on cpu) and we always need two horizontal adds anyway,
 794     * so a shuffle/add approach might be better.
 795     */
 796
 797    tmp[0] = vectors[0];
 798    tmp[1] = vectors[1];
 799
 800    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 801    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 802
 803    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 804        bld->type.length == 4) {
 805       intrinsic = "llvm.x86.sse3.hadd.ps";
 806    }
 807    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 808             bld->type.length == 8) {
 809       intrinsic = "llvm.x86.avx.hadd.ps.256";
 810    }
 811    if (intrinsic) {
 812       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 813                                        lp_build_vec_type(gallivm, bld->type),
 814                                        tmp[0], tmp[1]);
 815       if (num_vecs > 2) {
 816          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 817                                           lp_build_vec_type(gallivm, bld->type),
 818                                           tmp[2], tmp[3]);
 819       }
 820       else {
 821          tmp[1] = tmp[0];
 822       }
 823       return lp_build_intrinsic_binary(builder, intrinsic,
 824                                        lp_build_vec_type(gallivm, bld->type),
 825                                        tmp[0], tmp[1]);
 826    }
 827
 828    if (bld->type.length == 4) {
 829       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 830    }
 831    else {
 832       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 833       unsigned j;
 834       unsigned num_iter = bld->type.length / 4;
 835       struct lp_type parttype = bld->type;
 836       parttype.length = 4;
 837       for (j = 0; j < num_iter; j++) {
 838          LLVMValueRef partsrc[4];
 839          unsigned i;
 840          for (i = 0; i < 4; i++) {
 841             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 842          }
 843          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 844       }
 845       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 846    }
 847    return ret_vec;
 848 }
 849
 850 /**
 851  * Generate a - b
 852  */
 853 LLVMValueRef
 854 lp_build_sub(struct lp_build_context *bld,
 855              LLVMValueRef a,
 856              LLVMValueRef b)
 857 {
 858    LLVMBuilderRef builder = bld->gallivm->builder;
 859    const struct lp_type type = bld->type;
 860    LLVMValueRef res;
 861
 862    assert(lp_check_value(type, a));
 863    assert(lp_check_value(type, b));
 864
 865    if (b == bld->zero)
 866       return a;
 867    if (a == bld->undef || b == bld->undef)
 868       return bld->undef;
 869    if (a == b)
 870       return bld->zero;
 871
 872    if (type.norm) {
 873       const char *intrinsic = NULL;
 874
 875       if (!type.sign && b == bld->one)
 876         return bld->zero;
 877
 878       if (!type.floating && !type.fixed) {
 879          if (type.width * type.length == 128) {
 880             if (util_cpu_caps.has_sse2) {
 881                if (type.width == 8)
 882                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
 883                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
 884                if (type.width == 16)
 885                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
 886                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
 887             } else if (util_cpu_caps.has_altivec) {
 888                if (type.width == 8)
 889                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 890                if (type.width == 16)
 891                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 892             }
 893          }
 894          if (type.width * type.length == 256) {
 895             if (util_cpu_caps.has_avx2) {
 896                if (type.width == 8)
 897                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
 898                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
 899                if (type.width == 16)
 900                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
 901                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
 902             }
 903          }
 904       }
 905
 906       if (intrinsic)
 907          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 908    }
 909
 910    if(type.norm && !type.floating && !type.fixed) {
 911       if (type.sign) {
 912          uint64_t sign = (uint64_t)1 << (type.width - 1);
 913          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 914          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 915          /* a_clamp_max is the maximum a for negative b,
 916             a_clamp_min is the minimum a for positive b. */
 917          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 918          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 919          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 920       } else {
 921          /*
 922           * This must match llvm pattern for saturated unsigned sub.
 923           * (lp_build_max_simple actually does the job with its current
 924           * definition but do it explicitly here.)
 925           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 926           * interfere with llvm's ability to recognize the pattern but seems
 927           * a bit brittle.
 928           */
 929          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 930          a = lp_build_select(bld, no_ov, a, b);
 931       }
 932    }
 933
 934    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 935       if (type.floating)
 936          res = LLVMConstFSub(a, b);
 937       else
 938          res = LLVMConstSub(a, b);
 939    else
 940       if (type.floating)
 941          res = LLVMBuildFSub(builder, a, b, "");
 942       else
 943          res = LLVMBuildSub(builder, a, b, "");
 944
 945    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 946       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 947
 948    return res;
 949 }
 950
 951
 952
 953 /**
 954  * Normalized multiplication.
 955  *
 956  * There are several approaches for (using 8-bit normalized multiplication as
 957  * an example):
 958  *
 959  * - alpha plus one
 960  *
 961  *     makes the following approximation to the division (Sree)
 962  *
 963  *       a*b/255 ~= (a*(b + 1)) >> 256
 964  *
 965  *     which is the fastest method that satisfies the following OpenGL criteria of
 966  *
 967  *       0*0 = 0 and 255*255 = 255
 968  *
 969  * - geometric series
 970  *
 971  *     takes the geometric series approximation to the division
 972  *
 973  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 974  *
 975  *     in this case just the first two terms to fit in 16bit arithmetic
 976  *
 977  *       t/255 ~= (t + (t >> 8)) >> 8
 978  *
 979  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 980  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 981  *     must be used.
 982  *
 983  * - geometric series plus rounding
 984  *
 985  *     when using a geometric series division instead of truncating the result
 986  *     use roundoff in the approximation (Jim Blinn)
 987  *
 988  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 989  *
 990  *     achieving the exact results.
 991  *
 992  *
 993  *
 994  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 995  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 996  * @sa Michael Herf, The "double blend trick", May 2000,
 997  *     http://www.stereopsis.com/doubleblend.html
 998  */
 999 LLVMValueRef
1000 lp_build_mul_norm(struct gallivm_state *gallivm,
1001                   struct lp_type wide_type,
1002                   LLVMValueRef a, LLVMValueRef b)
1003 {
1004    LLVMBuilderRef builder = gallivm->builder;
1005    struct lp_build_context bld;
1006    unsigned n;
1007    LLVMValueRef half;
1008    LLVMValueRef ab;
1009
1010    assert(!wide_type.floating);
1011    assert(lp_check_value(wide_type, a));
1012    assert(lp_check_value(wide_type, b));
1013
1014    lp_build_context_init(&bld, gallivm, wide_type);
1015
1016    n = wide_type.width / 2;
1017    if (wide_type.sign) {
1018       --n;
1019    }
1020
1021    /*
1022     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1023     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1024     */
1025
1026    /*
1027     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1028     */
1029
1030    ab = LLVMBuildMul(builder, a, b, "");
1031    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1032
1033    /*
1034     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1035     */
1036
1037    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1038    if (wide_type.sign) {
1039       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1040       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1041       half = lp_build_select(&bld, sign, minus_half, half);
1042    }
1043    ab = LLVMBuildAdd(builder, ab, half, "");
1044
1045    /* Final division */
1046    ab = lp_build_shr_imm(&bld, ab, n);
1047
1048    return ab;
1049 }
1050
1051 /**
1052  * Generate a * b
1053  */
1054 LLVMValueRef
1055 lp_build_mul(struct lp_build_context *bld,
1056              LLVMValueRef a,
1057              LLVMValueRef b)
1058 {
1059    LLVMBuilderRef builder = bld->gallivm->builder;
1060    const struct lp_type type = bld->type;
1061    LLVMValueRef shift;
1062    LLVMValueRef res;
1063
1064    assert(lp_check_value(type, a));
1065    assert(lp_check_value(type, b));
1066
1067    if(a == bld->zero)
1068       return bld->zero;
1069    if(a == bld->one)
1070       return b;
1071    if(b == bld->zero)
1072       return bld->zero;
1073    if(b == bld->one)
1074       return a;
1075    if(a == bld->undef || b == bld->undef)
1076       return bld->undef;
1077
1078    if (!type.floating && !type.fixed && type.norm) {
1079       struct lp_type wide_type = lp_wider_type(type);
1080       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1081
1082       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1083       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1084
1085       /* PMULLW, PSRLW, PADDW */
1086       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1087       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1088
1089       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1090
1091       return ab;
1092    }
1093
1094    if(type.fixed)
1095       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1096    else
1097       shift = NULL;
1098
1099    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1100       if (type.floating)
1101          res = LLVMConstFMul(a, b);
1102       else
1103          res = LLVMConstMul(a, b);
1104       if(shift) {
1105          if(type.sign)
1106             res = LLVMConstAShr(res, shift);
1107          else
1108             res = LLVMConstLShr(res, shift);
1109       }
1110    }
1111    else {
1112       if (type.floating)
1113          res = LLVMBuildFMul(builder, a, b, "");
1114       else
1115          res = LLVMBuildMul(builder, a, b, "");
1116       if(shift) {
1117          if(type.sign)
1118             res = LLVMBuildAShr(builder, res, shift, "");
1119          else
1120             res = LLVMBuildLShr(builder, res, shift, "");
1121       }
1122    }
1123
1124    return res;
1125 }
1126
1127 /*
1128  * Widening mul, valid for 32x32 bit -> 64bit only.
1129  * Result is low 32bits, high bits returned in res_hi.
1130  *
1131  * Emits code that is meant to be compiled for the host CPU.
1132  */
1133 LLVMValueRef
1134 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1135                          LLVMValueRef a,
1136                          LLVMValueRef b,
1137                          LLVMValueRef *res_hi)
1138 {
1139    struct gallivm_state *gallivm = bld->gallivm;
1140    LLVMBuilderRef builder = gallivm->builder;
1141
1142    assert(bld->type.width == 32);
1143    assert(bld->type.floating == 0);
1144    assert(bld->type.fixed == 0);
1145    assert(bld->type.norm == 0);
1146
1147    /*
1148     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1149     * for x86 simd is atrocious (even if the high bits weren't required),
1150     * trying to handle real 64bit inputs (which of course can't happen due
1151     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1152     * apparently llvm does not recognize this widening mul). This includes 6
1153     * (instead of 2) pmuludq plus extra adds and shifts
1154     * The same story applies to signed mul, albeit fixing this requires sse41.
1155     * https://llvm.org/bugs/show_bug.cgi?id=30845
1156     * So, whip up our own code, albeit only for length 4 and 8 (which
1157     * should be good enough)...
1158     */
1159    if ((bld->type.length == 4 || bld->type.length == 8) &&
1160        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1161         util_cpu_caps.has_sse4_1)) {
1162       const char *intrinsic = NULL;
1163       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1164       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1165       struct lp_type type_wide = lp_wider_type(bld->type);
1166       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1167       unsigned i;
1168       for (i = 0; i < bld->type.length; i += 2) {
1169          shuf[i] = lp_build_const_int32(gallivm, i+1);
1170          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1171       }
1172       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1173       aeven = a;
1174       beven = b;
1175       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1176       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1177
1178       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1179          if (bld->type.sign) {
1180             intrinsic = "llvm.x86.avx2.pmul.dq";
1181          } else {
1182             intrinsic = "llvm.x86.avx2.pmulu.dq";
1183          }
1184          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1185                                              wider_type, aeven, beven);
1186          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1187                                             wider_type, aodd, bodd);
1188       }
1189       else {
1190          /* for consistent naming look elsewhere... */
1191          if (bld->type.sign) {
1192             intrinsic = "llvm.x86.sse41.pmuldq";
1193          } else {
1194             intrinsic = "llvm.x86.sse2.pmulu.dq";
1195          }
1196          /*
1197           * XXX If we only have AVX but not AVX2 this is a pain.
1198           * lp_build_intrinsic_binary_anylength() can't handle it
1199           * (due to src and dst type not being identical).
1200           */
1201          if (bld->type.length == 8) {
1202             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1203             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1204             LLVMValueRef muleven2[2], mulodd2[2];
1205             struct lp_type type_wide_half = type_wide;
1206             LLVMTypeRef wtype_half;
1207             type_wide_half.length = 2;
1208             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1209             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1210             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1211             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1212             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1213             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1214             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1215             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1216             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1217             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1218                                                     wtype_half, aevenlo, bevenlo);
1219             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1220                                                    wtype_half, aoddlo, boddlo);
1221             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1222                                                     wtype_half, aevenhi, bevenhi);
1223             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1224                                                    wtype_half, aoddhi, boddhi);
1225             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1226             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1227
1228          }
1229          else {
1230             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1231                                                 wider_type, aeven, beven);
1232             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1233                                                wider_type, aodd, bodd);
1234          }
1235       }
1236       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1237       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1238
1239       for (i = 0; i < bld->type.length; i += 2) {
1240          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1241          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1242       }
1243       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1244       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1245
1246       for (i = 0; i < bld->type.length; i += 2) {
1247          shuf[i] = lp_build_const_int32(gallivm, i);
1248          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1249       }
1250       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1251       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1252    }
1253    else {
1254       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1255    }
1256 }
1257
1258
1259 /*
1260  * Widening mul, valid for 32x32 bit -> 64bit only.
1261  * Result is low 32bits, high bits returned in res_hi.
1262  *
1263  * Emits generic code.
1264  */
1265 LLVMValueRef
1266 lp_build_mul_32_lohi(struct lp_build_context *bld,
1267                      LLVMValueRef a,
1268                      LLVMValueRef b,
1269                      LLVMValueRef *res_hi)
1270 {
1271    struct gallivm_state *gallivm = bld->gallivm;
1272    LLVMBuilderRef builder = gallivm->builder;
1273    LLVMValueRef tmp, shift, res_lo;
1274    struct lp_type type_tmp;
1275    LLVMTypeRef wide_type, narrow_type;
1276
1277    type_tmp = bld->type;
1278    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1279    type_tmp.width *= 2;
1280    wide_type = lp_build_vec_type(gallivm, type_tmp);
1281    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1282
1283    if (bld->type.sign) {
1284       a = LLVMBuildSExt(builder, a, wide_type, "");
1285       b = LLVMBuildSExt(builder, b, wide_type, "");
1286    } else {
1287       a = LLVMBuildZExt(builder, a, wide_type, "");
1288       b = LLVMBuildZExt(builder, b, wide_type, "");
1289    }
1290    tmp = LLVMBuildMul(builder, a, b, "");
1291
1292    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1293
1294    /* Since we truncate anyway, LShr and AShr are equivalent. */
1295    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1296    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1297
1298    return res_lo;
1299 }
1300
1301
1302 /* a * b + c */
1303 LLVMValueRef
1304 lp_build_mad(struct lp_build_context *bld,
1305              LLVMValueRef a,
1306              LLVMValueRef b,
1307              LLVMValueRef c)
1308 {
1309    const struct lp_type type = bld->type;
1310    if (type.floating) {
1311       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1312    } else {
1313       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1314    }
1315 }
1316
1317
1318 /**
1319  * Small vector x scale multiplication optimization.
1320  */
1321 LLVMValueRef
1322 lp_build_mul_imm(struct lp_build_context *bld,
1323                  LLVMValueRef a,
1324                  int b)
1325 {
1326    LLVMBuilderRef builder = bld->gallivm->builder;
1327    LLVMValueRef factor;
1328
1329    assert(lp_check_value(bld->type, a));
1330
1331    if(b == 0)
1332       return bld->zero;
1333
1334    if(b == 1)
1335       return a;
1336
1337    if(b == -1)
1338       return lp_build_negate(bld, a);
1339
1340    if(b == 2 && bld->type.floating)
1341       return lp_build_add(bld, a, a);
1342
1343    if(util_is_power_of_two_or_zero(b)) {
1344       unsigned shift = ffs(b) - 1;
1345
1346       if(bld->type.floating) {
1347 #if 0
1348          /*
1349           * Power of two multiplication by directly manipulating the exponent.
1350           *
1351           * XXX: This might not be always faster, it will introduce a small error
1352           * for multiplication by zero, and it will produce wrong results
1353           * for Inf and NaN.
1354           */
1355          unsigned mantissa = lp_mantissa(bld->type);
1356          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1357          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1358          a = LLVMBuildAdd(builder, a, factor, "");
1359          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1360          return a;
1361 #endif
1362       }
1363       else {
1364          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1365          return LLVMBuildShl(builder, a, factor, "");
1366       }
1367    }
1368
1369    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1370    return lp_build_mul(bld, a, factor);
1371 }
1372
1373
1374 /**
1375  * Generate a / b
1376  */
1377 LLVMValueRef
1378 lp_build_div(struct lp_build_context *bld,
1379              LLVMValueRef a,
1380              LLVMValueRef b)
1381 {
1382    LLVMBuilderRef builder = bld->gallivm->builder;
1383    const struct lp_type type = bld->type;
1384
1385    assert(lp_check_value(type, a));
1386    assert(lp_check_value(type, b));
1387
1388    if(a == bld->zero)
1389       return bld->zero;
1390    if(a == bld->one && type.floating)
1391       return lp_build_rcp(bld, b);
1392    if(b == bld->zero)
1393       return bld->undef;
1394    if(b == bld->one)
1395       return a;
1396    if(a == bld->undef || b == bld->undef)
1397       return bld->undef;
1398
1399    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1400       if (type.floating)
1401          return LLVMConstFDiv(a, b);
1402       else if (type.sign)
1403          return LLVMConstSDiv(a, b);
1404       else
1405          return LLVMConstUDiv(a, b);
1406    }
1407
1408    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1409    if(FALSE &&
1410       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1411        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1412       type.floating)
1413       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1414
1415    if (type.floating)
1416       return LLVMBuildFDiv(builder, a, b, "");
1417    else if (type.sign)
1418       return LLVMBuildSDiv(builder, a, b, "");
1419    else
1420       return LLVMBuildUDiv(builder, a, b, "");
1421 }
1422
1423
1424 /**
1425  * Linear interpolation helper.
1426  *
1427  * @param normalized whether we are interpolating normalized values,
1428  *        encoded in normalized integers, twice as wide.
1429  *
1430  * @sa http://www.stereopsis.com/doubleblend.html
1431  */
1432 static inline LLVMValueRef
1433 lp_build_lerp_simple(struct lp_build_context *bld,
1434                      LLVMValueRef x,
1435                      LLVMValueRef v0,
1436                      LLVMValueRef v1,
1437                      unsigned flags)
1438 {
1439    unsigned half_width = bld->type.width/2;
1440    LLVMBuilderRef builder = bld->gallivm->builder;
1441    LLVMValueRef delta;
1442    LLVMValueRef res;
1443
1444    assert(lp_check_value(bld->type, x));
1445    assert(lp_check_value(bld->type, v0));
1446    assert(lp_check_value(bld->type, v1));
1447
1448    delta = lp_build_sub(bld, v1, v0);
1449
1450    if (bld->type.floating) {
1451       assert(flags == 0);
1452       return lp_build_mad(bld, x, delta, v0);
1453    }
1454
1455    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1456       if (!bld->type.sign) {
1457          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1458             /*
1459              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1460              * most-significant-bit to the lowest-significant-bit, so that
1461              * later we can just divide by 2**n instead of 2**n - 1.
1462              */
1463
1464             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1465          }
1466
1467          /* (x * delta) >> n */
1468          res = lp_build_mul(bld, x, delta);
1469          res = lp_build_shr_imm(bld, res, half_width);
1470       } else {
1471          /*
1472           * The rescaling trick above doesn't work for signed numbers, so
1473           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1474           * instead.
1475           */
1476          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1477          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1478       }
1479    } else {
1480       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1481       res = lp_build_mul(bld, x, delta);
1482    }
1483
1484    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1485       /*
1486        * At this point both res and v0 only use the lower half of the bits,
1487        * the rest is zero. Instead of add / mask, do add with half wide type.
1488        */
1489       struct lp_type narrow_type;
1490       struct lp_build_context narrow_bld;
1491
1492       memset(&narrow_type, 0, sizeof narrow_type);
1493       narrow_type.sign   = bld->type.sign;
1494       narrow_type.width  = bld->type.width/2;
1495       narrow_type.length = bld->type.length*2;
1496
1497       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1498       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1499       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1500       res = lp_build_add(&narrow_bld, v0, res);
1501       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1502    } else {
1503       res = lp_build_add(bld, v0, res);
1504
1505       if (bld->type.fixed) {
1506          /*
1507           * We need to mask out the high order bits when lerping 8bit
1508           * normalized colors stored on 16bits
1509           */
1510          /* XXX: This step is necessary for lerping 8bit colors stored on
1511           * 16bits, but it will be wrong for true fixed point use cases.
1512           * Basically we need a more powerful lp_type, capable of further
1513           * distinguishing the values interpretation from the value storage.
1514           */
1515          LLVMValueRef low_bits;
1516          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1517          res = LLVMBuildAnd(builder, res, low_bits, "");
1518       }
1519    }
1520
1521    return res;
1522 }
1523
1524
1525 /**
1526  * Linear interpolation.
1527  */
1528 LLVMValueRef
1529 lp_build_lerp(struct lp_build_context *bld,
1530               LLVMValueRef x,
1531               LLVMValueRef v0,
1532               LLVMValueRef v1,
1533               unsigned flags)
1534 {
1535    const struct lp_type type = bld->type;
1536    LLVMValueRef res;
1537
1538    assert(lp_check_value(type, x));
1539    assert(lp_check_value(type, v0));
1540    assert(lp_check_value(type, v1));
1541
1542    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1543
1544    if (type.norm) {
1545       struct lp_type wide_type;
1546       struct lp_build_context wide_bld;
1547       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1548
1549       assert(type.length >= 2);
1550
1551       /*
1552        * Create a wider integer type, enough to hold the
1553        * intermediate result of the multiplication.
1554        */
1555       memset(&wide_type, 0, sizeof wide_type);
1556       wide_type.sign   = type.sign;
1557       wide_type.width  = type.width*2;
1558       wide_type.length = type.length/2;
1559
1560       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1561
1562       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1563       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1564       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1565
1566       /*
1567        * Lerp both halves.
1568        */
1569
1570       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1571
1572       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1573       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1574
1575       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1576    } else {
1577       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1578    }
1579
1580    return res;
1581 }
1582
1583
1584 /**
1585  * Bilinear interpolation.
1586  *
1587  * Values indices are in v_{yx}.
1588  */
1589 LLVMValueRef
1590 lp_build_lerp_2d(struct lp_build_context *bld,
1591                  LLVMValueRef x,
1592                  LLVMValueRef y,
1593                  LLVMValueRef v00,
1594                  LLVMValueRef v01,
1595                  LLVMValueRef v10,
1596                  LLVMValueRef v11,
1597                  unsigned flags)
1598 {
1599    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1600    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1601    return lp_build_lerp(bld, y, v0, v1, flags);
1602 }
1603
1604
1605 LLVMValueRef
1606 lp_build_lerp_3d(struct lp_build_context *bld,
1607                  LLVMValueRef x,
1608                  LLVMValueRef y,
1609                  LLVMValueRef z,
1610                  LLVMValueRef v000,
1611                  LLVMValueRef v001,
1612                  LLVMValueRef v010,
1613                  LLVMValueRef v011,
1614                  LLVMValueRef v100,
1615                  LLVMValueRef v101,
1616                  LLVMValueRef v110,
1617                  LLVMValueRef v111,
1618                  unsigned flags)
1619 {
1620    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1621    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1622    return lp_build_lerp(bld, z, v0, v1, flags);
1623 }
1624
1625
1626 /**
1627  * Generate min(a, b)
1628  * Do checks for special cases but not for nans.
1629  */
1630 LLVMValueRef
1631 lp_build_min(struct lp_build_context *bld,
1632              LLVMValueRef a,
1633              LLVMValueRef b)
1634 {
1635    assert(lp_check_value(bld->type, a));
1636    assert(lp_check_value(bld->type, b));
1637
1638    if(a == bld->undef || b == bld->undef)
1639       return bld->undef;
1640
1641    if(a == b)
1642       return a;
1643
1644    if (bld->type.norm) {
1645       if (!bld->type.sign) {
1646          if (a == bld->zero || b == bld->zero) {
1647             return bld->zero;
1648          }
1649       }
1650       if(a == bld->one)
1651          return b;
1652       if(b == bld->one)
1653          return a;
1654    }
1655
1656    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1657 }
1658
1659
1660 /**
1661  * Generate min(a, b)
1662  * NaN's are handled according to the behavior specified by the
1663  * nan_behavior argument.
1664  */
1665 LLVMValueRef
1666 lp_build_min_ext(struct lp_build_context *bld,
1667                  LLVMValueRef a,
1668                  LLVMValueRef b,
1669                  enum gallivm_nan_behavior nan_behavior)
1670 {
1671    assert(lp_check_value(bld->type, a));
1672    assert(lp_check_value(bld->type, b));
1673
1674    if(a == bld->undef || b == bld->undef)
1675       return bld->undef;
1676
1677    if(a == b)
1678       return a;
1679
1680    if (bld->type.norm) {
1681       if (!bld->type.sign) {
1682          if (a == bld->zero || b == bld->zero) {
1683             return bld->zero;
1684          }
1685       }
1686       if(a == bld->one)
1687          return b;
1688       if(b == bld->one)
1689          return a;
1690    }
1691
1692    return lp_build_min_simple(bld, a, b, nan_behavior);
1693 }
1694
1695 /**
1696  * Generate max(a, b)
1697  * Do checks for special cases, but NaN behavior is undefined.
1698  */
1699 LLVMValueRef
1700 lp_build_max(struct lp_build_context *bld,
1701              LLVMValueRef a,
1702              LLVMValueRef b)
1703 {
1704    assert(lp_check_value(bld->type, a));
1705    assert(lp_check_value(bld->type, b));
1706
1707    if(a == bld->undef || b == bld->undef)
1708       return bld->undef;
1709
1710    if(a == b)
1711       return a;
1712
1713    if(bld->type.norm) {
1714       if(a == bld->one || b == bld->one)
1715          return bld->one;
1716       if (!bld->type.sign) {
1717          if (a == bld->zero) {
1718             return b;
1719          }
1720          if (b == bld->zero) {
1721             return a;
1722          }
1723       }
1724    }
1725
1726    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1727 }
1728
1729
1730 /**
1731  * Generate max(a, b)
1732  * Checks for special cases.
1733  * NaN's are handled according to the behavior specified by the
1734  * nan_behavior argument.
1735  */
1736 LLVMValueRef
1737 lp_build_max_ext(struct lp_build_context *bld,
1738                   LLVMValueRef a,
1739                   LLVMValueRef b,
1740                   enum gallivm_nan_behavior nan_behavior)
1741 {
1742    assert(lp_check_value(bld->type, a));
1743    assert(lp_check_value(bld->type, b));
1744
1745    if(a == bld->undef || b == bld->undef)
1746       return bld->undef;
1747
1748    if(a == b)
1749       return a;
1750
1751    if(bld->type.norm) {
1752       if(a == bld->one || b == bld->one)
1753          return bld->one;
1754       if (!bld->type.sign) {
1755          if (a == bld->zero) {
1756             return b;
1757          }
1758          if (b == bld->zero) {
1759             return a;
1760          }
1761       }
1762    }
1763
1764    return lp_build_max_simple(bld, a, b, nan_behavior);
1765 }
1766
1767 /**
1768  * Generate clamp(a, min, max)
1769  * NaN behavior (for any of a, min, max) is undefined.
1770  * Do checks for special cases.
1771  */
1772 LLVMValueRef
1773 lp_build_clamp(struct lp_build_context *bld,
1774                LLVMValueRef a,
1775                LLVMValueRef min,
1776                LLVMValueRef max)
1777 {
1778    assert(lp_check_value(bld->type, a));
1779    assert(lp_check_value(bld->type, min));
1780    assert(lp_check_value(bld->type, max));
1781
1782    a = lp_build_min(bld, a, max);
1783    a = lp_build_max(bld, a, min);
1784    return a;
1785 }
1786
1787
1788 /**
1789  * Generate clamp(a, 0, 1)
1790  * A NaN will get converted to zero.
1791  */
1792 LLVMValueRef
1793 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1794                                 LLVMValueRef a)
1795 {
1796    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1797    a = lp_build_min(bld, a, bld->one);
1798    return a;
1799 }
1800
1801
1802 /**
1803  * Generate abs(a)
1804  */
1805 LLVMValueRef
1806 lp_build_abs(struct lp_build_context *bld,
1807              LLVMValueRef a)
1808 {
1809    LLVMBuilderRef builder = bld->gallivm->builder;
1810    const struct lp_type type = bld->type;
1811    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1812
1813    assert(lp_check_value(type, a));
1814
1815    if(!type.sign)
1816       return a;
1817
1818    if(type.floating) {
1819       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1820          /* Workaround llvm.org/PR27332 */
1821          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1822          unsigned long long absMask = ~(1ULL << (type.width - 1));
1823          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1824          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1825          a = LLVMBuildAnd(builder, a, mask, "");
1826          a = LLVMBuildBitCast(builder, a, vec_type, "");
1827          return a;
1828       } else {
1829          char intrinsic[32];
1830          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1831          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1832       }
1833    }
1834
1835    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1836       switch(type.width) {
1837       case 8:
1838          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1839       case 16:
1840          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1841       case 32:
1842          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1843       }
1844    }
1845    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1846       switch(type.width) {
1847       case 8:
1848          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1849       case 16:
1850          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1851       case 32:
1852          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1853       }
1854    }
1855
1856    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1857                           a, LLVMBuildNeg(builder, a, ""));
1858 }
1859
1860
1861 LLVMValueRef
1862 lp_build_negate(struct lp_build_context *bld,
1863                 LLVMValueRef a)
1864 {
1865    LLVMBuilderRef builder = bld->gallivm->builder;
1866
1867    assert(lp_check_value(bld->type, a));
1868
1869    if (bld->type.floating)
1870       a = LLVMBuildFNeg(builder, a, "");
1871    else
1872       a = LLVMBuildNeg(builder, a, "");
1873
1874    return a;
1875 }
1876
1877
1878 /** Return -1, 0 or +1 depending on the sign of a */
1879 LLVMValueRef
1880 lp_build_sgn(struct lp_build_context *bld,
1881              LLVMValueRef a)
1882 {
1883    LLVMBuilderRef builder = bld->gallivm->builder;
1884    const struct lp_type type = bld->type;
1885    LLVMValueRef cond;
1886    LLVMValueRef res;
1887
1888    assert(lp_check_value(type, a));
1889
1890    /* Handle non-zero case */
1891    if(!type.sign) {
1892       /* if not zero then sign must be positive */
1893       res = bld->one;
1894    }
1895    else if(type.floating) {
1896       LLVMTypeRef vec_type;
1897       LLVMTypeRef int_type;
1898       LLVMValueRef mask;
1899       LLVMValueRef sign;
1900       LLVMValueRef one;
1901       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1902
1903       int_type = lp_build_int_vec_type(bld->gallivm, type);
1904       vec_type = lp_build_vec_type(bld->gallivm, type);
1905       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1906
1907       /* Take the sign bit and add it to 1 constant */
1908       sign = LLVMBuildBitCast(builder, a, int_type, "");
1909       sign = LLVMBuildAnd(builder, sign, mask, "");
1910       one = LLVMConstBitCast(bld->one, int_type);
1911       res = LLVMBuildOr(builder, sign, one, "");
1912       res = LLVMBuildBitCast(builder, res, vec_type, "");
1913    }
1914    else
1915    {
1916       /* signed int/norm/fixed point */
1917       /* could use psign with sse3 and appropriate vectors here */
1918       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1919       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1920       res = lp_build_select(bld, cond, bld->one, minus_one);
1921    }
1922
1923    /* Handle zero */
1924    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1925    res = lp_build_select(bld, cond, bld->zero, res);
1926
1927    return res;
1928 }
1929
1930
1931 /**
1932  * Set the sign of float vector 'a' according to 'sign'.
1933  * If sign==0, return abs(a).
1934  * If sign==1, return -abs(a);
1935  * Other values for sign produce undefined results.
1936  */
1937 LLVMValueRef
1938 lp_build_set_sign(struct lp_build_context *bld,
1939                   LLVMValueRef a, LLVMValueRef sign)
1940 {
1941    LLVMBuilderRef builder = bld->gallivm->builder;
1942    const struct lp_type type = bld->type;
1943    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1944    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1946    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1947                              ~((unsigned long long) 1 << (type.width - 1)));
1948    LLVMValueRef val, res;
1949
1950    assert(type.floating);
1951    assert(lp_check_value(type, a));
1952
1953    /* val = reinterpret_cast<int>(a) */
1954    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1955    /* val = val & mask */
1956    val = LLVMBuildAnd(builder, val, mask, "");
1957    /* sign = sign << shift */
1958    sign = LLVMBuildShl(builder, sign, shift, "");
1959    /* res = val | sign */
1960    res = LLVMBuildOr(builder, val, sign, "");
1961    /* res = reinterpret_cast<float>(res) */
1962    res = LLVMBuildBitCast(builder, res, vec_type, "");
1963
1964    return res;
1965 }
1966
1967
1968 /**
1969  * Convert vector of (or scalar) int to vector of (or scalar) float.
1970  */
1971 LLVMValueRef
1972 lp_build_int_to_float(struct lp_build_context *bld,
1973                       LLVMValueRef a)
1974 {
1975    LLVMBuilderRef builder = bld->gallivm->builder;
1976    const struct lp_type type = bld->type;
1977    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1978
1979    assert(type.floating);
1980
1981    return LLVMBuildSIToFP(builder, a, vec_type, "");
1982 }
1983
1984 static boolean
1985 arch_rounding_available(const struct lp_type type)
1986 {
1987    if ((util_cpu_caps.has_sse4_1 &&
1988        (type.length == 1 || type.width*type.length == 128)) ||
1989        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1990        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1991       return TRUE;
1992    else if ((util_cpu_caps.has_altivec &&
1993             (type.width == 32 && type.length == 4)))
1994       return TRUE;
1995    else if (util_cpu_caps.has_neon)
1996       return TRUE;
1997
1998    return FALSE;
1999 }
2000
2001 enum lp_build_round_mode
2002 {
2003    LP_BUILD_ROUND_NEAREST = 0,
2004    LP_BUILD_ROUND_FLOOR = 1,
2005    LP_BUILD_ROUND_CEIL = 2,
2006    LP_BUILD_ROUND_TRUNCATE = 3
2007 };
2008
2009 static inline LLVMValueRef
2010 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2011                              LLVMValueRef a)
2012 {
2013    LLVMBuilderRef builder = bld->gallivm->builder;
2014    const struct lp_type type = bld->type;
2015    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2016    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2017    const char *intrinsic;
2018    LLVMValueRef res;
2019
2020    assert(type.floating);
2021    /* using the double precision conversions is a bit more complicated */
2022    assert(type.width == 32);
2023
2024    assert(lp_check_value(type, a));
2025    assert(util_cpu_caps.has_sse2);
2026
2027    /* This is relying on MXCSR rounding mode, which should always be nearest. */
2028    if (type.length == 1) {
2029       LLVMTypeRef vec_type;
2030       LLVMValueRef undef;
2031       LLVMValueRef arg;
2032       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2033
2034       vec_type = LLVMVectorType(bld->elem_type, 4);
2035
2036       intrinsic = "llvm.x86.sse.cvtss2si";
2037
2038       undef = LLVMGetUndef(vec_type);
2039
2040       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2041
2042       res = lp_build_intrinsic_unary(builder, intrinsic,
2043                                      ret_type, arg);
2044    }
2045    else {
2046       if (type.width* type.length == 128) {
2047          intrinsic = "llvm.x86.sse2.cvtps2dq";
2048       }
2049       else {
2050          assert(type.width*type.length == 256);
2051          assert(util_cpu_caps.has_avx);
2052
2053          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2054       }
2055       res = lp_build_intrinsic_unary(builder, intrinsic,
2056                                      ret_type, a);
2057    }
2058
2059    return res;
2060 }
2061
2062
2063 /*
2064  */
2065 static inline LLVMValueRef
2066 lp_build_round_altivec(struct lp_build_context *bld,
2067                        LLVMValueRef a,
2068                        enum lp_build_round_mode mode)
2069 {
2070    LLVMBuilderRef builder = bld->gallivm->builder;
2071    const struct lp_type type = bld->type;
2072    const char *intrinsic = NULL;
2073
2074    assert(type.floating);
2075
2076    assert(lp_check_value(type, a));
2077    assert(util_cpu_caps.has_altivec);
2078
2079    (void)type;
2080
2081    switch (mode) {
2082    case LP_BUILD_ROUND_NEAREST:
2083       intrinsic = "llvm.ppc.altivec.vrfin";
2084       break;
2085    case LP_BUILD_ROUND_FLOOR:
2086       intrinsic = "llvm.ppc.altivec.vrfim";
2087       break;
2088    case LP_BUILD_ROUND_CEIL:
2089       intrinsic = "llvm.ppc.altivec.vrfip";
2090       break;
2091    case LP_BUILD_ROUND_TRUNCATE:
2092       intrinsic = "llvm.ppc.altivec.vrfiz";
2093       break;
2094    }
2095
2096    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2097 }
2098
2099 static inline LLVMValueRef
2100 lp_build_round_arch(struct lp_build_context *bld,
2101                     LLVMValueRef a,
2102                     enum lp_build_round_mode mode)
2103 {
2104    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2105       LLVMBuilderRef builder = bld->gallivm->builder;
2106       const struct lp_type type = bld->type;
2107       const char *intrinsic_root;
2108       char intrinsic[32];
2109
2110       assert(type.floating);
2111       assert(lp_check_value(type, a));
2112       (void)type;
2113
2114       switch (mode) {
2115       case LP_BUILD_ROUND_NEAREST:
2116          intrinsic_root = "llvm.nearbyint";
2117          break;
2118       case LP_BUILD_ROUND_FLOOR:
2119          intrinsic_root = "llvm.floor";
2120          break;
2121       case LP_BUILD_ROUND_CEIL:
2122          intrinsic_root = "llvm.ceil";
2123          break;
2124       case LP_BUILD_ROUND_TRUNCATE:
2125          intrinsic_root = "llvm.trunc";
2126          break;
2127       }
2128
2129       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2130       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2131    }
2132    else /* (util_cpu_caps.has_altivec) */
2133      return lp_build_round_altivec(bld, a, mode);
2134 }
2135
2136 /**
2137  * Return the integer part of a float (vector) value (== round toward zero).
2138  * The returned value is a float (vector).
2139  * Ex: trunc(-1.5) = -1.0
2140  */
2141 LLVMValueRef
2142 lp_build_trunc(struct lp_build_context *bld,
2143                LLVMValueRef a)
2144 {
2145    LLVMBuilderRef builder = bld->gallivm->builder;
2146    const struct lp_type type = bld->type;
2147
2148    assert(type.floating);
2149    assert(lp_check_value(type, a));
2150
2151    if (arch_rounding_available(type)) {
2152       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2153    }
2154    else {
2155       const struct lp_type type = bld->type;
2156       struct lp_type inttype;
2157       struct lp_build_context intbld;
2158       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2159       LLVMValueRef trunc, res, anosign, mask;
2160       LLVMTypeRef int_vec_type = bld->int_vec_type;
2161       LLVMTypeRef vec_type = bld->vec_type;
2162
2163       assert(type.width == 32); /* might want to handle doubles at some point */
2164
2165       inttype = type;
2166       inttype.floating = 0;
2167       lp_build_context_init(&intbld, bld->gallivm, inttype);
2168
2169       /* round by truncation */
2170       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2171       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2172
2173       /* mask out sign bit */
2174       anosign = lp_build_abs(bld, a);
2175       /*
2176        * mask out all values if anosign > 2^24
2177        * This should work both for large ints (all rounding is no-op for them
2178        * because such floats are always exact) as well as special cases like
2179        * NaNs, Infs (taking advantage of the fact they use max exponent).
2180        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2181        */
2182       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2183       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2184       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2185       return lp_build_select(bld, mask, a, res);
2186    }
2187 }
2188
2189
2190 /**
2191  * Return float (vector) rounded to nearest integer (vector).  The returned
2192  * value is a float (vector).
2193  * Ex: round(0.9) = 1.0
2194  * Ex: round(-1.5) = -2.0
2195  */
2196 LLVMValueRef
2197 lp_build_round(struct lp_build_context *bld,
2198                LLVMValueRef a)
2199 {
2200    LLVMBuilderRef builder = bld->gallivm->builder;
2201    const struct lp_type type = bld->type;
2202
2203    assert(type.floating);
2204    assert(lp_check_value(type, a));
2205
2206    if (arch_rounding_available(type)) {
2207       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2208    }
2209    else {
2210       const struct lp_type type = bld->type;
2211       struct lp_type inttype;
2212       struct lp_build_context intbld;
2213       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2214       LLVMValueRef res, anosign, mask;
2215       LLVMTypeRef int_vec_type = bld->int_vec_type;
2216       LLVMTypeRef vec_type = bld->vec_type;
2217
2218       assert(type.width == 32); /* might want to handle doubles at some point */
2219
2220       inttype = type;
2221       inttype.floating = 0;
2222       lp_build_context_init(&intbld, bld->gallivm, inttype);
2223
2224       res = lp_build_iround(bld, a);
2225       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2226
2227       /* mask out sign bit */
2228       anosign = lp_build_abs(bld, a);
2229       /*
2230        * mask out all values if anosign > 2^24
2231        * This should work both for large ints (all rounding is no-op for them
2232        * because such floats are always exact) as well as special cases like
2233        * NaNs, Infs (taking advantage of the fact they use max exponent).
2234        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2235        */
2236       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2237       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2238       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2239       return lp_build_select(bld, mask, a, res);
2240    }
2241 }
2242
2243
2244 /**
2245  * Return floor of float (vector), result is a float (vector)
2246  * Ex: floor(1.1) = 1.0
2247  * Ex: floor(-1.1) = -2.0
2248  */
2249 LLVMValueRef
2250 lp_build_floor(struct lp_build_context *bld,
2251                LLVMValueRef a)
2252 {
2253    LLVMBuilderRef builder = bld->gallivm->builder;
2254    const struct lp_type type = bld->type;
2255
2256    assert(type.floating);
2257    assert(lp_check_value(type, a));
2258
2259    if (arch_rounding_available(type)) {
2260       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2261    }
2262    else {
2263       const struct lp_type type = bld->type;
2264       struct lp_type inttype;
2265       struct lp_build_context intbld;
2266       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2267       LLVMValueRef trunc, res, anosign, mask;
2268       LLVMTypeRef int_vec_type = bld->int_vec_type;
2269       LLVMTypeRef vec_type = bld->vec_type;
2270
2271       if (type.width != 32) {
2272          char intrinsic[32];
2273          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2274          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2275       }
2276
2277       assert(type.width == 32); /* might want to handle doubles at some point */
2278
2279       inttype = type;
2280       inttype.floating = 0;
2281       lp_build_context_init(&intbld, bld->gallivm, inttype);
2282
2283       /* round by truncation */
2284       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2285       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2286
2287       if (type.sign) {
2288          LLVMValueRef tmp;
2289
2290          /*
2291           * fix values if rounding is wrong (for non-special cases)
2292           * - this is the case if trunc > a
2293           */
2294          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2295          /* tmp = trunc > a ? 1.0 : 0.0 */
2296          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2297          tmp = lp_build_and(&intbld, mask, tmp);
2298          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2299          res = lp_build_sub(bld, res, tmp);
2300       }
2301
2302       /* mask out sign bit */
2303       anosign = lp_build_abs(bld, a);
2304       /*
2305        * mask out all values if anosign > 2^24
2306        * This should work both for large ints (all rounding is no-op for them
2307        * because such floats are always exact) as well as special cases like
2308        * NaNs, Infs (taking advantage of the fact they use max exponent).
2309        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2310        */
2311       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2312       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2313       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2314       return lp_build_select(bld, mask, a, res);
2315    }
2316 }
2317
2318
2319 /**
2320  * Return ceiling of float (vector), returning float (vector).
2321  * Ex: ceil( 1.1) = 2.0
2322  * Ex: ceil(-1.1) = -1.0
2323  */
2324 LLVMValueRef
2325 lp_build_ceil(struct lp_build_context *bld,
2326               LLVMValueRef a)
2327 {
2328    LLVMBuilderRef builder = bld->gallivm->builder;
2329    const struct lp_type type = bld->type;
2330
2331    assert(type.floating);
2332    assert(lp_check_value(type, a));
2333
2334    if (arch_rounding_available(type)) {
2335       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2336    }
2337    else {
2338       const struct lp_type type = bld->type;
2339       struct lp_type inttype;
2340       struct lp_build_context intbld;
2341       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2342       LLVMValueRef trunc, res, anosign, mask, tmp;
2343       LLVMTypeRef int_vec_type = bld->int_vec_type;
2344       LLVMTypeRef vec_type = bld->vec_type;
2345
2346       if (type.width != 32) {
2347          char intrinsic[32];
2348          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2349          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2350       }
2351
2352       assert(type.width == 32); /* might want to handle doubles at some point */
2353
2354       inttype = type;
2355       inttype.floating = 0;
2356       lp_build_context_init(&intbld, bld->gallivm, inttype);
2357
2358       /* round by truncation */
2359       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2360       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2361
2362       /*
2363        * fix values if rounding is wrong (for non-special cases)
2364        * - this is the case if trunc < a
2365        */
2366       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2367       /* tmp = trunc < a ? 1.0 : 0.0 */
2368       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2369       tmp = lp_build_and(&intbld, mask, tmp);
2370       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2371       res = lp_build_add(bld, trunc, tmp);
2372
2373       /* mask out sign bit */
2374       anosign = lp_build_abs(bld, a);
2375       /*
2376        * mask out all values if anosign > 2^24
2377        * This should work both for large ints (all rounding is no-op for them
2378        * because such floats are always exact) as well as special cases like
2379        * NaNs, Infs (taking advantage of the fact they use max exponent).
2380        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2381        */
2382       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2383       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2384       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2385       return lp_build_select(bld, mask, a, res);
2386    }
2387 }
2388
2389
2390 /**
2391  * Return fractional part of 'a' computed as a - floor(a)
2392  * Typically used in texture coord arithmetic.
2393  */
2394 LLVMValueRef
2395 lp_build_fract(struct lp_build_context *bld,
2396                LLVMValueRef a)
2397 {
2398    assert(bld->type.floating);
2399    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2400 }
2401
2402
2403 /**
2404  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2405  * against 0.99999(9). (Will also return that value for NaNs.)
2406  */
2407 static inline LLVMValueRef
2408 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2409 {
2410    LLVMValueRef max;
2411
2412    /* this is the largest number smaller than 1.0 representable as float */
2413    max = lp_build_const_vec(bld->gallivm, bld->type,
2414                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2415    return lp_build_min_ext(bld, fract, max,
2416                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2417 }
2418
2419
2420 /**
2421  * Same as lp_build_fract, but guarantees that the result is always smaller
2422  * than one. Will also return the smaller-than-one value for infs, NaNs.
2423  */
2424 LLVMValueRef
2425 lp_build_fract_safe(struct lp_build_context *bld,
2426                     LLVMValueRef a)
2427 {
2428    return clamp_fract(bld, lp_build_fract(bld, a));
2429 }
2430
2431
2432 /**
2433  * Return the integer part of a float (vector) value (== round toward zero).
2434  * The returned value is an integer (vector).
2435  * Ex: itrunc(-1.5) = -1
2436  */
2437 LLVMValueRef
2438 lp_build_itrunc(struct lp_build_context *bld,
2439                 LLVMValueRef a)
2440 {
2441    LLVMBuilderRef builder = bld->gallivm->builder;
2442    const struct lp_type type = bld->type;
2443    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2444
2445    assert(type.floating);
2446    assert(lp_check_value(type, a));
2447
2448    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2449 }
2450
2451
2452 /**
2453  * Return float (vector) rounded to nearest integer (vector).  The returned
2454  * value is an integer (vector).
2455  * Ex: iround(0.9) = 1
2456  * Ex: iround(-1.5) = -2
2457  */
2458 LLVMValueRef
2459 lp_build_iround(struct lp_build_context *bld,
2460                 LLVMValueRef a)
2461 {
2462    LLVMBuilderRef builder = bld->gallivm->builder;
2463    const struct lp_type type = bld->type;
2464    LLVMTypeRef int_vec_type = bld->int_vec_type;
2465    LLVMValueRef res;
2466
2467    assert(type.floating);
2468
2469    assert(lp_check_value(type, a));
2470
2471    if ((util_cpu_caps.has_sse2 &&
2472        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2473        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2474       return lp_build_iround_nearest_sse2(bld, a);
2475    }
2476    if (arch_rounding_available(type)) {
2477       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2478    }
2479    else {
2480       LLVMValueRef half;
2481
2482       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2483
2484       if (type.sign) {
2485          LLVMTypeRef vec_type = bld->vec_type;
2486          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2487                                     (unsigned long long)1 << (type.width - 1));
2488          LLVMValueRef sign;
2489
2490          /* get sign bit */
2491          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2492          sign = LLVMBuildAnd(builder, sign, mask, "");
2493
2494          /* sign * 0.5 */
2495          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2496          half = LLVMBuildOr(builder, sign, half, "");
2497          half = LLVMBuildBitCast(builder, half, vec_type, "");
2498       }
2499
2500       res = LLVMBuildFAdd(builder, a, half, "");
2501    }
2502
2503    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2504
2505    return res;
2506 }
2507
2508
2509 /**
2510  * Return floor of float (vector), result is an int (vector)
2511  * Ex: ifloor(1.1) = 1.0
2512  * Ex: ifloor(-1.1) = -2.0
2513  */
2514 LLVMValueRef
2515 lp_build_ifloor(struct lp_build_context *bld,
2516                 LLVMValueRef a)
2517 {
2518    LLVMBuilderRef builder = bld->gallivm->builder;
2519    const struct lp_type type = bld->type;
2520    LLVMTypeRef int_vec_type = bld->int_vec_type;
2521    LLVMValueRef res;
2522
2523    assert(type.floating);
2524    assert(lp_check_value(type, a));
2525
2526    res = a;
2527    if (type.sign) {
2528       if (arch_rounding_available(type)) {
2529          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2530       }
2531       else {
2532          struct lp_type inttype;
2533          struct lp_build_context intbld;
2534          LLVMValueRef trunc, itrunc, mask;
2535
2536          assert(type.floating);
2537          assert(lp_check_value(type, a));
2538
2539          inttype = type;
2540          inttype.floating = 0;
2541          lp_build_context_init(&intbld, bld->gallivm, inttype);
2542
2543          /* round by truncation */
2544          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2545          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2546
2547          /*
2548           * fix values if rounding is wrong (for non-special cases)
2549           * - this is the case if trunc > a
2550           * The results of doing this with NaNs, very large values etc.
2551           * are undefined but this seems to be the case anyway.
2552           */
2553          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2554          /* cheapie minus one with mask since the mask is minus one / zero */
2555          return lp_build_add(&intbld, itrunc, mask);
2556       }
2557    }
2558
2559    /* round to nearest (toward zero) */
2560    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2561
2562    return res;
2563 }
2564
2565
2566 /**
2567  * Return ceiling of float (vector), returning int (vector).
2568  * Ex: iceil( 1.1) = 2
2569  * Ex: iceil(-1.1) = -1
2570  */
2571 LLVMValueRef
2572 lp_build_iceil(struct lp_build_context *bld,
2573                LLVMValueRef a)
2574 {
2575    LLVMBuilderRef builder = bld->gallivm->builder;
2576    const struct lp_type type = bld->type;
2577    LLVMTypeRef int_vec_type = bld->int_vec_type;
2578    LLVMValueRef res;
2579
2580    assert(type.floating);
2581    assert(lp_check_value(type, a));
2582
2583    if (arch_rounding_available(type)) {
2584       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2585    }
2586    else {
2587       struct lp_type inttype;
2588       struct lp_build_context intbld;
2589       LLVMValueRef trunc, itrunc, mask;
2590
2591       assert(type.floating);
2592       assert(lp_check_value(type, a));
2593
2594       inttype = type;
2595       inttype.floating = 0;
2596       lp_build_context_init(&intbld, bld->gallivm, inttype);
2597
2598       /* round by truncation */
2599       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2600       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2601
2602       /*
2603        * fix values if rounding is wrong (for non-special cases)
2604        * - this is the case if trunc < a
2605        * The results of doing this with NaNs, very large values etc.
2606        * are undefined but this seems to be the case anyway.
2607        */
2608       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2609       /* cheapie plus one with mask since the mask is minus one / zero */
2610       return lp_build_sub(&intbld, itrunc, mask);
2611    }
2612
2613    /* round to nearest (toward zero) */
2614    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2615
2616    return res;
2617 }
2618
2619
2620 /**
2621  * Combined ifloor() & fract().
2622  *
2623  * Preferred to calling the functions separately, as it will ensure that the
2624  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2625  */
2626 void
2627 lp_build_ifloor_fract(struct lp_build_context *bld,
2628                       LLVMValueRef a,
2629                       LLVMValueRef *out_ipart,
2630                       LLVMValueRef *out_fpart)
2631 {
2632    LLVMBuilderRef builder = bld->gallivm->builder;
2633    const struct lp_type type = bld->type;
2634    LLVMValueRef ipart;
2635
2636    assert(type.floating);
2637    assert(lp_check_value(type, a));
2638
2639    if (arch_rounding_available(type)) {
2640       /*
2641        * floor() is easier.
2642        */
2643
2644       ipart = lp_build_floor(bld, a);
2645       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2646       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2647    }
2648    else {
2649       /*
2650        * ifloor() is easier.
2651        */
2652
2653       *out_ipart = lp_build_ifloor(bld, a);
2654       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2655       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2656    }
2657 }
2658
2659
2660 /**
2661  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2662  * always smaller than one.
2663  */
2664 void
2665 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2666                            LLVMValueRef a,
2667                            LLVMValueRef *out_ipart,
2668                            LLVMValueRef *out_fpart)
2669 {
2670    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2671    *out_fpart = clamp_fract(bld, *out_fpart);
2672 }
2673
2674
2675 LLVMValueRef
2676 lp_build_sqrt(struct lp_build_context *bld,
2677               LLVMValueRef a)
2678 {
2679    LLVMBuilderRef builder = bld->gallivm->builder;
2680    const struct lp_type type = bld->type;
2681    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2682    char intrinsic[32];
2683
2684    assert(lp_check_value(type, a));
2685
2686    assert(type.floating);
2687    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2688
2689    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2690 }
2691
2692
2693 /**
2694  * Do one Newton-Raphson step to improve reciprocate precision:
2695  *
2696  *   x_{i+1} = x_i * (2 - a * x_i)
2697  *
2698  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2699  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2700  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2701  * halo. It would be necessary to clamp the argument to prevent this.
2702  *
2703  * See also:
2704  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2705  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2706  */
2707 static inline LLVMValueRef
2708 lp_build_rcp_refine(struct lp_build_context *bld,
2709                     LLVMValueRef a,
2710                     LLVMValueRef rcp_a)
2711 {
2712    LLVMBuilderRef builder = bld->gallivm->builder;
2713    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2714    LLVMValueRef res;
2715
2716    res = LLVMBuildFMul(builder, a, rcp_a, "");
2717    res = LLVMBuildFSub(builder, two, res, "");
2718    res = LLVMBuildFMul(builder, rcp_a, res, "");
2719
2720    return res;
2721 }
2722
2723
2724 LLVMValueRef
2725 lp_build_rcp(struct lp_build_context *bld,
2726              LLVMValueRef a)
2727 {
2728    LLVMBuilderRef builder = bld->gallivm->builder;
2729    const struct lp_type type = bld->type;
2730
2731    assert(lp_check_value(type, a));
2732
2733    if(a == bld->zero)
2734       return bld->undef;
2735    if(a == bld->one)
2736       return bld->one;
2737    if(a == bld->undef)
2738       return bld->undef;
2739
2740    assert(type.floating);
2741
2742    if(LLVMIsConstant(a))
2743       return LLVMConstFDiv(bld->one, a);
2744
2745    /*
2746     * We don't use RCPPS because:
2747     * - it only has 10bits of precision
2748     * - it doesn't even get the reciprocate of 1.0 exactly
2749     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2750     * - for recent processors the benefit over DIVPS is marginal, a case
2751     *   dependent
2752     *
2753     * We could still use it on certain processors if benchmarks show that the
2754     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2755     * particular uses that require less workarounds.
2756     */
2757
2758    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2759          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2760       const unsigned num_iterations = 0;
2761       LLVMValueRef res;
2762       unsigned i;
2763       const char *intrinsic = NULL;
2764
2765       if (type.length == 4) {
2766          intrinsic = "llvm.x86.sse.rcp.ps";
2767       }
2768       else {
2769          intrinsic = "llvm.x86.avx.rcp.ps.256";
2770       }
2771
2772       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2773
2774       for (i = 0; i < num_iterations; ++i) {
2775          res = lp_build_rcp_refine(bld, a, res);
2776       }
2777
2778       return res;
2779    }
2780
2781    return LLVMBuildFDiv(builder, bld->one, a, "");
2782 }
2783
2784
2785 /**
2786  * Do one Newton-Raphson step to improve rsqrt precision:
2787  *
2788  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2789  *
2790  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2791  */
2792 static inline LLVMValueRef
2793 lp_build_rsqrt_refine(struct lp_build_context *bld,
2794                       LLVMValueRef a,
2795                       LLVMValueRef rsqrt_a)
2796 {
2797    LLVMBuilderRef builder = bld->gallivm->builder;
2798    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2799    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2800    LLVMValueRef res;
2801
2802    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2803    res = LLVMBuildFMul(builder, a, res, "");
2804    res = LLVMBuildFSub(builder, three, res, "");
2805    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2806    res = LLVMBuildFMul(builder, half, res, "");
2807
2808    return res;
2809 }
2810
2811
2812 /**
2813  * Generate 1/sqrt(a).
2814  * Result is undefined for values < 0, infinity for +0.
2815  */
2816 LLVMValueRef
2817 lp_build_rsqrt(struct lp_build_context *bld,
2818                LLVMValueRef a)
2819 {
2820    const struct lp_type type = bld->type;
2821
2822    assert(lp_check_value(type, a));
2823
2824    assert(type.floating);
2825
2826    /*
2827     * This should be faster but all denormals will end up as infinity.
2828     */
2829    if (0 && lp_build_fast_rsqrt_available(type)) {
2830       const unsigned num_iterations = 1;
2831       LLVMValueRef res;
2832       unsigned i;
2833
2834       /* rsqrt(1.0) != 1.0 here */
2835       res = lp_build_fast_rsqrt(bld, a);
2836
2837       if (num_iterations) {
2838          /*
2839           * Newton-Raphson will result in NaN instead of infinity for zero,
2840           * and NaN instead of zero for infinity.
2841           * Also, need to ensure rsqrt(1.0) == 1.0.
2842           * All numbers smaller than FLT_MIN will result in +infinity
2843           * (rsqrtps treats all denormals as zero).
2844           */
2845          LLVMValueRef cmp;
2846          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2847          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2848
2849          for (i = 0; i < num_iterations; ++i) {
2850             res = lp_build_rsqrt_refine(bld, a, res);
2851          }
2852          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2853          res = lp_build_select(bld, cmp, inf, res);
2854          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2855          res = lp_build_select(bld, cmp, bld->zero, res);
2856          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2857          res = lp_build_select(bld, cmp, bld->one, res);
2858       }
2859
2860       return res;
2861    }
2862
2863    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2864 }
2865
2866 /**
2867  * If there's a fast (inaccurate) rsqrt instruction available
2868  * (caller may want to avoid to call rsqrt_fast if it's not available,
2869  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2870  * unavailable it would result in sqrt/div/mul so obviously
2871  * much better to just call sqrt, skipping both div and mul).
2872  */
2873 boolean
2874 lp_build_fast_rsqrt_available(struct lp_type type)
2875 {
2876    assert(type.floating);
2877
2878    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2879        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2880       return true;
2881    }
2882    return false;
2883 }
2884
2885
2886 /**
2887  * Generate 1/sqrt(a).
2888  * Result is undefined for values < 0, infinity for +0.
2889  * Precision is limited, only ~10 bits guaranteed
2890  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2891  */
2892 LLVMValueRef
2893 lp_build_fast_rsqrt(struct lp_build_context *bld,
2894                     LLVMValueRef a)
2895 {
2896    LLVMBuilderRef builder = bld->gallivm->builder;
2897    const struct lp_type type = bld->type;
2898
2899    assert(lp_check_value(type, a));
2900
2901    if (lp_build_fast_rsqrt_available(type)) {
2902       const char *intrinsic = NULL;
2903
2904       if (type.length == 4) {
2905          intrinsic = "llvm.x86.sse.rsqrt.ps";
2906       }
2907       else {
2908          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2909       }
2910       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2911    }
2912    else {
2913       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2914    }
2915    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2916 }
2917
2918
2919 /**
2920  * Generate sin(a) or cos(a) using polynomial approximation.
2921  * TODO: it might be worth recognizing sin and cos using same source
2922  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2923  * would be way cheaper than calculating (nearly) everything twice...
2924  * Not sure it's common enough to be worth bothering however, scs
2925  * opcode could also benefit from calculating both though.
2926  */
2927 static LLVMValueRef
2928 lp_build_sin_or_cos(struct lp_build_context *bld,
2929                     LLVMValueRef a,
2930                     boolean cos)
2931 {
2932    struct gallivm_state *gallivm = bld->gallivm;
2933    LLVMBuilderRef b = gallivm->builder;
2934    struct lp_type int_type = lp_int_type(bld->type);
2935
2936    /*
2937     *  take the absolute value,
2938     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2939     */
2940
2941    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2942    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2943
2944    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2945    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2946
2947    /*
2948     * scale by 4/Pi
2949     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2950     */
2951
2952    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2953    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2954
2955    /*
2956     * store the integer part of y in mm0
2957     * emm2 = _mm_cvttps_epi32(y);
2958     */
2959
2960    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2961
2962    /*
2963     * j=(j+1) & (~1) (see the cephes sources)
2964     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2965     */
2966
2967    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2968    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2969    /*
2970     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2971     */
2972    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2973    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2974
2975    /*
2976     * y = _mm_cvtepi32_ps(emm2);
2977     */
2978    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2979
2980    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2981    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2982    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2983    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2984
2985    /*
2986     * Argument used for poly selection and sign bit determination
2987     * is different for sin vs. cos.
2988     */
2989    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2990                                emm2_and;
2991
2992    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2993                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2994                                               const_29, "sign_bit") :
2995                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2996                                                               LLVMBuildShl(b, emm2_add,
2997                                                                            const_29, ""), ""),
2998                                               sign_mask, "sign_bit");
2999
3000    /*
3001     * get the polynom selection mask
3002     * there is one polynom for 0 <= x <= Pi/4
3003     * and another one for Pi/4<x<=Pi/2
3004     * Both branches will be computed.
3005     *
3006     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3007     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3008     */
3009
3010    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3011    LLVMValueRef poly_mask = lp_build_compare(gallivm,
3012                                              int_type, PIPE_FUNC_EQUAL,
3013                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3014
3015    /*
3016     * _PS_CONST(minus_cephes_DP1, -0.78515625);
3017     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3018     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3019     */
3020    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3021    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3022    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3023
3024    /*
3025     * The magic pass: "Extended precision modular arithmetic"
3026     * x = ((x - y * DP1) - y * DP2) - y * DP3;
3027     */
3028    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3029    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3030    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3031
3032    /*
3033     * Evaluate the first polynom  (0 <= x <= Pi/4)
3034     *
3035     * z = _mm_mul_ps(x,x);
3036     */
3037    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3038
3039    /*
3040     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3041     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3042     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3043     */
3044    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3045    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3046    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3047
3048    /*
3049     * y = *(v4sf*)_ps_coscof_p0;
3050     * y = _mm_mul_ps(y, z);
3051     */
3052    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3053    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3054    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3055    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3056
3057
3058    /*
3059     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3060     * y = _mm_sub_ps(y, tmp);
3061     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3062     */
3063    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3064    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3065    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3066    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3067    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3068
3069    /*
3070     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3071     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3072     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3073     */
3074    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3075    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3076    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3077
3078    /*
3079     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3080     *
3081     * y2 = *(v4sf*)_ps_sincof_p0;
3082     * y2 = _mm_mul_ps(y2, z);
3083     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3084     * y2 = _mm_mul_ps(y2, z);
3085     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3086     * y2 = _mm_mul_ps(y2, z);
3087     * y2 = _mm_mul_ps(y2, x);
3088     * y2 = _mm_add_ps(y2, x);
3089     */
3090
3091    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3092    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3093    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3094    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3095
3096    /*
3097     * select the correct result from the two polynoms
3098     * xmm3 = poly_mask;
3099     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3100     * y = _mm_andnot_ps(xmm3, y);
3101     * y = _mm_or_ps(y,y2);
3102     */
3103    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3104    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3105    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3106    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3107    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3108    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3109
3110    /*
3111     * update the sign
3112     * y = _mm_xor_ps(y, sign_bit);
3113     */
3114    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3115    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3116
3117    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3118
3119    /* clamp output to be within [-1, 1] */
3120    y_result = lp_build_clamp(bld, y_result,
3121                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3122                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3123    /* If a is -inf, inf or NaN then return NaN */
3124    y_result = lp_build_select(bld, isfinite, y_result,
3125                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3126    return y_result;
3127 }
3128
3129
3130 /**
3131  * Generate sin(a)
3132  */
3133 LLVMValueRef
3134 lp_build_sin(struct lp_build_context *bld,
3135              LLVMValueRef a)
3136 {
3137    return lp_build_sin_or_cos(bld, a, FALSE);
3138 }
3139
3140
3141 /**
3142  * Generate cos(a)
3143  */
3144 LLVMValueRef
3145 lp_build_cos(struct lp_build_context *bld,
3146              LLVMValueRef a)
3147 {
3148    return lp_build_sin_or_cos(bld, a, TRUE);
3149 }
3150
3151
3152 /**
3153  * Generate pow(x, y)
3154  */
3155 LLVMValueRef
3156 lp_build_pow(struct lp_build_context *bld,
3157              LLVMValueRef x,
3158              LLVMValueRef y)
3159 {
3160    /* TODO: optimize the constant case */
3161    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3162        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3163       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3164                    __FUNCTION__);
3165    }
3166
3167    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3168 }
3169
3170
3171 /**
3172  * Generate exp(x)
3173  */
3174 LLVMValueRef
3175 lp_build_exp(struct lp_build_context *bld,
3176              LLVMValueRef x)
3177 {
3178    /* log2(e) = 1/log(2) */
3179    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3180                                            1.4426950408889634);
3181
3182    assert(lp_check_value(bld->type, x));
3183
3184    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3185 }
3186
3187
3188 /**
3189  * Generate log(x)
3190  * Behavior is undefined with infs, 0s and nans
3191  */
3192 LLVMValueRef
3193 lp_build_log(struct lp_build_context *bld,
3194              LLVMValueRef x)
3195 {
3196    /* log(2) */
3197    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3198                                           0.69314718055994529);
3199
3200    assert(lp_check_value(bld->type, x));
3201
3202    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3203 }
3204
3205 /**
3206  * Generate log(x) that handles edge cases (infs, 0s and nans)
3207  */
3208 LLVMValueRef
3209 lp_build_log_safe(struct lp_build_context *bld,
3210                   LLVMValueRef x)
3211 {
3212    /* log(2) */
3213    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3214                                           0.69314718055994529);
3215
3216    assert(lp_check_value(bld->type, x));
3217
3218    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3219 }
3220
3221
3222 /**
3223  * Generate polynomial.
3224  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3225  */
3226 LLVMValueRef
3227 lp_build_polynomial(struct lp_build_context *bld,
3228                     LLVMValueRef x,
3229                     const double *coeffs,
3230                     unsigned num_coeffs)
3231 {
3232    const struct lp_type type = bld->type;
3233    LLVMValueRef even = NULL, odd = NULL;
3234    LLVMValueRef x2;
3235    unsigned i;
3236
3237    assert(lp_check_value(bld->type, x));
3238
3239    /* TODO: optimize the constant case */
3240    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3241        LLVMIsConstant(x)) {
3242       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3243                    __FUNCTION__);
3244    }
3245
3246    /*
3247     * Calculate odd and even terms seperately to decrease data dependency
3248     * Ex:
3249     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3250     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3251     */
3252    x2 = lp_build_mul(bld, x, x);
3253
3254    for (i = num_coeffs; i--; ) {
3255       LLVMValueRef coeff;
3256
3257       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3258
3259       if (i % 2 == 0) {
3260          if (even)
3261             even = lp_build_mad(bld, x2, even, coeff);
3262          else
3263             even = coeff;
3264       } else {
3265          if (odd)
3266             odd = lp_build_mad(bld, x2, odd, coeff);
3267          else
3268             odd = coeff;
3269       }
3270    }
3271
3272    if (odd)
3273       return lp_build_mad(bld, odd, x, even);
3274    else if (even)
3275       return even;
3276    else
3277       return bld->undef;
3278 }
3279
3280
3281 /**
3282  * Minimax polynomial fit of 2**x, in range [0, 1[
3283  */
3284 const double lp_build_exp2_polynomial[] = {
3285 #if EXP_POLY_DEGREE == 5
3286    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3287    0.693153073200168932794,
3288    0.240153617044375388211,
3289    0.0558263180532956664775,
3290    0.00898934009049466391101,
3291    0.00187757667519147912699
3292 #elif EXP_POLY_DEGREE == 4
3293    1.00000259337069434683,
3294    0.693003834469974940458,
3295    0.24144275689150793076,
3296    0.0520114606103070150235,
3297    0.0135341679161270268764
3298 #elif EXP_POLY_DEGREE == 3
3299    0.999925218562710312959,
3300    0.695833540494823811697,
3301    0.226067155427249155588,
3302    0.0780245226406372992967
3303 #elif EXP_POLY_DEGREE == 2
3304    1.00172476321474503578,
3305    0.657636275736077639316,
3306    0.33718943461968720704
3307 #else
3308 #error
3309 #endif
3310 };
3311
3312
3313 LLVMValueRef
3314 lp_build_exp2(struct lp_build_context *bld,
3315               LLVMValueRef x)
3316 {
3317    LLVMBuilderRef builder = bld->gallivm->builder;
3318    const struct lp_type type = bld->type;
3319    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3320    LLVMValueRef ipart = NULL;
3321    LLVMValueRef fpart = NULL;
3322    LLVMValueRef expipart = NULL;
3323    LLVMValueRef expfpart = NULL;
3324    LLVMValueRef res = NULL;
3325
3326    assert(lp_check_value(bld->type, x));
3327
3328    /* TODO: optimize the constant case */
3329    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3330        LLVMIsConstant(x)) {
3331       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3332                    __FUNCTION__);
3333    }
3334
3335    assert(type.floating && type.width == 32);
3336
3337    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3338     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3339    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3340                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3341    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3342                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3343
3344    /* ipart = floor(x) */
3345    /* fpart = x - ipart */
3346    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3347
3348    /* expipart = (float) (1 << ipart) */
3349    expipart = LLVMBuildAdd(builder, ipart,
3350                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3351    expipart = LLVMBuildShl(builder, expipart,
3352                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3353    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3354
3355    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3356                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3357
3358    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3359
3360    return res;
3361 }
3362
3363
3364
3365 /**
3366  * Extract the exponent of a IEEE-754 floating point value.
3367  *
3368  * Optionally apply an integer bias.
3369  *
3370  * Result is an integer value with
3371  *
3372  *   ifloor(log2(x)) + bias
3373  */
3374 LLVMValueRef
3375 lp_build_extract_exponent(struct lp_build_context *bld,
3376                           LLVMValueRef x,
3377                           int bias)
3378 {
3379    LLVMBuilderRef builder = bld->gallivm->builder;
3380    const struct lp_type type = bld->type;
3381    unsigned mantissa = lp_mantissa(type);
3382    LLVMValueRef res;
3383
3384    assert(type.floating);
3385
3386    assert(lp_check_value(bld->type, x));
3387
3388    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3389
3390    res = LLVMBuildLShr(builder, x,
3391                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3392    res = LLVMBuildAnd(builder, res,
3393                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3394    res = LLVMBuildSub(builder, res,
3395                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3396
3397    return res;
3398 }
3399
3400
3401 /**
3402  * Extract the mantissa of the a floating.
3403  *
3404  * Result is a floating point value with
3405  *
3406  *   x / floor(log2(x))
3407  */
3408 LLVMValueRef
3409 lp_build_extract_mantissa(struct lp_build_context *bld,
3410                           LLVMValueRef x)
3411 {
3412    LLVMBuilderRef builder = bld->gallivm->builder;
3413    const struct lp_type type = bld->type;
3414    unsigned mantissa = lp_mantissa(type);
3415    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3416                                                   (1ULL << mantissa) - 1);
3417    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3418    LLVMValueRef res;
3419
3420    assert(lp_check_value(bld->type, x));
3421
3422    assert(type.floating);
3423
3424    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3425
3426    /* res = x / 2**ipart */
3427    res = LLVMBuildAnd(builder, x, mantmask, "");
3428    res = LLVMBuildOr(builder, res, one, "");
3429    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3430
3431    return res;
3432 }
3433
3434
3435
3436 /**
3437  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3438  * These coefficients can be generate with
3439  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3440  */
3441 const double lp_build_log2_polynomial[] = {
3442 #if LOG_POLY_DEGREE == 5
3443    2.88539008148777786488L,
3444    0.961796878841293367824L,
3445    0.577058946784739859012L,
3446    0.412914355135828735411L,
3447    0.308591899232910175289L,
3448    0.352376952300281371868L,
3449 #elif LOG_POLY_DEGREE == 4
3450    2.88539009343309178325L,
3451    0.961791550404184197881L,
3452    0.577440339438736392009L,
3453    0.403343858251329912514L,
3454    0.406718052498846252698L,
3455 #elif LOG_POLY_DEGREE == 3
3456    2.88538959748872753838L,
3457    0.961932915889597772928L,
3458    0.571118517972136195241L,
3459    0.493997535084709500285L,
3460 #else
3461 #error
3462 #endif
3463 };
3464
3465 /**
3466  * See http://www.devmaster.net/forums/showthread.php?p=43580
3467  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3468  * http://www.nezumi.demon.co.uk/consult/logx.htm
3469  *
3470  * If handle_edge_cases is true the function will perform computations
3471  * to match the required D3D10+ behavior for each of the edge cases.
3472  * That means that if input is:
3473  * - less than zero (to and including -inf) then NaN will be returned
3474  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3475  * - +infinity, then +infinity will be returned
3476  * - NaN, then NaN will be returned
3477  *
3478  * Those checks are fairly expensive so if you don't need them make sure
3479  * handle_edge_cases is false.
3480  */
3481 void
3482 lp_build_log2_approx(struct lp_build_context *bld,
3483                      LLVMValueRef x,
3484                      LLVMValueRef *p_exp,
3485                      LLVMValueRef *p_floor_log2,
3486                      LLVMValueRef *p_log2,
3487                      boolean handle_edge_cases)
3488 {
3489    LLVMBuilderRef builder = bld->gallivm->builder;
3490    const struct lp_type type = bld->type;
3491    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3492    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3493
3494    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3495    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3496    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3497
3498    LLVMValueRef i = NULL;
3499    LLVMValueRef y = NULL;
3500    LLVMValueRef z = NULL;
3501    LLVMValueRef exp = NULL;
3502    LLVMValueRef mant = NULL;
3503    LLVMValueRef logexp = NULL;
3504    LLVMValueRef p_z = NULL;
3505    LLVMValueRef res = NULL;
3506
3507    assert(lp_check_value(bld->type, x));
3508
3509    if(p_exp || p_floor_log2 || p_log2) {
3510       /* TODO: optimize the constant case */
3511       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3512           LLVMIsConstant(x)) {
3513          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3514                       __FUNCTION__);
3515       }
3516
3517       assert(type.floating && type.width == 32);
3518
3519       /*
3520        * We don't explicitly handle denormalized numbers. They will yield a
3521        * result in the neighbourhood of -127, which appears to be adequate
3522        * enough.
3523        */
3524
3525       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3526
3527       /* exp = (float) exponent(x) */
3528       exp = LLVMBuildAnd(builder, i, expmask, "");
3529    }
3530
3531    if(p_floor_log2 || p_log2) {
3532       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3533       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3534       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3535    }
3536
3537    if (p_log2) {
3538       /* mant = 1 + (float) mantissa(x) */
3539       mant = LLVMBuildAnd(builder, i, mantmask, "");
3540       mant = LLVMBuildOr(builder, mant, one, "");
3541       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3542
3543       /* y = (mant - 1) / (mant + 1) */
3544       y = lp_build_div(bld,
3545          lp_build_sub(bld, mant, bld->one),
3546          lp_build_add(bld, mant, bld->one)
3547       );
3548
3549       /* z = y^2 */
3550       z = lp_build_mul(bld, y, y);
3551
3552       /* compute P(z) */
3553       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3554                                 ARRAY_SIZE(lp_build_log2_polynomial));
3555
3556       /* y * P(z) + logexp */
3557       res = lp_build_mad(bld, y, p_z, logexp);
3558
3559       if (type.floating && handle_edge_cases) {
3560          LLVMValueRef negmask, infmask,  zmask;
3561          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3562                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3563          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3564                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3565          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3566                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3567
3568          /* If x is qual to inf make sure we return inf */
3569          res = lp_build_select(bld, infmask,
3570                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3571                                res);
3572          /* If x is qual to 0, return -inf */
3573          res = lp_build_select(bld, zmask,
3574                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3575                                res);
3576          /* If x is nan or less than 0, return nan */
3577          res = lp_build_select(bld, negmask,
3578                                lp_build_const_vec(bld->gallivm, type,  NAN),
3579                                res);
3580       }
3581    }
3582
3583    if (p_exp) {
3584       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3585       *p_exp = exp;
3586    }
3587
3588    if (p_floor_log2)
3589       *p_floor_log2 = logexp;
3590
3591    if (p_log2)
3592       *p_log2 = res;
3593 }
3594
3595
3596 /*
3597  * log2 implementation which doesn't have special code to
3598  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3599  * the results for those cases are undefined.
3600  */
3601 LLVMValueRef
3602 lp_build_log2(struct lp_build_context *bld,
3603               LLVMValueRef x)
3604 {
3605    LLVMValueRef res;
3606    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3607    return res;
3608 }
3609
3610 /*
3611  * Version of log2 which handles all edge cases.
3612  * Look at documentation of lp_build_log2_approx for
3613  * description of the behavior for each of the edge cases.
3614  */
3615 LLVMValueRef
3616 lp_build_log2_safe(struct lp_build_context *bld,
3617                    LLVMValueRef x)
3618 {
3619    LLVMValueRef res;
3620    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3621    return res;
3622 }
3623
3624
3625 /**
3626  * Faster (and less accurate) log2.
3627  *
3628  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3629  *
3630  * Piece-wise linear approximation, with exact results when x is a
3631  * power of two.
3632  *
3633  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3634  */
3635 LLVMValueRef
3636 lp_build_fast_log2(struct lp_build_context *bld,
3637                    LLVMValueRef x)
3638 {
3639    LLVMBuilderRef builder = bld->gallivm->builder;
3640    LLVMValueRef ipart;
3641    LLVMValueRef fpart;
3642
3643    assert(lp_check_value(bld->type, x));
3644
3645    assert(bld->type.floating);
3646
3647    /* ipart = floor(log2(x)) - 1 */
3648    ipart = lp_build_extract_exponent(bld, x, -1);
3649    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3650
3651    /* fpart = x / 2**ipart */
3652    fpart = lp_build_extract_mantissa(bld, x);
3653
3654    /* ipart + fpart */
3655    return LLVMBuildFAdd(builder, ipart, fpart, "");
3656 }
3657
3658
3659 /**
3660  * Fast implementation of iround(log2(x)).
3661  *
3662  * Not an approximation -- it should give accurate results all the time.
3663  */
3664 LLVMValueRef
3665 lp_build_ilog2(struct lp_build_context *bld,
3666                LLVMValueRef x)
3667 {
3668    LLVMBuilderRef builder = bld->gallivm->builder;
3669    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3670    LLVMValueRef ipart;
3671
3672    assert(bld->type.floating);
3673
3674    assert(lp_check_value(bld->type, x));
3675
3676    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3677    x = LLVMBuildFMul(builder, x, sqrt2, "");
3678
3679    /* ipart = floor(log2(x) + 0.5)  */
3680    ipart = lp_build_extract_exponent(bld, x, 0);
3681
3682    return ipart;
3683 }
3684
3685 LLVMValueRef
3686 lp_build_mod(struct lp_build_context *bld,
3687              LLVMValueRef x,
3688              LLVMValueRef y)
3689 {
3690    LLVMBuilderRef builder = bld->gallivm->builder;
3691    LLVMValueRef res;
3692    const struct lp_type type = bld->type;
3693
3694    assert(lp_check_value(type, x));
3695    assert(lp_check_value(type, y));
3696
3697    if (type.floating)
3698       res = LLVMBuildFRem(builder, x, y, "");
3699    else if (type.sign)
3700       res = LLVMBuildSRem(builder, x, y, "");
3701    else
3702       res = LLVMBuildURem(builder, x, y, "");
3703    return res;
3704 }
3705
3706
3707 /*
3708  * For floating inputs it creates and returns a mask
3709  * which is all 1's for channels which are NaN.
3710  * Channels inside x which are not NaN will be 0.
3711  */
3712 LLVMValueRef
3713 lp_build_isnan(struct lp_build_context *bld,
3714                LLVMValueRef x)
3715 {
3716    LLVMValueRef mask;
3717    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3718
3719    assert(bld->type.floating);
3720    assert(lp_check_value(bld->type, x));
3721
3722    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3723                         "isnotnan");
3724    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3725    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3726    return mask;
3727 }
3728
3729 /* Returns all 1's for floating point numbers that are
3730  * finite numbers and returns all zeros for -inf,
3731  * inf and nan's */
3732 LLVMValueRef
3733 lp_build_isfinite(struct lp_build_context *bld,
3734                   LLVMValueRef x)
3735 {
3736    LLVMBuilderRef builder = bld->gallivm->builder;
3737    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3738    struct lp_type int_type = lp_int_type(bld->type);
3739    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3740    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3741                                                     0x7f800000);
3742
3743    if (!bld->type.floating) {
3744       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3745    }
3746    assert(bld->type.floating);
3747    assert(lp_check_value(bld->type, x));
3748    assert(bld->type.width == 32);
3749
3750    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3751    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3752                            intx, infornan32);
3753 }
3754
3755 /*
3756  * Returns true if the number is nan or inf and false otherwise.
3757  * The input has to be a floating point vector.
3758  */
3759 LLVMValueRef
3760 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3761                        const struct lp_type type,
3762                        LLVMValueRef x)
3763 {
3764    LLVMBuilderRef builder = gallivm->builder;
3765    struct lp_type int_type = lp_int_type(type);
3766    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3767                                                 0x7f800000);
3768    LLVMValueRef ret;
3769
3770    assert(type.floating);
3771
3772    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3773    ret = LLVMBuildAnd(builder, ret, const0, "");
3774    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3775                           ret, const0);
3776
3777    return ret;
3778 }
3779
3780
3781 LLVMValueRef
3782 lp_build_fpstate_get(struct gallivm_state *gallivm)
3783 {
3784    if (util_cpu_caps.has_sse) {
3785       LLVMBuilderRef builder = gallivm->builder;
3786       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3787          gallivm,
3788          LLVMInt32TypeInContext(gallivm->context),
3789          "mxcsr_ptr");
3790       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3791           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3792       lp_build_intrinsic(builder,
3793                          "llvm.x86.sse.stmxcsr",
3794                          LLVMVoidTypeInContext(gallivm->context),
3795                          &mxcsr_ptr8, 1, 0);
3796       return mxcsr_ptr;
3797    }
3798    return 0;
3799 }
3800
3801 void
3802 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3803                                   boolean zero)
3804 {
3805    if (util_cpu_caps.has_sse) {
3806       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3807       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3808
3809       LLVMBuilderRef builder = gallivm->builder;
3810       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3811       LLVMValueRef mxcsr =
3812          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3813
3814       if (util_cpu_caps.has_daz) {
3815          /* Enable denormals are zero mode */
3816          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3817       }
3818       if (zero) {
3819          mxcsr = LLVMBuildOr(builder, mxcsr,
3820                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3821       } else {
3822          mxcsr = LLVMBuildAnd(builder, mxcsr,
3823                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3824       }
3825
3826       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3827       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3828    }
3829 }
3830
3831 void
3832 lp_build_fpstate_set(struct gallivm_state *gallivm,
3833                      LLVMValueRef mxcsr_ptr)
3834 {
3835    if (util_cpu_caps.has_sse) {
3836       LLVMBuilderRef builder = gallivm->builder;
3837       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3838                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3839       lp_build_intrinsic(builder,
3840                          "llvm.x86.sse.ldmxcsr",
3841                          LLVMVoidTypeInContext(gallivm->context),
3842                          &mxcsr_ptr, 1, 0);
3843    }
3844 }