src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if (a == bld->zero)
 545       return b;
 546    if (b == bld->zero)
 547       return a;
 548    if (a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if (type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if (!type.sign && (a == bld->one || b == bld->one))
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (type.width * type.length == 128) {
 559             if (util_cpu_caps.has_sse2) {
 560               if (type.width == 8)
 561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 562               if (type.width == 16)
 563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 564             } else if (util_cpu_caps.has_altivec) {
 565               if (type.width == 8)
 566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 567               if (type.width == 16)
 568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 569             }
 570          }
 571          if (type.width * type.length == 256) {
 572             if (util_cpu_caps.has_avx2) {
 573               if (type.width == 8)
 574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
 575               if (type.width == 16)
 576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
 577             }
 578          }
 579       }
 580
 581       if (intrinsic)
 582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 583    }
 584
 585    if(type.norm && !type.floating && !type.fixed) {
 586       if (type.sign) {
 587          uint64_t sign = (uint64_t)1 << (type.width - 1);
 588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 590          /* a_clamp_max is the maximum a for positive b,
 591             a_clamp_min is the minimum a for negative b. */
 592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 595       } else {
 596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 597       }
 598    }
 599
 600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 601       if (type.floating)
 602          res = LLVMConstFAdd(a, b);
 603       else
 604          res = LLVMConstAdd(a, b);
 605    else
 606       if (type.floating)
 607          res = LLVMBuildFAdd(builder, a, b, "");
 608       else
 609          res = LLVMBuildAdd(builder, a, b, "");
 610
 611    /* clamp to ceiling of 1.0 */
 612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 614
 615    /* XXX clamp to floor of -1 or 0??? */
 616
 617    return res;
 618 }
 619
 620
 621 /** Return the scalar sum of the elements of a.
 622  * Should avoid this operation whenever possible.
 623  */
 624 LLVMValueRef
 625 lp_build_horizontal_add(struct lp_build_context *bld,
 626                         LLVMValueRef a)
 627 {
 628    LLVMBuilderRef builder = bld->gallivm->builder;
 629    const struct lp_type type = bld->type;
 630    LLVMValueRef index, res;
 631    unsigned i, length;
 632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 634    LLVMValueRef vecres, elem2;
 635
 636    assert(lp_check_value(type, a));
 637
 638    if (type.length == 1) {
 639       return a;
 640    }
 641
 642    assert(!bld->type.norm);
 643
 644    /*
 645     * for byte vectors can do much better with psadbw.
 646     * Using repeated shuffle/adds here. Note with multiple vectors
 647     * this can be done more efficiently as outlined in the intel
 648     * optimization manual.
 649     * Note: could cause data rearrangement if used with smaller element
 650     * sizes.
 651     */
 652
 653    vecres = a;
 654    length = type.length / 2;
 655    while (length > 1) {
 656       LLVMValueRef vec1, vec2;
 657       for (i = 0; i < length; i++) {
 658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 660       }
 661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 662                                     LLVMConstVector(shuffles1, length), "");
 663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 664                                     LLVMConstVector(shuffles2, length), "");
 665       if (type.floating) {
 666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 667       }
 668       else {
 669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 670       }
 671       length = length >> 1;
 672    }
 673
 674    /* always have vector of size 2 here */
 675    assert(length == 1);
 676
 677    index = lp_build_const_int32(bld->gallivm, 0);
 678    res = LLVMBuildExtractElement(builder, vecres, index, "");
 679    index = lp_build_const_int32(bld->gallivm, 1);
 680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 681
 682    if (type.floating)
 683       res = LLVMBuildFAdd(builder, res, elem2, "");
 684     else
 685       res = LLVMBuildAdd(builder, res, elem2, "");
 686
 687    return res;
 688 }
 689
 690 /**
 691  * Return the horizontal sums of 4 float vectors as a float4 vector.
 692  * This uses the technique as outlined in Intel Optimization Manual.
 693  */
 694 static LLVMValueRef
 695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 696                             LLVMValueRef src[4])
 697 {
 698    struct gallivm_state *gallivm = bld->gallivm;
 699    LLVMBuilderRef builder = gallivm->builder;
 700    LLVMValueRef shuffles[4];
 701    LLVMValueRef tmp[4];
 702    LLVMValueRef sumtmp[2], shuftmp[2];
 703
 704    /* lower half of regs */
 705    shuffles[0] = lp_build_const_int32(gallivm, 0);
 706    shuffles[1] = lp_build_const_int32(gallivm, 1);
 707    shuffles[2] = lp_build_const_int32(gallivm, 4);
 708    shuffles[3] = lp_build_const_int32(gallivm, 5);
 709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 710                                    LLVMConstVector(shuffles, 4), "");
 711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 712                                    LLVMConstVector(shuffles, 4), "");
 713
 714    /* upper half of regs */
 715    shuffles[0] = lp_build_const_int32(gallivm, 2);
 716    shuffles[1] = lp_build_const_int32(gallivm, 3);
 717    shuffles[2] = lp_build_const_int32(gallivm, 6);
 718    shuffles[3] = lp_build_const_int32(gallivm, 7);
 719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 720                                    LLVMConstVector(shuffles, 4), "");
 721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 722                                    LLVMConstVector(shuffles, 4), "");
 723
 724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 726
 727    shuffles[0] = lp_build_const_int32(gallivm, 0);
 728    shuffles[1] = lp_build_const_int32(gallivm, 2);
 729    shuffles[2] = lp_build_const_int32(gallivm, 4);
 730    shuffles[3] = lp_build_const_int32(gallivm, 6);
 731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 732                                        LLVMConstVector(shuffles, 4), "");
 733
 734    shuffles[0] = lp_build_const_int32(gallivm, 1);
 735    shuffles[1] = lp_build_const_int32(gallivm, 3);
 736    shuffles[2] = lp_build_const_int32(gallivm, 5);
 737    shuffles[3] = lp_build_const_int32(gallivm, 7);
 738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 739                                        LLVMConstVector(shuffles, 4), "");
 740
 741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 742 }
 743
 744
 745 /*
 746  * partially horizontally add 2-4 float vectors with length nx4,
 747  * i.e. only four adjacent values in each vector will be added,
 748  * assuming values are really grouped in 4 which also determines
 749  * output order.
 750  *
 751  * Return a vector of the same length as the initial vectors,
 752  * with the excess elements (if any) being undefined.
 753  * The element order is independent of number of input vectors.
 754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 755  * the output order thus will be
 756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 757  */
 758 LLVMValueRef
 759 lp_build_hadd_partial4(struct lp_build_context *bld,
 760                        LLVMValueRef vectors[],
 761                        unsigned num_vecs)
 762 {
 763    struct gallivm_state *gallivm = bld->gallivm;
 764    LLVMBuilderRef builder = gallivm->builder;
 765    LLVMValueRef ret_vec;
 766    LLVMValueRef tmp[4];
 767    const char *intrinsic = NULL;
 768
 769    assert(num_vecs >= 2 && num_vecs <= 4);
 770    assert(bld->type.floating);
 771
 772    /* only use this with at least 2 vectors, as it is sort of expensive
 773     * (depending on cpu) and we always need two horizontal adds anyway,
 774     * so a shuffle/add approach might be better.
 775     */
 776
 777    tmp[0] = vectors[0];
 778    tmp[1] = vectors[1];
 779
 780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 782
 783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 784        bld->type.length == 4) {
 785       intrinsic = "llvm.x86.sse3.hadd.ps";
 786    }
 787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 788             bld->type.length == 8) {
 789       intrinsic = "llvm.x86.avx.hadd.ps.256";
 790    }
 791    if (intrinsic) {
 792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 793                                        lp_build_vec_type(gallivm, bld->type),
 794                                        tmp[0], tmp[1]);
 795       if (num_vecs > 2) {
 796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 797                                           lp_build_vec_type(gallivm, bld->type),
 798                                           tmp[2], tmp[3]);
 799       }
 800       else {
 801          tmp[1] = tmp[0];
 802       }
 803       return lp_build_intrinsic_binary(builder, intrinsic,
 804                                        lp_build_vec_type(gallivm, bld->type),
 805                                        tmp[0], tmp[1]);
 806    }
 807
 808    if (bld->type.length == 4) {
 809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 810    }
 811    else {
 812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 813       unsigned j;
 814       unsigned num_iter = bld->type.length / 4;
 815       struct lp_type parttype = bld->type;
 816       parttype.length = 4;
 817       for (j = 0; j < num_iter; j++) {
 818          LLVMValueRef partsrc[4];
 819          unsigned i;
 820          for (i = 0; i < 4; i++) {
 821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 822          }
 823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 824       }
 825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 826    }
 827    return ret_vec;
 828 }
 829
 830 /**
 831  * Generate a - b
 832  */
 833 LLVMValueRef
 834 lp_build_sub(struct lp_build_context *bld,
 835              LLVMValueRef a,
 836              LLVMValueRef b)
 837 {
 838    LLVMBuilderRef builder = bld->gallivm->builder;
 839    const struct lp_type type = bld->type;
 840    LLVMValueRef res;
 841
 842    assert(lp_check_value(type, a));
 843    assert(lp_check_value(type, b));
 844
 845    if (b == bld->zero)
 846       return a;
 847    if (a == bld->undef || b == bld->undef)
 848       return bld->undef;
 849    if (a == b)
 850       return bld->zero;
 851
 852    if (type.norm) {
 853       const char *intrinsic = NULL;
 854
 855       if (!type.sign && b == bld->one)
 856         return bld->zero;
 857
 858       if (!type.floating && !type.fixed) {
 859          if (type.width * type.length == 128) {
 860             if (util_cpu_caps.has_sse2) {
 861               if (type.width == 8)
 862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 863               if (type.width == 16)
 864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 865             } else if (util_cpu_caps.has_altivec) {
 866               if (type.width == 8)
 867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 868               if (type.width == 16)
 869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 870             }
 871          }
 872          if (type.width * type.length == 256) {
 873             if (util_cpu_caps.has_avx2) {
 874               if (type.width == 8)
 875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
 876               if (type.width == 16)
 877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
 878             }
 879          }
 880       }
 881
 882       if (intrinsic)
 883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 884    }
 885
 886    if(type.norm && !type.floating && !type.fixed) {
 887       if (type.sign) {
 888          uint64_t sign = (uint64_t)1 << (type.width - 1);
 889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 891          /* a_clamp_max is the maximum a for negative b,
 892             a_clamp_min is the minimum a for positive b. */
 893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 896       } else {
 897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 898       }
 899    }
 900
 901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 902       if (type.floating)
 903          res = LLVMConstFSub(a, b);
 904       else
 905          res = LLVMConstSub(a, b);
 906    else
 907       if (type.floating)
 908          res = LLVMBuildFSub(builder, a, b, "");
 909       else
 910          res = LLVMBuildSub(builder, a, b, "");
 911
 912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 914
 915    return res;
 916 }
 917
 918
 919
 920 /**
 921  * Normalized multiplication.
 922  *
 923  * There are several approaches for (using 8-bit normalized multiplication as
 924  * an example):
 925  *
 926  * - alpha plus one
 927  *
 928  *     makes the following approximation to the division (Sree)
 929  *
 930  *       a*b/255 ~= (a*(b + 1)) >> 256
 931  *
 932  *     which is the fastest method that satisfies the following OpenGL criteria of
 933  *
 934  *       0*0 = 0 and 255*255 = 255
 935  *
 936  * - geometric series
 937  *
 938  *     takes the geometric series approximation to the division
 939  *
 940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 941  *
 942  *     in this case just the first two terms to fit in 16bit arithmetic
 943  *
 944  *       t/255 ~= (t + (t >> 8)) >> 8
 945  *
 946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 948  *     must be used.
 949  *
 950  * - geometric series plus rounding
 951  *
 952  *     when using a geometric series division instead of truncating the result
 953  *     use roundoff in the approximation (Jim Blinn)
 954  *
 955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 956  *
 957  *     achieving the exact results.
 958  *
 959  *
 960  *
 961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 963  * @sa Michael Herf, The "double blend trick", May 2000,
 964  *     http://www.stereopsis.com/doubleblend.html
 965  */
 966 LLVMValueRef
 967 lp_build_mul_norm(struct gallivm_state *gallivm,
 968                   struct lp_type wide_type,
 969                   LLVMValueRef a, LLVMValueRef b)
 970 {
 971    LLVMBuilderRef builder = gallivm->builder;
 972    struct lp_build_context bld;
 973    unsigned n;
 974    LLVMValueRef half;
 975    LLVMValueRef ab;
 976
 977    assert(!wide_type.floating);
 978    assert(lp_check_value(wide_type, a));
 979    assert(lp_check_value(wide_type, b));
 980
 981    lp_build_context_init(&bld, gallivm, wide_type);
 982
 983    n = wide_type.width / 2;
 984    if (wide_type.sign) {
 985       --n;
 986    }
 987
 988    /*
 989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 991     */
 992
 993    /*
 994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 995     */
 996
 997    ab = LLVMBuildMul(builder, a, b, "");
 998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 999
1000    /*
1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002     */
1003
1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005    if (wide_type.sign) {
1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008       half = lp_build_select(&bld, sign, minus_half, half);
1009    }
1010    ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012    /* Final division */
1013    ab = lp_build_shr_imm(&bld, ab, n);
1014
1015    return ab;
1016 }
1017
1018 /**
1019  * Generate a * b
1020  */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023              LLVMValueRef a,
1024              LLVMValueRef b)
1025 {
1026    LLVMBuilderRef builder = bld->gallivm->builder;
1027    const struct lp_type type = bld->type;
1028    LLVMValueRef shift;
1029    LLVMValueRef res;
1030
1031    assert(lp_check_value(type, a));
1032    assert(lp_check_value(type, b));
1033
1034    if(a == bld->zero)
1035       return bld->zero;
1036    if(a == bld->one)
1037       return b;
1038    if(b == bld->zero)
1039       return bld->zero;
1040    if(b == bld->one)
1041       return a;
1042    if(a == bld->undef || b == bld->undef)
1043       return bld->undef;
1044
1045    if (!type.floating && !type.fixed && type.norm) {
1046       struct lp_type wide_type = lp_wider_type(type);
1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052       /* PMULLW, PSRLW, PADDW */
1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058       return ab;
1059    }
1060
1061    if(type.fixed)
1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063    else
1064       shift = NULL;
1065
1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067       if (type.floating)
1068          res = LLVMConstFMul(a, b);
1069       else
1070          res = LLVMConstMul(a, b);
1071       if(shift) {
1072          if(type.sign)
1073             res = LLVMConstAShr(res, shift);
1074          else
1075             res = LLVMConstLShr(res, shift);
1076       }
1077    }
1078    else {
1079       if (type.floating)
1080          res = LLVMBuildFMul(builder, a, b, "");
1081       else
1082          res = LLVMBuildMul(builder, a, b, "");
1083       if(shift) {
1084          if(type.sign)
1085             res = LLVMBuildAShr(builder, res, shift, "");
1086          else
1087             res = LLVMBuildLShr(builder, res, shift, "");
1088       }
1089    }
1090
1091    return res;
1092 }
1093
1094 /*
1095  * Widening mul, valid for 32x32 bit -> 64bit only.
1096  * Result is low 32bits, high bits returned in res_hi.
1097  *
1098  * Emits code that is meant to be compiled for the host CPU.
1099  */
1100 LLVMValueRef
1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102                          LLVMValueRef a,
1103                          LLVMValueRef b,
1104                          LLVMValueRef *res_hi)
1105 {
1106    struct gallivm_state *gallivm = bld->gallivm;
1107    LLVMBuilderRef builder = gallivm->builder;
1108
1109    assert(bld->type.width == 32);
1110    assert(bld->type.floating == 0);
1111    assert(bld->type.fixed == 0);
1112    assert(bld->type.norm == 0);
1113
1114    /*
1115     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116     * for x86 simd is atrocious (even if the high bits weren't required),
1117     * trying to handle real 64bit inputs (which of course can't happen due
1118     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119     * apparently llvm does not recognize this widening mul). This includes 6
1120     * (instead of 2) pmuludq plus extra adds and shifts
1121     * The same story applies to signed mul, albeit fixing this requires sse41.
1122     * https://llvm.org/bugs/show_bug.cgi?id=30845
1123     * So, whip up our own code, albeit only for length 4 and 8 (which
1124     * should be good enough)...
1125     */
1126    if ((bld->type.length == 4 || bld->type.length == 8) &&
1127        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128         util_cpu_caps.has_sse4_1)) {
1129       const char *intrinsic = NULL;
1130       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132       struct lp_type type_wide = lp_wider_type(bld->type);
1133       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134       unsigned i;
1135       for (i = 0; i < bld->type.length; i += 2) {
1136          shuf[i] = lp_build_const_int32(gallivm, i+1);
1137          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138       }
1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140       aeven = a;
1141       beven = b;
1142       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146          if (bld->type.sign) {
1147             intrinsic = "llvm.x86.avx2.pmul.dq";
1148          } else {
1149             intrinsic = "llvm.x86.avx2.pmulu.dq";
1150          }
1151          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                              wider_type, aeven, beven);
1153          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                             wider_type, aodd, bodd);
1155       }
1156       else {
1157          /* for consistent naming look elsewhere... */
1158          if (bld->type.sign) {
1159             intrinsic = "llvm.x86.sse41.pmuldq";
1160          } else {
1161             intrinsic = "llvm.x86.sse2.pmulu.dq";
1162          }
1163          /*
1164           * XXX If we only have AVX but not AVX2 this is a pain.
1165           * lp_build_intrinsic_binary_anylength() can't handle it
1166           * (due to src and dst type not being identical).
1167           */
1168          if (bld->type.length == 8) {
1169             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171             LLVMValueRef muleven2[2], mulodd2[2];
1172             struct lp_type type_wide_half = type_wide;
1173             LLVMTypeRef wtype_half;
1174             type_wide_half.length = 2;
1175             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185                                                     wtype_half, aevenlo, bevenlo);
1186             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187                                                    wtype_half, aoddlo, boddlo);
1188             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189                                                     wtype_half, aevenhi, bevenhi);
1190             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191                                                    wtype_half, aoddhi, boddhi);
1192             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195          }
1196          else {
1197             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198                                                 wider_type, aeven, beven);
1199             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200                                                wider_type, aodd, bodd);
1201          }
1202       }
1203       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206       for (i = 0; i < bld->type.length; i += 2) {
1207          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209       }
1210       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213       for (i = 0; i < bld->type.length; i += 2) {
1214          shuf[i] = lp_build_const_int32(gallivm, i);
1215          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216       }
1217       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219    }
1220    else {
1221       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222    }
1223 }
1224
1225
1226 /*
1227  * Widening mul, valid for 32x32 bit -> 64bit only.
1228  * Result is low 32bits, high bits returned in res_hi.
1229  *
1230  * Emits generic code.
1231  */
1232 LLVMValueRef
1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234                      LLVMValueRef a,
1235                      LLVMValueRef b,
1236                      LLVMValueRef *res_hi)
1237 {
1238    struct gallivm_state *gallivm = bld->gallivm;
1239    LLVMBuilderRef builder = gallivm->builder;
1240    LLVMValueRef tmp, shift, res_lo;
1241    struct lp_type type_tmp;
1242    LLVMTypeRef wide_type, narrow_type;
1243
1244    type_tmp = bld->type;
1245    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246    type_tmp.width *= 2;
1247    wide_type = lp_build_vec_type(gallivm, type_tmp);
1248    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249
1250    if (bld->type.sign) {
1251       a = LLVMBuildSExt(builder, a, wide_type, "");
1252       b = LLVMBuildSExt(builder, b, wide_type, "");
1253    } else {
1254       a = LLVMBuildZExt(builder, a, wide_type, "");
1255       b = LLVMBuildZExt(builder, b, wide_type, "");
1256    }
1257    tmp = LLVMBuildMul(builder, a, b, "");
1258
1259    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260
1261    /* Since we truncate anyway, LShr and AShr are equivalent. */
1262    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264
1265    return res_lo;
1266 }
1267
1268
1269 /* a * b + c */
1270 LLVMValueRef
1271 lp_build_mad(struct lp_build_context *bld,
1272              LLVMValueRef a,
1273              LLVMValueRef b,
1274              LLVMValueRef c)
1275 {
1276    const struct lp_type type = bld->type;
1277    if (type.floating) {
1278       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279    } else {
1280       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281    }
1282 }
1283
1284
1285 /**
1286  * Small vector x scale multiplication optimization.
1287  */
1288 LLVMValueRef
1289 lp_build_mul_imm(struct lp_build_context *bld,
1290                  LLVMValueRef a,
1291                  int b)
1292 {
1293    LLVMBuilderRef builder = bld->gallivm->builder;
1294    LLVMValueRef factor;
1295
1296    assert(lp_check_value(bld->type, a));
1297
1298    if(b == 0)
1299       return bld->zero;
1300
1301    if(b == 1)
1302       return a;
1303
1304    if(b == -1)
1305       return lp_build_negate(bld, a);
1306
1307    if(b == 2 && bld->type.floating)
1308       return lp_build_add(bld, a, a);
1309
1310    if(util_is_power_of_two_or_zero(b)) {
1311       unsigned shift = ffs(b) - 1;
1312
1313       if(bld->type.floating) {
1314 #if 0
1315          /*
1316           * Power of two multiplication by directly manipulating the exponent.
1317           *
1318           * XXX: This might not be always faster, it will introduce a small error
1319           * for multiplication by zero, and it will produce wrong results
1320           * for Inf and NaN.
1321           */
1322          unsigned mantissa = lp_mantissa(bld->type);
1323          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325          a = LLVMBuildAdd(builder, a, factor, "");
1326          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327          return a;
1328 #endif
1329       }
1330       else {
1331          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332          return LLVMBuildShl(builder, a, factor, "");
1333       }
1334    }
1335
1336    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337    return lp_build_mul(bld, a, factor);
1338 }
1339
1340
1341 /**
1342  * Generate a / b
1343  */
1344 LLVMValueRef
1345 lp_build_div(struct lp_build_context *bld,
1346              LLVMValueRef a,
1347              LLVMValueRef b)
1348 {
1349    LLVMBuilderRef builder = bld->gallivm->builder;
1350    const struct lp_type type = bld->type;
1351
1352    assert(lp_check_value(type, a));
1353    assert(lp_check_value(type, b));
1354
1355    if(a == bld->zero)
1356       return bld->zero;
1357    if(a == bld->one && type.floating)
1358       return lp_build_rcp(bld, b);
1359    if(b == bld->zero)
1360       return bld->undef;
1361    if(b == bld->one)
1362       return a;
1363    if(a == bld->undef || b == bld->undef)
1364       return bld->undef;
1365
1366    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367       if (type.floating)
1368          return LLVMConstFDiv(a, b);
1369       else if (type.sign)
1370          return LLVMConstSDiv(a, b);
1371       else
1372          return LLVMConstUDiv(a, b);
1373    }
1374
1375    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1376    if(FALSE &&
1377       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1378        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1379       type.floating)
1380       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1381
1382    if (type.floating)
1383       return LLVMBuildFDiv(builder, a, b, "");
1384    else if (type.sign)
1385       return LLVMBuildSDiv(builder, a, b, "");
1386    else
1387       return LLVMBuildUDiv(builder, a, b, "");
1388 }
1389
1390
1391 /**
1392  * Linear interpolation helper.
1393  *
1394  * @param normalized whether we are interpolating normalized values,
1395  *        encoded in normalized integers, twice as wide.
1396  *
1397  * @sa http://www.stereopsis.com/doubleblend.html
1398  */
1399 static inline LLVMValueRef
1400 lp_build_lerp_simple(struct lp_build_context *bld,
1401                      LLVMValueRef x,
1402                      LLVMValueRef v0,
1403                      LLVMValueRef v1,
1404                      unsigned flags)
1405 {
1406    unsigned half_width = bld->type.width/2;
1407    LLVMBuilderRef builder = bld->gallivm->builder;
1408    LLVMValueRef delta;
1409    LLVMValueRef res;
1410
1411    assert(lp_check_value(bld->type, x));
1412    assert(lp_check_value(bld->type, v0));
1413    assert(lp_check_value(bld->type, v1));
1414
1415    delta = lp_build_sub(bld, v1, v0);
1416
1417    if (bld->type.floating) {
1418       assert(flags == 0);
1419       return lp_build_mad(bld, x, delta, v0);
1420    }
1421
1422    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1423       if (!bld->type.sign) {
1424          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1425             /*
1426              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1427              * most-significant-bit to the lowest-significant-bit, so that
1428              * later we can just divide by 2**n instead of 2**n - 1.
1429              */
1430
1431             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1432          }
1433
1434          /* (x * delta) >> n */
1435          res = lp_build_mul(bld, x, delta);
1436          res = lp_build_shr_imm(bld, res, half_width);
1437       } else {
1438          /*
1439           * The rescaling trick above doesn't work for signed numbers, so
1440           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1441           * instead.
1442           */
1443          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1444          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1445       }
1446    } else {
1447       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1448       res = lp_build_mul(bld, x, delta);
1449    }
1450
1451    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1452       /*
1453        * At this point both res and v0 only use the lower half of the bits,
1454        * the rest is zero. Instead of add / mask, do add with half wide type.
1455        */
1456       struct lp_type narrow_type;
1457       struct lp_build_context narrow_bld;
1458
1459       memset(&narrow_type, 0, sizeof narrow_type);
1460       narrow_type.sign   = bld->type.sign;
1461       narrow_type.width  = bld->type.width/2;
1462       narrow_type.length = bld->type.length*2;
1463
1464       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1465       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1466       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1467       res = lp_build_add(&narrow_bld, v0, res);
1468       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1469    } else {
1470       res = lp_build_add(bld, v0, res);
1471
1472       if (bld->type.fixed) {
1473          /*
1474           * We need to mask out the high order bits when lerping 8bit
1475           * normalized colors stored on 16bits
1476           */
1477          /* XXX: This step is necessary for lerping 8bit colors stored on
1478           * 16bits, but it will be wrong for true fixed point use cases.
1479           * Basically we need a more powerful lp_type, capable of further
1480           * distinguishing the values interpretation from the value storage.
1481           */
1482          LLVMValueRef low_bits;
1483          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1484          res = LLVMBuildAnd(builder, res, low_bits, "");
1485       }
1486    }
1487
1488    return res;
1489 }
1490
1491
1492 /**
1493  * Linear interpolation.
1494  */
1495 LLVMValueRef
1496 lp_build_lerp(struct lp_build_context *bld,
1497               LLVMValueRef x,
1498               LLVMValueRef v0,
1499               LLVMValueRef v1,
1500               unsigned flags)
1501 {
1502    const struct lp_type type = bld->type;
1503    LLVMValueRef res;
1504
1505    assert(lp_check_value(type, x));
1506    assert(lp_check_value(type, v0));
1507    assert(lp_check_value(type, v1));
1508
1509    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1510
1511    if (type.norm) {
1512       struct lp_type wide_type;
1513       struct lp_build_context wide_bld;
1514       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1515
1516       assert(type.length >= 2);
1517
1518       /*
1519        * Create a wider integer type, enough to hold the
1520        * intermediate result of the multiplication.
1521        */
1522       memset(&wide_type, 0, sizeof wide_type);
1523       wide_type.sign   = type.sign;
1524       wide_type.width  = type.width*2;
1525       wide_type.length = type.length/2;
1526
1527       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1528
1529       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1530       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1531       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1532
1533       /*
1534        * Lerp both halves.
1535        */
1536
1537       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1538
1539       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1540       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1541
1542       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1543    } else {
1544       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1545    }
1546
1547    return res;
1548 }
1549
1550
1551 /**
1552  * Bilinear interpolation.
1553  *
1554  * Values indices are in v_{yx}.
1555  */
1556 LLVMValueRef
1557 lp_build_lerp_2d(struct lp_build_context *bld,
1558                  LLVMValueRef x,
1559                  LLVMValueRef y,
1560                  LLVMValueRef v00,
1561                  LLVMValueRef v01,
1562                  LLVMValueRef v10,
1563                  LLVMValueRef v11,
1564                  unsigned flags)
1565 {
1566    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1567    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1568    return lp_build_lerp(bld, y, v0, v1, flags);
1569 }
1570
1571
1572 LLVMValueRef
1573 lp_build_lerp_3d(struct lp_build_context *bld,
1574                  LLVMValueRef x,
1575                  LLVMValueRef y,
1576                  LLVMValueRef z,
1577                  LLVMValueRef v000,
1578                  LLVMValueRef v001,
1579                  LLVMValueRef v010,
1580                  LLVMValueRef v011,
1581                  LLVMValueRef v100,
1582                  LLVMValueRef v101,
1583                  LLVMValueRef v110,
1584                  LLVMValueRef v111,
1585                  unsigned flags)
1586 {
1587    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1588    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1589    return lp_build_lerp(bld, z, v0, v1, flags);
1590 }
1591
1592
1593 /**
1594  * Generate min(a, b)
1595  * Do checks for special cases but not for nans.
1596  */
1597 LLVMValueRef
1598 lp_build_min(struct lp_build_context *bld,
1599              LLVMValueRef a,
1600              LLVMValueRef b)
1601 {
1602    assert(lp_check_value(bld->type, a));
1603    assert(lp_check_value(bld->type, b));
1604
1605    if(a == bld->undef || b == bld->undef)
1606       return bld->undef;
1607
1608    if(a == b)
1609       return a;
1610
1611    if (bld->type.norm) {
1612       if (!bld->type.sign) {
1613          if (a == bld->zero || b == bld->zero) {
1614             return bld->zero;
1615          }
1616       }
1617       if(a == bld->one)
1618          return b;
1619       if(b == bld->one)
1620          return a;
1621    }
1622
1623    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1624 }
1625
1626
1627 /**
1628  * Generate min(a, b)
1629  * NaN's are handled according to the behavior specified by the
1630  * nan_behavior argument.
1631  */
1632 LLVMValueRef
1633 lp_build_min_ext(struct lp_build_context *bld,
1634                  LLVMValueRef a,
1635                  LLVMValueRef b,
1636                  enum gallivm_nan_behavior nan_behavior)
1637 {
1638    assert(lp_check_value(bld->type, a));
1639    assert(lp_check_value(bld->type, b));
1640
1641    if(a == bld->undef || b == bld->undef)
1642       return bld->undef;
1643
1644    if(a == b)
1645       return a;
1646
1647    if (bld->type.norm) {
1648       if (!bld->type.sign) {
1649          if (a == bld->zero || b == bld->zero) {
1650             return bld->zero;
1651          }
1652       }
1653       if(a == bld->one)
1654          return b;
1655       if(b == bld->one)
1656          return a;
1657    }
1658
1659    return lp_build_min_simple(bld, a, b, nan_behavior);
1660 }
1661
1662 /**
1663  * Generate max(a, b)
1664  * Do checks for special cases, but NaN behavior is undefined.
1665  */
1666 LLVMValueRef
1667 lp_build_max(struct lp_build_context *bld,
1668              LLVMValueRef a,
1669              LLVMValueRef b)
1670 {
1671    assert(lp_check_value(bld->type, a));
1672    assert(lp_check_value(bld->type, b));
1673
1674    if(a == bld->undef || b == bld->undef)
1675       return bld->undef;
1676
1677    if(a == b)
1678       return a;
1679
1680    if(bld->type.norm) {
1681       if(a == bld->one || b == bld->one)
1682          return bld->one;
1683       if (!bld->type.sign) {
1684          if (a == bld->zero) {
1685             return b;
1686          }
1687          if (b == bld->zero) {
1688             return a;
1689          }
1690       }
1691    }
1692
1693    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1694 }
1695
1696
1697 /**
1698  * Generate max(a, b)
1699  * Checks for special cases.
1700  * NaN's are handled according to the behavior specified by the
1701  * nan_behavior argument.
1702  */
1703 LLVMValueRef
1704 lp_build_max_ext(struct lp_build_context *bld,
1705                   LLVMValueRef a,
1706                   LLVMValueRef b,
1707                   enum gallivm_nan_behavior nan_behavior)
1708 {
1709    assert(lp_check_value(bld->type, a));
1710    assert(lp_check_value(bld->type, b));
1711
1712    if(a == bld->undef || b == bld->undef)
1713       return bld->undef;
1714
1715    if(a == b)
1716       return a;
1717
1718    if(bld->type.norm) {
1719       if(a == bld->one || b == bld->one)
1720          return bld->one;
1721       if (!bld->type.sign) {
1722          if (a == bld->zero) {
1723             return b;
1724          }
1725          if (b == bld->zero) {
1726             return a;
1727          }
1728       }
1729    }
1730
1731    return lp_build_max_simple(bld, a, b, nan_behavior);
1732 }
1733
1734 /**
1735  * Generate clamp(a, min, max)
1736  * NaN behavior (for any of a, min, max) is undefined.
1737  * Do checks for special cases.
1738  */
1739 LLVMValueRef
1740 lp_build_clamp(struct lp_build_context *bld,
1741                LLVMValueRef a,
1742                LLVMValueRef min,
1743                LLVMValueRef max)
1744 {
1745    assert(lp_check_value(bld->type, a));
1746    assert(lp_check_value(bld->type, min));
1747    assert(lp_check_value(bld->type, max));
1748
1749    a = lp_build_min(bld, a, max);
1750    a = lp_build_max(bld, a, min);
1751    return a;
1752 }
1753
1754
1755 /**
1756  * Generate clamp(a, 0, 1)
1757  * A NaN will get converted to zero.
1758  */
1759 LLVMValueRef
1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1761                                 LLVMValueRef a)
1762 {
1763    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1764    a = lp_build_min(bld, a, bld->one);
1765    return a;
1766 }
1767
1768
1769 /**
1770  * Generate abs(a)
1771  */
1772 LLVMValueRef
1773 lp_build_abs(struct lp_build_context *bld,
1774              LLVMValueRef a)
1775 {
1776    LLVMBuilderRef builder = bld->gallivm->builder;
1777    const struct lp_type type = bld->type;
1778    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1779
1780    assert(lp_check_value(type, a));
1781
1782    if(!type.sign)
1783       return a;
1784
1785    if(type.floating) {
1786       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1787          /* Workaround llvm.org/PR27332 */
1788          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1789          unsigned long long absMask = ~(1ULL << (type.width - 1));
1790          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1791          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1792          a = LLVMBuildAnd(builder, a, mask, "");
1793          a = LLVMBuildBitCast(builder, a, vec_type, "");
1794          return a;
1795       } else {
1796          char intrinsic[32];
1797          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1798          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1799       }
1800    }
1801
1802    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1803       switch(type.width) {
1804       case 8:
1805          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1806       case 16:
1807          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1808       case 32:
1809          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1810       }
1811    }
1812    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1813       switch(type.width) {
1814       case 8:
1815          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1816       case 16:
1817          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1818       case 32:
1819          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1820       }
1821    }
1822
1823    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1824                           a, LLVMBuildNeg(builder, a, ""));
1825 }
1826
1827
1828 LLVMValueRef
1829 lp_build_negate(struct lp_build_context *bld,
1830                 LLVMValueRef a)
1831 {
1832    LLVMBuilderRef builder = bld->gallivm->builder;
1833
1834    assert(lp_check_value(bld->type, a));
1835
1836    if (bld->type.floating)
1837       a = LLVMBuildFNeg(builder, a, "");
1838    else
1839       a = LLVMBuildNeg(builder, a, "");
1840
1841    return a;
1842 }
1843
1844
1845 /** Return -1, 0 or +1 depending on the sign of a */
1846 LLVMValueRef
1847 lp_build_sgn(struct lp_build_context *bld,
1848              LLVMValueRef a)
1849 {
1850    LLVMBuilderRef builder = bld->gallivm->builder;
1851    const struct lp_type type = bld->type;
1852    LLVMValueRef cond;
1853    LLVMValueRef res;
1854
1855    assert(lp_check_value(type, a));
1856
1857    /* Handle non-zero case */
1858    if(!type.sign) {
1859       /* if not zero then sign must be positive */
1860       res = bld->one;
1861    }
1862    else if(type.floating) {
1863       LLVMTypeRef vec_type;
1864       LLVMTypeRef int_type;
1865       LLVMValueRef mask;
1866       LLVMValueRef sign;
1867       LLVMValueRef one;
1868       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1869
1870       int_type = lp_build_int_vec_type(bld->gallivm, type);
1871       vec_type = lp_build_vec_type(bld->gallivm, type);
1872       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1873
1874       /* Take the sign bit and add it to 1 constant */
1875       sign = LLVMBuildBitCast(builder, a, int_type, "");
1876       sign = LLVMBuildAnd(builder, sign, mask, "");
1877       one = LLVMConstBitCast(bld->one, int_type);
1878       res = LLVMBuildOr(builder, sign, one, "");
1879       res = LLVMBuildBitCast(builder, res, vec_type, "");
1880    }
1881    else
1882    {
1883       /* signed int/norm/fixed point */
1884       /* could use psign with sse3 and appropriate vectors here */
1885       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1886       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1887       res = lp_build_select(bld, cond, bld->one, minus_one);
1888    }
1889
1890    /* Handle zero */
1891    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1892    res = lp_build_select(bld, cond, bld->zero, res);
1893
1894    return res;
1895 }
1896
1897
1898 /**
1899  * Set the sign of float vector 'a' according to 'sign'.
1900  * If sign==0, return abs(a).
1901  * If sign==1, return -abs(a);
1902  * Other values for sign produce undefined results.
1903  */
1904 LLVMValueRef
1905 lp_build_set_sign(struct lp_build_context *bld,
1906                   LLVMValueRef a, LLVMValueRef sign)
1907 {
1908    LLVMBuilderRef builder = bld->gallivm->builder;
1909    const struct lp_type type = bld->type;
1910    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1911    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1912    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1913    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1914                              ~((unsigned long long) 1 << (type.width - 1)));
1915    LLVMValueRef val, res;
1916
1917    assert(type.floating);
1918    assert(lp_check_value(type, a));
1919
1920    /* val = reinterpret_cast<int>(a) */
1921    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1922    /* val = val & mask */
1923    val = LLVMBuildAnd(builder, val, mask, "");
1924    /* sign = sign << shift */
1925    sign = LLVMBuildShl(builder, sign, shift, "");
1926    /* res = val | sign */
1927    res = LLVMBuildOr(builder, val, sign, "");
1928    /* res = reinterpret_cast<float>(res) */
1929    res = LLVMBuildBitCast(builder, res, vec_type, "");
1930
1931    return res;
1932 }
1933
1934
1935 /**
1936  * Convert vector of (or scalar) int to vector of (or scalar) float.
1937  */
1938 LLVMValueRef
1939 lp_build_int_to_float(struct lp_build_context *bld,
1940                       LLVMValueRef a)
1941 {
1942    LLVMBuilderRef builder = bld->gallivm->builder;
1943    const struct lp_type type = bld->type;
1944    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945
1946    assert(type.floating);
1947
1948    return LLVMBuildSIToFP(builder, a, vec_type, "");
1949 }
1950
1951 static boolean
1952 arch_rounding_available(const struct lp_type type)
1953 {
1954    if ((util_cpu_caps.has_sse4_1 &&
1955        (type.length == 1 || type.width*type.length == 128)) ||
1956        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1957        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1958       return TRUE;
1959    else if ((util_cpu_caps.has_altivec &&
1960             (type.width == 32 && type.length == 4)))
1961       return TRUE;
1962
1963    return FALSE;
1964 }
1965
1966 enum lp_build_round_mode
1967 {
1968    LP_BUILD_ROUND_NEAREST = 0,
1969    LP_BUILD_ROUND_FLOOR = 1,
1970    LP_BUILD_ROUND_CEIL = 2,
1971    LP_BUILD_ROUND_TRUNCATE = 3
1972 };
1973
1974 static inline LLVMValueRef
1975 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1976                              LLVMValueRef a)
1977 {
1978    LLVMBuilderRef builder = bld->gallivm->builder;
1979    const struct lp_type type = bld->type;
1980    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1981    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1982    const char *intrinsic;
1983    LLVMValueRef res;
1984
1985    assert(type.floating);
1986    /* using the double precision conversions is a bit more complicated */
1987    assert(type.width == 32);
1988
1989    assert(lp_check_value(type, a));
1990    assert(util_cpu_caps.has_sse2);
1991
1992    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1993    if (type.length == 1) {
1994       LLVMTypeRef vec_type;
1995       LLVMValueRef undef;
1996       LLVMValueRef arg;
1997       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1998
1999       vec_type = LLVMVectorType(bld->elem_type, 4);
2000
2001       intrinsic = "llvm.x86.sse.cvtss2si";
2002
2003       undef = LLVMGetUndef(vec_type);
2004
2005       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2006
2007       res = lp_build_intrinsic_unary(builder, intrinsic,
2008                                      ret_type, arg);
2009    }
2010    else {
2011       if (type.width* type.length == 128) {
2012          intrinsic = "llvm.x86.sse2.cvtps2dq";
2013       }
2014       else {
2015          assert(type.width*type.length == 256);
2016          assert(util_cpu_caps.has_avx);
2017
2018          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2019       }
2020       res = lp_build_intrinsic_unary(builder, intrinsic,
2021                                      ret_type, a);
2022    }
2023
2024    return res;
2025 }
2026
2027
2028 /*
2029  */
2030 static inline LLVMValueRef
2031 lp_build_round_altivec(struct lp_build_context *bld,
2032                        LLVMValueRef a,
2033                        enum lp_build_round_mode mode)
2034 {
2035    LLVMBuilderRef builder = bld->gallivm->builder;
2036    const struct lp_type type = bld->type;
2037    const char *intrinsic = NULL;
2038
2039    assert(type.floating);
2040
2041    assert(lp_check_value(type, a));
2042    assert(util_cpu_caps.has_altivec);
2043
2044    (void)type;
2045
2046    switch (mode) {
2047    case LP_BUILD_ROUND_NEAREST:
2048       intrinsic = "llvm.ppc.altivec.vrfin";
2049       break;
2050    case LP_BUILD_ROUND_FLOOR:
2051       intrinsic = "llvm.ppc.altivec.vrfim";
2052       break;
2053    case LP_BUILD_ROUND_CEIL:
2054       intrinsic = "llvm.ppc.altivec.vrfip";
2055       break;
2056    case LP_BUILD_ROUND_TRUNCATE:
2057       intrinsic = "llvm.ppc.altivec.vrfiz";
2058       break;
2059    }
2060
2061    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2062 }
2063
2064 static inline LLVMValueRef
2065 lp_build_round_arch(struct lp_build_context *bld,
2066                     LLVMValueRef a,
2067                     enum lp_build_round_mode mode)
2068 {
2069    if (util_cpu_caps.has_sse4_1) {
2070       LLVMBuilderRef builder = bld->gallivm->builder;
2071       const struct lp_type type = bld->type;
2072       const char *intrinsic_root;
2073       char intrinsic[32];
2074
2075       assert(type.floating);
2076       assert(lp_check_value(type, a));
2077       (void)type;
2078
2079       switch (mode) {
2080       case LP_BUILD_ROUND_NEAREST:
2081          intrinsic_root = "llvm.nearbyint";
2082          break;
2083       case LP_BUILD_ROUND_FLOOR:
2084          intrinsic_root = "llvm.floor";
2085          break;
2086       case LP_BUILD_ROUND_CEIL:
2087          intrinsic_root = "llvm.ceil";
2088          break;
2089       case LP_BUILD_ROUND_TRUNCATE:
2090          intrinsic_root = "llvm.trunc";
2091          break;
2092       }
2093
2094       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2095       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2096    }
2097    else /* (util_cpu_caps.has_altivec) */
2098      return lp_build_round_altivec(bld, a, mode);
2099 }
2100
2101 /**
2102  * Return the integer part of a float (vector) value (== round toward zero).
2103  * The returned value is a float (vector).
2104  * Ex: trunc(-1.5) = -1.0
2105  */
2106 LLVMValueRef
2107 lp_build_trunc(struct lp_build_context *bld,
2108                LLVMValueRef a)
2109 {
2110    LLVMBuilderRef builder = bld->gallivm->builder;
2111    const struct lp_type type = bld->type;
2112
2113    assert(type.floating);
2114    assert(lp_check_value(type, a));
2115
2116    if (arch_rounding_available(type)) {
2117       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2118    }
2119    else {
2120       const struct lp_type type = bld->type;
2121       struct lp_type inttype;
2122       struct lp_build_context intbld;
2123       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2124       LLVMValueRef trunc, res, anosign, mask;
2125       LLVMTypeRef int_vec_type = bld->int_vec_type;
2126       LLVMTypeRef vec_type = bld->vec_type;
2127
2128       assert(type.width == 32); /* might want to handle doubles at some point */
2129
2130       inttype = type;
2131       inttype.floating = 0;
2132       lp_build_context_init(&intbld, bld->gallivm, inttype);
2133
2134       /* round by truncation */
2135       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2136       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2137
2138       /* mask out sign bit */
2139       anosign = lp_build_abs(bld, a);
2140       /*
2141        * mask out all values if anosign > 2^24
2142        * This should work both for large ints (all rounding is no-op for them
2143        * because such floats are always exact) as well as special cases like
2144        * NaNs, Infs (taking advantage of the fact they use max exponent).
2145        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2146        */
2147       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2148       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2149       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2150       return lp_build_select(bld, mask, a, res);
2151    }
2152 }
2153
2154
2155 /**
2156  * Return float (vector) rounded to nearest integer (vector).  The returned
2157  * value is a float (vector).
2158  * Ex: round(0.9) = 1.0
2159  * Ex: round(-1.5) = -2.0
2160  */
2161 LLVMValueRef
2162 lp_build_round(struct lp_build_context *bld,
2163                LLVMValueRef a)
2164 {
2165    LLVMBuilderRef builder = bld->gallivm->builder;
2166    const struct lp_type type = bld->type;
2167
2168    assert(type.floating);
2169    assert(lp_check_value(type, a));
2170
2171    if (arch_rounding_available(type)) {
2172       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2173    }
2174    else {
2175       const struct lp_type type = bld->type;
2176       struct lp_type inttype;
2177       struct lp_build_context intbld;
2178       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2179       LLVMValueRef res, anosign, mask;
2180       LLVMTypeRef int_vec_type = bld->int_vec_type;
2181       LLVMTypeRef vec_type = bld->vec_type;
2182
2183       assert(type.width == 32); /* might want to handle doubles at some point */
2184
2185       inttype = type;
2186       inttype.floating = 0;
2187       lp_build_context_init(&intbld, bld->gallivm, inttype);
2188
2189       res = lp_build_iround(bld, a);
2190       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2191
2192       /* mask out sign bit */
2193       anosign = lp_build_abs(bld, a);
2194       /*
2195        * mask out all values if anosign > 2^24
2196        * This should work both for large ints (all rounding is no-op for them
2197        * because such floats are always exact) as well as special cases like
2198        * NaNs, Infs (taking advantage of the fact they use max exponent).
2199        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2200        */
2201       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2202       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2203       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2204       return lp_build_select(bld, mask, a, res);
2205    }
2206 }
2207
2208
2209 /**
2210  * Return floor of float (vector), result is a float (vector)
2211  * Ex: floor(1.1) = 1.0
2212  * Ex: floor(-1.1) = -2.0
2213  */
2214 LLVMValueRef
2215 lp_build_floor(struct lp_build_context *bld,
2216                LLVMValueRef a)
2217 {
2218    LLVMBuilderRef builder = bld->gallivm->builder;
2219    const struct lp_type type = bld->type;
2220
2221    assert(type.floating);
2222    assert(lp_check_value(type, a));
2223
2224    if (arch_rounding_available(type)) {
2225       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2226    }
2227    else {
2228       const struct lp_type type = bld->type;
2229       struct lp_type inttype;
2230       struct lp_build_context intbld;
2231       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2232       LLVMValueRef trunc, res, anosign, mask;
2233       LLVMTypeRef int_vec_type = bld->int_vec_type;
2234       LLVMTypeRef vec_type = bld->vec_type;
2235
2236       if (type.width != 32) {
2237          char intrinsic[32];
2238          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2239          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2240       }
2241
2242       assert(type.width == 32); /* might want to handle doubles at some point */
2243
2244       inttype = type;
2245       inttype.floating = 0;
2246       lp_build_context_init(&intbld, bld->gallivm, inttype);
2247
2248       /* round by truncation */
2249       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2250       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2251
2252       if (type.sign) {
2253          LLVMValueRef tmp;
2254
2255          /*
2256           * fix values if rounding is wrong (for non-special cases)
2257           * - this is the case if trunc > a
2258           */
2259          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2260          /* tmp = trunc > a ? 1.0 : 0.0 */
2261          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2262          tmp = lp_build_and(&intbld, mask, tmp);
2263          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2264          res = lp_build_sub(bld, res, tmp);
2265       }
2266
2267       /* mask out sign bit */
2268       anosign = lp_build_abs(bld, a);
2269       /*
2270        * mask out all values if anosign > 2^24
2271        * This should work both for large ints (all rounding is no-op for them
2272        * because such floats are always exact) as well as special cases like
2273        * NaNs, Infs (taking advantage of the fact they use max exponent).
2274        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2275        */
2276       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2277       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2278       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2279       return lp_build_select(bld, mask, a, res);
2280    }
2281 }
2282
2283
2284 /**
2285  * Return ceiling of float (vector), returning float (vector).
2286  * Ex: ceil( 1.1) = 2.0
2287  * Ex: ceil(-1.1) = -1.0
2288  */
2289 LLVMValueRef
2290 lp_build_ceil(struct lp_build_context *bld,
2291               LLVMValueRef a)
2292 {
2293    LLVMBuilderRef builder = bld->gallivm->builder;
2294    const struct lp_type type = bld->type;
2295
2296    assert(type.floating);
2297    assert(lp_check_value(type, a));
2298
2299    if (arch_rounding_available(type)) {
2300       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2301    }
2302    else {
2303       const struct lp_type type = bld->type;
2304       struct lp_type inttype;
2305       struct lp_build_context intbld;
2306       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2307       LLVMValueRef trunc, res, anosign, mask, tmp;
2308       LLVMTypeRef int_vec_type = bld->int_vec_type;
2309       LLVMTypeRef vec_type = bld->vec_type;
2310
2311       if (type.width != 32) {
2312          char intrinsic[32];
2313          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2314          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2315       }
2316
2317       assert(type.width == 32); /* might want to handle doubles at some point */
2318
2319       inttype = type;
2320       inttype.floating = 0;
2321       lp_build_context_init(&intbld, bld->gallivm, inttype);
2322
2323       /* round by truncation */
2324       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2325       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2326
2327       /*
2328        * fix values if rounding is wrong (for non-special cases)
2329        * - this is the case if trunc < a
2330        */
2331       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2332       /* tmp = trunc < a ? 1.0 : 0.0 */
2333       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2334       tmp = lp_build_and(&intbld, mask, tmp);
2335       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2336       res = lp_build_add(bld, trunc, tmp);
2337
2338       /* mask out sign bit */
2339       anosign = lp_build_abs(bld, a);
2340       /*
2341        * mask out all values if anosign > 2^24
2342        * This should work both for large ints (all rounding is no-op for them
2343        * because such floats are always exact) as well as special cases like
2344        * NaNs, Infs (taking advantage of the fact they use max exponent).
2345        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2346        */
2347       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2348       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2349       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2350       return lp_build_select(bld, mask, a, res);
2351    }
2352 }
2353
2354
2355 /**
2356  * Return fractional part of 'a' computed as a - floor(a)
2357  * Typically used in texture coord arithmetic.
2358  */
2359 LLVMValueRef
2360 lp_build_fract(struct lp_build_context *bld,
2361                LLVMValueRef a)
2362 {
2363    assert(bld->type.floating);
2364    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2365 }
2366
2367
2368 /**
2369  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2370  * against 0.99999(9). (Will also return that value for NaNs.)
2371  */
2372 static inline LLVMValueRef
2373 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2374 {
2375    LLVMValueRef max;
2376
2377    /* this is the largest number smaller than 1.0 representable as float */
2378    max = lp_build_const_vec(bld->gallivm, bld->type,
2379                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2380    return lp_build_min_ext(bld, fract, max,
2381                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2382 }
2383
2384
2385 /**
2386  * Same as lp_build_fract, but guarantees that the result is always smaller
2387  * than one. Will also return the smaller-than-one value for infs, NaNs.
2388  */
2389 LLVMValueRef
2390 lp_build_fract_safe(struct lp_build_context *bld,
2391                     LLVMValueRef a)
2392 {
2393    return clamp_fract(bld, lp_build_fract(bld, a));
2394 }
2395
2396
2397 /**
2398  * Return the integer part of a float (vector) value (== round toward zero).
2399  * The returned value is an integer (vector).
2400  * Ex: itrunc(-1.5) = -1
2401  */
2402 LLVMValueRef
2403 lp_build_itrunc(struct lp_build_context *bld,
2404                 LLVMValueRef a)
2405 {
2406    LLVMBuilderRef builder = bld->gallivm->builder;
2407    const struct lp_type type = bld->type;
2408    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2409
2410    assert(type.floating);
2411    assert(lp_check_value(type, a));
2412
2413    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2414 }
2415
2416
2417 /**
2418  * Return float (vector) rounded to nearest integer (vector).  The returned
2419  * value is an integer (vector).
2420  * Ex: iround(0.9) = 1
2421  * Ex: iround(-1.5) = -2
2422  */
2423 LLVMValueRef
2424 lp_build_iround(struct lp_build_context *bld,
2425                 LLVMValueRef a)
2426 {
2427    LLVMBuilderRef builder = bld->gallivm->builder;
2428    const struct lp_type type = bld->type;
2429    LLVMTypeRef int_vec_type = bld->int_vec_type;
2430    LLVMValueRef res;
2431
2432    assert(type.floating);
2433
2434    assert(lp_check_value(type, a));
2435
2436    if ((util_cpu_caps.has_sse2 &&
2437        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2438        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2439       return lp_build_iround_nearest_sse2(bld, a);
2440    }
2441    if (arch_rounding_available(type)) {
2442       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2443    }
2444    else {
2445       LLVMValueRef half;
2446
2447       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2448
2449       if (type.sign) {
2450          LLVMTypeRef vec_type = bld->vec_type;
2451          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2452                                     (unsigned long long)1 << (type.width - 1));
2453          LLVMValueRef sign;
2454
2455          /* get sign bit */
2456          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2457          sign = LLVMBuildAnd(builder, sign, mask, "");
2458
2459          /* sign * 0.5 */
2460          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2461          half = LLVMBuildOr(builder, sign, half, "");
2462          half = LLVMBuildBitCast(builder, half, vec_type, "");
2463       }
2464
2465       res = LLVMBuildFAdd(builder, a, half, "");
2466    }
2467
2468    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2469
2470    return res;
2471 }
2472
2473
2474 /**
2475  * Return floor of float (vector), result is an int (vector)
2476  * Ex: ifloor(1.1) = 1.0
2477  * Ex: ifloor(-1.1) = -2.0
2478  */
2479 LLVMValueRef
2480 lp_build_ifloor(struct lp_build_context *bld,
2481                 LLVMValueRef a)
2482 {
2483    LLVMBuilderRef builder = bld->gallivm->builder;
2484    const struct lp_type type = bld->type;
2485    LLVMTypeRef int_vec_type = bld->int_vec_type;
2486    LLVMValueRef res;
2487
2488    assert(type.floating);
2489    assert(lp_check_value(type, a));
2490
2491    res = a;
2492    if (type.sign) {
2493       if (arch_rounding_available(type)) {
2494          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2495       }
2496       else {
2497          struct lp_type inttype;
2498          struct lp_build_context intbld;
2499          LLVMValueRef trunc, itrunc, mask;
2500
2501          assert(type.floating);
2502          assert(lp_check_value(type, a));
2503
2504          inttype = type;
2505          inttype.floating = 0;
2506          lp_build_context_init(&intbld, bld->gallivm, inttype);
2507
2508          /* round by truncation */
2509          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2511
2512          /*
2513           * fix values if rounding is wrong (for non-special cases)
2514           * - this is the case if trunc > a
2515           * The results of doing this with NaNs, very large values etc.
2516           * are undefined but this seems to be the case anyway.
2517           */
2518          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2519          /* cheapie minus one with mask since the mask is minus one / zero */
2520          return lp_build_add(&intbld, itrunc, mask);
2521       }
2522    }
2523
2524    /* round to nearest (toward zero) */
2525    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2526
2527    return res;
2528 }
2529
2530
2531 /**
2532  * Return ceiling of float (vector), returning int (vector).
2533  * Ex: iceil( 1.1) = 2
2534  * Ex: iceil(-1.1) = -1
2535  */
2536 LLVMValueRef
2537 lp_build_iceil(struct lp_build_context *bld,
2538                LLVMValueRef a)
2539 {
2540    LLVMBuilderRef builder = bld->gallivm->builder;
2541    const struct lp_type type = bld->type;
2542    LLVMTypeRef int_vec_type = bld->int_vec_type;
2543    LLVMValueRef res;
2544
2545    assert(type.floating);
2546    assert(lp_check_value(type, a));
2547
2548    if (arch_rounding_available(type)) {
2549       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2550    }
2551    else {
2552       struct lp_type inttype;
2553       struct lp_build_context intbld;
2554       LLVMValueRef trunc, itrunc, mask;
2555
2556       assert(type.floating);
2557       assert(lp_check_value(type, a));
2558
2559       inttype = type;
2560       inttype.floating = 0;
2561       lp_build_context_init(&intbld, bld->gallivm, inttype);
2562
2563       /* round by truncation */
2564       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2565       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2566
2567       /*
2568        * fix values if rounding is wrong (for non-special cases)
2569        * - this is the case if trunc < a
2570        * The results of doing this with NaNs, very large values etc.
2571        * are undefined but this seems to be the case anyway.
2572        */
2573       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2574       /* cheapie plus one with mask since the mask is minus one / zero */
2575       return lp_build_sub(&intbld, itrunc, mask);
2576    }
2577
2578    /* round to nearest (toward zero) */
2579    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2580
2581    return res;
2582 }
2583
2584
2585 /**
2586  * Combined ifloor() & fract().
2587  *
2588  * Preferred to calling the functions separately, as it will ensure that the
2589  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2590  */
2591 void
2592 lp_build_ifloor_fract(struct lp_build_context *bld,
2593                       LLVMValueRef a,
2594                       LLVMValueRef *out_ipart,
2595                       LLVMValueRef *out_fpart)
2596 {
2597    LLVMBuilderRef builder = bld->gallivm->builder;
2598    const struct lp_type type = bld->type;
2599    LLVMValueRef ipart;
2600
2601    assert(type.floating);
2602    assert(lp_check_value(type, a));
2603
2604    if (arch_rounding_available(type)) {
2605       /*
2606        * floor() is easier.
2607        */
2608
2609       ipart = lp_build_floor(bld, a);
2610       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2611       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2612    }
2613    else {
2614       /*
2615        * ifloor() is easier.
2616        */
2617
2618       *out_ipart = lp_build_ifloor(bld, a);
2619       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2620       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2621    }
2622 }
2623
2624
2625 /**
2626  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2627  * always smaller than one.
2628  */
2629 void
2630 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2631                            LLVMValueRef a,
2632                            LLVMValueRef *out_ipart,
2633                            LLVMValueRef *out_fpart)
2634 {
2635    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2636    *out_fpart = clamp_fract(bld, *out_fpart);
2637 }
2638
2639
2640 LLVMValueRef
2641 lp_build_sqrt(struct lp_build_context *bld,
2642               LLVMValueRef a)
2643 {
2644    LLVMBuilderRef builder = bld->gallivm->builder;
2645    const struct lp_type type = bld->type;
2646    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2647    char intrinsic[32];
2648
2649    assert(lp_check_value(type, a));
2650
2651    assert(type.floating);
2652    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2653
2654    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2655 }
2656
2657
2658 /**
2659  * Do one Newton-Raphson step to improve reciprocate precision:
2660  *
2661  *   x_{i+1} = x_i * (2 - a * x_i)
2662  *
2663  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2664  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2665  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2666  * halo. It would be necessary to clamp the argument to prevent this.
2667  *
2668  * See also:
2669  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2670  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2671  */
2672 static inline LLVMValueRef
2673 lp_build_rcp_refine(struct lp_build_context *bld,
2674                     LLVMValueRef a,
2675                     LLVMValueRef rcp_a)
2676 {
2677    LLVMBuilderRef builder = bld->gallivm->builder;
2678    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2679    LLVMValueRef res;
2680
2681    res = LLVMBuildFMul(builder, a, rcp_a, "");
2682    res = LLVMBuildFSub(builder, two, res, "");
2683    res = LLVMBuildFMul(builder, rcp_a, res, "");
2684
2685    return res;
2686 }
2687
2688
2689 LLVMValueRef
2690 lp_build_rcp(struct lp_build_context *bld,
2691              LLVMValueRef a)
2692 {
2693    LLVMBuilderRef builder = bld->gallivm->builder;
2694    const struct lp_type type = bld->type;
2695
2696    assert(lp_check_value(type, a));
2697
2698    if(a == bld->zero)
2699       return bld->undef;
2700    if(a == bld->one)
2701       return bld->one;
2702    if(a == bld->undef)
2703       return bld->undef;
2704
2705    assert(type.floating);
2706
2707    if(LLVMIsConstant(a))
2708       return LLVMConstFDiv(bld->one, a);
2709
2710    /*
2711     * We don't use RCPPS because:
2712     * - it only has 10bits of precision
2713     * - it doesn't even get the reciprocate of 1.0 exactly
2714     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2715     * - for recent processors the benefit over DIVPS is marginal, a case
2716     *   dependent
2717     *
2718     * We could still use it on certain processors if benchmarks show that the
2719     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2720     * particular uses that require less workarounds.
2721     */
2722
2723    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2724          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2725       const unsigned num_iterations = 0;
2726       LLVMValueRef res;
2727       unsigned i;
2728       const char *intrinsic = NULL;
2729
2730       if (type.length == 4) {
2731          intrinsic = "llvm.x86.sse.rcp.ps";
2732       }
2733       else {
2734          intrinsic = "llvm.x86.avx.rcp.ps.256";
2735       }
2736
2737       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2738
2739       for (i = 0; i < num_iterations; ++i) {
2740          res = lp_build_rcp_refine(bld, a, res);
2741       }
2742
2743       return res;
2744    }
2745
2746    return LLVMBuildFDiv(builder, bld->one, a, "");
2747 }
2748
2749
2750 /**
2751  * Do one Newton-Raphson step to improve rsqrt precision:
2752  *
2753  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2754  *
2755  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2756  */
2757 static inline LLVMValueRef
2758 lp_build_rsqrt_refine(struct lp_build_context *bld,
2759                       LLVMValueRef a,
2760                       LLVMValueRef rsqrt_a)
2761 {
2762    LLVMBuilderRef builder = bld->gallivm->builder;
2763    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2764    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2765    LLVMValueRef res;
2766
2767    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2768    res = LLVMBuildFMul(builder, a, res, "");
2769    res = LLVMBuildFSub(builder, three, res, "");
2770    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2771    res = LLVMBuildFMul(builder, half, res, "");
2772
2773    return res;
2774 }
2775
2776
2777 /**
2778  * Generate 1/sqrt(a).
2779  * Result is undefined for values < 0, infinity for +0.
2780  */
2781 LLVMValueRef
2782 lp_build_rsqrt(struct lp_build_context *bld,
2783                LLVMValueRef a)
2784 {
2785    const struct lp_type type = bld->type;
2786
2787    assert(lp_check_value(type, a));
2788
2789    assert(type.floating);
2790
2791    /*
2792     * This should be faster but all denormals will end up as infinity.
2793     */
2794    if (0 && lp_build_fast_rsqrt_available(type)) {
2795       const unsigned num_iterations = 1;
2796       LLVMValueRef res;
2797       unsigned i;
2798
2799       /* rsqrt(1.0) != 1.0 here */
2800       res = lp_build_fast_rsqrt(bld, a);
2801
2802       if (num_iterations) {
2803          /*
2804           * Newton-Raphson will result in NaN instead of infinity for zero,
2805           * and NaN instead of zero for infinity.
2806           * Also, need to ensure rsqrt(1.0) == 1.0.
2807           * All numbers smaller than FLT_MIN will result in +infinity
2808           * (rsqrtps treats all denormals as zero).
2809           */
2810          LLVMValueRef cmp;
2811          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2812          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2813
2814          for (i = 0; i < num_iterations; ++i) {
2815             res = lp_build_rsqrt_refine(bld, a, res);
2816          }
2817          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2818          res = lp_build_select(bld, cmp, inf, res);
2819          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2820          res = lp_build_select(bld, cmp, bld->zero, res);
2821          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2822          res = lp_build_select(bld, cmp, bld->one, res);
2823       }
2824
2825       return res;
2826    }
2827
2828    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2829 }
2830
2831 /**
2832  * If there's a fast (inaccurate) rsqrt instruction available
2833  * (caller may want to avoid to call rsqrt_fast if it's not available,
2834  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2835  * unavailable it would result in sqrt/div/mul so obviously
2836  * much better to just call sqrt, skipping both div and mul).
2837  */
2838 boolean
2839 lp_build_fast_rsqrt_available(struct lp_type type)
2840 {
2841    assert(type.floating);
2842
2843    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2844        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2845       return true;
2846    }
2847    return false;
2848 }
2849
2850
2851 /**
2852  * Generate 1/sqrt(a).
2853  * Result is undefined for values < 0, infinity for +0.
2854  * Precision is limited, only ~10 bits guaranteed
2855  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2856  */
2857 LLVMValueRef
2858 lp_build_fast_rsqrt(struct lp_build_context *bld,
2859                     LLVMValueRef a)
2860 {
2861    LLVMBuilderRef builder = bld->gallivm->builder;
2862    const struct lp_type type = bld->type;
2863
2864    assert(lp_check_value(type, a));
2865
2866    if (lp_build_fast_rsqrt_available(type)) {
2867       const char *intrinsic = NULL;
2868
2869       if (type.length == 4) {
2870          intrinsic = "llvm.x86.sse.rsqrt.ps";
2871       }
2872       else {
2873          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2874       }
2875       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2876    }
2877    else {
2878       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2879    }
2880    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2881 }
2882
2883
2884 /**
2885  * Generate sin(a) or cos(a) using polynomial approximation.
2886  * TODO: it might be worth recognizing sin and cos using same source
2887  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2888  * would be way cheaper than calculating (nearly) everything twice...
2889  * Not sure it's common enough to be worth bothering however, scs
2890  * opcode could also benefit from calculating both though.
2891  */
2892 static LLVMValueRef
2893 lp_build_sin_or_cos(struct lp_build_context *bld,
2894                     LLVMValueRef a,
2895                     boolean cos)
2896 {
2897    struct gallivm_state *gallivm = bld->gallivm;
2898    LLVMBuilderRef b = gallivm->builder;
2899    struct lp_type int_type = lp_int_type(bld->type);
2900
2901    /*
2902     *  take the absolute value,
2903     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2904     */
2905
2906    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2907    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2908
2909    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2910    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2911
2912    /*
2913     * scale by 4/Pi
2914     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2915     */
2916
2917    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2918    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2919
2920    /*
2921     * store the integer part of y in mm0
2922     * emm2 = _mm_cvttps_epi32(y);
2923     */
2924
2925    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2926
2927    /*
2928     * j=(j+1) & (~1) (see the cephes sources)
2929     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2930     */
2931
2932    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2933    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2934    /*
2935     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2936     */
2937    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2938    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2939
2940    /*
2941     * y = _mm_cvtepi32_ps(emm2);
2942     */
2943    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2944
2945    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2946    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2947    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2948    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2949
2950    /*
2951     * Argument used for poly selection and sign bit determination
2952     * is different for sin vs. cos.
2953     */
2954    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2955                                emm2_and;
2956
2957    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2958                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2959                                               const_29, "sign_bit") :
2960                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2961                                                               LLVMBuildShl(b, emm2_add,
2962                                                                            const_29, ""), ""),
2963                                               sign_mask, "sign_bit");
2964
2965    /*
2966     * get the polynom selection mask
2967     * there is one polynom for 0 <= x <= Pi/4
2968     * and another one for Pi/4<x<=Pi/2
2969     * Both branches will be computed.
2970     *
2971     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2972     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2973     */
2974
2975    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2976    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2977                                              int_type, PIPE_FUNC_EQUAL,
2978                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2979
2980    /*
2981     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2982     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2983     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2984     */
2985    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2986    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2987    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2988
2989    /*
2990     * The magic pass: "Extended precision modular arithmetic"
2991     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2992     */
2993    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2994    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2995    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2996
2997    /*
2998     * Evaluate the first polynom  (0 <= x <= Pi/4)
2999     *
3000     * z = _mm_mul_ps(x,x);
3001     */
3002    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3003
3004    /*
3005     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3006     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3007     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3008     */
3009    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3010    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3011    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3012
3013    /*
3014     * y = *(v4sf*)_ps_coscof_p0;
3015     * y = _mm_mul_ps(y, z);
3016     */
3017    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3018    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3019    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3020    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3021
3022
3023    /*
3024     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3025     * y = _mm_sub_ps(y, tmp);
3026     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3027     */
3028    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3029    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3030    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3031    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3032    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3033
3034    /*
3035     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3036     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3037     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3038     */
3039    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3040    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3041    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3042
3043    /*
3044     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3045     *
3046     * y2 = *(v4sf*)_ps_sincof_p0;
3047     * y2 = _mm_mul_ps(y2, z);
3048     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3049     * y2 = _mm_mul_ps(y2, z);
3050     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3051     * y2 = _mm_mul_ps(y2, z);
3052     * y2 = _mm_mul_ps(y2, x);
3053     * y2 = _mm_add_ps(y2, x);
3054     */
3055
3056    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3057    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3058    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3059    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3060
3061    /*
3062     * select the correct result from the two polynoms
3063     * xmm3 = poly_mask;
3064     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3065     * y = _mm_andnot_ps(xmm3, y);
3066     * y = _mm_or_ps(y,y2);
3067     */
3068    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3069    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3070    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3071    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3072    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3073    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3074
3075    /*
3076     * update the sign
3077     * y = _mm_xor_ps(y, sign_bit);
3078     */
3079    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3080    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3081
3082    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3083
3084    /* clamp output to be within [-1, 1] */
3085    y_result = lp_build_clamp(bld, y_result,
3086                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3087                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3088    /* If a is -inf, inf or NaN then return NaN */
3089    y_result = lp_build_select(bld, isfinite, y_result,
3090                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3091    return y_result;
3092 }
3093
3094
3095 /**
3096  * Generate sin(a)
3097  */
3098 LLVMValueRef
3099 lp_build_sin(struct lp_build_context *bld,
3100              LLVMValueRef a)
3101 {
3102    return lp_build_sin_or_cos(bld, a, FALSE);
3103 }
3104
3105
3106 /**
3107  * Generate cos(a)
3108  */
3109 LLVMValueRef
3110 lp_build_cos(struct lp_build_context *bld,
3111              LLVMValueRef a)
3112 {
3113    return lp_build_sin_or_cos(bld, a, TRUE);
3114 }
3115
3116
3117 /**
3118  * Generate pow(x, y)
3119  */
3120 LLVMValueRef
3121 lp_build_pow(struct lp_build_context *bld,
3122              LLVMValueRef x,
3123              LLVMValueRef y)
3124 {
3125    /* TODO: optimize the constant case */
3126    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3127        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3128       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3129                    __FUNCTION__);
3130    }
3131
3132    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3133 }
3134
3135
3136 /**
3137  * Generate exp(x)
3138  */
3139 LLVMValueRef
3140 lp_build_exp(struct lp_build_context *bld,
3141              LLVMValueRef x)
3142 {
3143    /* log2(e) = 1/log(2) */
3144    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3145                                            1.4426950408889634);
3146
3147    assert(lp_check_value(bld->type, x));
3148
3149    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3150 }
3151
3152
3153 /**
3154  * Generate log(x)
3155  * Behavior is undefined with infs, 0s and nans
3156  */
3157 LLVMValueRef
3158 lp_build_log(struct lp_build_context *bld,
3159              LLVMValueRef x)
3160 {
3161    /* log(2) */
3162    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3163                                           0.69314718055994529);
3164
3165    assert(lp_check_value(bld->type, x));
3166
3167    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3168 }
3169
3170 /**
3171  * Generate log(x) that handles edge cases (infs, 0s and nans)
3172  */
3173 LLVMValueRef
3174 lp_build_log_safe(struct lp_build_context *bld,
3175                   LLVMValueRef x)
3176 {
3177    /* log(2) */
3178    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3179                                           0.69314718055994529);
3180
3181    assert(lp_check_value(bld->type, x));
3182
3183    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3184 }
3185
3186
3187 /**
3188  * Generate polynomial.
3189  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3190  */
3191 LLVMValueRef
3192 lp_build_polynomial(struct lp_build_context *bld,
3193                     LLVMValueRef x,
3194                     const double *coeffs,
3195                     unsigned num_coeffs)
3196 {
3197    const struct lp_type type = bld->type;
3198    LLVMValueRef even = NULL, odd = NULL;
3199    LLVMValueRef x2;
3200    unsigned i;
3201
3202    assert(lp_check_value(bld->type, x));
3203
3204    /* TODO: optimize the constant case */
3205    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3206        LLVMIsConstant(x)) {
3207       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3208                    __FUNCTION__);
3209    }
3210
3211    /*
3212     * Calculate odd and even terms seperately to decrease data dependency
3213     * Ex:
3214     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3215     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3216     */
3217    x2 = lp_build_mul(bld, x, x);
3218
3219    for (i = num_coeffs; i--; ) {
3220       LLVMValueRef coeff;
3221
3222       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3223
3224       if (i % 2 == 0) {
3225          if (even)
3226             even = lp_build_mad(bld, x2, even, coeff);
3227          else
3228             even = coeff;
3229       } else {
3230          if (odd)
3231             odd = lp_build_mad(bld, x2, odd, coeff);
3232          else
3233             odd = coeff;
3234       }
3235    }
3236
3237    if (odd)
3238       return lp_build_mad(bld, odd, x, even);
3239    else if (even)
3240       return even;
3241    else
3242       return bld->undef;
3243 }
3244
3245
3246 /**
3247  * Minimax polynomial fit of 2**x, in range [0, 1[
3248  */
3249 const double lp_build_exp2_polynomial[] = {
3250 #if EXP_POLY_DEGREE == 5
3251    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3252    0.693153073200168932794,
3253    0.240153617044375388211,
3254    0.0558263180532956664775,
3255    0.00898934009049466391101,
3256    0.00187757667519147912699
3257 #elif EXP_POLY_DEGREE == 4
3258    1.00000259337069434683,
3259    0.693003834469974940458,
3260    0.24144275689150793076,
3261    0.0520114606103070150235,
3262    0.0135341679161270268764
3263 #elif EXP_POLY_DEGREE == 3
3264    0.999925218562710312959,
3265    0.695833540494823811697,
3266    0.226067155427249155588,
3267    0.0780245226406372992967
3268 #elif EXP_POLY_DEGREE == 2
3269    1.00172476321474503578,
3270    0.657636275736077639316,
3271    0.33718943461968720704
3272 #else
3273 #error
3274 #endif
3275 };
3276
3277
3278 LLVMValueRef
3279 lp_build_exp2(struct lp_build_context *bld,
3280               LLVMValueRef x)
3281 {
3282    LLVMBuilderRef builder = bld->gallivm->builder;
3283    const struct lp_type type = bld->type;
3284    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3285    LLVMValueRef ipart = NULL;
3286    LLVMValueRef fpart = NULL;
3287    LLVMValueRef expipart = NULL;
3288    LLVMValueRef expfpart = NULL;
3289    LLVMValueRef res = NULL;
3290
3291    assert(lp_check_value(bld->type, x));
3292
3293    /* TODO: optimize the constant case */
3294    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3295        LLVMIsConstant(x)) {
3296       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3297                    __FUNCTION__);
3298    }
3299
3300    assert(type.floating && type.width == 32);
3301
3302    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3303     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3304    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3305                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3306    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3307                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3308
3309    /* ipart = floor(x) */
3310    /* fpart = x - ipart */
3311    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3312
3313    /* expipart = (float) (1 << ipart) */
3314    expipart = LLVMBuildAdd(builder, ipart,
3315                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3316    expipart = LLVMBuildShl(builder, expipart,
3317                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3318    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3319
3320    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3321                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3322
3323    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3324
3325    return res;
3326 }
3327
3328
3329
3330 /**
3331  * Extract the exponent of a IEEE-754 floating point value.
3332  *
3333  * Optionally apply an integer bias.
3334  *
3335  * Result is an integer value with
3336  *
3337  *   ifloor(log2(x)) + bias
3338  */
3339 LLVMValueRef
3340 lp_build_extract_exponent(struct lp_build_context *bld,
3341                           LLVMValueRef x,
3342                           int bias)
3343 {
3344    LLVMBuilderRef builder = bld->gallivm->builder;
3345    const struct lp_type type = bld->type;
3346    unsigned mantissa = lp_mantissa(type);
3347    LLVMValueRef res;
3348
3349    assert(type.floating);
3350
3351    assert(lp_check_value(bld->type, x));
3352
3353    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3354
3355    res = LLVMBuildLShr(builder, x,
3356                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3357    res = LLVMBuildAnd(builder, res,
3358                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3359    res = LLVMBuildSub(builder, res,
3360                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3361
3362    return res;
3363 }
3364
3365
3366 /**
3367  * Extract the mantissa of the a floating.
3368  *
3369  * Result is a floating point value with
3370  *
3371  *   x / floor(log2(x))
3372  */
3373 LLVMValueRef
3374 lp_build_extract_mantissa(struct lp_build_context *bld,
3375                           LLVMValueRef x)
3376 {
3377    LLVMBuilderRef builder = bld->gallivm->builder;
3378    const struct lp_type type = bld->type;
3379    unsigned mantissa = lp_mantissa(type);
3380    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3381                                                   (1ULL << mantissa) - 1);
3382    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3383    LLVMValueRef res;
3384
3385    assert(lp_check_value(bld->type, x));
3386
3387    assert(type.floating);
3388
3389    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3390
3391    /* res = x / 2**ipart */
3392    res = LLVMBuildAnd(builder, x, mantmask, "");
3393    res = LLVMBuildOr(builder, res, one, "");
3394    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3395
3396    return res;
3397 }
3398
3399
3400
3401 /**
3402  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3403  * These coefficients can be generate with
3404  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3405  */
3406 const double lp_build_log2_polynomial[] = {
3407 #if LOG_POLY_DEGREE == 5
3408    2.88539008148777786488L,
3409    0.961796878841293367824L,
3410    0.577058946784739859012L,
3411    0.412914355135828735411L,
3412    0.308591899232910175289L,
3413    0.352376952300281371868L,
3414 #elif LOG_POLY_DEGREE == 4
3415    2.88539009343309178325L,
3416    0.961791550404184197881L,
3417    0.577440339438736392009L,
3418    0.403343858251329912514L,
3419    0.406718052498846252698L,
3420 #elif LOG_POLY_DEGREE == 3
3421    2.88538959748872753838L,
3422    0.961932915889597772928L,
3423    0.571118517972136195241L,
3424    0.493997535084709500285L,
3425 #else
3426 #error
3427 #endif
3428 };
3429
3430 /**
3431  * See http://www.devmaster.net/forums/showthread.php?p=43580
3432  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3433  * http://www.nezumi.demon.co.uk/consult/logx.htm
3434  *
3435  * If handle_edge_cases is true the function will perform computations
3436  * to match the required D3D10+ behavior for each of the edge cases.
3437  * That means that if input is:
3438  * - less than zero (to and including -inf) then NaN will be returned
3439  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3440  * - +infinity, then +infinity will be returned
3441  * - NaN, then NaN will be returned
3442  *
3443  * Those checks are fairly expensive so if you don't need them make sure
3444  * handle_edge_cases is false.
3445  */
3446 void
3447 lp_build_log2_approx(struct lp_build_context *bld,
3448                      LLVMValueRef x,
3449                      LLVMValueRef *p_exp,
3450                      LLVMValueRef *p_floor_log2,
3451                      LLVMValueRef *p_log2,
3452                      boolean handle_edge_cases)
3453 {
3454    LLVMBuilderRef builder = bld->gallivm->builder;
3455    const struct lp_type type = bld->type;
3456    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3457    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3458
3459    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3460    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3461    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3462
3463    LLVMValueRef i = NULL;
3464    LLVMValueRef y = NULL;
3465    LLVMValueRef z = NULL;
3466    LLVMValueRef exp = NULL;
3467    LLVMValueRef mant = NULL;
3468    LLVMValueRef logexp = NULL;
3469    LLVMValueRef p_z = NULL;
3470    LLVMValueRef res = NULL;
3471
3472    assert(lp_check_value(bld->type, x));
3473
3474    if(p_exp || p_floor_log2 || p_log2) {
3475       /* TODO: optimize the constant case */
3476       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3477           LLVMIsConstant(x)) {
3478          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3479                       __FUNCTION__);
3480       }
3481
3482       assert(type.floating && type.width == 32);
3483
3484       /*
3485        * We don't explicitly handle denormalized numbers. They will yield a
3486        * result in the neighbourhood of -127, which appears to be adequate
3487        * enough.
3488        */
3489
3490       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3491
3492       /* exp = (float) exponent(x) */
3493       exp = LLVMBuildAnd(builder, i, expmask, "");
3494    }
3495
3496    if(p_floor_log2 || p_log2) {
3497       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3498       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3499       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3500    }
3501
3502    if (p_log2) {
3503       /* mant = 1 + (float) mantissa(x) */
3504       mant = LLVMBuildAnd(builder, i, mantmask, "");
3505       mant = LLVMBuildOr(builder, mant, one, "");
3506       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3507
3508       /* y = (mant - 1) / (mant + 1) */
3509       y = lp_build_div(bld,
3510          lp_build_sub(bld, mant, bld->one),
3511          lp_build_add(bld, mant, bld->one)
3512       );
3513
3514       /* z = y^2 */
3515       z = lp_build_mul(bld, y, y);
3516
3517       /* compute P(z) */
3518       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3519                                 ARRAY_SIZE(lp_build_log2_polynomial));
3520
3521       /* y * P(z) + logexp */
3522       res = lp_build_mad(bld, y, p_z, logexp);
3523
3524       if (type.floating && handle_edge_cases) {
3525          LLVMValueRef negmask, infmask,  zmask;
3526          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3527                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3528          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3529                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3530          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3531                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3532
3533          /* If x is qual to inf make sure we return inf */
3534          res = lp_build_select(bld, infmask,
3535                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3536                                res);
3537          /* If x is qual to 0, return -inf */
3538          res = lp_build_select(bld, zmask,
3539                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3540                                res);
3541          /* If x is nan or less than 0, return nan */
3542          res = lp_build_select(bld, negmask,
3543                                lp_build_const_vec(bld->gallivm, type,  NAN),
3544                                res);
3545       }
3546    }
3547
3548    if (p_exp) {
3549       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3550       *p_exp = exp;
3551    }
3552
3553    if (p_floor_log2)
3554       *p_floor_log2 = logexp;
3555
3556    if (p_log2)
3557       *p_log2 = res;
3558 }
3559
3560
3561 /*
3562  * log2 implementation which doesn't have special code to
3563  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3564  * the results for those cases are undefined.
3565  */
3566 LLVMValueRef
3567 lp_build_log2(struct lp_build_context *bld,
3568               LLVMValueRef x)
3569 {
3570    LLVMValueRef res;
3571    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3572    return res;
3573 }
3574
3575 /*
3576  * Version of log2 which handles all edge cases.
3577  * Look at documentation of lp_build_log2_approx for
3578  * description of the behavior for each of the edge cases.
3579  */
3580 LLVMValueRef
3581 lp_build_log2_safe(struct lp_build_context *bld,
3582                    LLVMValueRef x)
3583 {
3584    LLVMValueRef res;
3585    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3586    return res;
3587 }
3588
3589
3590 /**
3591  * Faster (and less accurate) log2.
3592  *
3593  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3594  *
3595  * Piece-wise linear approximation, with exact results when x is a
3596  * power of two.
3597  *
3598  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3599  */
3600 LLVMValueRef
3601 lp_build_fast_log2(struct lp_build_context *bld,
3602                    LLVMValueRef x)
3603 {
3604    LLVMBuilderRef builder = bld->gallivm->builder;
3605    LLVMValueRef ipart;
3606    LLVMValueRef fpart;
3607
3608    assert(lp_check_value(bld->type, x));
3609
3610    assert(bld->type.floating);
3611
3612    /* ipart = floor(log2(x)) - 1 */
3613    ipart = lp_build_extract_exponent(bld, x, -1);
3614    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3615
3616    /* fpart = x / 2**ipart */
3617    fpart = lp_build_extract_mantissa(bld, x);
3618
3619    /* ipart + fpart */
3620    return LLVMBuildFAdd(builder, ipart, fpart, "");
3621 }
3622
3623
3624 /**
3625  * Fast implementation of iround(log2(x)).
3626  *
3627  * Not an approximation -- it should give accurate results all the time.
3628  */
3629 LLVMValueRef
3630 lp_build_ilog2(struct lp_build_context *bld,
3631                LLVMValueRef x)
3632 {
3633    LLVMBuilderRef builder = bld->gallivm->builder;
3634    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3635    LLVMValueRef ipart;
3636
3637    assert(bld->type.floating);
3638
3639    assert(lp_check_value(bld->type, x));
3640
3641    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3642    x = LLVMBuildFMul(builder, x, sqrt2, "");
3643
3644    /* ipart = floor(log2(x) + 0.5)  */
3645    ipart = lp_build_extract_exponent(bld, x, 0);
3646
3647    return ipart;
3648 }
3649
3650 LLVMValueRef
3651 lp_build_mod(struct lp_build_context *bld,
3652              LLVMValueRef x,
3653              LLVMValueRef y)
3654 {
3655    LLVMBuilderRef builder = bld->gallivm->builder;
3656    LLVMValueRef res;
3657    const struct lp_type type = bld->type;
3658
3659    assert(lp_check_value(type, x));
3660    assert(lp_check_value(type, y));
3661
3662    if (type.floating)
3663       res = LLVMBuildFRem(builder, x, y, "");
3664    else if (type.sign)
3665       res = LLVMBuildSRem(builder, x, y, "");
3666    else
3667       res = LLVMBuildURem(builder, x, y, "");
3668    return res;
3669 }
3670
3671
3672 /*
3673  * For floating inputs it creates and returns a mask
3674  * which is all 1's for channels which are NaN.
3675  * Channels inside x which are not NaN will be 0.
3676  */
3677 LLVMValueRef
3678 lp_build_isnan(struct lp_build_context *bld,
3679                LLVMValueRef x)
3680 {
3681    LLVMValueRef mask;
3682    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3683
3684    assert(bld->type.floating);
3685    assert(lp_check_value(bld->type, x));
3686
3687    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3688                         "isnotnan");
3689    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3690    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3691    return mask;
3692 }
3693
3694 /* Returns all 1's for floating point numbers that are
3695  * finite numbers and returns all zeros for -inf,
3696  * inf and nan's */
3697 LLVMValueRef
3698 lp_build_isfinite(struct lp_build_context *bld,
3699                   LLVMValueRef x)
3700 {
3701    LLVMBuilderRef builder = bld->gallivm->builder;
3702    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3703    struct lp_type int_type = lp_int_type(bld->type);
3704    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3705    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3706                                                     0x7f800000);
3707
3708    if (!bld->type.floating) {
3709       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3710    }
3711    assert(bld->type.floating);
3712    assert(lp_check_value(bld->type, x));
3713    assert(bld->type.width == 32);
3714
3715    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3716    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3717                            intx, infornan32);
3718 }
3719
3720 /*
3721  * Returns true if the number is nan or inf and false otherwise.
3722  * The input has to be a floating point vector.
3723  */
3724 LLVMValueRef
3725 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3726                        const struct lp_type type,
3727                        LLVMValueRef x)
3728 {
3729    LLVMBuilderRef builder = gallivm->builder;
3730    struct lp_type int_type = lp_int_type(type);
3731    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3732                                                 0x7f800000);
3733    LLVMValueRef ret;
3734
3735    assert(type.floating);
3736
3737    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3738    ret = LLVMBuildAnd(builder, ret, const0, "");
3739    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3740                           ret, const0);
3741
3742    return ret;
3743 }
3744
3745
3746 LLVMValueRef
3747 lp_build_fpstate_get(struct gallivm_state *gallivm)
3748 {
3749    if (util_cpu_caps.has_sse) {
3750       LLVMBuilderRef builder = gallivm->builder;
3751       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3752          gallivm,
3753          LLVMInt32TypeInContext(gallivm->context),
3754          "mxcsr_ptr");
3755       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3756           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3757       lp_build_intrinsic(builder,
3758                          "llvm.x86.sse.stmxcsr",
3759                          LLVMVoidTypeInContext(gallivm->context),
3760                          &mxcsr_ptr8, 1, 0);
3761       return mxcsr_ptr;
3762    }
3763    return 0;
3764 }
3765
3766 void
3767 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3768                                   boolean zero)
3769 {
3770    if (util_cpu_caps.has_sse) {
3771       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3772       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3773
3774       LLVMBuilderRef builder = gallivm->builder;
3775       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3776       LLVMValueRef mxcsr =
3777          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3778
3779       if (util_cpu_caps.has_daz) {
3780          /* Enable denormals are zero mode */
3781          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3782       }
3783       if (zero) {
3784          mxcsr = LLVMBuildOr(builder, mxcsr,
3785                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3786       } else {
3787          mxcsr = LLVMBuildAnd(builder, mxcsr,
3788                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3789       }
3790
3791       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3792       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3793    }
3794 }
3795
3796 void
3797 lp_build_fpstate_set(struct gallivm_state *gallivm,
3798                      LLVMValueRef mxcsr_ptr)
3799 {
3800    if (util_cpu_caps.has_sse) {
3801       LLVMBuilderRef builder = gallivm->builder;
3802       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3803                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3804       lp_build_intrinsic(builder,
3805                          "llvm.x86.sse.ldmxcsr",
3806                          LLVMVoidTypeInContext(gallivm->context),
3807                          &mxcsr_ptr, 1, 0);
3808    }
3809 }