src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 146       intr_size = 128;
 147       if ((type.width == 8 || type.width == 16) &&
 148           (type.width * type.length <= 64) &&
 149           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 150          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 151                       __FUNCTION__);
 152       }
 153       if (type.width == 8 && !type.sign) {
 154          intrinsic = "llvm.x86.sse2.pminu.b";
 155       }
 156       else if (type.width == 16 && type.sign) {
 157          intrinsic = "llvm.x86.sse2.pmins.w";
 158       }
 159       if (util_cpu_caps.has_sse4_1) {
 160          if (type.width == 8 && type.sign) {
 161             intrinsic = "llvm.x86.sse41.pminsb";
 162          }
 163          if (type.width == 16 && !type.sign) {
 164             intrinsic = "llvm.x86.sse41.pminuw";
 165          }
 166          if (type.width == 32 && !type.sign) {
 167             intrinsic = "llvm.x86.sse41.pminud";
 168          }
 169          if (type.width == 32 && type.sign) {
 170             intrinsic = "llvm.x86.sse41.pminsd";
 171          }
 172       }
 173    } else if (util_cpu_caps.has_altivec) {
 174       intr_size = 128;
 175       if (type.width == 8) {
 176          if (!type.sign) {
 177             intrinsic = "llvm.ppc.altivec.vminub";
 178          } else {
 179             intrinsic = "llvm.ppc.altivec.vminsb";
 180          }
 181       } else if (type.width == 16) {
 182          if (!type.sign) {
 183             intrinsic = "llvm.ppc.altivec.vminuh";
 184          } else {
 185             intrinsic = "llvm.ppc.altivec.vminsh";
 186          }
 187       } else if (type.width == 32) {
 188          if (!type.sign) {
 189             intrinsic = "llvm.ppc.altivec.vminuw";
 190          } else {
 191             intrinsic = "llvm.ppc.altivec.vminsw";
 192          }
 193       }
 194    }
 195
 196    if (intrinsic) {
 197       /* We need to handle nan's for floating point numbers. If one of the
 198        * inputs is nan the other should be returned (required by both D3D10+
 199        * and OpenCL).
 200        * The sse intrinsics return the second operator in case of nan by
 201        * default so we need to special code to handle those.
 202        */
 203       if (util_cpu_caps.has_sse && type.floating &&
 204           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 205           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 206           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 207          LLVMValueRef isnan, min;
 208          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 209                                                    type,
 210                                                    intr_size, a, b);
 211          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 212             isnan = lp_build_isnan(bld, b);
 213             return lp_build_select(bld, isnan, a, min);
 214          } else {
 215             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 216             isnan = lp_build_isnan(bld, a);
 217             return lp_build_select(bld, isnan, a, min);
 218          }
 219       } else {
 220          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 221                                                     type,
 222                                                     intr_size, a, b);
 223       }
 224    }
 225
 226    if (type.floating) {
 227       switch (nan_behavior) {
 228       case GALLIVM_NAN_RETURN_NAN: {
 229          LLVMValueRef isnan = lp_build_isnan(bld, b);
 230          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 231          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 232          return lp_build_select(bld, cond, a, b);
 233       }
 234          break;
 235       case GALLIVM_NAN_RETURN_OTHER: {
 236          LLVMValueRef isnan = lp_build_isnan(bld, a);
 237          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 238          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 239          return lp_build_select(bld, cond, a, b);
 240       }
 241          break;
 242       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 243          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 244          return lp_build_select(bld, cond, a, b);
 245       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 246          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 247          return lp_build_select(bld, cond, b, a);
 248       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 249          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 250          return lp_build_select(bld, cond, a, b);
 251          break;
 252       default:
 253          assert(0);
 254          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 255          return lp_build_select(bld, cond, a, b);
 256       }
 257    } else {
 258       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 259       return lp_build_select(bld, cond, a, b);
 260    }
 261 }
 262
 263
 264 LLVMValueRef
 265 lp_build_fmuladd(LLVMBuilderRef builder,
 266                  LLVMValueRef a,
 267                  LLVMValueRef b,
 268                  LLVMValueRef c)
 269 {
 270    LLVMTypeRef type = LLVMTypeOf(a);
 271    assert(type == LLVMTypeOf(b));
 272    assert(type == LLVMTypeOf(c));
 273    char intrinsic[32];
 274    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 275    LLVMValueRef args[] = { a, b, c };
 276    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 277 }
 278
 279
 280 /**
 281  * Generate max(a, b)
 282  * No checks for special case values of a or b = 1 or 0 are done.
 283  * NaN's are handled according to the behavior specified by the
 284  * nan_behavior argument.
 285  */
 286 static LLVMValueRef
 287 lp_build_max_simple(struct lp_build_context *bld,
 288                     LLVMValueRef a,
 289                     LLVMValueRef b,
 290                     enum gallivm_nan_behavior nan_behavior)
 291 {
 292    const struct lp_type type = bld->type;
 293    const char *intrinsic = NULL;
 294    unsigned intr_size = 0;
 295    LLVMValueRef cond;
 296
 297    assert(lp_check_value(type, a));
 298    assert(lp_check_value(type, b));
 299
 300    /* TODO: optimize the constant case */
 301
 302    if (type.floating && util_cpu_caps.has_sse) {
 303       if (type.width == 32) {
 304          if (type.length == 1) {
 305             intrinsic = "llvm.x86.sse.max.ss";
 306             intr_size = 128;
 307          }
 308          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 309             intrinsic = "llvm.x86.sse.max.ps";
 310             intr_size = 128;
 311          }
 312          else {
 313             intrinsic = "llvm.x86.avx.max.ps.256";
 314             intr_size = 256;
 315          }
 316       }
 317       if (type.width == 64 && util_cpu_caps.has_sse2) {
 318          if (type.length == 1) {
 319             intrinsic = "llvm.x86.sse2.max.sd";
 320             intr_size = 128;
 321          }
 322          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 323             intrinsic = "llvm.x86.sse2.max.pd";
 324             intr_size = 128;
 325          }
 326          else {
 327             intrinsic = "llvm.x86.avx.max.pd.256";
 328             intr_size = 256;
 329          }
 330       }
 331    }
 332    else if (type.floating && util_cpu_caps.has_altivec) {
 333       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 334           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 335          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 336                       __FUNCTION__);
 337       }
 338       if (type.width == 32 || type.length == 4) {
 339          intrinsic = "llvm.ppc.altivec.vmaxfp";
 340          intr_size = 128;
 341       }
 342    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 343       intr_size = 128;
 344       if ((type.width == 8 || type.width == 16) &&
 345           (type.width * type.length <= 64) &&
 346           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 347          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 348                       __FUNCTION__);
 349          }
 350       if (type.width == 8 && !type.sign) {
 351          intrinsic = "llvm.x86.sse2.pmaxu.b";
 352          intr_size = 128;
 353       }
 354       else if (type.width == 16 && type.sign) {
 355          intrinsic = "llvm.x86.sse2.pmaxs.w";
 356       }
 357       if (util_cpu_caps.has_sse4_1) {
 358          if (type.width == 8 && type.sign) {
 359             intrinsic = "llvm.x86.sse41.pmaxsb";
 360          }
 361          if (type.width == 16 && !type.sign) {
 362             intrinsic = "llvm.x86.sse41.pmaxuw";
 363          }
 364          if (type.width == 32 && !type.sign) {
 365             intrinsic = "llvm.x86.sse41.pmaxud";
 366         }
 367          if (type.width == 32 && type.sign) {
 368             intrinsic = "llvm.x86.sse41.pmaxsd";
 369          }
 370       }
 371    } else if (util_cpu_caps.has_altivec) {
 372      intr_size = 128;
 373      if (type.width == 8) {
 374        if (!type.sign) {
 375          intrinsic = "llvm.ppc.altivec.vmaxub";
 376        } else {
 377          intrinsic = "llvm.ppc.altivec.vmaxsb";
 378        }
 379      } else if (type.width == 16) {
 380        if (!type.sign) {
 381          intrinsic = "llvm.ppc.altivec.vmaxuh";
 382        } else {
 383          intrinsic = "llvm.ppc.altivec.vmaxsh";
 384        }
 385      } else if (type.width == 32) {
 386        if (!type.sign) {
 387          intrinsic = "llvm.ppc.altivec.vmaxuw";
 388        } else {
 389          intrinsic = "llvm.ppc.altivec.vmaxsw";
 390        }
 391      }
 392    }
 393
 394    if (intrinsic) {
 395       if (util_cpu_caps.has_sse && type.floating &&
 396           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 397           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 398           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 399          LLVMValueRef isnan, max;
 400          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 401                                                    type,
 402                                                    intr_size, a, b);
 403          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 404             isnan = lp_build_isnan(bld, b);
 405             return lp_build_select(bld, isnan, a, max);
 406          } else {
 407             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 408             isnan = lp_build_isnan(bld, a);
 409             return lp_build_select(bld, isnan, a, max);
 410          }
 411       } else {
 412          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 413                                                     type,
 414                                                     intr_size, a, b);
 415       }
 416    }
 417
 418    if (type.floating) {
 419       switch (nan_behavior) {
 420       case GALLIVM_NAN_RETURN_NAN: {
 421          LLVMValueRef isnan = lp_build_isnan(bld, b);
 422          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 423          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 424          return lp_build_select(bld, cond, a, b);
 425       }
 426          break;
 427       case GALLIVM_NAN_RETURN_OTHER: {
 428          LLVMValueRef isnan = lp_build_isnan(bld, a);
 429          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 430          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 431          return lp_build_select(bld, cond, a, b);
 432       }
 433          break;
 434       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 435          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 436          return lp_build_select(bld, cond, a, b);
 437       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 438          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 439          return lp_build_select(bld, cond, b, a);
 440       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 441          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 442          return lp_build_select(bld, cond, a, b);
 443          break;
 444       default:
 445          assert(0);
 446          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 447          return lp_build_select(bld, cond, a, b);
 448       }
 449    } else {
 450       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 451       return lp_build_select(bld, cond, a, b);
 452    }
 453 }
 454
 455
 456 /**
 457  * Generate 1 - a, or ~a depending on bld->type.
 458  */
 459 LLVMValueRef
 460 lp_build_comp(struct lp_build_context *bld,
 461               LLVMValueRef a)
 462 {
 463    LLVMBuilderRef builder = bld->gallivm->builder;
 464    const struct lp_type type = bld->type;
 465
 466    assert(lp_check_value(type, a));
 467
 468    if(a == bld->one)
 469       return bld->zero;
 470    if(a == bld->zero)
 471       return bld->one;
 472
 473    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 474       if(LLVMIsConstant(a))
 475          return LLVMConstNot(a);
 476       else
 477          return LLVMBuildNot(builder, a, "");
 478    }
 479
 480    if(LLVMIsConstant(a))
 481       if (type.floating)
 482           return LLVMConstFSub(bld->one, a);
 483       else
 484           return LLVMConstSub(bld->one, a);
 485    else
 486       if (type.floating)
 487          return LLVMBuildFSub(builder, bld->one, a, "");
 488       else
 489          return LLVMBuildSub(builder, bld->one, a, "");
 490 }
 491
 492
 493 /**
 494  * Generate a + b
 495  */
 496 LLVMValueRef
 497 lp_build_add(struct lp_build_context *bld,
 498              LLVMValueRef a,
 499              LLVMValueRef b)
 500 {
 501    LLVMBuilderRef builder = bld->gallivm->builder;
 502    const struct lp_type type = bld->type;
 503    LLVMValueRef res;
 504
 505    assert(lp_check_value(type, a));
 506    assert(lp_check_value(type, b));
 507
 508    if(a == bld->zero)
 509       return b;
 510    if(b == bld->zero)
 511       return a;
 512    if(a == bld->undef || b == bld->undef)
 513       return bld->undef;
 514
 515    if(bld->type.norm) {
 516       const char *intrinsic = NULL;
 517
 518       if(a == bld->one || b == bld->one)
 519         return bld->one;
 520
 521       if (type.width * type.length == 128 &&
 522           !type.floating && !type.fixed) {
 523          if(util_cpu_caps.has_sse2) {
 524            if(type.width == 8)
 525              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 526            if(type.width == 16)
 527              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 528          } else if (util_cpu_caps.has_altivec) {
 529            if(type.width == 8)
 530               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 531            if(type.width == 16)
 532               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 533          }
 534       }
 535
 536       if (intrinsic)
 537          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 538    }
 539
 540    if(type.norm && !type.floating && !type.fixed) {
 541       if (type.sign) {
 542          uint64_t sign = (uint64_t)1 << (type.width - 1);
 543          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 544          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 545          /* a_clamp_max is the maximum a for positive b,
 546             a_clamp_min is the minimum a for negative b. */
 547          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 548          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 549          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 550       } else {
 551          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 552       }
 553    }
 554
 555    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 556       if (type.floating)
 557          res = LLVMConstFAdd(a, b);
 558       else
 559          res = LLVMConstAdd(a, b);
 560    else
 561       if (type.floating)
 562          res = LLVMBuildFAdd(builder, a, b, "");
 563       else
 564          res = LLVMBuildAdd(builder, a, b, "");
 565
 566    /* clamp to ceiling of 1.0 */
 567    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 568       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 569
 570    /* XXX clamp to floor of -1 or 0??? */
 571
 572    return res;
 573 }
 574
 575
 576 /** Return the scalar sum of the elements of a.
 577  * Should avoid this operation whenever possible.
 578  */
 579 LLVMValueRef
 580 lp_build_horizontal_add(struct lp_build_context *bld,
 581                         LLVMValueRef a)
 582 {
 583    LLVMBuilderRef builder = bld->gallivm->builder;
 584    const struct lp_type type = bld->type;
 585    LLVMValueRef index, res;
 586    unsigned i, length;
 587    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 588    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 589    LLVMValueRef vecres, elem2;
 590
 591    assert(lp_check_value(type, a));
 592
 593    if (type.length == 1) {
 594       return a;
 595    }
 596
 597    assert(!bld->type.norm);
 598
 599    /*
 600     * for byte vectors can do much better with psadbw.
 601     * Using repeated shuffle/adds here. Note with multiple vectors
 602     * this can be done more efficiently as outlined in the intel
 603     * optimization manual.
 604     * Note: could cause data rearrangement if used with smaller element
 605     * sizes.
 606     */
 607
 608    vecres = a;
 609    length = type.length / 2;
 610    while (length > 1) {
 611       LLVMValueRef vec1, vec2;
 612       for (i = 0; i < length; i++) {
 613          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 614          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 615       }
 616       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 617                                     LLVMConstVector(shuffles1, length), "");
 618       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 619                                     LLVMConstVector(shuffles2, length), "");
 620       if (type.floating) {
 621          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 622       }
 623       else {
 624          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 625       }
 626       length = length >> 1;
 627    }
 628
 629    /* always have vector of size 2 here */
 630    assert(length == 1);
 631
 632    index = lp_build_const_int32(bld->gallivm, 0);
 633    res = LLVMBuildExtractElement(builder, vecres, index, "");
 634    index = lp_build_const_int32(bld->gallivm, 1);
 635    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 636
 637    if (type.floating)
 638       res = LLVMBuildFAdd(builder, res, elem2, "");
 639     else
 640       res = LLVMBuildAdd(builder, res, elem2, "");
 641
 642    return res;
 643 }
 644
 645 /**
 646  * Return the horizontal sums of 4 float vectors as a float4 vector.
 647  * This uses the technique as outlined in Intel Optimization Manual.
 648  */
 649 static LLVMValueRef
 650 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 651                             LLVMValueRef src[4])
 652 {
 653    struct gallivm_state *gallivm = bld->gallivm;
 654    LLVMBuilderRef builder = gallivm->builder;
 655    LLVMValueRef shuffles[4];
 656    LLVMValueRef tmp[4];
 657    LLVMValueRef sumtmp[2], shuftmp[2];
 658
 659    /* lower half of regs */
 660    shuffles[0] = lp_build_const_int32(gallivm, 0);
 661    shuffles[1] = lp_build_const_int32(gallivm, 1);
 662    shuffles[2] = lp_build_const_int32(gallivm, 4);
 663    shuffles[3] = lp_build_const_int32(gallivm, 5);
 664    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 665                                    LLVMConstVector(shuffles, 4), "");
 666    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 667                                    LLVMConstVector(shuffles, 4), "");
 668
 669    /* upper half of regs */
 670    shuffles[0] = lp_build_const_int32(gallivm, 2);
 671    shuffles[1] = lp_build_const_int32(gallivm, 3);
 672    shuffles[2] = lp_build_const_int32(gallivm, 6);
 673    shuffles[3] = lp_build_const_int32(gallivm, 7);
 674    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 675                                    LLVMConstVector(shuffles, 4), "");
 676    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 677                                    LLVMConstVector(shuffles, 4), "");
 678
 679    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 680    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 681
 682    shuffles[0] = lp_build_const_int32(gallivm, 0);
 683    shuffles[1] = lp_build_const_int32(gallivm, 2);
 684    shuffles[2] = lp_build_const_int32(gallivm, 4);
 685    shuffles[3] = lp_build_const_int32(gallivm, 6);
 686    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 687                                        LLVMConstVector(shuffles, 4), "");
 688
 689    shuffles[0] = lp_build_const_int32(gallivm, 1);
 690    shuffles[1] = lp_build_const_int32(gallivm, 3);
 691    shuffles[2] = lp_build_const_int32(gallivm, 5);
 692    shuffles[3] = lp_build_const_int32(gallivm, 7);
 693    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 694                                        LLVMConstVector(shuffles, 4), "");
 695
 696    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 697 }
 698
 699
 700 /*
 701  * partially horizontally add 2-4 float vectors with length nx4,
 702  * i.e. only four adjacent values in each vector will be added,
 703  * assuming values are really grouped in 4 which also determines
 704  * output order.
 705  *
 706  * Return a vector of the same length as the initial vectors,
 707  * with the excess elements (if any) being undefined.
 708  * The element order is independent of number of input vectors.
 709  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 710  * the output order thus will be
 711  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 712  */
 713 LLVMValueRef
 714 lp_build_hadd_partial4(struct lp_build_context *bld,
 715                        LLVMValueRef vectors[],
 716                        unsigned num_vecs)
 717 {
 718    struct gallivm_state *gallivm = bld->gallivm;
 719    LLVMBuilderRef builder = gallivm->builder;
 720    LLVMValueRef ret_vec;
 721    LLVMValueRef tmp[4];
 722    const char *intrinsic = NULL;
 723
 724    assert(num_vecs >= 2 && num_vecs <= 4);
 725    assert(bld->type.floating);
 726
 727    /* only use this with at least 2 vectors, as it is sort of expensive
 728     * (depending on cpu) and we always need two horizontal adds anyway,
 729     * so a shuffle/add approach might be better.
 730     */
 731
 732    tmp[0] = vectors[0];
 733    tmp[1] = vectors[1];
 734
 735    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 736    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 737
 738    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 739        bld->type.length == 4) {
 740       intrinsic = "llvm.x86.sse3.hadd.ps";
 741    }
 742    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 743             bld->type.length == 8) {
 744       intrinsic = "llvm.x86.avx.hadd.ps.256";
 745    }
 746    if (intrinsic) {
 747       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 748                                        lp_build_vec_type(gallivm, bld->type),
 749                                        tmp[0], tmp[1]);
 750       if (num_vecs > 2) {
 751          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 752                                           lp_build_vec_type(gallivm, bld->type),
 753                                           tmp[2], tmp[3]);
 754       }
 755       else {
 756          tmp[1] = tmp[0];
 757       }
 758       return lp_build_intrinsic_binary(builder, intrinsic,
 759                                        lp_build_vec_type(gallivm, bld->type),
 760                                        tmp[0], tmp[1]);
 761    }
 762
 763    if (bld->type.length == 4) {
 764       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 765    }
 766    else {
 767       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 768       unsigned j;
 769       unsigned num_iter = bld->type.length / 4;
 770       struct lp_type parttype = bld->type;
 771       parttype.length = 4;
 772       for (j = 0; j < num_iter; j++) {
 773          LLVMValueRef partsrc[4];
 774          unsigned i;
 775          for (i = 0; i < 4; i++) {
 776             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 777          }
 778          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 779       }
 780       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 781    }
 782    return ret_vec;
 783 }
 784
 785 /**
 786  * Generate a - b
 787  */
 788 LLVMValueRef
 789 lp_build_sub(struct lp_build_context *bld,
 790              LLVMValueRef a,
 791              LLVMValueRef b)
 792 {
 793    LLVMBuilderRef builder = bld->gallivm->builder;
 794    const struct lp_type type = bld->type;
 795    LLVMValueRef res;
 796
 797    assert(lp_check_value(type, a));
 798    assert(lp_check_value(type, b));
 799
 800    if(b == bld->zero)
 801       return a;
 802    if(a == bld->undef || b == bld->undef)
 803       return bld->undef;
 804    if(a == b)
 805       return bld->zero;
 806
 807    if(bld->type.norm) {
 808       const char *intrinsic = NULL;
 809
 810       if(b == bld->one)
 811         return bld->zero;
 812
 813       if (type.width * type.length == 128 &&
 814           !type.floating && !type.fixed) {
 815          if (util_cpu_caps.has_sse2) {
 816            if(type.width == 8)
 817               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 818            if(type.width == 16)
 819               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 820          } else if (util_cpu_caps.has_altivec) {
 821            if(type.width == 8)
 822               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 823            if(type.width == 16)
 824               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 825          }
 826       }
 827
 828       if (intrinsic)
 829          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 830    }
 831
 832    if(type.norm && !type.floating && !type.fixed) {
 833       if (type.sign) {
 834          uint64_t sign = (uint64_t)1 << (type.width - 1);
 835          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 836          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 837          /* a_clamp_max is the maximum a for negative b,
 838             a_clamp_min is the minimum a for positive b. */
 839          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 840          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 841          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 842       } else {
 843          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 844       }
 845    }
 846
 847    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 848       if (type.floating)
 849          res = LLVMConstFSub(a, b);
 850       else
 851          res = LLVMConstSub(a, b);
 852    else
 853       if (type.floating)
 854          res = LLVMBuildFSub(builder, a, b, "");
 855       else
 856          res = LLVMBuildSub(builder, a, b, "");
 857
 858    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 859       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 860
 861    return res;
 862 }
 863
 864
 865
 866 /**
 867  * Normalized multiplication.
 868  *
 869  * There are several approaches for (using 8-bit normalized multiplication as
 870  * an example):
 871  *
 872  * - alpha plus one
 873  *
 874  *     makes the following approximation to the division (Sree)
 875  *
 876  *       a*b/255 ~= (a*(b + 1)) >> 256
 877  *
 878  *     which is the fastest method that satisfies the following OpenGL criteria of
 879  *
 880  *       0*0 = 0 and 255*255 = 255
 881  *
 882  * - geometric series
 883  *
 884  *     takes the geometric series approximation to the division
 885  *
 886  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 887  *
 888  *     in this case just the first two terms to fit in 16bit arithmetic
 889  *
 890  *       t/255 ~= (t + (t >> 8)) >> 8
 891  *
 892  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 893  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 894  *     must be used.
 895  *
 896  * - geometric series plus rounding
 897  *
 898  *     when using a geometric series division instead of truncating the result
 899  *     use roundoff in the approximation (Jim Blinn)
 900  *
 901  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 902  *
 903  *     achieving the exact results.
 904  *
 905  *
 906  *
 907  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 908  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 909  * @sa Michael Herf, The "double blend trick", May 2000,
 910  *     http://www.stereopsis.com/doubleblend.html
 911  */
 912 static LLVMValueRef
 913 lp_build_mul_norm(struct gallivm_state *gallivm,
 914                   struct lp_type wide_type,
 915                   LLVMValueRef a, LLVMValueRef b)
 916 {
 917    LLVMBuilderRef builder = gallivm->builder;
 918    struct lp_build_context bld;
 919    unsigned n;
 920    LLVMValueRef half;
 921    LLVMValueRef ab;
 922
 923    assert(!wide_type.floating);
 924    assert(lp_check_value(wide_type, a));
 925    assert(lp_check_value(wide_type, b));
 926
 927    lp_build_context_init(&bld, gallivm, wide_type);
 928
 929    n = wide_type.width / 2;
 930    if (wide_type.sign) {
 931       --n;
 932    }
 933
 934    /*
 935     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 936     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 937     */
 938
 939    /*
 940     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 941     */
 942
 943    ab = LLVMBuildMul(builder, a, b, "");
 944    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 945
 946    /*
 947     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 948     */
 949
 950    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 951    if (wide_type.sign) {
 952       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 953       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 954       half = lp_build_select(&bld, sign, minus_half, half);
 955    }
 956    ab = LLVMBuildAdd(builder, ab, half, "");
 957
 958    /* Final division */
 959    ab = lp_build_shr_imm(&bld, ab, n);
 960
 961    return ab;
 962 }
 963
 964 /**
 965  * Generate a * b
 966  */
 967 LLVMValueRef
 968 lp_build_mul(struct lp_build_context *bld,
 969              LLVMValueRef a,
 970              LLVMValueRef b)
 971 {
 972    LLVMBuilderRef builder = bld->gallivm->builder;
 973    const struct lp_type type = bld->type;
 974    LLVMValueRef shift;
 975    LLVMValueRef res;
 976
 977    assert(lp_check_value(type, a));
 978    assert(lp_check_value(type, b));
 979
 980    if(a == bld->zero)
 981       return bld->zero;
 982    if(a == bld->one)
 983       return b;
 984    if(b == bld->zero)
 985       return bld->zero;
 986    if(b == bld->one)
 987       return a;
 988    if(a == bld->undef || b == bld->undef)
 989       return bld->undef;
 990
 991    if (!type.floating && !type.fixed && type.norm) {
 992       struct lp_type wide_type = lp_wider_type(type);
 993       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 994
 995       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 996       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 997
 998       /* PMULLW, PSRLW, PADDW */
 999       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1000       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1001
1002       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
1003
1004       return ab;
1005    }
1006
1007    if(type.fixed)
1008       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1009    else
1010       shift = NULL;
1011
1012    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1013       if (type.floating)
1014          res = LLVMConstFMul(a, b);
1015       else
1016          res = LLVMConstMul(a, b);
1017       if(shift) {
1018          if(type.sign)
1019             res = LLVMConstAShr(res, shift);
1020          else
1021             res = LLVMConstLShr(res, shift);
1022       }
1023    }
1024    else {
1025       if (type.floating)
1026          res = LLVMBuildFMul(builder, a, b, "");
1027       else
1028          res = LLVMBuildMul(builder, a, b, "");
1029       if(shift) {
1030          if(type.sign)
1031             res = LLVMBuildAShr(builder, res, shift, "");
1032          else
1033             res = LLVMBuildLShr(builder, res, shift, "");
1034       }
1035    }
1036
1037    return res;
1038 }
1039
1040
1041 /* a * b + c */
1042 LLVMValueRef
1043 lp_build_mad(struct lp_build_context *bld,
1044              LLVMValueRef a,
1045              LLVMValueRef b,
1046              LLVMValueRef c)
1047 {
1048    const struct lp_type type = bld->type;
1049    if (type.floating) {
1050       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1051    } else {
1052       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1053    }
1054 }
1055
1056
1057 /**
1058  * Small vector x scale multiplication optimization.
1059  */
1060 LLVMValueRef
1061 lp_build_mul_imm(struct lp_build_context *bld,
1062                  LLVMValueRef a,
1063                  int b)
1064 {
1065    LLVMBuilderRef builder = bld->gallivm->builder;
1066    LLVMValueRef factor;
1067
1068    assert(lp_check_value(bld->type, a));
1069
1070    if(b == 0)
1071       return bld->zero;
1072
1073    if(b == 1)
1074       return a;
1075
1076    if(b == -1)
1077       return lp_build_negate(bld, a);
1078
1079    if(b == 2 && bld->type.floating)
1080       return lp_build_add(bld, a, a);
1081
1082    if(util_is_power_of_two(b)) {
1083       unsigned shift = ffs(b) - 1;
1084
1085       if(bld->type.floating) {
1086 #if 0
1087          /*
1088           * Power of two multiplication by directly manipulating the exponent.
1089           *
1090           * XXX: This might not be always faster, it will introduce a small error
1091           * for multiplication by zero, and it will produce wrong results
1092           * for Inf and NaN.
1093           */
1094          unsigned mantissa = lp_mantissa(bld->type);
1095          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1096          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1097          a = LLVMBuildAdd(builder, a, factor, "");
1098          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1099          return a;
1100 #endif
1101       }
1102       else {
1103          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1104          return LLVMBuildShl(builder, a, factor, "");
1105       }
1106    }
1107
1108    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1109    return lp_build_mul(bld, a, factor);
1110 }
1111
1112
1113 /**
1114  * Generate a / b
1115  */
1116 LLVMValueRef
1117 lp_build_div(struct lp_build_context *bld,
1118              LLVMValueRef a,
1119              LLVMValueRef b)
1120 {
1121    LLVMBuilderRef builder = bld->gallivm->builder;
1122    const struct lp_type type = bld->type;
1123
1124    assert(lp_check_value(type, a));
1125    assert(lp_check_value(type, b));
1126
1127    if(a == bld->zero)
1128       return bld->zero;
1129    if(a == bld->one && type.floating)
1130       return lp_build_rcp(bld, b);
1131    if(b == bld->zero)
1132       return bld->undef;
1133    if(b == bld->one)
1134       return a;
1135    if(a == bld->undef || b == bld->undef)
1136       return bld->undef;
1137
1138    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1139       if (type.floating)
1140          return LLVMConstFDiv(a, b);
1141       else if (type.sign)
1142          return LLVMConstSDiv(a, b);
1143       else
1144          return LLVMConstUDiv(a, b);
1145    }
1146
1147    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1148        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1149       type.floating)
1150       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1151
1152    if (type.floating)
1153       return LLVMBuildFDiv(builder, a, b, "");
1154    else if (type.sign)
1155       return LLVMBuildSDiv(builder, a, b, "");
1156    else
1157       return LLVMBuildUDiv(builder, a, b, "");
1158 }
1159
1160
1161 /**
1162  * Linear interpolation helper.
1163  *
1164  * @param normalized whether we are interpolating normalized values,
1165  *        encoded in normalized integers, twice as wide.
1166  *
1167  * @sa http://www.stereopsis.com/doubleblend.html
1168  */
1169 static inline LLVMValueRef
1170 lp_build_lerp_simple(struct lp_build_context *bld,
1171                      LLVMValueRef x,
1172                      LLVMValueRef v0,
1173                      LLVMValueRef v1,
1174                      unsigned flags)
1175 {
1176    unsigned half_width = bld->type.width/2;
1177    LLVMBuilderRef builder = bld->gallivm->builder;
1178    LLVMValueRef delta;
1179    LLVMValueRef res;
1180
1181    assert(lp_check_value(bld->type, x));
1182    assert(lp_check_value(bld->type, v0));
1183    assert(lp_check_value(bld->type, v1));
1184
1185    delta = lp_build_sub(bld, v1, v0);
1186
1187    if (bld->type.floating) {
1188       assert(flags == 0);
1189       return lp_build_mad(bld, x, delta, v0);
1190    }
1191
1192    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1193       if (!bld->type.sign) {
1194          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1195             /*
1196              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1197              * most-significant-bit to the lowest-significant-bit, so that
1198              * later we can just divide by 2**n instead of 2**n - 1.
1199              */
1200
1201             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1202          }
1203
1204          /* (x * delta) >> n */
1205          res = lp_build_mul(bld, x, delta);
1206          res = lp_build_shr_imm(bld, res, half_width);
1207       } else {
1208          /*
1209           * The rescaling trick above doesn't work for signed numbers, so
1210           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1211           * instead.
1212           */
1213          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1214          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1215       }
1216    } else {
1217       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1218       res = lp_build_mul(bld, x, delta);
1219    }
1220
1221    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1222       /*
1223        * At this point both res and v0 only use the lower half of the bits,
1224        * the rest is zero. Instead of add / mask, do add with half wide type.
1225        */
1226       struct lp_type narrow_type;
1227       struct lp_build_context narrow_bld;
1228
1229       memset(&narrow_type, 0, sizeof narrow_type);
1230       narrow_type.sign   = bld->type.sign;
1231       narrow_type.width  = bld->type.width/2;
1232       narrow_type.length = bld->type.length*2;
1233
1234       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1235       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1236       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1237       res = lp_build_add(&narrow_bld, v0, res);
1238       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1239    } else {
1240       res = lp_build_add(bld, v0, res);
1241
1242       if (bld->type.fixed) {
1243          /*
1244           * We need to mask out the high order bits when lerping 8bit
1245           * normalized colors stored on 16bits
1246           */
1247          /* XXX: This step is necessary for lerping 8bit colors stored on
1248           * 16bits, but it will be wrong for true fixed point use cases.
1249           * Basically we need a more powerful lp_type, capable of further
1250           * distinguishing the values interpretation from the value storage.
1251           */
1252          LLVMValueRef low_bits;
1253          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1254          res = LLVMBuildAnd(builder, res, low_bits, "");
1255       }
1256    }
1257
1258    return res;
1259 }
1260
1261
1262 /**
1263  * Linear interpolation.
1264  */
1265 LLVMValueRef
1266 lp_build_lerp(struct lp_build_context *bld,
1267               LLVMValueRef x,
1268               LLVMValueRef v0,
1269               LLVMValueRef v1,
1270               unsigned flags)
1271 {
1272    const struct lp_type type = bld->type;
1273    LLVMValueRef res;
1274
1275    assert(lp_check_value(type, x));
1276    assert(lp_check_value(type, v0));
1277    assert(lp_check_value(type, v1));
1278
1279    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1280
1281    if (type.norm) {
1282       struct lp_type wide_type;
1283       struct lp_build_context wide_bld;
1284       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1285
1286       assert(type.length >= 2);
1287
1288       /*
1289        * Create a wider integer type, enough to hold the
1290        * intermediate result of the multiplication.
1291        */
1292       memset(&wide_type, 0, sizeof wide_type);
1293       wide_type.sign   = type.sign;
1294       wide_type.width  = type.width*2;
1295       wide_type.length = type.length/2;
1296
1297       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1298
1299       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1300       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1301       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1302
1303       /*
1304        * Lerp both halves.
1305        */
1306
1307       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1308
1309       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1310       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1311
1312       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1313    } else {
1314       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1315    }
1316
1317    return res;
1318 }
1319
1320
1321 /**
1322  * Bilinear interpolation.
1323  *
1324  * Values indices are in v_{yx}.
1325  */
1326 LLVMValueRef
1327 lp_build_lerp_2d(struct lp_build_context *bld,
1328                  LLVMValueRef x,
1329                  LLVMValueRef y,
1330                  LLVMValueRef v00,
1331                  LLVMValueRef v01,
1332                  LLVMValueRef v10,
1333                  LLVMValueRef v11,
1334                  unsigned flags)
1335 {
1336    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1337    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1338    return lp_build_lerp(bld, y, v0, v1, flags);
1339 }
1340
1341
1342 LLVMValueRef
1343 lp_build_lerp_3d(struct lp_build_context *bld,
1344                  LLVMValueRef x,
1345                  LLVMValueRef y,
1346                  LLVMValueRef z,
1347                  LLVMValueRef v000,
1348                  LLVMValueRef v001,
1349                  LLVMValueRef v010,
1350                  LLVMValueRef v011,
1351                  LLVMValueRef v100,
1352                  LLVMValueRef v101,
1353                  LLVMValueRef v110,
1354                  LLVMValueRef v111,
1355                  unsigned flags)
1356 {
1357    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1358    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1359    return lp_build_lerp(bld, z, v0, v1, flags);
1360 }
1361
1362
1363 /**
1364  * Generate min(a, b)
1365  * Do checks for special cases but not for nans.
1366  */
1367 LLVMValueRef
1368 lp_build_min(struct lp_build_context *bld,
1369              LLVMValueRef a,
1370              LLVMValueRef b)
1371 {
1372    assert(lp_check_value(bld->type, a));
1373    assert(lp_check_value(bld->type, b));
1374
1375    if(a == bld->undef || b == bld->undef)
1376       return bld->undef;
1377
1378    if(a == b)
1379       return a;
1380
1381    if (bld->type.norm) {
1382       if (!bld->type.sign) {
1383          if (a == bld->zero || b == bld->zero) {
1384             return bld->zero;
1385          }
1386       }
1387       if(a == bld->one)
1388          return b;
1389       if(b == bld->one)
1390          return a;
1391    }
1392
1393    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1394 }
1395
1396
1397 /**
1398  * Generate min(a, b)
1399  * NaN's are handled according to the behavior specified by the
1400  * nan_behavior argument.
1401  */
1402 LLVMValueRef
1403 lp_build_min_ext(struct lp_build_context *bld,
1404                  LLVMValueRef a,
1405                  LLVMValueRef b,
1406                  enum gallivm_nan_behavior nan_behavior)
1407 {
1408    assert(lp_check_value(bld->type, a));
1409    assert(lp_check_value(bld->type, b));
1410
1411    if(a == bld->undef || b == bld->undef)
1412       return bld->undef;
1413
1414    if(a == b)
1415       return a;
1416
1417    if (bld->type.norm) {
1418       if (!bld->type.sign) {
1419          if (a == bld->zero || b == bld->zero) {
1420             return bld->zero;
1421          }
1422       }
1423       if(a == bld->one)
1424          return b;
1425       if(b == bld->one)
1426          return a;
1427    }
1428
1429    return lp_build_min_simple(bld, a, b, nan_behavior);
1430 }
1431
1432 /**
1433  * Generate max(a, b)
1434  * Do checks for special cases, but NaN behavior is undefined.
1435  */
1436 LLVMValueRef
1437 lp_build_max(struct lp_build_context *bld,
1438              LLVMValueRef a,
1439              LLVMValueRef b)
1440 {
1441    assert(lp_check_value(bld->type, a));
1442    assert(lp_check_value(bld->type, b));
1443
1444    if(a == bld->undef || b == bld->undef)
1445       return bld->undef;
1446
1447    if(a == b)
1448       return a;
1449
1450    if(bld->type.norm) {
1451       if(a == bld->one || b == bld->one)
1452          return bld->one;
1453       if (!bld->type.sign) {
1454          if (a == bld->zero) {
1455             return b;
1456          }
1457          if (b == bld->zero) {
1458             return a;
1459          }
1460       }
1461    }
1462
1463    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1464 }
1465
1466
1467 /**
1468  * Generate max(a, b)
1469  * Checks for special cases.
1470  * NaN's are handled according to the behavior specified by the
1471  * nan_behavior argument.
1472  */
1473 LLVMValueRef
1474 lp_build_max_ext(struct lp_build_context *bld,
1475                   LLVMValueRef a,
1476                   LLVMValueRef b,
1477                   enum gallivm_nan_behavior nan_behavior)
1478 {
1479    assert(lp_check_value(bld->type, a));
1480    assert(lp_check_value(bld->type, b));
1481
1482    if(a == bld->undef || b == bld->undef)
1483       return bld->undef;
1484
1485    if(a == b)
1486       return a;
1487
1488    if(bld->type.norm) {
1489       if(a == bld->one || b == bld->one)
1490          return bld->one;
1491       if (!bld->type.sign) {
1492          if (a == bld->zero) {
1493             return b;
1494          }
1495          if (b == bld->zero) {
1496             return a;
1497          }
1498       }
1499    }
1500
1501    return lp_build_max_simple(bld, a, b, nan_behavior);
1502 }
1503
1504 /**
1505  * Generate clamp(a, min, max)
1506  * NaN behavior (for any of a, min, max) is undefined.
1507  * Do checks for special cases.
1508  */
1509 LLVMValueRef
1510 lp_build_clamp(struct lp_build_context *bld,
1511                LLVMValueRef a,
1512                LLVMValueRef min,
1513                LLVMValueRef max)
1514 {
1515    assert(lp_check_value(bld->type, a));
1516    assert(lp_check_value(bld->type, min));
1517    assert(lp_check_value(bld->type, max));
1518
1519    a = lp_build_min(bld, a, max);
1520    a = lp_build_max(bld, a, min);
1521    return a;
1522 }
1523
1524
1525 /**
1526  * Generate clamp(a, 0, 1)
1527  * A NaN will get converted to zero.
1528  */
1529 LLVMValueRef
1530 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1531                                 LLVMValueRef a)
1532 {
1533    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1534    a = lp_build_min(bld, a, bld->one);
1535    return a;
1536 }
1537
1538
1539 /**
1540  * Generate abs(a)
1541  */
1542 LLVMValueRef
1543 lp_build_abs(struct lp_build_context *bld,
1544              LLVMValueRef a)
1545 {
1546    LLVMBuilderRef builder = bld->gallivm->builder;
1547    const struct lp_type type = bld->type;
1548    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1549
1550    assert(lp_check_value(type, a));
1551
1552    if(!type.sign)
1553       return a;
1554
1555    if(type.floating) {
1556       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1557          /* Workaround llvm.org/PR27332 */
1558          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1559          unsigned long long absMask = ~(1ULL << (type.width - 1));
1560          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1561          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1562          a = LLVMBuildAnd(builder, a, mask, "");
1563          a = LLVMBuildBitCast(builder, a, vec_type, "");
1564          return a;
1565       } else {
1566          char intrinsic[32];
1567          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1568          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1569       }
1570    }
1571
1572    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1573       switch(type.width) {
1574       case 8:
1575          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1576       case 16:
1577          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1578       case 32:
1579          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1580       }
1581    }
1582    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1583             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1584             (type.width == 8 || type.width == 16 || type.width == 32)) {
1585       debug_printf("%s: inefficient code, should split vectors manually\n",
1586                    __FUNCTION__);
1587    }
1588
1589    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1590 }
1591
1592
1593 LLVMValueRef
1594 lp_build_negate(struct lp_build_context *bld,
1595                 LLVMValueRef a)
1596 {
1597    LLVMBuilderRef builder = bld->gallivm->builder;
1598
1599    assert(lp_check_value(bld->type, a));
1600
1601    if (bld->type.floating)
1602       a = LLVMBuildFNeg(builder, a, "");
1603    else
1604       a = LLVMBuildNeg(builder, a, "");
1605
1606    return a;
1607 }
1608
1609
1610 /** Return -1, 0 or +1 depending on the sign of a */
1611 LLVMValueRef
1612 lp_build_sgn(struct lp_build_context *bld,
1613              LLVMValueRef a)
1614 {
1615    LLVMBuilderRef builder = bld->gallivm->builder;
1616    const struct lp_type type = bld->type;
1617    LLVMValueRef cond;
1618    LLVMValueRef res;
1619
1620    assert(lp_check_value(type, a));
1621
1622    /* Handle non-zero case */
1623    if(!type.sign) {
1624       /* if not zero then sign must be positive */
1625       res = bld->one;
1626    }
1627    else if(type.floating) {
1628       LLVMTypeRef vec_type;
1629       LLVMTypeRef int_type;
1630       LLVMValueRef mask;
1631       LLVMValueRef sign;
1632       LLVMValueRef one;
1633       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1634
1635       int_type = lp_build_int_vec_type(bld->gallivm, type);
1636       vec_type = lp_build_vec_type(bld->gallivm, type);
1637       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1638
1639       /* Take the sign bit and add it to 1 constant */
1640       sign = LLVMBuildBitCast(builder, a, int_type, "");
1641       sign = LLVMBuildAnd(builder, sign, mask, "");
1642       one = LLVMConstBitCast(bld->one, int_type);
1643       res = LLVMBuildOr(builder, sign, one, "");
1644       res = LLVMBuildBitCast(builder, res, vec_type, "");
1645    }
1646    else
1647    {
1648       /* signed int/norm/fixed point */
1649       /* could use psign with sse3 and appropriate vectors here */
1650       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1651       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1652       res = lp_build_select(bld, cond, bld->one, minus_one);
1653    }
1654
1655    /* Handle zero */
1656    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1657    res = lp_build_select(bld, cond, bld->zero, res);
1658
1659    return res;
1660 }
1661
1662
1663 /**
1664  * Set the sign of float vector 'a' according to 'sign'.
1665  * If sign==0, return abs(a).
1666  * If sign==1, return -abs(a);
1667  * Other values for sign produce undefined results.
1668  */
1669 LLVMValueRef
1670 lp_build_set_sign(struct lp_build_context *bld,
1671                   LLVMValueRef a, LLVMValueRef sign)
1672 {
1673    LLVMBuilderRef builder = bld->gallivm->builder;
1674    const struct lp_type type = bld->type;
1675    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1676    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1677    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1678    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1679                              ~((unsigned long long) 1 << (type.width - 1)));
1680    LLVMValueRef val, res;
1681
1682    assert(type.floating);
1683    assert(lp_check_value(type, a));
1684
1685    /* val = reinterpret_cast<int>(a) */
1686    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1687    /* val = val & mask */
1688    val = LLVMBuildAnd(builder, val, mask, "");
1689    /* sign = sign << shift */
1690    sign = LLVMBuildShl(builder, sign, shift, "");
1691    /* res = val | sign */
1692    res = LLVMBuildOr(builder, val, sign, "");
1693    /* res = reinterpret_cast<float>(res) */
1694    res = LLVMBuildBitCast(builder, res, vec_type, "");
1695
1696    return res;
1697 }
1698
1699
1700 /**
1701  * Convert vector of (or scalar) int to vector of (or scalar) float.
1702  */
1703 LLVMValueRef
1704 lp_build_int_to_float(struct lp_build_context *bld,
1705                       LLVMValueRef a)
1706 {
1707    LLVMBuilderRef builder = bld->gallivm->builder;
1708    const struct lp_type type = bld->type;
1709    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1710
1711    assert(type.floating);
1712
1713    return LLVMBuildSIToFP(builder, a, vec_type, "");
1714 }
1715
1716 static boolean
1717 arch_rounding_available(const struct lp_type type)
1718 {
1719    if ((util_cpu_caps.has_sse4_1 &&
1720        (type.length == 1 || type.width*type.length == 128)) ||
1721        (util_cpu_caps.has_avx && type.width*type.length == 256))
1722       return TRUE;
1723    else if ((util_cpu_caps.has_altivec &&
1724             (type.width == 32 && type.length == 4)))
1725       return TRUE;
1726
1727    return FALSE;
1728 }
1729
1730 enum lp_build_round_mode
1731 {
1732    LP_BUILD_ROUND_NEAREST = 0,
1733    LP_BUILD_ROUND_FLOOR = 1,
1734    LP_BUILD_ROUND_CEIL = 2,
1735    LP_BUILD_ROUND_TRUNCATE = 3
1736 };
1737
1738 static inline LLVMValueRef
1739 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1740                              LLVMValueRef a)
1741 {
1742    LLVMBuilderRef builder = bld->gallivm->builder;
1743    const struct lp_type type = bld->type;
1744    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1745    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1746    const char *intrinsic;
1747    LLVMValueRef res;
1748
1749    assert(type.floating);
1750    /* using the double precision conversions is a bit more complicated */
1751    assert(type.width == 32);
1752
1753    assert(lp_check_value(type, a));
1754    assert(util_cpu_caps.has_sse2);
1755
1756    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1757    if (type.length == 1) {
1758       LLVMTypeRef vec_type;
1759       LLVMValueRef undef;
1760       LLVMValueRef arg;
1761       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1762
1763       vec_type = LLVMVectorType(bld->elem_type, 4);
1764
1765       intrinsic = "llvm.x86.sse.cvtss2si";
1766
1767       undef = LLVMGetUndef(vec_type);
1768
1769       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1770
1771       res = lp_build_intrinsic_unary(builder, intrinsic,
1772                                      ret_type, arg);
1773    }
1774    else {
1775       if (type.width* type.length == 128) {
1776          intrinsic = "llvm.x86.sse2.cvtps2dq";
1777       }
1778       else {
1779          assert(type.width*type.length == 256);
1780          assert(util_cpu_caps.has_avx);
1781
1782          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1783       }
1784       res = lp_build_intrinsic_unary(builder, intrinsic,
1785                                      ret_type, a);
1786    }
1787
1788    return res;
1789 }
1790
1791
1792 /*
1793  */
1794 static inline LLVMValueRef
1795 lp_build_round_altivec(struct lp_build_context *bld,
1796                        LLVMValueRef a,
1797                        enum lp_build_round_mode mode)
1798 {
1799    LLVMBuilderRef builder = bld->gallivm->builder;
1800    const struct lp_type type = bld->type;
1801    const char *intrinsic = NULL;
1802
1803    assert(type.floating);
1804
1805    assert(lp_check_value(type, a));
1806    assert(util_cpu_caps.has_altivec);
1807
1808    (void)type;
1809
1810    switch (mode) {
1811    case LP_BUILD_ROUND_NEAREST:
1812       intrinsic = "llvm.ppc.altivec.vrfin";
1813       break;
1814    case LP_BUILD_ROUND_FLOOR:
1815       intrinsic = "llvm.ppc.altivec.vrfim";
1816       break;
1817    case LP_BUILD_ROUND_CEIL:
1818       intrinsic = "llvm.ppc.altivec.vrfip";
1819       break;
1820    case LP_BUILD_ROUND_TRUNCATE:
1821       intrinsic = "llvm.ppc.altivec.vrfiz";
1822       break;
1823    }
1824
1825    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1826 }
1827
1828 static inline LLVMValueRef
1829 lp_build_round_arch(struct lp_build_context *bld,
1830                     LLVMValueRef a,
1831                     enum lp_build_round_mode mode)
1832 {
1833    if (util_cpu_caps.has_sse4_1) {
1834       LLVMBuilderRef builder = bld->gallivm->builder;
1835       const struct lp_type type = bld->type;
1836       const char *intrinsic_root;
1837       char intrinsic[32];
1838
1839       assert(type.floating);
1840       assert(lp_check_value(type, a));
1841       (void)type;
1842
1843       switch (mode) {
1844       case LP_BUILD_ROUND_NEAREST:
1845          intrinsic_root = "llvm.nearbyint";
1846          break;
1847       case LP_BUILD_ROUND_FLOOR:
1848          intrinsic_root = "llvm.floor";
1849          break;
1850       case LP_BUILD_ROUND_CEIL:
1851          intrinsic_root = "llvm.ceil";
1852          break;
1853       case LP_BUILD_ROUND_TRUNCATE:
1854          intrinsic_root = "llvm.trunc";
1855          break;
1856       }
1857
1858       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1859       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1860    }
1861    else /* (util_cpu_caps.has_altivec) */
1862      return lp_build_round_altivec(bld, a, mode);
1863 }
1864
1865 /**
1866  * Return the integer part of a float (vector) value (== round toward zero).
1867  * The returned value is a float (vector).
1868  * Ex: trunc(-1.5) = -1.0
1869  */
1870 LLVMValueRef
1871 lp_build_trunc(struct lp_build_context *bld,
1872                LLVMValueRef a)
1873 {
1874    LLVMBuilderRef builder = bld->gallivm->builder;
1875    const struct lp_type type = bld->type;
1876
1877    assert(type.floating);
1878    assert(lp_check_value(type, a));
1879
1880    if (arch_rounding_available(type)) {
1881       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1882    }
1883    else {
1884       const struct lp_type type = bld->type;
1885       struct lp_type inttype;
1886       struct lp_build_context intbld;
1887       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1888       LLVMValueRef trunc, res, anosign, mask;
1889       LLVMTypeRef int_vec_type = bld->int_vec_type;
1890       LLVMTypeRef vec_type = bld->vec_type;
1891
1892       assert(type.width == 32); /* might want to handle doubles at some point */
1893
1894       inttype = type;
1895       inttype.floating = 0;
1896       lp_build_context_init(&intbld, bld->gallivm, inttype);
1897
1898       /* round by truncation */
1899       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1900       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1901
1902       /* mask out sign bit */
1903       anosign = lp_build_abs(bld, a);
1904       /*
1905        * mask out all values if anosign > 2^24
1906        * This should work both for large ints (all rounding is no-op for them
1907        * because such floats are always exact) as well as special cases like
1908        * NaNs, Infs (taking advantage of the fact they use max exponent).
1909        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1910        */
1911       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1912       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1913       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1914       return lp_build_select(bld, mask, a, res);
1915    }
1916 }
1917
1918
1919 /**
1920  * Return float (vector) rounded to nearest integer (vector).  The returned
1921  * value is a float (vector).
1922  * Ex: round(0.9) = 1.0
1923  * Ex: round(-1.5) = -2.0
1924  */
1925 LLVMValueRef
1926 lp_build_round(struct lp_build_context *bld,
1927                LLVMValueRef a)
1928 {
1929    LLVMBuilderRef builder = bld->gallivm->builder;
1930    const struct lp_type type = bld->type;
1931
1932    assert(type.floating);
1933    assert(lp_check_value(type, a));
1934
1935    if (arch_rounding_available(type)) {
1936       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1937    }
1938    else {
1939       const struct lp_type type = bld->type;
1940       struct lp_type inttype;
1941       struct lp_build_context intbld;
1942       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1943       LLVMValueRef res, anosign, mask;
1944       LLVMTypeRef int_vec_type = bld->int_vec_type;
1945       LLVMTypeRef vec_type = bld->vec_type;
1946
1947       assert(type.width == 32); /* might want to handle doubles at some point */
1948
1949       inttype = type;
1950       inttype.floating = 0;
1951       lp_build_context_init(&intbld, bld->gallivm, inttype);
1952
1953       res = lp_build_iround(bld, a);
1954       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1955
1956       /* mask out sign bit */
1957       anosign = lp_build_abs(bld, a);
1958       /*
1959        * mask out all values if anosign > 2^24
1960        * This should work both for large ints (all rounding is no-op for them
1961        * because such floats are always exact) as well as special cases like
1962        * NaNs, Infs (taking advantage of the fact they use max exponent).
1963        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1964        */
1965       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1966       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1967       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1968       return lp_build_select(bld, mask, a, res);
1969    }
1970 }
1971
1972
1973 /**
1974  * Return floor of float (vector), result is a float (vector)
1975  * Ex: floor(1.1) = 1.0
1976  * Ex: floor(-1.1) = -2.0
1977  */
1978 LLVMValueRef
1979 lp_build_floor(struct lp_build_context *bld,
1980                LLVMValueRef a)
1981 {
1982    LLVMBuilderRef builder = bld->gallivm->builder;
1983    const struct lp_type type = bld->type;
1984
1985    assert(type.floating);
1986    assert(lp_check_value(type, a));
1987
1988    if (arch_rounding_available(type)) {
1989       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1990    }
1991    else {
1992       const struct lp_type type = bld->type;
1993       struct lp_type inttype;
1994       struct lp_build_context intbld;
1995       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1996       LLVMValueRef trunc, res, anosign, mask;
1997       LLVMTypeRef int_vec_type = bld->int_vec_type;
1998       LLVMTypeRef vec_type = bld->vec_type;
1999
2000       if (type.width != 32) {
2001          char intrinsic[32];
2002          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2003          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2004       }
2005
2006       assert(type.width == 32); /* might want to handle doubles at some point */
2007
2008       inttype = type;
2009       inttype.floating = 0;
2010       lp_build_context_init(&intbld, bld->gallivm, inttype);
2011
2012       /* round by truncation */
2013       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2014       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2015
2016       if (type.sign) {
2017          LLVMValueRef tmp;
2018
2019          /*
2020           * fix values if rounding is wrong (for non-special cases)
2021           * - this is the case if trunc > a
2022           */
2023          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2024          /* tmp = trunc > a ? 1.0 : 0.0 */
2025          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2026          tmp = lp_build_and(&intbld, mask, tmp);
2027          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2028          res = lp_build_sub(bld, res, tmp);
2029       }
2030
2031       /* mask out sign bit */
2032       anosign = lp_build_abs(bld, a);
2033       /*
2034        * mask out all values if anosign > 2^24
2035        * This should work both for large ints (all rounding is no-op for them
2036        * because such floats are always exact) as well as special cases like
2037        * NaNs, Infs (taking advantage of the fact they use max exponent).
2038        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2039        */
2040       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2041       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2042       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2043       return lp_build_select(bld, mask, a, res);
2044    }
2045 }
2046
2047
2048 /**
2049  * Return ceiling of float (vector), returning float (vector).
2050  * Ex: ceil( 1.1) = 2.0
2051  * Ex: ceil(-1.1) = -1.0
2052  */
2053 LLVMValueRef
2054 lp_build_ceil(struct lp_build_context *bld,
2055               LLVMValueRef a)
2056 {
2057    LLVMBuilderRef builder = bld->gallivm->builder;
2058    const struct lp_type type = bld->type;
2059
2060    assert(type.floating);
2061    assert(lp_check_value(type, a));
2062
2063    if (arch_rounding_available(type)) {
2064       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2065    }
2066    else {
2067       const struct lp_type type = bld->type;
2068       struct lp_type inttype;
2069       struct lp_build_context intbld;
2070       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2071       LLVMValueRef trunc, res, anosign, mask, tmp;
2072       LLVMTypeRef int_vec_type = bld->int_vec_type;
2073       LLVMTypeRef vec_type = bld->vec_type;
2074
2075       if (type.width != 32) {
2076          char intrinsic[32];
2077          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2078          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2079       }
2080
2081       assert(type.width == 32); /* might want to handle doubles at some point */
2082
2083       inttype = type;
2084       inttype.floating = 0;
2085       lp_build_context_init(&intbld, bld->gallivm, inttype);
2086
2087       /* round by truncation */
2088       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2089       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2090
2091       /*
2092        * fix values if rounding is wrong (for non-special cases)
2093        * - this is the case if trunc < a
2094        */
2095       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2096       /* tmp = trunc < a ? 1.0 : 0.0 */
2097       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2098       tmp = lp_build_and(&intbld, mask, tmp);
2099       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2100       res = lp_build_add(bld, trunc, tmp);
2101
2102       /* mask out sign bit */
2103       anosign = lp_build_abs(bld, a);
2104       /*
2105        * mask out all values if anosign > 2^24
2106        * This should work both for large ints (all rounding is no-op for them
2107        * because such floats are always exact) as well as special cases like
2108        * NaNs, Infs (taking advantage of the fact they use max exponent).
2109        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2110        */
2111       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2112       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2113       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2114       return lp_build_select(bld, mask, a, res);
2115    }
2116 }
2117
2118
2119 /**
2120  * Return fractional part of 'a' computed as a - floor(a)
2121  * Typically used in texture coord arithmetic.
2122  */
2123 LLVMValueRef
2124 lp_build_fract(struct lp_build_context *bld,
2125                LLVMValueRef a)
2126 {
2127    assert(bld->type.floating);
2128    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2129 }
2130
2131
2132 /**
2133  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2134  * against 0.99999(9). (Will also return that value for NaNs.)
2135  */
2136 static inline LLVMValueRef
2137 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2138 {
2139    LLVMValueRef max;
2140
2141    /* this is the largest number smaller than 1.0 representable as float */
2142    max = lp_build_const_vec(bld->gallivm, bld->type,
2143                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2144    return lp_build_min_ext(bld, fract, max,
2145                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2146 }
2147
2148
2149 /**
2150  * Same as lp_build_fract, but guarantees that the result is always smaller
2151  * than one. Will also return the smaller-than-one value for infs, NaNs.
2152  */
2153 LLVMValueRef
2154 lp_build_fract_safe(struct lp_build_context *bld,
2155                     LLVMValueRef a)
2156 {
2157    return clamp_fract(bld, lp_build_fract(bld, a));
2158 }
2159
2160
2161 /**
2162  * Return the integer part of a float (vector) value (== round toward zero).
2163  * The returned value is an integer (vector).
2164  * Ex: itrunc(-1.5) = -1
2165  */
2166 LLVMValueRef
2167 lp_build_itrunc(struct lp_build_context *bld,
2168                 LLVMValueRef a)
2169 {
2170    LLVMBuilderRef builder = bld->gallivm->builder;
2171    const struct lp_type type = bld->type;
2172    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2173
2174    assert(type.floating);
2175    assert(lp_check_value(type, a));
2176
2177    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2178 }
2179
2180
2181 /**
2182  * Return float (vector) rounded to nearest integer (vector).  The returned
2183  * value is an integer (vector).
2184  * Ex: iround(0.9) = 1
2185  * Ex: iround(-1.5) = -2
2186  */
2187 LLVMValueRef
2188 lp_build_iround(struct lp_build_context *bld,
2189                 LLVMValueRef a)
2190 {
2191    LLVMBuilderRef builder = bld->gallivm->builder;
2192    const struct lp_type type = bld->type;
2193    LLVMTypeRef int_vec_type = bld->int_vec_type;
2194    LLVMValueRef res;
2195
2196    assert(type.floating);
2197
2198    assert(lp_check_value(type, a));
2199
2200    if ((util_cpu_caps.has_sse2 &&
2201        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2202        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2203       return lp_build_iround_nearest_sse2(bld, a);
2204    }
2205    if (arch_rounding_available(type)) {
2206       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2207    }
2208    else {
2209       LLVMValueRef half;
2210
2211       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2212
2213       if (type.sign) {
2214          LLVMTypeRef vec_type = bld->vec_type;
2215          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2216                                     (unsigned long long)1 << (type.width - 1));
2217          LLVMValueRef sign;
2218
2219          /* get sign bit */
2220          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2221          sign = LLVMBuildAnd(builder, sign, mask, "");
2222
2223          /* sign * 0.5 */
2224          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2225          half = LLVMBuildOr(builder, sign, half, "");
2226          half = LLVMBuildBitCast(builder, half, vec_type, "");
2227       }
2228
2229       res = LLVMBuildFAdd(builder, a, half, "");
2230    }
2231
2232    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2233
2234    return res;
2235 }
2236
2237
2238 /**
2239  * Return floor of float (vector), result is an int (vector)
2240  * Ex: ifloor(1.1) = 1.0
2241  * Ex: ifloor(-1.1) = -2.0
2242  */
2243 LLVMValueRef
2244 lp_build_ifloor(struct lp_build_context *bld,
2245                 LLVMValueRef a)
2246 {
2247    LLVMBuilderRef builder = bld->gallivm->builder;
2248    const struct lp_type type = bld->type;
2249    LLVMTypeRef int_vec_type = bld->int_vec_type;
2250    LLVMValueRef res;
2251
2252    assert(type.floating);
2253    assert(lp_check_value(type, a));
2254
2255    res = a;
2256    if (type.sign) {
2257       if (arch_rounding_available(type)) {
2258          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2259       }
2260       else {
2261          struct lp_type inttype;
2262          struct lp_build_context intbld;
2263          LLVMValueRef trunc, itrunc, mask;
2264
2265          assert(type.floating);
2266          assert(lp_check_value(type, a));
2267
2268          inttype = type;
2269          inttype.floating = 0;
2270          lp_build_context_init(&intbld, bld->gallivm, inttype);
2271
2272          /* round by truncation */
2273          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2274          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2275
2276          /*
2277           * fix values if rounding is wrong (for non-special cases)
2278           * - this is the case if trunc > a
2279           * The results of doing this with NaNs, very large values etc.
2280           * are undefined but this seems to be the case anyway.
2281           */
2282          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2283          /* cheapie minus one with mask since the mask is minus one / zero */
2284          return lp_build_add(&intbld, itrunc, mask);
2285       }
2286    }
2287
2288    /* round to nearest (toward zero) */
2289    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2290
2291    return res;
2292 }
2293
2294
2295 /**
2296  * Return ceiling of float (vector), returning int (vector).
2297  * Ex: iceil( 1.1) = 2
2298  * Ex: iceil(-1.1) = -1
2299  */
2300 LLVMValueRef
2301 lp_build_iceil(struct lp_build_context *bld,
2302                LLVMValueRef a)
2303 {
2304    LLVMBuilderRef builder = bld->gallivm->builder;
2305    const struct lp_type type = bld->type;
2306    LLVMTypeRef int_vec_type = bld->int_vec_type;
2307    LLVMValueRef res;
2308
2309    assert(type.floating);
2310    assert(lp_check_value(type, a));
2311
2312    if (arch_rounding_available(type)) {
2313       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2314    }
2315    else {
2316       struct lp_type inttype;
2317       struct lp_build_context intbld;
2318       LLVMValueRef trunc, itrunc, mask;
2319
2320       assert(type.floating);
2321       assert(lp_check_value(type, a));
2322
2323       inttype = type;
2324       inttype.floating = 0;
2325       lp_build_context_init(&intbld, bld->gallivm, inttype);
2326
2327       /* round by truncation */
2328       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2329       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2330
2331       /*
2332        * fix values if rounding is wrong (for non-special cases)
2333        * - this is the case if trunc < a
2334        * The results of doing this with NaNs, very large values etc.
2335        * are undefined but this seems to be the case anyway.
2336        */
2337       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2338       /* cheapie plus one with mask since the mask is minus one / zero */
2339       return lp_build_sub(&intbld, itrunc, mask);
2340    }
2341
2342    /* round to nearest (toward zero) */
2343    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2344
2345    return res;
2346 }
2347
2348
2349 /**
2350  * Combined ifloor() & fract().
2351  *
2352  * Preferred to calling the functions separately, as it will ensure that the
2353  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2354  */
2355 void
2356 lp_build_ifloor_fract(struct lp_build_context *bld,
2357                       LLVMValueRef a,
2358                       LLVMValueRef *out_ipart,
2359                       LLVMValueRef *out_fpart)
2360 {
2361    LLVMBuilderRef builder = bld->gallivm->builder;
2362    const struct lp_type type = bld->type;
2363    LLVMValueRef ipart;
2364
2365    assert(type.floating);
2366    assert(lp_check_value(type, a));
2367
2368    if (arch_rounding_available(type)) {
2369       /*
2370        * floor() is easier.
2371        */
2372
2373       ipart = lp_build_floor(bld, a);
2374       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2375       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2376    }
2377    else {
2378       /*
2379        * ifloor() is easier.
2380        */
2381
2382       *out_ipart = lp_build_ifloor(bld, a);
2383       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2384       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2385    }
2386 }
2387
2388
2389 /**
2390  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2391  * always smaller than one.
2392  */
2393 void
2394 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2395                            LLVMValueRef a,
2396                            LLVMValueRef *out_ipart,
2397                            LLVMValueRef *out_fpart)
2398 {
2399    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2400    *out_fpart = clamp_fract(bld, *out_fpart);
2401 }
2402
2403
2404 LLVMValueRef
2405 lp_build_sqrt(struct lp_build_context *bld,
2406               LLVMValueRef a)
2407 {
2408    LLVMBuilderRef builder = bld->gallivm->builder;
2409    const struct lp_type type = bld->type;
2410    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2411    char intrinsic[32];
2412
2413    assert(lp_check_value(type, a));
2414
2415    assert(type.floating);
2416    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2417
2418    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2419 }
2420
2421
2422 /**
2423  * Do one Newton-Raphson step to improve reciprocate precision:
2424  *
2425  *   x_{i+1} = x_i * (2 - a * x_i)
2426  *
2427  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2428  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2429  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2430  * halo. It would be necessary to clamp the argument to prevent this.
2431  *
2432  * See also:
2433  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2434  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2435  */
2436 static inline LLVMValueRef
2437 lp_build_rcp_refine(struct lp_build_context *bld,
2438                     LLVMValueRef a,
2439                     LLVMValueRef rcp_a)
2440 {
2441    LLVMBuilderRef builder = bld->gallivm->builder;
2442    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2443    LLVMValueRef res;
2444
2445    res = LLVMBuildFMul(builder, a, rcp_a, "");
2446    res = LLVMBuildFSub(builder, two, res, "");
2447    res = LLVMBuildFMul(builder, rcp_a, res, "");
2448
2449    return res;
2450 }
2451
2452
2453 LLVMValueRef
2454 lp_build_rcp(struct lp_build_context *bld,
2455              LLVMValueRef a)
2456 {
2457    LLVMBuilderRef builder = bld->gallivm->builder;
2458    const struct lp_type type = bld->type;
2459
2460    assert(lp_check_value(type, a));
2461
2462    if(a == bld->zero)
2463       return bld->undef;
2464    if(a == bld->one)
2465       return bld->one;
2466    if(a == bld->undef)
2467       return bld->undef;
2468
2469    assert(type.floating);
2470
2471    if(LLVMIsConstant(a))
2472       return LLVMConstFDiv(bld->one, a);
2473
2474    /*
2475     * We don't use RCPPS because:
2476     * - it only has 10bits of precision
2477     * - it doesn't even get the reciprocate of 1.0 exactly
2478     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2479     * - for recent processors the benefit over DIVPS is marginal, a case
2480     *   dependent
2481     *
2482     * We could still use it on certain processors if benchmarks show that the
2483     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2484     * particular uses that require less workarounds.
2485     */
2486
2487    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2488          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2489       const unsigned num_iterations = 0;
2490       LLVMValueRef res;
2491       unsigned i;
2492       const char *intrinsic = NULL;
2493
2494       if (type.length == 4) {
2495          intrinsic = "llvm.x86.sse.rcp.ps";
2496       }
2497       else {
2498          intrinsic = "llvm.x86.avx.rcp.ps.256";
2499       }
2500
2501       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2502
2503       for (i = 0; i < num_iterations; ++i) {
2504          res = lp_build_rcp_refine(bld, a, res);
2505       }
2506
2507       return res;
2508    }
2509
2510    return LLVMBuildFDiv(builder, bld->one, a, "");
2511 }
2512
2513
2514 /**
2515  * Do one Newton-Raphson step to improve rsqrt precision:
2516  *
2517  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2518  *
2519  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2520  */
2521 static inline LLVMValueRef
2522 lp_build_rsqrt_refine(struct lp_build_context *bld,
2523                       LLVMValueRef a,
2524                       LLVMValueRef rsqrt_a)
2525 {
2526    LLVMBuilderRef builder = bld->gallivm->builder;
2527    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2528    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2529    LLVMValueRef res;
2530
2531    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2532    res = LLVMBuildFMul(builder, a, res, "");
2533    res = LLVMBuildFSub(builder, three, res, "");
2534    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2535    res = LLVMBuildFMul(builder, half, res, "");
2536
2537    return res;
2538 }
2539
2540
2541 /**
2542  * Generate 1/sqrt(a).
2543  * Result is undefined for values < 0, infinity for +0.
2544  */
2545 LLVMValueRef
2546 lp_build_rsqrt(struct lp_build_context *bld,
2547                LLVMValueRef a)
2548 {
2549    const struct lp_type type = bld->type;
2550
2551    assert(lp_check_value(type, a));
2552
2553    assert(type.floating);
2554
2555    /*
2556     * This should be faster but all denormals will end up as infinity.
2557     */
2558    if (0 && lp_build_fast_rsqrt_available(type)) {
2559       const unsigned num_iterations = 1;
2560       LLVMValueRef res;
2561       unsigned i;
2562
2563       /* rsqrt(1.0) != 1.0 here */
2564       res = lp_build_fast_rsqrt(bld, a);
2565
2566       if (num_iterations) {
2567          /*
2568           * Newton-Raphson will result in NaN instead of infinity for zero,
2569           * and NaN instead of zero for infinity.
2570           * Also, need to ensure rsqrt(1.0) == 1.0.
2571           * All numbers smaller than FLT_MIN will result in +infinity
2572           * (rsqrtps treats all denormals as zero).
2573           */
2574          LLVMValueRef cmp;
2575          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2576          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2577
2578          for (i = 0; i < num_iterations; ++i) {
2579             res = lp_build_rsqrt_refine(bld, a, res);
2580          }
2581          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2582          res = lp_build_select(bld, cmp, inf, res);
2583          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2584          res = lp_build_select(bld, cmp, bld->zero, res);
2585          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2586          res = lp_build_select(bld, cmp, bld->one, res);
2587       }
2588
2589       return res;
2590    }
2591
2592    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2593 }
2594
2595 /**
2596  * If there's a fast (inaccurate) rsqrt instruction available
2597  * (caller may want to avoid to call rsqrt_fast if it's not available,
2598  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2599  * unavailable it would result in sqrt/div/mul so obviously
2600  * much better to just call sqrt, skipping both div and mul).
2601  */
2602 boolean
2603 lp_build_fast_rsqrt_available(struct lp_type type)
2604 {
2605    assert(type.floating);
2606
2607    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2608        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2609       return true;
2610    }
2611    return false;
2612 }
2613
2614
2615 /**
2616  * Generate 1/sqrt(a).
2617  * Result is undefined for values < 0, infinity for +0.
2618  * Precision is limited, only ~10 bits guaranteed
2619  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2620  */
2621 LLVMValueRef
2622 lp_build_fast_rsqrt(struct lp_build_context *bld,
2623                     LLVMValueRef a)
2624 {
2625    LLVMBuilderRef builder = bld->gallivm->builder;
2626    const struct lp_type type = bld->type;
2627
2628    assert(lp_check_value(type, a));
2629
2630    if (lp_build_fast_rsqrt_available(type)) {
2631       const char *intrinsic = NULL;
2632
2633       if (type.length == 4) {
2634          intrinsic = "llvm.x86.sse.rsqrt.ps";
2635       }
2636       else {
2637          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2638       }
2639       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2640    }
2641    else {
2642       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2643    }
2644    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2645 }
2646
2647
2648 /**
2649  * Generate sin(a) or cos(a) using polynomial approximation.
2650  * TODO: it might be worth recognizing sin and cos using same source
2651  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2652  * would be way cheaper than calculating (nearly) everything twice...
2653  * Not sure it's common enough to be worth bothering however, scs
2654  * opcode could also benefit from calculating both though.
2655  */
2656 static LLVMValueRef
2657 lp_build_sin_or_cos(struct lp_build_context *bld,
2658                     LLVMValueRef a,
2659                     boolean cos)
2660 {
2661    struct gallivm_state *gallivm = bld->gallivm;
2662    LLVMBuilderRef b = gallivm->builder;
2663    struct lp_type int_type = lp_int_type(bld->type);
2664
2665    /*
2666     *  take the absolute value,
2667     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2668     */
2669
2670    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2671    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2672
2673    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2674    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2675
2676    /*
2677     * scale by 4/Pi
2678     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2679     */
2680
2681    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2682    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2683
2684    /*
2685     * store the integer part of y in mm0
2686     * emm2 = _mm_cvttps_epi32(y);
2687     */
2688
2689    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2690
2691    /*
2692     * j=(j+1) & (~1) (see the cephes sources)
2693     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2694     */
2695
2696    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2697    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2698    /*
2699     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2700     */
2701    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2702    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2703
2704    /*
2705     * y = _mm_cvtepi32_ps(emm2);
2706     */
2707    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2708
2709    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2710    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2711    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2712    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2713
2714    /*
2715     * Argument used for poly selection and sign bit determination
2716     * is different for sin vs. cos.
2717     */
2718    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2719                                emm2_and;
2720
2721    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2722                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2723                                               const_29, "sign_bit") :
2724                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2725                                                               LLVMBuildShl(b, emm2_add,
2726                                                                            const_29, ""), ""),
2727                                               sign_mask, "sign_bit");
2728
2729    /*
2730     * get the polynom selection mask
2731     * there is one polynom for 0 <= x <= Pi/4
2732     * and another one for Pi/4<x<=Pi/2
2733     * Both branches will be computed.
2734     *
2735     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2736     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2737     */
2738
2739    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2740    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2741                                              int_type, PIPE_FUNC_EQUAL,
2742                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2743
2744    /*
2745     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2746     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2747     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2748     */
2749    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2750    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2751    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2752
2753    /*
2754     * The magic pass: "Extended precision modular arithmetic"
2755     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2756     */
2757    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2758    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2759    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2760
2761    /*
2762     * Evaluate the first polynom  (0 <= x <= Pi/4)
2763     *
2764     * z = _mm_mul_ps(x,x);
2765     */
2766    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2767
2768    /*
2769     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2770     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2771     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2772     */
2773    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2774    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2775    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2776
2777    /*
2778     * y = *(v4sf*)_ps_coscof_p0;
2779     * y = _mm_mul_ps(y, z);
2780     */
2781    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2782    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2783    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2784    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2785
2786
2787    /*
2788     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2789     * y = _mm_sub_ps(y, tmp);
2790     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2791     */
2792    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2793    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2794    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2795    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2796    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2797
2798    /*
2799     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2800     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2801     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2802     */
2803    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2804    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2805    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2806
2807    /*
2808     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2809     *
2810     * y2 = *(v4sf*)_ps_sincof_p0;
2811     * y2 = _mm_mul_ps(y2, z);
2812     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2813     * y2 = _mm_mul_ps(y2, z);
2814     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2815     * y2 = _mm_mul_ps(y2, z);
2816     * y2 = _mm_mul_ps(y2, x);
2817     * y2 = _mm_add_ps(y2, x);
2818     */
2819
2820    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2821    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2822    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2823    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2824
2825    /*
2826     * select the correct result from the two polynoms
2827     * xmm3 = poly_mask;
2828     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2829     * y = _mm_andnot_ps(xmm3, y);
2830     * y = _mm_or_ps(y,y2);
2831     */
2832    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2833    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2834    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2835    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2836    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2837    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2838
2839    /*
2840     * update the sign
2841     * y = _mm_xor_ps(y, sign_bit);
2842     */
2843    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2844    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2845
2846    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2847
2848    /* clamp output to be within [-1, 1] */
2849    y_result = lp_build_clamp(bld, y_result,
2850                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2851                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2852    /* If a is -inf, inf or NaN then return NaN */
2853    y_result = lp_build_select(bld, isfinite, y_result,
2854                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2855    return y_result;
2856 }
2857
2858
2859 /**
2860  * Generate sin(a)
2861  */
2862 LLVMValueRef
2863 lp_build_sin(struct lp_build_context *bld,
2864              LLVMValueRef a)
2865 {
2866    return lp_build_sin_or_cos(bld, a, FALSE);
2867 }
2868
2869
2870 /**
2871  * Generate cos(a)
2872  */
2873 LLVMValueRef
2874 lp_build_cos(struct lp_build_context *bld,
2875              LLVMValueRef a)
2876 {
2877    return lp_build_sin_or_cos(bld, a, TRUE);
2878 }
2879
2880
2881 /**
2882  * Generate pow(x, y)
2883  */
2884 LLVMValueRef
2885 lp_build_pow(struct lp_build_context *bld,
2886              LLVMValueRef x,
2887              LLVMValueRef y)
2888 {
2889    /* TODO: optimize the constant case */
2890    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2891        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2892       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2893                    __FUNCTION__);
2894    }
2895
2896    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2897 }
2898
2899
2900 /**
2901  * Generate exp(x)
2902  */
2903 LLVMValueRef
2904 lp_build_exp(struct lp_build_context *bld,
2905              LLVMValueRef x)
2906 {
2907    /* log2(e) = 1/log(2) */
2908    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2909                                            1.4426950408889634);
2910
2911    assert(lp_check_value(bld->type, x));
2912
2913    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2914 }
2915
2916
2917 /**
2918  * Generate log(x)
2919  * Behavior is undefined with infs, 0s and nans
2920  */
2921 LLVMValueRef
2922 lp_build_log(struct lp_build_context *bld,
2923              LLVMValueRef x)
2924 {
2925    /* log(2) */
2926    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2927                                           0.69314718055994529);
2928
2929    assert(lp_check_value(bld->type, x));
2930
2931    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2932 }
2933
2934 /**
2935  * Generate log(x) that handles edge cases (infs, 0s and nans)
2936  */
2937 LLVMValueRef
2938 lp_build_log_safe(struct lp_build_context *bld,
2939                   LLVMValueRef x)
2940 {
2941    /* log(2) */
2942    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2943                                           0.69314718055994529);
2944
2945    assert(lp_check_value(bld->type, x));
2946
2947    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2948 }
2949
2950
2951 /**
2952  * Generate polynomial.
2953  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2954  */
2955 LLVMValueRef
2956 lp_build_polynomial(struct lp_build_context *bld,
2957                     LLVMValueRef x,
2958                     const double *coeffs,
2959                     unsigned num_coeffs)
2960 {
2961    const struct lp_type type = bld->type;
2962    LLVMValueRef even = NULL, odd = NULL;
2963    LLVMValueRef x2;
2964    unsigned i;
2965
2966    assert(lp_check_value(bld->type, x));
2967
2968    /* TODO: optimize the constant case */
2969    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2970        LLVMIsConstant(x)) {
2971       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2972                    __FUNCTION__);
2973    }
2974
2975    /*
2976     * Calculate odd and even terms seperately to decrease data dependency
2977     * Ex:
2978     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2979     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2980     */
2981    x2 = lp_build_mul(bld, x, x);
2982
2983    for (i = num_coeffs; i--; ) {
2984       LLVMValueRef coeff;
2985
2986       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2987
2988       if (i % 2 == 0) {
2989          if (even)
2990             even = lp_build_mad(bld, x2, even, coeff);
2991          else
2992             even = coeff;
2993       } else {
2994          if (odd)
2995             odd = lp_build_mad(bld, x2, odd, coeff);
2996          else
2997             odd = coeff;
2998       }
2999    }
3000
3001    if (odd)
3002       return lp_build_mad(bld, odd, x, even);
3003    else if (even)
3004       return even;
3005    else
3006       return bld->undef;
3007 }
3008
3009
3010 /**
3011  * Minimax polynomial fit of 2**x, in range [0, 1[
3012  */
3013 const double lp_build_exp2_polynomial[] = {
3014 #if EXP_POLY_DEGREE == 5
3015    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3016    0.693153073200168932794,
3017    0.240153617044375388211,
3018    0.0558263180532956664775,
3019    0.00898934009049466391101,
3020    0.00187757667519147912699
3021 #elif EXP_POLY_DEGREE == 4
3022    1.00000259337069434683,
3023    0.693003834469974940458,
3024    0.24144275689150793076,
3025    0.0520114606103070150235,
3026    0.0135341679161270268764
3027 #elif EXP_POLY_DEGREE == 3
3028    0.999925218562710312959,
3029    0.695833540494823811697,
3030    0.226067155427249155588,
3031    0.0780245226406372992967
3032 #elif EXP_POLY_DEGREE == 2
3033    1.00172476321474503578,
3034    0.657636275736077639316,
3035    0.33718943461968720704
3036 #else
3037 #error
3038 #endif
3039 };
3040
3041
3042 LLVMValueRef
3043 lp_build_exp2(struct lp_build_context *bld,
3044               LLVMValueRef x)
3045 {
3046    LLVMBuilderRef builder = bld->gallivm->builder;
3047    const struct lp_type type = bld->type;
3048    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3049    LLVMValueRef ipart = NULL;
3050    LLVMValueRef fpart = NULL;
3051    LLVMValueRef expipart = NULL;
3052    LLVMValueRef expfpart = NULL;
3053    LLVMValueRef res = NULL;
3054
3055    assert(lp_check_value(bld->type, x));
3056
3057    /* TODO: optimize the constant case */
3058    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3059        LLVMIsConstant(x)) {
3060       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3061                    __FUNCTION__);
3062    }
3063
3064    assert(type.floating && type.width == 32);
3065
3066    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3067     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3068    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3069                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3070    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3071                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3072
3073    /* ipart = floor(x) */
3074    /* fpart = x - ipart */
3075    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3076
3077    /* expipart = (float) (1 << ipart) */
3078    expipart = LLVMBuildAdd(builder, ipart,
3079                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3080    expipart = LLVMBuildShl(builder, expipart,
3081                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3082    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3083
3084    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3085                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3086
3087    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3088
3089    return res;
3090 }
3091
3092
3093
3094 /**
3095  * Extract the exponent of a IEEE-754 floating point value.
3096  *
3097  * Optionally apply an integer bias.
3098  *
3099  * Result is an integer value with
3100  *
3101  *   ifloor(log2(x)) + bias
3102  */
3103 LLVMValueRef
3104 lp_build_extract_exponent(struct lp_build_context *bld,
3105                           LLVMValueRef x,
3106                           int bias)
3107 {
3108    LLVMBuilderRef builder = bld->gallivm->builder;
3109    const struct lp_type type = bld->type;
3110    unsigned mantissa = lp_mantissa(type);
3111    LLVMValueRef res;
3112
3113    assert(type.floating);
3114
3115    assert(lp_check_value(bld->type, x));
3116
3117    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3118
3119    res = LLVMBuildLShr(builder, x,
3120                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3121    res = LLVMBuildAnd(builder, res,
3122                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3123    res = LLVMBuildSub(builder, res,
3124                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3125
3126    return res;
3127 }
3128
3129
3130 /**
3131  * Extract the mantissa of the a floating.
3132  *
3133  * Result is a floating point value with
3134  *
3135  *   x / floor(log2(x))
3136  */
3137 LLVMValueRef
3138 lp_build_extract_mantissa(struct lp_build_context *bld,
3139                           LLVMValueRef x)
3140 {
3141    LLVMBuilderRef builder = bld->gallivm->builder;
3142    const struct lp_type type = bld->type;
3143    unsigned mantissa = lp_mantissa(type);
3144    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3145                                                   (1ULL << mantissa) - 1);
3146    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3147    LLVMValueRef res;
3148
3149    assert(lp_check_value(bld->type, x));
3150
3151    assert(type.floating);
3152
3153    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3154
3155    /* res = x / 2**ipart */
3156    res = LLVMBuildAnd(builder, x, mantmask, "");
3157    res = LLVMBuildOr(builder, res, one, "");
3158    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3159
3160    return res;
3161 }
3162
3163
3164
3165 /**
3166  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3167  * These coefficients can be generate with
3168  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3169  */
3170 const double lp_build_log2_polynomial[] = {
3171 #if LOG_POLY_DEGREE == 5
3172    2.88539008148777786488L,
3173    0.961796878841293367824L,
3174    0.577058946784739859012L,
3175    0.412914355135828735411L,
3176    0.308591899232910175289L,
3177    0.352376952300281371868L,
3178 #elif LOG_POLY_DEGREE == 4
3179    2.88539009343309178325L,
3180    0.961791550404184197881L,
3181    0.577440339438736392009L,
3182    0.403343858251329912514L,
3183    0.406718052498846252698L,
3184 #elif LOG_POLY_DEGREE == 3
3185    2.88538959748872753838L,
3186    0.961932915889597772928L,
3187    0.571118517972136195241L,
3188    0.493997535084709500285L,
3189 #else
3190 #error
3191 #endif
3192 };
3193
3194 /**
3195  * See http://www.devmaster.net/forums/showthread.php?p=43580
3196  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3197  * http://www.nezumi.demon.co.uk/consult/logx.htm
3198  *
3199  * If handle_edge_cases is true the function will perform computations
3200  * to match the required D3D10+ behavior for each of the edge cases.
3201  * That means that if input is:
3202  * - less than zero (to and including -inf) then NaN will be returned
3203  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3204  * - +infinity, then +infinity will be returned
3205  * - NaN, then NaN will be returned
3206  *
3207  * Those checks are fairly expensive so if you don't need them make sure
3208  * handle_edge_cases is false.
3209  */
3210 void
3211 lp_build_log2_approx(struct lp_build_context *bld,
3212                      LLVMValueRef x,
3213                      LLVMValueRef *p_exp,
3214                      LLVMValueRef *p_floor_log2,
3215                      LLVMValueRef *p_log2,
3216                      boolean handle_edge_cases)
3217 {
3218    LLVMBuilderRef builder = bld->gallivm->builder;
3219    const struct lp_type type = bld->type;
3220    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3221    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3222
3223    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3224    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3225    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3226
3227    LLVMValueRef i = NULL;
3228    LLVMValueRef y = NULL;
3229    LLVMValueRef z = NULL;
3230    LLVMValueRef exp = NULL;
3231    LLVMValueRef mant = NULL;
3232    LLVMValueRef logexp = NULL;
3233    LLVMValueRef p_z = NULL;
3234    LLVMValueRef res = NULL;
3235
3236    assert(lp_check_value(bld->type, x));
3237
3238    if(p_exp || p_floor_log2 || p_log2) {
3239       /* TODO: optimize the constant case */
3240       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3241           LLVMIsConstant(x)) {
3242          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3243                       __FUNCTION__);
3244       }
3245
3246       assert(type.floating && type.width == 32);
3247
3248       /*
3249        * We don't explicitly handle denormalized numbers. They will yield a
3250        * result in the neighbourhood of -127, which appears to be adequate
3251        * enough.
3252        */
3253
3254       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3255
3256       /* exp = (float) exponent(x) */
3257       exp = LLVMBuildAnd(builder, i, expmask, "");
3258    }
3259
3260    if(p_floor_log2 || p_log2) {
3261       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3262       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3263       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3264    }
3265
3266    if (p_log2) {
3267       /* mant = 1 + (float) mantissa(x) */
3268       mant = LLVMBuildAnd(builder, i, mantmask, "");
3269       mant = LLVMBuildOr(builder, mant, one, "");
3270       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3271
3272       /* y = (mant - 1) / (mant + 1) */
3273       y = lp_build_div(bld,
3274          lp_build_sub(bld, mant, bld->one),
3275          lp_build_add(bld, mant, bld->one)
3276       );
3277
3278       /* z = y^2 */
3279       z = lp_build_mul(bld, y, y);
3280
3281       /* compute P(z) */
3282       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3283                                 ARRAY_SIZE(lp_build_log2_polynomial));
3284
3285       /* y * P(z) + logexp */
3286       res = lp_build_mad(bld, y, p_z, logexp);
3287
3288       if (type.floating && handle_edge_cases) {
3289          LLVMValueRef negmask, infmask,  zmask;
3290          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3291                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3292          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3293                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3294          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3295                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3296
3297          /* If x is qual to inf make sure we return inf */
3298          res = lp_build_select(bld, infmask,
3299                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3300                                res);
3301          /* If x is qual to 0, return -inf */
3302          res = lp_build_select(bld, zmask,
3303                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3304                                res);
3305          /* If x is nan or less than 0, return nan */
3306          res = lp_build_select(bld, negmask,
3307                                lp_build_const_vec(bld->gallivm, type,  NAN),
3308                                res);
3309       }
3310    }
3311
3312    if (p_exp) {
3313       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3314       *p_exp = exp;
3315    }
3316
3317    if (p_floor_log2)
3318       *p_floor_log2 = logexp;
3319
3320    if (p_log2)
3321       *p_log2 = res;
3322 }
3323
3324
3325 /*
3326  * log2 implementation which doesn't have special code to
3327  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3328  * the results for those cases are undefined.
3329  */
3330 LLVMValueRef
3331 lp_build_log2(struct lp_build_context *bld,
3332               LLVMValueRef x)
3333 {
3334    LLVMValueRef res;
3335    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3336    return res;
3337 }
3338
3339 /*
3340  * Version of log2 which handles all edge cases.
3341  * Look at documentation of lp_build_log2_approx for
3342  * description of the behavior for each of the edge cases.
3343  */
3344 LLVMValueRef
3345 lp_build_log2_safe(struct lp_build_context *bld,
3346                    LLVMValueRef x)
3347 {
3348    LLVMValueRef res;
3349    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3350    return res;
3351 }
3352
3353
3354 /**
3355  * Faster (and less accurate) log2.
3356  *
3357  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3358  *
3359  * Piece-wise linear approximation, with exact results when x is a
3360  * power of two.
3361  *
3362  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3363  */
3364 LLVMValueRef
3365 lp_build_fast_log2(struct lp_build_context *bld,
3366                    LLVMValueRef x)
3367 {
3368    LLVMBuilderRef builder = bld->gallivm->builder;
3369    LLVMValueRef ipart;
3370    LLVMValueRef fpart;
3371
3372    assert(lp_check_value(bld->type, x));
3373
3374    assert(bld->type.floating);
3375
3376    /* ipart = floor(log2(x)) - 1 */
3377    ipart = lp_build_extract_exponent(bld, x, -1);
3378    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3379
3380    /* fpart = x / 2**ipart */
3381    fpart = lp_build_extract_mantissa(bld, x);
3382
3383    /* ipart + fpart */
3384    return LLVMBuildFAdd(builder, ipart, fpart, "");
3385 }
3386
3387
3388 /**
3389  * Fast implementation of iround(log2(x)).
3390  *
3391  * Not an approximation -- it should give accurate results all the time.
3392  */
3393 LLVMValueRef
3394 lp_build_ilog2(struct lp_build_context *bld,
3395                LLVMValueRef x)
3396 {
3397    LLVMBuilderRef builder = bld->gallivm->builder;
3398    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3399    LLVMValueRef ipart;
3400
3401    assert(bld->type.floating);
3402
3403    assert(lp_check_value(bld->type, x));
3404
3405    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3406    x = LLVMBuildFMul(builder, x, sqrt2, "");
3407
3408    /* ipart = floor(log2(x) + 0.5)  */
3409    ipart = lp_build_extract_exponent(bld, x, 0);
3410
3411    return ipart;
3412 }
3413
3414 LLVMValueRef
3415 lp_build_mod(struct lp_build_context *bld,
3416              LLVMValueRef x,
3417              LLVMValueRef y)
3418 {
3419    LLVMBuilderRef builder = bld->gallivm->builder;
3420    LLVMValueRef res;
3421    const struct lp_type type = bld->type;
3422
3423    assert(lp_check_value(type, x));
3424    assert(lp_check_value(type, y));
3425
3426    if (type.floating)
3427       res = LLVMBuildFRem(builder, x, y, "");
3428    else if (type.sign)
3429       res = LLVMBuildSRem(builder, x, y, "");
3430    else
3431       res = LLVMBuildURem(builder, x, y, "");
3432    return res;
3433 }
3434
3435
3436 /*
3437  * For floating inputs it creates and returns a mask
3438  * which is all 1's for channels which are NaN.
3439  * Channels inside x which are not NaN will be 0.
3440  */
3441 LLVMValueRef
3442 lp_build_isnan(struct lp_build_context *bld,
3443                LLVMValueRef x)
3444 {
3445    LLVMValueRef mask;
3446    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3447
3448    assert(bld->type.floating);
3449    assert(lp_check_value(bld->type, x));
3450
3451    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3452                         "isnotnan");
3453    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3454    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3455    return mask;
3456 }
3457
3458 /* Returns all 1's for floating point numbers that are
3459  * finite numbers and returns all zeros for -inf,
3460  * inf and nan's */
3461 LLVMValueRef
3462 lp_build_isfinite(struct lp_build_context *bld,
3463                   LLVMValueRef x)
3464 {
3465    LLVMBuilderRef builder = bld->gallivm->builder;
3466    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3467    struct lp_type int_type = lp_int_type(bld->type);
3468    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3469    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3470                                                     0x7f800000);
3471
3472    if (!bld->type.floating) {
3473       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3474    }
3475    assert(bld->type.floating);
3476    assert(lp_check_value(bld->type, x));
3477    assert(bld->type.width == 32);
3478
3479    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3480    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3481                            intx, infornan32);
3482 }
3483
3484 /*
3485  * Returns true if the number is nan or inf and false otherwise.
3486  * The input has to be a floating point vector.
3487  */
3488 LLVMValueRef
3489 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3490                        const struct lp_type type,
3491                        LLVMValueRef x)
3492 {
3493    LLVMBuilderRef builder = gallivm->builder;
3494    struct lp_type int_type = lp_int_type(type);
3495    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3496                                                 0x7f800000);
3497    LLVMValueRef ret;
3498
3499    assert(type.floating);
3500
3501    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3502    ret = LLVMBuildAnd(builder, ret, const0, "");
3503    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3504                           ret, const0);
3505
3506    return ret;
3507 }
3508
3509
3510 LLVMValueRef
3511 lp_build_fpstate_get(struct gallivm_state *gallivm)
3512 {
3513    if (util_cpu_caps.has_sse) {
3514       LLVMBuilderRef builder = gallivm->builder;
3515       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3516          gallivm,
3517          LLVMInt32TypeInContext(gallivm->context),
3518          "mxcsr_ptr");
3519       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3520           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3521       lp_build_intrinsic(builder,
3522                          "llvm.x86.sse.stmxcsr",
3523                          LLVMVoidTypeInContext(gallivm->context),
3524                          &mxcsr_ptr8, 1, 0);
3525       return mxcsr_ptr;
3526    }
3527    return 0;
3528 }
3529
3530 void
3531 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3532                                   boolean zero)
3533 {
3534    if (util_cpu_caps.has_sse) {
3535       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3536       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3537
3538       LLVMBuilderRef builder = gallivm->builder;
3539       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3540       LLVMValueRef mxcsr =
3541          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3542
3543       if (util_cpu_caps.has_daz) {
3544          /* Enable denormals are zero mode */
3545          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3546       }
3547       if (zero) {
3548          mxcsr = LLVMBuildOr(builder, mxcsr,
3549                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3550       } else {
3551          mxcsr = LLVMBuildAnd(builder, mxcsr,
3552                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3553       }
3554
3555       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3556       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3557    }
3558 }
3559
3560 void
3561 lp_build_fpstate_set(struct gallivm_state *gallivm,
3562                      LLVMValueRef mxcsr_ptr)
3563 {
3564    if (util_cpu_caps.has_sse) {
3565       LLVMBuilderRef builder = gallivm->builder;
3566       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3567                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3568       lp_build_intrinsic(builder,
3569                          "llvm.x86.sse.ldmxcsr",
3570                          LLVMVoidTypeInContext(gallivm->context),
3571                          &mxcsr_ptr, 1, 0);
3572    }
3573 }