src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_sse2 && type.length >= 2) {
 147       intr_size = 128;
 148       if ((type.width == 8 || type.width == 16) &&
 149           (type.width * type.length <= 64) &&
 150           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 151          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 152                       __FUNCTION__);
 153       }
 154       if (type.width == 8 && !type.sign) {
 155          intrinsic = "llvm.x86.sse2.pminu.b";
 156       }
 157       else if (type.width == 16 && type.sign) {
 158          intrinsic = "llvm.x86.sse2.pmins.w";
 159       }
 160       if (util_cpu_caps.has_sse4_1) {
 161          if (type.width == 8 && type.sign) {
 162             intrinsic = "llvm.x86.sse41.pminsb";
 163          }
 164          if (type.width == 16 && !type.sign) {
 165             intrinsic = "llvm.x86.sse41.pminuw";
 166          }
 167          if (type.width == 32 && !type.sign) {
 168             intrinsic = "llvm.x86.sse41.pminud";
 169          }
 170          if (type.width == 32 && type.sign) {
 171             intrinsic = "llvm.x86.sse41.pminsd";
 172          }
 173       }
 174    } else if (util_cpu_caps.has_altivec) {
 175       intr_size = 128;
 176       if (type.width == 8) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminub";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsb";
 181          }
 182       } else if (type.width == 16) {
 183          if (!type.sign) {
 184             intrinsic = "llvm.ppc.altivec.vminuh";
 185          } else {
 186             intrinsic = "llvm.ppc.altivec.vminsh";
 187          }
 188       } else if (type.width == 32) {
 189          if (!type.sign) {
 190             intrinsic = "llvm.ppc.altivec.vminuw";
 191          } else {
 192             intrinsic = "llvm.ppc.altivec.vminsw";
 193          }
 194       }
 195    }
 196
 197    if (intrinsic) {
 198       /* We need to handle nan's for floating point numbers. If one of the
 199        * inputs is nan the other should be returned (required by both D3D10+
 200        * and OpenCL).
 201        * The sse intrinsics return the second operator in case of nan by
 202        * default so we need to special code to handle those.
 203        */
 204       if (util_cpu_caps.has_sse && type.floating &&
 205           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 206           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 207           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 208          LLVMValueRef isnan, min;
 209          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 210                                                    type,
 211                                                    intr_size, a, b);
 212          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 213             isnan = lp_build_isnan(bld, b);
 214             return lp_build_select(bld, isnan, a, min);
 215          } else {
 216             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 217             isnan = lp_build_isnan(bld, a);
 218             return lp_build_select(bld, isnan, a, min);
 219          }
 220       } else {
 221          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 222                                                     type,
 223                                                     intr_size, a, b);
 224       }
 225    }
 226
 227    if (type.floating) {
 228       switch (nan_behavior) {
 229       case GALLIVM_NAN_RETURN_NAN: {
 230          LLVMValueRef isnan = lp_build_isnan(bld, b);
 231          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 232          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 233          return lp_build_select(bld, cond, a, b);
 234       }
 235          break;
 236       case GALLIVM_NAN_RETURN_OTHER: {
 237          LLVMValueRef isnan = lp_build_isnan(bld, a);
 238          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 239          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242          break;
 243       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 244          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 245          return lp_build_select(bld, cond, a, b);
 246       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 248          return lp_build_select(bld, cond, b, a);
 249       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252          break;
 253       default:
 254          assert(0);
 255          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258    } else {
 259       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 260       return lp_build_select(bld, cond, a, b);
 261    }
 262 }
 263
 264
 265 LLVMValueRef
 266 lp_build_fmuladd(LLVMBuilderRef builder,
 267                  LLVMValueRef a,
 268                  LLVMValueRef b,
 269                  LLVMValueRef c)
 270 {
 271    LLVMTypeRef type = LLVMTypeOf(a);
 272    assert(type == LLVMTypeOf(b));
 273    assert(type == LLVMTypeOf(c));
 274    if (HAVE_LLVM < 0x0304) {
 275       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 276        * not supported, and instead it falls-back to a C function.
 277        */
 278       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 279    }
 280    char intrinsic[32];
 281    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 282    LLVMValueRef args[] = { a, b, c };
 283    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 284 }
 285
 286
 287 /**
 288  * Generate max(a, b)
 289  * No checks for special case values of a or b = 1 or 0 are done.
 290  * NaN's are handled according to the behavior specified by the
 291  * nan_behavior argument.
 292  */
 293 static LLVMValueRef
 294 lp_build_max_simple(struct lp_build_context *bld,
 295                     LLVMValueRef a,
 296                     LLVMValueRef b,
 297                     enum gallivm_nan_behavior nan_behavior)
 298 {
 299    const struct lp_type type = bld->type;
 300    const char *intrinsic = NULL;
 301    unsigned intr_size = 0;
 302    LLVMValueRef cond;
 303
 304    assert(lp_check_value(type, a));
 305    assert(lp_check_value(type, b));
 306
 307    /* TODO: optimize the constant case */
 308
 309    if (type.floating && util_cpu_caps.has_sse) {
 310       if (type.width == 32) {
 311          if (type.length == 1) {
 312             intrinsic = "llvm.x86.sse.max.ss";
 313             intr_size = 128;
 314          }
 315          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 316             intrinsic = "llvm.x86.sse.max.ps";
 317             intr_size = 128;
 318          }
 319          else {
 320             intrinsic = "llvm.x86.avx.max.ps.256";
 321             intr_size = 256;
 322          }
 323       }
 324       if (type.width == 64 && util_cpu_caps.has_sse2) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse2.max.sd";
 327             intr_size = 128;
 328          }
 329          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse2.max.pd";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.pd.256";
 335             intr_size = 256;
 336          }
 337       }
 338    }
 339    else if (type.floating && util_cpu_caps.has_altivec) {
 340       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 341           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 342          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 343                       __FUNCTION__);
 344       }
 345       if (type.width == 32 || type.length == 4) {
 346          intrinsic = "llvm.ppc.altivec.vmaxfp";
 347          intr_size = 128;
 348       }
 349    } else if (HAVE_LLVM < 0x0309 &&
 350               util_cpu_caps.has_sse2 && type.length >= 2) {
 351       intr_size = 128;
 352       if ((type.width == 8 || type.width == 16) &&
 353           (type.width * type.length <= 64) &&
 354           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 355          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 356                       __FUNCTION__);
 357          }
 358       if (type.width == 8 && !type.sign) {
 359          intrinsic = "llvm.x86.sse2.pmaxu.b";
 360          intr_size = 128;
 361       }
 362       else if (type.width == 16 && type.sign) {
 363          intrinsic = "llvm.x86.sse2.pmaxs.w";
 364       }
 365       if (util_cpu_caps.has_sse4_1) {
 366          if (type.width == 8 && type.sign) {
 367             intrinsic = "llvm.x86.sse41.pmaxsb";
 368          }
 369          if (type.width == 16 && !type.sign) {
 370             intrinsic = "llvm.x86.sse41.pmaxuw";
 371          }
 372          if (type.width == 32 && !type.sign) {
 373             intrinsic = "llvm.x86.sse41.pmaxud";
 374         }
 375          if (type.width == 32 && type.sign) {
 376             intrinsic = "llvm.x86.sse41.pmaxsd";
 377          }
 378       }
 379    } else if (util_cpu_caps.has_altivec) {
 380      intr_size = 128;
 381      if (type.width == 8) {
 382        if (!type.sign) {
 383          intrinsic = "llvm.ppc.altivec.vmaxub";
 384        } else {
 385          intrinsic = "llvm.ppc.altivec.vmaxsb";
 386        }
 387      } else if (type.width == 16) {
 388        if (!type.sign) {
 389          intrinsic = "llvm.ppc.altivec.vmaxuh";
 390        } else {
 391          intrinsic = "llvm.ppc.altivec.vmaxsh";
 392        }
 393      } else if (type.width == 32) {
 394        if (!type.sign) {
 395          intrinsic = "llvm.ppc.altivec.vmaxuw";
 396        } else {
 397          intrinsic = "llvm.ppc.altivec.vmaxsw";
 398        }
 399      }
 400    }
 401
 402    if (intrinsic) {
 403       if (util_cpu_caps.has_sse && type.floating &&
 404           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 405           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 406           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 407          LLVMValueRef isnan, max;
 408          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 409                                                    type,
 410                                                    intr_size, a, b);
 411          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 412             isnan = lp_build_isnan(bld, b);
 413             return lp_build_select(bld, isnan, a, max);
 414          } else {
 415             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 416             isnan = lp_build_isnan(bld, a);
 417             return lp_build_select(bld, isnan, a, max);
 418          }
 419       } else {
 420          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 421                                                     type,
 422                                                     intr_size, a, b);
 423       }
 424    }
 425
 426    if (type.floating) {
 427       switch (nan_behavior) {
 428       case GALLIVM_NAN_RETURN_NAN: {
 429          LLVMValueRef isnan = lp_build_isnan(bld, b);
 430          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 431          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 432          return lp_build_select(bld, cond, a, b);
 433       }
 434          break;
 435       case GALLIVM_NAN_RETURN_OTHER: {
 436          LLVMValueRef isnan = lp_build_isnan(bld, a);
 437          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 438          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 439          return lp_build_select(bld, cond, a, b);
 440       }
 441          break;
 442       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 443          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 444          return lp_build_select(bld, cond, a, b);
 445       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 446          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 447          return lp_build_select(bld, cond, b, a);
 448       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 449          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 450          return lp_build_select(bld, cond, a, b);
 451          break;
 452       default:
 453          assert(0);
 454          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 455          return lp_build_select(bld, cond, a, b);
 456       }
 457    } else {
 458       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459       return lp_build_select(bld, cond, a, b);
 460    }
 461 }
 462
 463
 464 /**
 465  * Generate 1 - a, or ~a depending on bld->type.
 466  */
 467 LLVMValueRef
 468 lp_build_comp(struct lp_build_context *bld,
 469               LLVMValueRef a)
 470 {
 471    LLVMBuilderRef builder = bld->gallivm->builder;
 472    const struct lp_type type = bld->type;
 473
 474    assert(lp_check_value(type, a));
 475
 476    if(a == bld->one)
 477       return bld->zero;
 478    if(a == bld->zero)
 479       return bld->one;
 480
 481    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 482       if(LLVMIsConstant(a))
 483          return LLVMConstNot(a);
 484       else
 485          return LLVMBuildNot(builder, a, "");
 486    }
 487
 488    if(LLVMIsConstant(a))
 489       if (type.floating)
 490           return LLVMConstFSub(bld->one, a);
 491       else
 492           return LLVMConstSub(bld->one, a);
 493    else
 494       if (type.floating)
 495          return LLVMBuildFSub(builder, bld->one, a, "");
 496       else
 497          return LLVMBuildSub(builder, bld->one, a, "");
 498 }
 499
 500
 501 /**
 502  * Generate a + b
 503  */
 504 LLVMValueRef
 505 lp_build_add(struct lp_build_context *bld,
 506              LLVMValueRef a,
 507              LLVMValueRef b)
 508 {
 509    LLVMBuilderRef builder = bld->gallivm->builder;
 510    const struct lp_type type = bld->type;
 511    LLVMValueRef res;
 512
 513    assert(lp_check_value(type, a));
 514    assert(lp_check_value(type, b));
 515
 516    if(a == bld->zero)
 517       return b;
 518    if(b == bld->zero)
 519       return a;
 520    if(a == bld->undef || b == bld->undef)
 521       return bld->undef;
 522
 523    if(bld->type.norm) {
 524       const char *intrinsic = NULL;
 525
 526       if(a == bld->one || b == bld->one)
 527         return bld->one;
 528
 529       if (type.width * type.length == 128 &&
 530           !type.floating && !type.fixed) {
 531          if(util_cpu_caps.has_sse2) {
 532            if(type.width == 8)
 533              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 534            if(type.width == 16)
 535              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 536          } else if (util_cpu_caps.has_altivec) {
 537            if(type.width == 8)
 538               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 539            if(type.width == 16)
 540               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 541          }
 542       }
 543
 544       if (intrinsic)
 545          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 546    }
 547
 548    if(type.norm && !type.floating && !type.fixed) {
 549       if (type.sign) {
 550          uint64_t sign = (uint64_t)1 << (type.width - 1);
 551          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 552          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 553          /* a_clamp_max is the maximum a for positive b,
 554             a_clamp_min is the minimum a for negative b. */
 555          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 556          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 557          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 558       } else {
 559          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 560       }
 561    }
 562
 563    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 564       if (type.floating)
 565          res = LLVMConstFAdd(a, b);
 566       else
 567          res = LLVMConstAdd(a, b);
 568    else
 569       if (type.floating)
 570          res = LLVMBuildFAdd(builder, a, b, "");
 571       else
 572          res = LLVMBuildAdd(builder, a, b, "");
 573
 574    /* clamp to ceiling of 1.0 */
 575    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 576       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 577
 578    /* XXX clamp to floor of -1 or 0??? */
 579
 580    return res;
 581 }
 582
 583
 584 /** Return the scalar sum of the elements of a.
 585  * Should avoid this operation whenever possible.
 586  */
 587 LLVMValueRef
 588 lp_build_horizontal_add(struct lp_build_context *bld,
 589                         LLVMValueRef a)
 590 {
 591    LLVMBuilderRef builder = bld->gallivm->builder;
 592    const struct lp_type type = bld->type;
 593    LLVMValueRef index, res;
 594    unsigned i, length;
 595    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 596    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 597    LLVMValueRef vecres, elem2;
 598
 599    assert(lp_check_value(type, a));
 600
 601    if (type.length == 1) {
 602       return a;
 603    }
 604
 605    assert(!bld->type.norm);
 606
 607    /*
 608     * for byte vectors can do much better with psadbw.
 609     * Using repeated shuffle/adds here. Note with multiple vectors
 610     * this can be done more efficiently as outlined in the intel
 611     * optimization manual.
 612     * Note: could cause data rearrangement if used with smaller element
 613     * sizes.
 614     */
 615
 616    vecres = a;
 617    length = type.length / 2;
 618    while (length > 1) {
 619       LLVMValueRef vec1, vec2;
 620       for (i = 0; i < length; i++) {
 621          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 622          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 623       }
 624       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 625                                     LLVMConstVector(shuffles1, length), "");
 626       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 627                                     LLVMConstVector(shuffles2, length), "");
 628       if (type.floating) {
 629          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 630       }
 631       else {
 632          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 633       }
 634       length = length >> 1;
 635    }
 636
 637    /* always have vector of size 2 here */
 638    assert(length == 1);
 639
 640    index = lp_build_const_int32(bld->gallivm, 0);
 641    res = LLVMBuildExtractElement(builder, vecres, index, "");
 642    index = lp_build_const_int32(bld->gallivm, 1);
 643    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 644
 645    if (type.floating)
 646       res = LLVMBuildFAdd(builder, res, elem2, "");
 647     else
 648       res = LLVMBuildAdd(builder, res, elem2, "");
 649
 650    return res;
 651 }
 652
 653 /**
 654  * Return the horizontal sums of 4 float vectors as a float4 vector.
 655  * This uses the technique as outlined in Intel Optimization Manual.
 656  */
 657 static LLVMValueRef
 658 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 659                             LLVMValueRef src[4])
 660 {
 661    struct gallivm_state *gallivm = bld->gallivm;
 662    LLVMBuilderRef builder = gallivm->builder;
 663    LLVMValueRef shuffles[4];
 664    LLVMValueRef tmp[4];
 665    LLVMValueRef sumtmp[2], shuftmp[2];
 666
 667    /* lower half of regs */
 668    shuffles[0] = lp_build_const_int32(gallivm, 0);
 669    shuffles[1] = lp_build_const_int32(gallivm, 1);
 670    shuffles[2] = lp_build_const_int32(gallivm, 4);
 671    shuffles[3] = lp_build_const_int32(gallivm, 5);
 672    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 673                                    LLVMConstVector(shuffles, 4), "");
 674    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 675                                    LLVMConstVector(shuffles, 4), "");
 676
 677    /* upper half of regs */
 678    shuffles[0] = lp_build_const_int32(gallivm, 2);
 679    shuffles[1] = lp_build_const_int32(gallivm, 3);
 680    shuffles[2] = lp_build_const_int32(gallivm, 6);
 681    shuffles[3] = lp_build_const_int32(gallivm, 7);
 682    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 683                                    LLVMConstVector(shuffles, 4), "");
 684    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 685                                    LLVMConstVector(shuffles, 4), "");
 686
 687    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 688    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 689
 690    shuffles[0] = lp_build_const_int32(gallivm, 0);
 691    shuffles[1] = lp_build_const_int32(gallivm, 2);
 692    shuffles[2] = lp_build_const_int32(gallivm, 4);
 693    shuffles[3] = lp_build_const_int32(gallivm, 6);
 694    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 695                                        LLVMConstVector(shuffles, 4), "");
 696
 697    shuffles[0] = lp_build_const_int32(gallivm, 1);
 698    shuffles[1] = lp_build_const_int32(gallivm, 3);
 699    shuffles[2] = lp_build_const_int32(gallivm, 5);
 700    shuffles[3] = lp_build_const_int32(gallivm, 7);
 701    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 702                                        LLVMConstVector(shuffles, 4), "");
 703
 704    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 705 }
 706
 707
 708 /*
 709  * partially horizontally add 2-4 float vectors with length nx4,
 710  * i.e. only four adjacent values in each vector will be added,
 711  * assuming values are really grouped in 4 which also determines
 712  * output order.
 713  *
 714  * Return a vector of the same length as the initial vectors,
 715  * with the excess elements (if any) being undefined.
 716  * The element order is independent of number of input vectors.
 717  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 718  * the output order thus will be
 719  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 720  */
 721 LLVMValueRef
 722 lp_build_hadd_partial4(struct lp_build_context *bld,
 723                        LLVMValueRef vectors[],
 724                        unsigned num_vecs)
 725 {
 726    struct gallivm_state *gallivm = bld->gallivm;
 727    LLVMBuilderRef builder = gallivm->builder;
 728    LLVMValueRef ret_vec;
 729    LLVMValueRef tmp[4];
 730    const char *intrinsic = NULL;
 731
 732    assert(num_vecs >= 2 && num_vecs <= 4);
 733    assert(bld->type.floating);
 734
 735    /* only use this with at least 2 vectors, as it is sort of expensive
 736     * (depending on cpu) and we always need two horizontal adds anyway,
 737     * so a shuffle/add approach might be better.
 738     */
 739
 740    tmp[0] = vectors[0];
 741    tmp[1] = vectors[1];
 742
 743    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 744    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 745
 746    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 747        bld->type.length == 4) {
 748       intrinsic = "llvm.x86.sse3.hadd.ps";
 749    }
 750    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 751             bld->type.length == 8) {
 752       intrinsic = "llvm.x86.avx.hadd.ps.256";
 753    }
 754    if (intrinsic) {
 755       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 756                                        lp_build_vec_type(gallivm, bld->type),
 757                                        tmp[0], tmp[1]);
 758       if (num_vecs > 2) {
 759          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 760                                           lp_build_vec_type(gallivm, bld->type),
 761                                           tmp[2], tmp[3]);
 762       }
 763       else {
 764          tmp[1] = tmp[0];
 765       }
 766       return lp_build_intrinsic_binary(builder, intrinsic,
 767                                        lp_build_vec_type(gallivm, bld->type),
 768                                        tmp[0], tmp[1]);
 769    }
 770
 771    if (bld->type.length == 4) {
 772       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 773    }
 774    else {
 775       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 776       unsigned j;
 777       unsigned num_iter = bld->type.length / 4;
 778       struct lp_type parttype = bld->type;
 779       parttype.length = 4;
 780       for (j = 0; j < num_iter; j++) {
 781          LLVMValueRef partsrc[4];
 782          unsigned i;
 783          for (i = 0; i < 4; i++) {
 784             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 785          }
 786          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 787       }
 788       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 789    }
 790    return ret_vec;
 791 }
 792
 793 /**
 794  * Generate a - b
 795  */
 796 LLVMValueRef
 797 lp_build_sub(struct lp_build_context *bld,
 798              LLVMValueRef a,
 799              LLVMValueRef b)
 800 {
 801    LLVMBuilderRef builder = bld->gallivm->builder;
 802    const struct lp_type type = bld->type;
 803    LLVMValueRef res;
 804
 805    assert(lp_check_value(type, a));
 806    assert(lp_check_value(type, b));
 807
 808    if(b == bld->zero)
 809       return a;
 810    if(a == bld->undef || b == bld->undef)
 811       return bld->undef;
 812    if(a == b)
 813       return bld->zero;
 814
 815    if(bld->type.norm) {
 816       const char *intrinsic = NULL;
 817
 818       if(b == bld->one)
 819         return bld->zero;
 820
 821       if (type.width * type.length == 128 &&
 822           !type.floating && !type.fixed) {
 823          if (util_cpu_caps.has_sse2) {
 824            if(type.width == 8)
 825               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 826            if(type.width == 16)
 827               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 828          } else if (util_cpu_caps.has_altivec) {
 829            if(type.width == 8)
 830               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 831            if(type.width == 16)
 832               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 833          }
 834       }
 835
 836       if (intrinsic)
 837          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 838    }
 839
 840    if(type.norm && !type.floating && !type.fixed) {
 841       if (type.sign) {
 842          uint64_t sign = (uint64_t)1 << (type.width - 1);
 843          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 844          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 845          /* a_clamp_max is the maximum a for negative b,
 846             a_clamp_min is the minimum a for positive b. */
 847          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 848          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 849          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 850       } else {
 851          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 852       }
 853    }
 854
 855    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 856       if (type.floating)
 857          res = LLVMConstFSub(a, b);
 858       else
 859          res = LLVMConstSub(a, b);
 860    else
 861       if (type.floating)
 862          res = LLVMBuildFSub(builder, a, b, "");
 863       else
 864          res = LLVMBuildSub(builder, a, b, "");
 865
 866    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 867       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 868
 869    return res;
 870 }
 871
 872
 873
 874 /**
 875  * Normalized multiplication.
 876  *
 877  * There are several approaches for (using 8-bit normalized multiplication as
 878  * an example):
 879  *
 880  * - alpha plus one
 881  *
 882  *     makes the following approximation to the division (Sree)
 883  *
 884  *       a*b/255 ~= (a*(b + 1)) >> 256
 885  *
 886  *     which is the fastest method that satisfies the following OpenGL criteria of
 887  *
 888  *       0*0 = 0 and 255*255 = 255
 889  *
 890  * - geometric series
 891  *
 892  *     takes the geometric series approximation to the division
 893  *
 894  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 895  *
 896  *     in this case just the first two terms to fit in 16bit arithmetic
 897  *
 898  *       t/255 ~= (t + (t >> 8)) >> 8
 899  *
 900  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 901  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 902  *     must be used.
 903  *
 904  * - geometric series plus rounding
 905  *
 906  *     when using a geometric series division instead of truncating the result
 907  *     use roundoff in the approximation (Jim Blinn)
 908  *
 909  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 910  *
 911  *     achieving the exact results.
 912  *
 913  *
 914  *
 915  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 916  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 917  * @sa Michael Herf, The "double blend trick", May 2000,
 918  *     http://www.stereopsis.com/doubleblend.html
 919  */
 920 static LLVMValueRef
 921 lp_build_mul_norm(struct gallivm_state *gallivm,
 922                   struct lp_type wide_type,
 923                   LLVMValueRef a, LLVMValueRef b)
 924 {
 925    LLVMBuilderRef builder = gallivm->builder;
 926    struct lp_build_context bld;
 927    unsigned n;
 928    LLVMValueRef half;
 929    LLVMValueRef ab;
 930
 931    assert(!wide_type.floating);
 932    assert(lp_check_value(wide_type, a));
 933    assert(lp_check_value(wide_type, b));
 934
 935    lp_build_context_init(&bld, gallivm, wide_type);
 936
 937    n = wide_type.width / 2;
 938    if (wide_type.sign) {
 939       --n;
 940    }
 941
 942    /*
 943     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 944     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 945     */
 946
 947    /*
 948     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 949     */
 950
 951    ab = LLVMBuildMul(builder, a, b, "");
 952    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 953
 954    /*
 955     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 956     */
 957
 958    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 959    if (wide_type.sign) {
 960       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 961       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 962       half = lp_build_select(&bld, sign, minus_half, half);
 963    }
 964    ab = LLVMBuildAdd(builder, ab, half, "");
 965
 966    /* Final division */
 967    ab = lp_build_shr_imm(&bld, ab, n);
 968
 969    return ab;
 970 }
 971
 972 /**
 973  * Generate a * b
 974  */
 975 LLVMValueRef
 976 lp_build_mul(struct lp_build_context *bld,
 977              LLVMValueRef a,
 978              LLVMValueRef b)
 979 {
 980    LLVMBuilderRef builder = bld->gallivm->builder;
 981    const struct lp_type type = bld->type;
 982    LLVMValueRef shift;
 983    LLVMValueRef res;
 984
 985    assert(lp_check_value(type, a));
 986    assert(lp_check_value(type, b));
 987
 988    if(a == bld->zero)
 989       return bld->zero;
 990    if(a == bld->one)
 991       return b;
 992    if(b == bld->zero)
 993       return bld->zero;
 994    if(b == bld->one)
 995       return a;
 996    if(a == bld->undef || b == bld->undef)
 997       return bld->undef;
 998
 999    if (!type.floating && !type.fixed && type.norm) {
1000       struct lp_type wide_type = lp_wider_type(type);
1001       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1002
1003       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
1004       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
1005
1006       /* PMULLW, PSRLW, PADDW */
1007       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1008       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1009
1010       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
1011
1012       return ab;
1013    }
1014
1015    if(type.fixed)
1016       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1017    else
1018       shift = NULL;
1019
1020    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1021       if (type.floating)
1022          res = LLVMConstFMul(a, b);
1023       else
1024          res = LLVMConstMul(a, b);
1025       if(shift) {
1026          if(type.sign)
1027             res = LLVMConstAShr(res, shift);
1028          else
1029             res = LLVMConstLShr(res, shift);
1030       }
1031    }
1032    else {
1033       if (type.floating)
1034          res = LLVMBuildFMul(builder, a, b, "");
1035       else
1036          res = LLVMBuildMul(builder, a, b, "");
1037       if(shift) {
1038          if(type.sign)
1039             res = LLVMBuildAShr(builder, res, shift, "");
1040          else
1041             res = LLVMBuildLShr(builder, res, shift, "");
1042       }
1043    }
1044
1045    return res;
1046 }
1047
1048
1049 /* a * b + c */
1050 LLVMValueRef
1051 lp_build_mad(struct lp_build_context *bld,
1052              LLVMValueRef a,
1053              LLVMValueRef b,
1054              LLVMValueRef c)
1055 {
1056    const struct lp_type type = bld->type;
1057    if (type.floating) {
1058       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1059    } else {
1060       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1061    }
1062 }
1063
1064
1065 /**
1066  * Small vector x scale multiplication optimization.
1067  */
1068 LLVMValueRef
1069 lp_build_mul_imm(struct lp_build_context *bld,
1070                  LLVMValueRef a,
1071                  int b)
1072 {
1073    LLVMBuilderRef builder = bld->gallivm->builder;
1074    LLVMValueRef factor;
1075
1076    assert(lp_check_value(bld->type, a));
1077
1078    if(b == 0)
1079       return bld->zero;
1080
1081    if(b == 1)
1082       return a;
1083
1084    if(b == -1)
1085       return lp_build_negate(bld, a);
1086
1087    if(b == 2 && bld->type.floating)
1088       return lp_build_add(bld, a, a);
1089
1090    if(util_is_power_of_two(b)) {
1091       unsigned shift = ffs(b) - 1;
1092
1093       if(bld->type.floating) {
1094 #if 0
1095          /*
1096           * Power of two multiplication by directly manipulating the exponent.
1097           *
1098           * XXX: This might not be always faster, it will introduce a small error
1099           * for multiplication by zero, and it will produce wrong results
1100           * for Inf and NaN.
1101           */
1102          unsigned mantissa = lp_mantissa(bld->type);
1103          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1104          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1105          a = LLVMBuildAdd(builder, a, factor, "");
1106          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1107          return a;
1108 #endif
1109       }
1110       else {
1111          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1112          return LLVMBuildShl(builder, a, factor, "");
1113       }
1114    }
1115
1116    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1117    return lp_build_mul(bld, a, factor);
1118 }
1119
1120
1121 /**
1122  * Generate a / b
1123  */
1124 LLVMValueRef
1125 lp_build_div(struct lp_build_context *bld,
1126              LLVMValueRef a,
1127              LLVMValueRef b)
1128 {
1129    LLVMBuilderRef builder = bld->gallivm->builder;
1130    const struct lp_type type = bld->type;
1131
1132    assert(lp_check_value(type, a));
1133    assert(lp_check_value(type, b));
1134
1135    if(a == bld->zero)
1136       return bld->zero;
1137    if(a == bld->one && type.floating)
1138       return lp_build_rcp(bld, b);
1139    if(b == bld->zero)
1140       return bld->undef;
1141    if(b == bld->one)
1142       return a;
1143    if(a == bld->undef || b == bld->undef)
1144       return bld->undef;
1145
1146    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1147       if (type.floating)
1148          return LLVMConstFDiv(a, b);
1149       else if (type.sign)
1150          return LLVMConstSDiv(a, b);
1151       else
1152          return LLVMConstUDiv(a, b);
1153    }
1154
1155    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1156        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1157       type.floating)
1158       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1159
1160    if (type.floating)
1161       return LLVMBuildFDiv(builder, a, b, "");
1162    else if (type.sign)
1163       return LLVMBuildSDiv(builder, a, b, "");
1164    else
1165       return LLVMBuildUDiv(builder, a, b, "");
1166 }
1167
1168
1169 /**
1170  * Linear interpolation helper.
1171  *
1172  * @param normalized whether we are interpolating normalized values,
1173  *        encoded in normalized integers, twice as wide.
1174  *
1175  * @sa http://www.stereopsis.com/doubleblend.html
1176  */
1177 static inline LLVMValueRef
1178 lp_build_lerp_simple(struct lp_build_context *bld,
1179                      LLVMValueRef x,
1180                      LLVMValueRef v0,
1181                      LLVMValueRef v1,
1182                      unsigned flags)
1183 {
1184    unsigned half_width = bld->type.width/2;
1185    LLVMBuilderRef builder = bld->gallivm->builder;
1186    LLVMValueRef delta;
1187    LLVMValueRef res;
1188
1189    assert(lp_check_value(bld->type, x));
1190    assert(lp_check_value(bld->type, v0));
1191    assert(lp_check_value(bld->type, v1));
1192
1193    delta = lp_build_sub(bld, v1, v0);
1194
1195    if (bld->type.floating) {
1196       assert(flags == 0);
1197       return lp_build_mad(bld, x, delta, v0);
1198    }
1199
1200    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1201       if (!bld->type.sign) {
1202          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1203             /*
1204              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1205              * most-significant-bit to the lowest-significant-bit, so that
1206              * later we can just divide by 2**n instead of 2**n - 1.
1207              */
1208
1209             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1210          }
1211
1212          /* (x * delta) >> n */
1213          res = lp_build_mul(bld, x, delta);
1214          res = lp_build_shr_imm(bld, res, half_width);
1215       } else {
1216          /*
1217           * The rescaling trick above doesn't work for signed numbers, so
1218           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1219           * instead.
1220           */
1221          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1222          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1223       }
1224    } else {
1225       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1226       res = lp_build_mul(bld, x, delta);
1227    }
1228
1229    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1230       /*
1231        * At this point both res and v0 only use the lower half of the bits,
1232        * the rest is zero. Instead of add / mask, do add with half wide type.
1233        */
1234       struct lp_type narrow_type;
1235       struct lp_build_context narrow_bld;
1236
1237       memset(&narrow_type, 0, sizeof narrow_type);
1238       narrow_type.sign   = bld->type.sign;
1239       narrow_type.width  = bld->type.width/2;
1240       narrow_type.length = bld->type.length*2;
1241
1242       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1243       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1244       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1245       res = lp_build_add(&narrow_bld, v0, res);
1246       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1247    } else {
1248       res = lp_build_add(bld, v0, res);
1249
1250       if (bld->type.fixed) {
1251          /*
1252           * We need to mask out the high order bits when lerping 8bit
1253           * normalized colors stored on 16bits
1254           */
1255          /* XXX: This step is necessary for lerping 8bit colors stored on
1256           * 16bits, but it will be wrong for true fixed point use cases.
1257           * Basically we need a more powerful lp_type, capable of further
1258           * distinguishing the values interpretation from the value storage.
1259           */
1260          LLVMValueRef low_bits;
1261          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1262          res = LLVMBuildAnd(builder, res, low_bits, "");
1263       }
1264    }
1265
1266    return res;
1267 }
1268
1269
1270 /**
1271  * Linear interpolation.
1272  */
1273 LLVMValueRef
1274 lp_build_lerp(struct lp_build_context *bld,
1275               LLVMValueRef x,
1276               LLVMValueRef v0,
1277               LLVMValueRef v1,
1278               unsigned flags)
1279 {
1280    const struct lp_type type = bld->type;
1281    LLVMValueRef res;
1282
1283    assert(lp_check_value(type, x));
1284    assert(lp_check_value(type, v0));
1285    assert(lp_check_value(type, v1));
1286
1287    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1288
1289    if (type.norm) {
1290       struct lp_type wide_type;
1291       struct lp_build_context wide_bld;
1292       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1293
1294       assert(type.length >= 2);
1295
1296       /*
1297        * Create a wider integer type, enough to hold the
1298        * intermediate result of the multiplication.
1299        */
1300       memset(&wide_type, 0, sizeof wide_type);
1301       wide_type.sign   = type.sign;
1302       wide_type.width  = type.width*2;
1303       wide_type.length = type.length/2;
1304
1305       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1306
1307       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1308       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1309       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1310
1311       /*
1312        * Lerp both halves.
1313        */
1314
1315       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1316
1317       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1318       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1319
1320       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1321    } else {
1322       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1323    }
1324
1325    return res;
1326 }
1327
1328
1329 /**
1330  * Bilinear interpolation.
1331  *
1332  * Values indices are in v_{yx}.
1333  */
1334 LLVMValueRef
1335 lp_build_lerp_2d(struct lp_build_context *bld,
1336                  LLVMValueRef x,
1337                  LLVMValueRef y,
1338                  LLVMValueRef v00,
1339                  LLVMValueRef v01,
1340                  LLVMValueRef v10,
1341                  LLVMValueRef v11,
1342                  unsigned flags)
1343 {
1344    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1345    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1346    return lp_build_lerp(bld, y, v0, v1, flags);
1347 }
1348
1349
1350 LLVMValueRef
1351 lp_build_lerp_3d(struct lp_build_context *bld,
1352                  LLVMValueRef x,
1353                  LLVMValueRef y,
1354                  LLVMValueRef z,
1355                  LLVMValueRef v000,
1356                  LLVMValueRef v001,
1357                  LLVMValueRef v010,
1358                  LLVMValueRef v011,
1359                  LLVMValueRef v100,
1360                  LLVMValueRef v101,
1361                  LLVMValueRef v110,
1362                  LLVMValueRef v111,
1363                  unsigned flags)
1364 {
1365    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1366    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1367    return lp_build_lerp(bld, z, v0, v1, flags);
1368 }
1369
1370
1371 /**
1372  * Generate min(a, b)
1373  * Do checks for special cases but not for nans.
1374  */
1375 LLVMValueRef
1376 lp_build_min(struct lp_build_context *bld,
1377              LLVMValueRef a,
1378              LLVMValueRef b)
1379 {
1380    assert(lp_check_value(bld->type, a));
1381    assert(lp_check_value(bld->type, b));
1382
1383    if(a == bld->undef || b == bld->undef)
1384       return bld->undef;
1385
1386    if(a == b)
1387       return a;
1388
1389    if (bld->type.norm) {
1390       if (!bld->type.sign) {
1391          if (a == bld->zero || b == bld->zero) {
1392             return bld->zero;
1393          }
1394       }
1395       if(a == bld->one)
1396          return b;
1397       if(b == bld->one)
1398          return a;
1399    }
1400
1401    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1402 }
1403
1404
1405 /**
1406  * Generate min(a, b)
1407  * NaN's are handled according to the behavior specified by the
1408  * nan_behavior argument.
1409  */
1410 LLVMValueRef
1411 lp_build_min_ext(struct lp_build_context *bld,
1412                  LLVMValueRef a,
1413                  LLVMValueRef b,
1414                  enum gallivm_nan_behavior nan_behavior)
1415 {
1416    assert(lp_check_value(bld->type, a));
1417    assert(lp_check_value(bld->type, b));
1418
1419    if(a == bld->undef || b == bld->undef)
1420       return bld->undef;
1421
1422    if(a == b)
1423       return a;
1424
1425    if (bld->type.norm) {
1426       if (!bld->type.sign) {
1427          if (a == bld->zero || b == bld->zero) {
1428             return bld->zero;
1429          }
1430       }
1431       if(a == bld->one)
1432          return b;
1433       if(b == bld->one)
1434          return a;
1435    }
1436
1437    return lp_build_min_simple(bld, a, b, nan_behavior);
1438 }
1439
1440 /**
1441  * Generate max(a, b)
1442  * Do checks for special cases, but NaN behavior is undefined.
1443  */
1444 LLVMValueRef
1445 lp_build_max(struct lp_build_context *bld,
1446              LLVMValueRef a,
1447              LLVMValueRef b)
1448 {
1449    assert(lp_check_value(bld->type, a));
1450    assert(lp_check_value(bld->type, b));
1451
1452    if(a == bld->undef || b == bld->undef)
1453       return bld->undef;
1454
1455    if(a == b)
1456       return a;
1457
1458    if(bld->type.norm) {
1459       if(a == bld->one || b == bld->one)
1460          return bld->one;
1461       if (!bld->type.sign) {
1462          if (a == bld->zero) {
1463             return b;
1464          }
1465          if (b == bld->zero) {
1466             return a;
1467          }
1468       }
1469    }
1470
1471    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1472 }
1473
1474
1475 /**
1476  * Generate max(a, b)
1477  * Checks for special cases.
1478  * NaN's are handled according to the behavior specified by the
1479  * nan_behavior argument.
1480  */
1481 LLVMValueRef
1482 lp_build_max_ext(struct lp_build_context *bld,
1483                   LLVMValueRef a,
1484                   LLVMValueRef b,
1485                   enum gallivm_nan_behavior nan_behavior)
1486 {
1487    assert(lp_check_value(bld->type, a));
1488    assert(lp_check_value(bld->type, b));
1489
1490    if(a == bld->undef || b == bld->undef)
1491       return bld->undef;
1492
1493    if(a == b)
1494       return a;
1495
1496    if(bld->type.norm) {
1497       if(a == bld->one || b == bld->one)
1498          return bld->one;
1499       if (!bld->type.sign) {
1500          if (a == bld->zero) {
1501             return b;
1502          }
1503          if (b == bld->zero) {
1504             return a;
1505          }
1506       }
1507    }
1508
1509    return lp_build_max_simple(bld, a, b, nan_behavior);
1510 }
1511
1512 /**
1513  * Generate clamp(a, min, max)
1514  * NaN behavior (for any of a, min, max) is undefined.
1515  * Do checks for special cases.
1516  */
1517 LLVMValueRef
1518 lp_build_clamp(struct lp_build_context *bld,
1519                LLVMValueRef a,
1520                LLVMValueRef min,
1521                LLVMValueRef max)
1522 {
1523    assert(lp_check_value(bld->type, a));
1524    assert(lp_check_value(bld->type, min));
1525    assert(lp_check_value(bld->type, max));
1526
1527    a = lp_build_min(bld, a, max);
1528    a = lp_build_max(bld, a, min);
1529    return a;
1530 }
1531
1532
1533 /**
1534  * Generate clamp(a, 0, 1)
1535  * A NaN will get converted to zero.
1536  */
1537 LLVMValueRef
1538 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1539                                 LLVMValueRef a)
1540 {
1541    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1542    a = lp_build_min(bld, a, bld->one);
1543    return a;
1544 }
1545
1546
1547 /**
1548  * Generate abs(a)
1549  */
1550 LLVMValueRef
1551 lp_build_abs(struct lp_build_context *bld,
1552              LLVMValueRef a)
1553 {
1554    LLVMBuilderRef builder = bld->gallivm->builder;
1555    const struct lp_type type = bld->type;
1556    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1557
1558    assert(lp_check_value(type, a));
1559
1560    if(!type.sign)
1561       return a;
1562
1563    if(type.floating) {
1564       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1565          /* Workaround llvm.org/PR27332 */
1566          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1567          unsigned long long absMask = ~(1ULL << (type.width - 1));
1568          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1569          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1570          a = LLVMBuildAnd(builder, a, mask, "");
1571          a = LLVMBuildBitCast(builder, a, vec_type, "");
1572          return a;
1573       } else {
1574          char intrinsic[32];
1575          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1576          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1577       }
1578    }
1579
1580    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1581       switch(type.width) {
1582       case 8:
1583          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1584       case 16:
1585          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1586       case 32:
1587          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1588       }
1589    }
1590    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1591             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1592             (type.width == 8 || type.width == 16 || type.width == 32)) {
1593       debug_printf("%s: inefficient code, should split vectors manually\n",
1594                    __FUNCTION__);
1595    }
1596
1597    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1598 }
1599
1600
1601 LLVMValueRef
1602 lp_build_negate(struct lp_build_context *bld,
1603                 LLVMValueRef a)
1604 {
1605    LLVMBuilderRef builder = bld->gallivm->builder;
1606
1607    assert(lp_check_value(bld->type, a));
1608
1609    if (bld->type.floating)
1610       a = LLVMBuildFNeg(builder, a, "");
1611    else
1612       a = LLVMBuildNeg(builder, a, "");
1613
1614    return a;
1615 }
1616
1617
1618 /** Return -1, 0 or +1 depending on the sign of a */
1619 LLVMValueRef
1620 lp_build_sgn(struct lp_build_context *bld,
1621              LLVMValueRef a)
1622 {
1623    LLVMBuilderRef builder = bld->gallivm->builder;
1624    const struct lp_type type = bld->type;
1625    LLVMValueRef cond;
1626    LLVMValueRef res;
1627
1628    assert(lp_check_value(type, a));
1629
1630    /* Handle non-zero case */
1631    if(!type.sign) {
1632       /* if not zero then sign must be positive */
1633       res = bld->one;
1634    }
1635    else if(type.floating) {
1636       LLVMTypeRef vec_type;
1637       LLVMTypeRef int_type;
1638       LLVMValueRef mask;
1639       LLVMValueRef sign;
1640       LLVMValueRef one;
1641       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1642
1643       int_type = lp_build_int_vec_type(bld->gallivm, type);
1644       vec_type = lp_build_vec_type(bld->gallivm, type);
1645       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1646
1647       /* Take the sign bit and add it to 1 constant */
1648       sign = LLVMBuildBitCast(builder, a, int_type, "");
1649       sign = LLVMBuildAnd(builder, sign, mask, "");
1650       one = LLVMConstBitCast(bld->one, int_type);
1651       res = LLVMBuildOr(builder, sign, one, "");
1652       res = LLVMBuildBitCast(builder, res, vec_type, "");
1653    }
1654    else
1655    {
1656       /* signed int/norm/fixed point */
1657       /* could use psign with sse3 and appropriate vectors here */
1658       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1659       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1660       res = lp_build_select(bld, cond, bld->one, minus_one);
1661    }
1662
1663    /* Handle zero */
1664    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1665    res = lp_build_select(bld, cond, bld->zero, res);
1666
1667    return res;
1668 }
1669
1670
1671 /**
1672  * Set the sign of float vector 'a' according to 'sign'.
1673  * If sign==0, return abs(a).
1674  * If sign==1, return -abs(a);
1675  * Other values for sign produce undefined results.
1676  */
1677 LLVMValueRef
1678 lp_build_set_sign(struct lp_build_context *bld,
1679                   LLVMValueRef a, LLVMValueRef sign)
1680 {
1681    LLVMBuilderRef builder = bld->gallivm->builder;
1682    const struct lp_type type = bld->type;
1683    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1684    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1685    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1686    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1687                              ~((unsigned long long) 1 << (type.width - 1)));
1688    LLVMValueRef val, res;
1689
1690    assert(type.floating);
1691    assert(lp_check_value(type, a));
1692
1693    /* val = reinterpret_cast<int>(a) */
1694    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1695    /* val = val & mask */
1696    val = LLVMBuildAnd(builder, val, mask, "");
1697    /* sign = sign << shift */
1698    sign = LLVMBuildShl(builder, sign, shift, "");
1699    /* res = val | sign */
1700    res = LLVMBuildOr(builder, val, sign, "");
1701    /* res = reinterpret_cast<float>(res) */
1702    res = LLVMBuildBitCast(builder, res, vec_type, "");
1703
1704    return res;
1705 }
1706
1707
1708 /**
1709  * Convert vector of (or scalar) int to vector of (or scalar) float.
1710  */
1711 LLVMValueRef
1712 lp_build_int_to_float(struct lp_build_context *bld,
1713                       LLVMValueRef a)
1714 {
1715    LLVMBuilderRef builder = bld->gallivm->builder;
1716    const struct lp_type type = bld->type;
1717    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1718
1719    assert(type.floating);
1720
1721    return LLVMBuildSIToFP(builder, a, vec_type, "");
1722 }
1723
1724 static boolean
1725 arch_rounding_available(const struct lp_type type)
1726 {
1727    if ((util_cpu_caps.has_sse4_1 &&
1728        (type.length == 1 || type.width*type.length == 128)) ||
1729        (util_cpu_caps.has_avx && type.width*type.length == 256))
1730       return TRUE;
1731    else if ((util_cpu_caps.has_altivec &&
1732             (type.width == 32 && type.length == 4)))
1733       return TRUE;
1734
1735    return FALSE;
1736 }
1737
1738 enum lp_build_round_mode
1739 {
1740    LP_BUILD_ROUND_NEAREST = 0,
1741    LP_BUILD_ROUND_FLOOR = 1,
1742    LP_BUILD_ROUND_CEIL = 2,
1743    LP_BUILD_ROUND_TRUNCATE = 3
1744 };
1745
1746 static inline LLVMValueRef
1747 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1748                              LLVMValueRef a)
1749 {
1750    LLVMBuilderRef builder = bld->gallivm->builder;
1751    const struct lp_type type = bld->type;
1752    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1753    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1754    const char *intrinsic;
1755    LLVMValueRef res;
1756
1757    assert(type.floating);
1758    /* using the double precision conversions is a bit more complicated */
1759    assert(type.width == 32);
1760
1761    assert(lp_check_value(type, a));
1762    assert(util_cpu_caps.has_sse2);
1763
1764    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1765    if (type.length == 1) {
1766       LLVMTypeRef vec_type;
1767       LLVMValueRef undef;
1768       LLVMValueRef arg;
1769       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1770
1771       vec_type = LLVMVectorType(bld->elem_type, 4);
1772
1773       intrinsic = "llvm.x86.sse.cvtss2si";
1774
1775       undef = LLVMGetUndef(vec_type);
1776
1777       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1778
1779       res = lp_build_intrinsic_unary(builder, intrinsic,
1780                                      ret_type, arg);
1781    }
1782    else {
1783       if (type.width* type.length == 128) {
1784          intrinsic = "llvm.x86.sse2.cvtps2dq";
1785       }
1786       else {
1787          assert(type.width*type.length == 256);
1788          assert(util_cpu_caps.has_avx);
1789
1790          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1791       }
1792       res = lp_build_intrinsic_unary(builder, intrinsic,
1793                                      ret_type, a);
1794    }
1795
1796    return res;
1797 }
1798
1799
1800 /*
1801  */
1802 static inline LLVMValueRef
1803 lp_build_round_altivec(struct lp_build_context *bld,
1804                        LLVMValueRef a,
1805                        enum lp_build_round_mode mode)
1806 {
1807    LLVMBuilderRef builder = bld->gallivm->builder;
1808    const struct lp_type type = bld->type;
1809    const char *intrinsic = NULL;
1810
1811    assert(type.floating);
1812
1813    assert(lp_check_value(type, a));
1814    assert(util_cpu_caps.has_altivec);
1815
1816    (void)type;
1817
1818    switch (mode) {
1819    case LP_BUILD_ROUND_NEAREST:
1820       intrinsic = "llvm.ppc.altivec.vrfin";
1821       break;
1822    case LP_BUILD_ROUND_FLOOR:
1823       intrinsic = "llvm.ppc.altivec.vrfim";
1824       break;
1825    case LP_BUILD_ROUND_CEIL:
1826       intrinsic = "llvm.ppc.altivec.vrfip";
1827       break;
1828    case LP_BUILD_ROUND_TRUNCATE:
1829       intrinsic = "llvm.ppc.altivec.vrfiz";
1830       break;
1831    }
1832
1833    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1834 }
1835
1836 static inline LLVMValueRef
1837 lp_build_round_arch(struct lp_build_context *bld,
1838                     LLVMValueRef a,
1839                     enum lp_build_round_mode mode)
1840 {
1841    if (util_cpu_caps.has_sse4_1) {
1842       LLVMBuilderRef builder = bld->gallivm->builder;
1843       const struct lp_type type = bld->type;
1844       const char *intrinsic_root;
1845       char intrinsic[32];
1846
1847       assert(type.floating);
1848       assert(lp_check_value(type, a));
1849       (void)type;
1850
1851       switch (mode) {
1852       case LP_BUILD_ROUND_NEAREST:
1853          intrinsic_root = "llvm.nearbyint";
1854          break;
1855       case LP_BUILD_ROUND_FLOOR:
1856          intrinsic_root = "llvm.floor";
1857          break;
1858       case LP_BUILD_ROUND_CEIL:
1859          intrinsic_root = "llvm.ceil";
1860          break;
1861       case LP_BUILD_ROUND_TRUNCATE:
1862          intrinsic_root = "llvm.trunc";
1863          break;
1864       }
1865
1866       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1867       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1868    }
1869    else /* (util_cpu_caps.has_altivec) */
1870      return lp_build_round_altivec(bld, a, mode);
1871 }
1872
1873 /**
1874  * Return the integer part of a float (vector) value (== round toward zero).
1875  * The returned value is a float (vector).
1876  * Ex: trunc(-1.5) = -1.0
1877  */
1878 LLVMValueRef
1879 lp_build_trunc(struct lp_build_context *bld,
1880                LLVMValueRef a)
1881 {
1882    LLVMBuilderRef builder = bld->gallivm->builder;
1883    const struct lp_type type = bld->type;
1884
1885    assert(type.floating);
1886    assert(lp_check_value(type, a));
1887
1888    if (arch_rounding_available(type)) {
1889       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1890    }
1891    else {
1892       const struct lp_type type = bld->type;
1893       struct lp_type inttype;
1894       struct lp_build_context intbld;
1895       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1896       LLVMValueRef trunc, res, anosign, mask;
1897       LLVMTypeRef int_vec_type = bld->int_vec_type;
1898       LLVMTypeRef vec_type = bld->vec_type;
1899
1900       assert(type.width == 32); /* might want to handle doubles at some point */
1901
1902       inttype = type;
1903       inttype.floating = 0;
1904       lp_build_context_init(&intbld, bld->gallivm, inttype);
1905
1906       /* round by truncation */
1907       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1908       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1909
1910       /* mask out sign bit */
1911       anosign = lp_build_abs(bld, a);
1912       /*
1913        * mask out all values if anosign > 2^24
1914        * This should work both for large ints (all rounding is no-op for them
1915        * because such floats are always exact) as well as special cases like
1916        * NaNs, Infs (taking advantage of the fact they use max exponent).
1917        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1918        */
1919       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1920       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1921       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1922       return lp_build_select(bld, mask, a, res);
1923    }
1924 }
1925
1926
1927 /**
1928  * Return float (vector) rounded to nearest integer (vector).  The returned
1929  * value is a float (vector).
1930  * Ex: round(0.9) = 1.0
1931  * Ex: round(-1.5) = -2.0
1932  */
1933 LLVMValueRef
1934 lp_build_round(struct lp_build_context *bld,
1935                LLVMValueRef a)
1936 {
1937    LLVMBuilderRef builder = bld->gallivm->builder;
1938    const struct lp_type type = bld->type;
1939
1940    assert(type.floating);
1941    assert(lp_check_value(type, a));
1942
1943    if (arch_rounding_available(type)) {
1944       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1945    }
1946    else {
1947       const struct lp_type type = bld->type;
1948       struct lp_type inttype;
1949       struct lp_build_context intbld;
1950       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1951       LLVMValueRef res, anosign, mask;
1952       LLVMTypeRef int_vec_type = bld->int_vec_type;
1953       LLVMTypeRef vec_type = bld->vec_type;
1954
1955       assert(type.width == 32); /* might want to handle doubles at some point */
1956
1957       inttype = type;
1958       inttype.floating = 0;
1959       lp_build_context_init(&intbld, bld->gallivm, inttype);
1960
1961       res = lp_build_iround(bld, a);
1962       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1963
1964       /* mask out sign bit */
1965       anosign = lp_build_abs(bld, a);
1966       /*
1967        * mask out all values if anosign > 2^24
1968        * This should work both for large ints (all rounding is no-op for them
1969        * because such floats are always exact) as well as special cases like
1970        * NaNs, Infs (taking advantage of the fact they use max exponent).
1971        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1972        */
1973       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1974       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1975       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1976       return lp_build_select(bld, mask, a, res);
1977    }
1978 }
1979
1980
1981 /**
1982  * Return floor of float (vector), result is a float (vector)
1983  * Ex: floor(1.1) = 1.0
1984  * Ex: floor(-1.1) = -2.0
1985  */
1986 LLVMValueRef
1987 lp_build_floor(struct lp_build_context *bld,
1988                LLVMValueRef a)
1989 {
1990    LLVMBuilderRef builder = bld->gallivm->builder;
1991    const struct lp_type type = bld->type;
1992
1993    assert(type.floating);
1994    assert(lp_check_value(type, a));
1995
1996    if (arch_rounding_available(type)) {
1997       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1998    }
1999    else {
2000       const struct lp_type type = bld->type;
2001       struct lp_type inttype;
2002       struct lp_build_context intbld;
2003       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2004       LLVMValueRef trunc, res, anosign, mask;
2005       LLVMTypeRef int_vec_type = bld->int_vec_type;
2006       LLVMTypeRef vec_type = bld->vec_type;
2007
2008       if (type.width != 32) {
2009          char intrinsic[32];
2010          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2011          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2012       }
2013
2014       assert(type.width == 32); /* might want to handle doubles at some point */
2015
2016       inttype = type;
2017       inttype.floating = 0;
2018       lp_build_context_init(&intbld, bld->gallivm, inttype);
2019
2020       /* round by truncation */
2021       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2022       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2023
2024       if (type.sign) {
2025          LLVMValueRef tmp;
2026
2027          /*
2028           * fix values if rounding is wrong (for non-special cases)
2029           * - this is the case if trunc > a
2030           */
2031          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2032          /* tmp = trunc > a ? 1.0 : 0.0 */
2033          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2034          tmp = lp_build_and(&intbld, mask, tmp);
2035          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2036          res = lp_build_sub(bld, res, tmp);
2037       }
2038
2039       /* mask out sign bit */
2040       anosign = lp_build_abs(bld, a);
2041       /*
2042        * mask out all values if anosign > 2^24
2043        * This should work both for large ints (all rounding is no-op for them
2044        * because such floats are always exact) as well as special cases like
2045        * NaNs, Infs (taking advantage of the fact they use max exponent).
2046        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2047        */
2048       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2049       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2050       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2051       return lp_build_select(bld, mask, a, res);
2052    }
2053 }
2054
2055
2056 /**
2057  * Return ceiling of float (vector), returning float (vector).
2058  * Ex: ceil( 1.1) = 2.0
2059  * Ex: ceil(-1.1) = -1.0
2060  */
2061 LLVMValueRef
2062 lp_build_ceil(struct lp_build_context *bld,
2063               LLVMValueRef a)
2064 {
2065    LLVMBuilderRef builder = bld->gallivm->builder;
2066    const struct lp_type type = bld->type;
2067
2068    assert(type.floating);
2069    assert(lp_check_value(type, a));
2070
2071    if (arch_rounding_available(type)) {
2072       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2073    }
2074    else {
2075       const struct lp_type type = bld->type;
2076       struct lp_type inttype;
2077       struct lp_build_context intbld;
2078       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2079       LLVMValueRef trunc, res, anosign, mask, tmp;
2080       LLVMTypeRef int_vec_type = bld->int_vec_type;
2081       LLVMTypeRef vec_type = bld->vec_type;
2082
2083       if (type.width != 32) {
2084          char intrinsic[32];
2085          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2086          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2087       }
2088
2089       assert(type.width == 32); /* might want to handle doubles at some point */
2090
2091       inttype = type;
2092       inttype.floating = 0;
2093       lp_build_context_init(&intbld, bld->gallivm, inttype);
2094
2095       /* round by truncation */
2096       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2097       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2098
2099       /*
2100        * fix values if rounding is wrong (for non-special cases)
2101        * - this is the case if trunc < a
2102        */
2103       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2104       /* tmp = trunc < a ? 1.0 : 0.0 */
2105       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2106       tmp = lp_build_and(&intbld, mask, tmp);
2107       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2108       res = lp_build_add(bld, trunc, tmp);
2109
2110       /* mask out sign bit */
2111       anosign = lp_build_abs(bld, a);
2112       /*
2113        * mask out all values if anosign > 2^24
2114        * This should work both for large ints (all rounding is no-op for them
2115        * because such floats are always exact) as well as special cases like
2116        * NaNs, Infs (taking advantage of the fact they use max exponent).
2117        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2118        */
2119       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2120       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2121       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2122       return lp_build_select(bld, mask, a, res);
2123    }
2124 }
2125
2126
2127 /**
2128  * Return fractional part of 'a' computed as a - floor(a)
2129  * Typically used in texture coord arithmetic.
2130  */
2131 LLVMValueRef
2132 lp_build_fract(struct lp_build_context *bld,
2133                LLVMValueRef a)
2134 {
2135    assert(bld->type.floating);
2136    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2137 }
2138
2139
2140 /**
2141  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2142  * against 0.99999(9). (Will also return that value for NaNs.)
2143  */
2144 static inline LLVMValueRef
2145 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2146 {
2147    LLVMValueRef max;
2148
2149    /* this is the largest number smaller than 1.0 representable as float */
2150    max = lp_build_const_vec(bld->gallivm, bld->type,
2151                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2152    return lp_build_min_ext(bld, fract, max,
2153                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2154 }
2155
2156
2157 /**
2158  * Same as lp_build_fract, but guarantees that the result is always smaller
2159  * than one. Will also return the smaller-than-one value for infs, NaNs.
2160  */
2161 LLVMValueRef
2162 lp_build_fract_safe(struct lp_build_context *bld,
2163                     LLVMValueRef a)
2164 {
2165    return clamp_fract(bld, lp_build_fract(bld, a));
2166 }
2167
2168
2169 /**
2170  * Return the integer part of a float (vector) value (== round toward zero).
2171  * The returned value is an integer (vector).
2172  * Ex: itrunc(-1.5) = -1
2173  */
2174 LLVMValueRef
2175 lp_build_itrunc(struct lp_build_context *bld,
2176                 LLVMValueRef a)
2177 {
2178    LLVMBuilderRef builder = bld->gallivm->builder;
2179    const struct lp_type type = bld->type;
2180    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2181
2182    assert(type.floating);
2183    assert(lp_check_value(type, a));
2184
2185    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2186 }
2187
2188
2189 /**
2190  * Return float (vector) rounded to nearest integer (vector).  The returned
2191  * value is an integer (vector).
2192  * Ex: iround(0.9) = 1
2193  * Ex: iround(-1.5) = -2
2194  */
2195 LLVMValueRef
2196 lp_build_iround(struct lp_build_context *bld,
2197                 LLVMValueRef a)
2198 {
2199    LLVMBuilderRef builder = bld->gallivm->builder;
2200    const struct lp_type type = bld->type;
2201    LLVMTypeRef int_vec_type = bld->int_vec_type;
2202    LLVMValueRef res;
2203
2204    assert(type.floating);
2205
2206    assert(lp_check_value(type, a));
2207
2208    if ((util_cpu_caps.has_sse2 &&
2209        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2210        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2211       return lp_build_iround_nearest_sse2(bld, a);
2212    }
2213    if (arch_rounding_available(type)) {
2214       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2215    }
2216    else {
2217       LLVMValueRef half;
2218
2219       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2220
2221       if (type.sign) {
2222          LLVMTypeRef vec_type = bld->vec_type;
2223          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2224                                     (unsigned long long)1 << (type.width - 1));
2225          LLVMValueRef sign;
2226
2227          /* get sign bit */
2228          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2229          sign = LLVMBuildAnd(builder, sign, mask, "");
2230
2231          /* sign * 0.5 */
2232          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2233          half = LLVMBuildOr(builder, sign, half, "");
2234          half = LLVMBuildBitCast(builder, half, vec_type, "");
2235       }
2236
2237       res = LLVMBuildFAdd(builder, a, half, "");
2238    }
2239
2240    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2241
2242    return res;
2243 }
2244
2245
2246 /**
2247  * Return floor of float (vector), result is an int (vector)
2248  * Ex: ifloor(1.1) = 1.0
2249  * Ex: ifloor(-1.1) = -2.0
2250  */
2251 LLVMValueRef
2252 lp_build_ifloor(struct lp_build_context *bld,
2253                 LLVMValueRef a)
2254 {
2255    LLVMBuilderRef builder = bld->gallivm->builder;
2256    const struct lp_type type = bld->type;
2257    LLVMTypeRef int_vec_type = bld->int_vec_type;
2258    LLVMValueRef res;
2259
2260    assert(type.floating);
2261    assert(lp_check_value(type, a));
2262
2263    res = a;
2264    if (type.sign) {
2265       if (arch_rounding_available(type)) {
2266          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2267       }
2268       else {
2269          struct lp_type inttype;
2270          struct lp_build_context intbld;
2271          LLVMValueRef trunc, itrunc, mask;
2272
2273          assert(type.floating);
2274          assert(lp_check_value(type, a));
2275
2276          inttype = type;
2277          inttype.floating = 0;
2278          lp_build_context_init(&intbld, bld->gallivm, inttype);
2279
2280          /* round by truncation */
2281          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2282          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2283
2284          /*
2285           * fix values if rounding is wrong (for non-special cases)
2286           * - this is the case if trunc > a
2287           * The results of doing this with NaNs, very large values etc.
2288           * are undefined but this seems to be the case anyway.
2289           */
2290          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2291          /* cheapie minus one with mask since the mask is minus one / zero */
2292          return lp_build_add(&intbld, itrunc, mask);
2293       }
2294    }
2295
2296    /* round to nearest (toward zero) */
2297    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2298
2299    return res;
2300 }
2301
2302
2303 /**
2304  * Return ceiling of float (vector), returning int (vector).
2305  * Ex: iceil( 1.1) = 2
2306  * Ex: iceil(-1.1) = -1
2307  */
2308 LLVMValueRef
2309 lp_build_iceil(struct lp_build_context *bld,
2310                LLVMValueRef a)
2311 {
2312    LLVMBuilderRef builder = bld->gallivm->builder;
2313    const struct lp_type type = bld->type;
2314    LLVMTypeRef int_vec_type = bld->int_vec_type;
2315    LLVMValueRef res;
2316
2317    assert(type.floating);
2318    assert(lp_check_value(type, a));
2319
2320    if (arch_rounding_available(type)) {
2321       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2322    }
2323    else {
2324       struct lp_type inttype;
2325       struct lp_build_context intbld;
2326       LLVMValueRef trunc, itrunc, mask;
2327
2328       assert(type.floating);
2329       assert(lp_check_value(type, a));
2330
2331       inttype = type;
2332       inttype.floating = 0;
2333       lp_build_context_init(&intbld, bld->gallivm, inttype);
2334
2335       /* round by truncation */
2336       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2337       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2338
2339       /*
2340        * fix values if rounding is wrong (for non-special cases)
2341        * - this is the case if trunc < a
2342        * The results of doing this with NaNs, very large values etc.
2343        * are undefined but this seems to be the case anyway.
2344        */
2345       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2346       /* cheapie plus one with mask since the mask is minus one / zero */
2347       return lp_build_sub(&intbld, itrunc, mask);
2348    }
2349
2350    /* round to nearest (toward zero) */
2351    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2352
2353    return res;
2354 }
2355
2356
2357 /**
2358  * Combined ifloor() & fract().
2359  *
2360  * Preferred to calling the functions separately, as it will ensure that the
2361  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2362  */
2363 void
2364 lp_build_ifloor_fract(struct lp_build_context *bld,
2365                       LLVMValueRef a,
2366                       LLVMValueRef *out_ipart,
2367                       LLVMValueRef *out_fpart)
2368 {
2369    LLVMBuilderRef builder = bld->gallivm->builder;
2370    const struct lp_type type = bld->type;
2371    LLVMValueRef ipart;
2372
2373    assert(type.floating);
2374    assert(lp_check_value(type, a));
2375
2376    if (arch_rounding_available(type)) {
2377       /*
2378        * floor() is easier.
2379        */
2380
2381       ipart = lp_build_floor(bld, a);
2382       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2383       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2384    }
2385    else {
2386       /*
2387        * ifloor() is easier.
2388        */
2389
2390       *out_ipart = lp_build_ifloor(bld, a);
2391       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2392       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2393    }
2394 }
2395
2396
2397 /**
2398  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2399  * always smaller than one.
2400  */
2401 void
2402 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2403                            LLVMValueRef a,
2404                            LLVMValueRef *out_ipart,
2405                            LLVMValueRef *out_fpart)
2406 {
2407    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2408    *out_fpart = clamp_fract(bld, *out_fpart);
2409 }
2410
2411
2412 LLVMValueRef
2413 lp_build_sqrt(struct lp_build_context *bld,
2414               LLVMValueRef a)
2415 {
2416    LLVMBuilderRef builder = bld->gallivm->builder;
2417    const struct lp_type type = bld->type;
2418    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2419    char intrinsic[32];
2420
2421    assert(lp_check_value(type, a));
2422
2423    assert(type.floating);
2424    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2425
2426    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2427 }
2428
2429
2430 /**
2431  * Do one Newton-Raphson step to improve reciprocate precision:
2432  *
2433  *   x_{i+1} = x_i * (2 - a * x_i)
2434  *
2435  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2436  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2437  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2438  * halo. It would be necessary to clamp the argument to prevent this.
2439  *
2440  * See also:
2441  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2442  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2443  */
2444 static inline LLVMValueRef
2445 lp_build_rcp_refine(struct lp_build_context *bld,
2446                     LLVMValueRef a,
2447                     LLVMValueRef rcp_a)
2448 {
2449    LLVMBuilderRef builder = bld->gallivm->builder;
2450    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2451    LLVMValueRef res;
2452
2453    res = LLVMBuildFMul(builder, a, rcp_a, "");
2454    res = LLVMBuildFSub(builder, two, res, "");
2455    res = LLVMBuildFMul(builder, rcp_a, res, "");
2456
2457    return res;
2458 }
2459
2460
2461 LLVMValueRef
2462 lp_build_rcp(struct lp_build_context *bld,
2463              LLVMValueRef a)
2464 {
2465    LLVMBuilderRef builder = bld->gallivm->builder;
2466    const struct lp_type type = bld->type;
2467
2468    assert(lp_check_value(type, a));
2469
2470    if(a == bld->zero)
2471       return bld->undef;
2472    if(a == bld->one)
2473       return bld->one;
2474    if(a == bld->undef)
2475       return bld->undef;
2476
2477    assert(type.floating);
2478
2479    if(LLVMIsConstant(a))
2480       return LLVMConstFDiv(bld->one, a);
2481
2482    /*
2483     * We don't use RCPPS because:
2484     * - it only has 10bits of precision
2485     * - it doesn't even get the reciprocate of 1.0 exactly
2486     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2487     * - for recent processors the benefit over DIVPS is marginal, a case
2488     *   dependent
2489     *
2490     * We could still use it on certain processors if benchmarks show that the
2491     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2492     * particular uses that require less workarounds.
2493     */
2494
2495    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2496          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2497       const unsigned num_iterations = 0;
2498       LLVMValueRef res;
2499       unsigned i;
2500       const char *intrinsic = NULL;
2501
2502       if (type.length == 4) {
2503          intrinsic = "llvm.x86.sse.rcp.ps";
2504       }
2505       else {
2506          intrinsic = "llvm.x86.avx.rcp.ps.256";
2507       }
2508
2509       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2510
2511       for (i = 0; i < num_iterations; ++i) {
2512          res = lp_build_rcp_refine(bld, a, res);
2513       }
2514
2515       return res;
2516    }
2517
2518    return LLVMBuildFDiv(builder, bld->one, a, "");
2519 }
2520
2521
2522 /**
2523  * Do one Newton-Raphson step to improve rsqrt precision:
2524  *
2525  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2526  *
2527  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2528  */
2529 static inline LLVMValueRef
2530 lp_build_rsqrt_refine(struct lp_build_context *bld,
2531                       LLVMValueRef a,
2532                       LLVMValueRef rsqrt_a)
2533 {
2534    LLVMBuilderRef builder = bld->gallivm->builder;
2535    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2536    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2537    LLVMValueRef res;
2538
2539    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2540    res = LLVMBuildFMul(builder, a, res, "");
2541    res = LLVMBuildFSub(builder, three, res, "");
2542    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2543    res = LLVMBuildFMul(builder, half, res, "");
2544
2545    return res;
2546 }
2547
2548
2549 /**
2550  * Generate 1/sqrt(a).
2551  * Result is undefined for values < 0, infinity for +0.
2552  */
2553 LLVMValueRef
2554 lp_build_rsqrt(struct lp_build_context *bld,
2555                LLVMValueRef a)
2556 {
2557    const struct lp_type type = bld->type;
2558
2559    assert(lp_check_value(type, a));
2560
2561    assert(type.floating);
2562
2563    /*
2564     * This should be faster but all denormals will end up as infinity.
2565     */
2566    if (0 && lp_build_fast_rsqrt_available(type)) {
2567       const unsigned num_iterations = 1;
2568       LLVMValueRef res;
2569       unsigned i;
2570
2571       /* rsqrt(1.0) != 1.0 here */
2572       res = lp_build_fast_rsqrt(bld, a);
2573
2574       if (num_iterations) {
2575          /*
2576           * Newton-Raphson will result in NaN instead of infinity for zero,
2577           * and NaN instead of zero for infinity.
2578           * Also, need to ensure rsqrt(1.0) == 1.0.
2579           * All numbers smaller than FLT_MIN will result in +infinity
2580           * (rsqrtps treats all denormals as zero).
2581           */
2582          LLVMValueRef cmp;
2583          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2584          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2585
2586          for (i = 0; i < num_iterations; ++i) {
2587             res = lp_build_rsqrt_refine(bld, a, res);
2588          }
2589          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2590          res = lp_build_select(bld, cmp, inf, res);
2591          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2592          res = lp_build_select(bld, cmp, bld->zero, res);
2593          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2594          res = lp_build_select(bld, cmp, bld->one, res);
2595       }
2596
2597       return res;
2598    }
2599
2600    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2601 }
2602
2603 /**
2604  * If there's a fast (inaccurate) rsqrt instruction available
2605  * (caller may want to avoid to call rsqrt_fast if it's not available,
2606  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2607  * unavailable it would result in sqrt/div/mul so obviously
2608  * much better to just call sqrt, skipping both div and mul).
2609  */
2610 boolean
2611 lp_build_fast_rsqrt_available(struct lp_type type)
2612 {
2613    assert(type.floating);
2614
2615    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2616        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2617       return true;
2618    }
2619    return false;
2620 }
2621
2622
2623 /**
2624  * Generate 1/sqrt(a).
2625  * Result is undefined for values < 0, infinity for +0.
2626  * Precision is limited, only ~10 bits guaranteed
2627  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2628  */
2629 LLVMValueRef
2630 lp_build_fast_rsqrt(struct lp_build_context *bld,
2631                     LLVMValueRef a)
2632 {
2633    LLVMBuilderRef builder = bld->gallivm->builder;
2634    const struct lp_type type = bld->type;
2635
2636    assert(lp_check_value(type, a));
2637
2638    if (lp_build_fast_rsqrt_available(type)) {
2639       const char *intrinsic = NULL;
2640
2641       if (type.length == 4) {
2642          intrinsic = "llvm.x86.sse.rsqrt.ps";
2643       }
2644       else {
2645          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2646       }
2647       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2648    }
2649    else {
2650       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2651    }
2652    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2653 }
2654
2655
2656 /**
2657  * Generate sin(a) or cos(a) using polynomial approximation.
2658  * TODO: it might be worth recognizing sin and cos using same source
2659  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2660  * would be way cheaper than calculating (nearly) everything twice...
2661  * Not sure it's common enough to be worth bothering however, scs
2662  * opcode could also benefit from calculating both though.
2663  */
2664 static LLVMValueRef
2665 lp_build_sin_or_cos(struct lp_build_context *bld,
2666                     LLVMValueRef a,
2667                     boolean cos)
2668 {
2669    struct gallivm_state *gallivm = bld->gallivm;
2670    LLVMBuilderRef b = gallivm->builder;
2671    struct lp_type int_type = lp_int_type(bld->type);
2672
2673    /*
2674     *  take the absolute value,
2675     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2676     */
2677
2678    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2679    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2680
2681    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2682    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2683
2684    /*
2685     * scale by 4/Pi
2686     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2687     */
2688
2689    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2690    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2691
2692    /*
2693     * store the integer part of y in mm0
2694     * emm2 = _mm_cvttps_epi32(y);
2695     */
2696
2697    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2698
2699    /*
2700     * j=(j+1) & (~1) (see the cephes sources)
2701     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2702     */
2703
2704    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2705    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2706    /*
2707     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2708     */
2709    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2710    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2711
2712    /*
2713     * y = _mm_cvtepi32_ps(emm2);
2714     */
2715    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2716
2717    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2718    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2719    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2720    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2721
2722    /*
2723     * Argument used for poly selection and sign bit determination
2724     * is different for sin vs. cos.
2725     */
2726    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2727                                emm2_and;
2728
2729    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2730                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2731                                               const_29, "sign_bit") :
2732                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2733                                                               LLVMBuildShl(b, emm2_add,
2734                                                                            const_29, ""), ""),
2735                                               sign_mask, "sign_bit");
2736
2737    /*
2738     * get the polynom selection mask
2739     * there is one polynom for 0 <= x <= Pi/4
2740     * and another one for Pi/4<x<=Pi/2
2741     * Both branches will be computed.
2742     *
2743     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2744     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2745     */
2746
2747    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2748    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2749                                              int_type, PIPE_FUNC_EQUAL,
2750                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2751
2752    /*
2753     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2754     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2755     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2756     */
2757    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2758    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2759    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2760
2761    /*
2762     * The magic pass: "Extended precision modular arithmetic"
2763     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2764     */
2765    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2766    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2767    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2768
2769    /*
2770     * Evaluate the first polynom  (0 <= x <= Pi/4)
2771     *
2772     * z = _mm_mul_ps(x,x);
2773     */
2774    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2775
2776    /*
2777     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2778     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2779     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2780     */
2781    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2782    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2783    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2784
2785    /*
2786     * y = *(v4sf*)_ps_coscof_p0;
2787     * y = _mm_mul_ps(y, z);
2788     */
2789    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2790    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2791    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2792    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2793
2794
2795    /*
2796     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2797     * y = _mm_sub_ps(y, tmp);
2798     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2799     */
2800    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2801    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2802    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2803    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2804    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2805
2806    /*
2807     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2808     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2809     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2810     */
2811    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2812    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2813    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2814
2815    /*
2816     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2817     *
2818     * y2 = *(v4sf*)_ps_sincof_p0;
2819     * y2 = _mm_mul_ps(y2, z);
2820     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2821     * y2 = _mm_mul_ps(y2, z);
2822     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2823     * y2 = _mm_mul_ps(y2, z);
2824     * y2 = _mm_mul_ps(y2, x);
2825     * y2 = _mm_add_ps(y2, x);
2826     */
2827
2828    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2829    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2830    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2831    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2832
2833    /*
2834     * select the correct result from the two polynoms
2835     * xmm3 = poly_mask;
2836     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2837     * y = _mm_andnot_ps(xmm3, y);
2838     * y = _mm_or_ps(y,y2);
2839     */
2840    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2841    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2842    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2843    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2844    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2845    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2846
2847    /*
2848     * update the sign
2849     * y = _mm_xor_ps(y, sign_bit);
2850     */
2851    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2852    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2853
2854    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2855
2856    /* clamp output to be within [-1, 1] */
2857    y_result = lp_build_clamp(bld, y_result,
2858                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2859                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2860    /* If a is -inf, inf or NaN then return NaN */
2861    y_result = lp_build_select(bld, isfinite, y_result,
2862                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2863    return y_result;
2864 }
2865
2866
2867 /**
2868  * Generate sin(a)
2869  */
2870 LLVMValueRef
2871 lp_build_sin(struct lp_build_context *bld,
2872              LLVMValueRef a)
2873 {
2874    return lp_build_sin_or_cos(bld, a, FALSE);
2875 }
2876
2877
2878 /**
2879  * Generate cos(a)
2880  */
2881 LLVMValueRef
2882 lp_build_cos(struct lp_build_context *bld,
2883              LLVMValueRef a)
2884 {
2885    return lp_build_sin_or_cos(bld, a, TRUE);
2886 }
2887
2888
2889 /**
2890  * Generate pow(x, y)
2891  */
2892 LLVMValueRef
2893 lp_build_pow(struct lp_build_context *bld,
2894              LLVMValueRef x,
2895              LLVMValueRef y)
2896 {
2897    /* TODO: optimize the constant case */
2898    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2899        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2900       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2901                    __FUNCTION__);
2902    }
2903
2904    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2905 }
2906
2907
2908 /**
2909  * Generate exp(x)
2910  */
2911 LLVMValueRef
2912 lp_build_exp(struct lp_build_context *bld,
2913              LLVMValueRef x)
2914 {
2915    /* log2(e) = 1/log(2) */
2916    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2917                                            1.4426950408889634);
2918
2919    assert(lp_check_value(bld->type, x));
2920
2921    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2922 }
2923
2924
2925 /**
2926  * Generate log(x)
2927  * Behavior is undefined with infs, 0s and nans
2928  */
2929 LLVMValueRef
2930 lp_build_log(struct lp_build_context *bld,
2931              LLVMValueRef x)
2932 {
2933    /* log(2) */
2934    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2935                                           0.69314718055994529);
2936
2937    assert(lp_check_value(bld->type, x));
2938
2939    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2940 }
2941
2942 /**
2943  * Generate log(x) that handles edge cases (infs, 0s and nans)
2944  */
2945 LLVMValueRef
2946 lp_build_log_safe(struct lp_build_context *bld,
2947                   LLVMValueRef x)
2948 {
2949    /* log(2) */
2950    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2951                                           0.69314718055994529);
2952
2953    assert(lp_check_value(bld->type, x));
2954
2955    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2956 }
2957
2958
2959 /**
2960  * Generate polynomial.
2961  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2962  */
2963 LLVMValueRef
2964 lp_build_polynomial(struct lp_build_context *bld,
2965                     LLVMValueRef x,
2966                     const double *coeffs,
2967                     unsigned num_coeffs)
2968 {
2969    const struct lp_type type = bld->type;
2970    LLVMValueRef even = NULL, odd = NULL;
2971    LLVMValueRef x2;
2972    unsigned i;
2973
2974    assert(lp_check_value(bld->type, x));
2975
2976    /* TODO: optimize the constant case */
2977    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2978        LLVMIsConstant(x)) {
2979       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2980                    __FUNCTION__);
2981    }
2982
2983    /*
2984     * Calculate odd and even terms seperately to decrease data dependency
2985     * Ex:
2986     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2987     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2988     */
2989    x2 = lp_build_mul(bld, x, x);
2990
2991    for (i = num_coeffs; i--; ) {
2992       LLVMValueRef coeff;
2993
2994       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2995
2996       if (i % 2 == 0) {
2997          if (even)
2998             even = lp_build_mad(bld, x2, even, coeff);
2999          else
3000             even = coeff;
3001       } else {
3002          if (odd)
3003             odd = lp_build_mad(bld, x2, odd, coeff);
3004          else
3005             odd = coeff;
3006       }
3007    }
3008
3009    if (odd)
3010       return lp_build_mad(bld, odd, x, even);
3011    else if (even)
3012       return even;
3013    else
3014       return bld->undef;
3015 }
3016
3017
3018 /**
3019  * Minimax polynomial fit of 2**x, in range [0, 1[
3020  */
3021 const double lp_build_exp2_polynomial[] = {
3022 #if EXP_POLY_DEGREE == 5
3023    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3024    0.693153073200168932794,
3025    0.240153617044375388211,
3026    0.0558263180532956664775,
3027    0.00898934009049466391101,
3028    0.00187757667519147912699
3029 #elif EXP_POLY_DEGREE == 4
3030    1.00000259337069434683,
3031    0.693003834469974940458,
3032    0.24144275689150793076,
3033    0.0520114606103070150235,
3034    0.0135341679161270268764
3035 #elif EXP_POLY_DEGREE == 3
3036    0.999925218562710312959,
3037    0.695833540494823811697,
3038    0.226067155427249155588,
3039    0.0780245226406372992967
3040 #elif EXP_POLY_DEGREE == 2
3041    1.00172476321474503578,
3042    0.657636275736077639316,
3043    0.33718943461968720704
3044 #else
3045 #error
3046 #endif
3047 };
3048
3049
3050 LLVMValueRef
3051 lp_build_exp2(struct lp_build_context *bld,
3052               LLVMValueRef x)
3053 {
3054    LLVMBuilderRef builder = bld->gallivm->builder;
3055    const struct lp_type type = bld->type;
3056    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3057    LLVMValueRef ipart = NULL;
3058    LLVMValueRef fpart = NULL;
3059    LLVMValueRef expipart = NULL;
3060    LLVMValueRef expfpart = NULL;
3061    LLVMValueRef res = NULL;
3062
3063    assert(lp_check_value(bld->type, x));
3064
3065    /* TODO: optimize the constant case */
3066    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3067        LLVMIsConstant(x)) {
3068       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3069                    __FUNCTION__);
3070    }
3071
3072    assert(type.floating && type.width == 32);
3073
3074    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3075     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3076    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3077                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3078    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3079                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3080
3081    /* ipart = floor(x) */
3082    /* fpart = x - ipart */
3083    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3084
3085    /* expipart = (float) (1 << ipart) */
3086    expipart = LLVMBuildAdd(builder, ipart,
3087                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3088    expipart = LLVMBuildShl(builder, expipart,
3089                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3090    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3091
3092    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3093                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3094
3095    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3096
3097    return res;
3098 }
3099
3100
3101
3102 /**
3103  * Extract the exponent of a IEEE-754 floating point value.
3104  *
3105  * Optionally apply an integer bias.
3106  *
3107  * Result is an integer value with
3108  *
3109  *   ifloor(log2(x)) + bias
3110  */
3111 LLVMValueRef
3112 lp_build_extract_exponent(struct lp_build_context *bld,
3113                           LLVMValueRef x,
3114                           int bias)
3115 {
3116    LLVMBuilderRef builder = bld->gallivm->builder;
3117    const struct lp_type type = bld->type;
3118    unsigned mantissa = lp_mantissa(type);
3119    LLVMValueRef res;
3120
3121    assert(type.floating);
3122
3123    assert(lp_check_value(bld->type, x));
3124
3125    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3126
3127    res = LLVMBuildLShr(builder, x,
3128                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3129    res = LLVMBuildAnd(builder, res,
3130                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3131    res = LLVMBuildSub(builder, res,
3132                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3133
3134    return res;
3135 }
3136
3137
3138 /**
3139  * Extract the mantissa of the a floating.
3140  *
3141  * Result is a floating point value with
3142  *
3143  *   x / floor(log2(x))
3144  */
3145 LLVMValueRef
3146 lp_build_extract_mantissa(struct lp_build_context *bld,
3147                           LLVMValueRef x)
3148 {
3149    LLVMBuilderRef builder = bld->gallivm->builder;
3150    const struct lp_type type = bld->type;
3151    unsigned mantissa = lp_mantissa(type);
3152    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3153                                                   (1ULL << mantissa) - 1);
3154    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3155    LLVMValueRef res;
3156
3157    assert(lp_check_value(bld->type, x));
3158
3159    assert(type.floating);
3160
3161    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3162
3163    /* res = x / 2**ipart */
3164    res = LLVMBuildAnd(builder, x, mantmask, "");
3165    res = LLVMBuildOr(builder, res, one, "");
3166    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3167
3168    return res;
3169 }
3170
3171
3172
3173 /**
3174  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3175  * These coefficients can be generate with
3176  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3177  */
3178 const double lp_build_log2_polynomial[] = {
3179 #if LOG_POLY_DEGREE == 5
3180    2.88539008148777786488L,
3181    0.961796878841293367824L,
3182    0.577058946784739859012L,
3183    0.412914355135828735411L,
3184    0.308591899232910175289L,
3185    0.352376952300281371868L,
3186 #elif LOG_POLY_DEGREE == 4
3187    2.88539009343309178325L,
3188    0.961791550404184197881L,
3189    0.577440339438736392009L,
3190    0.403343858251329912514L,
3191    0.406718052498846252698L,
3192 #elif LOG_POLY_DEGREE == 3
3193    2.88538959748872753838L,
3194    0.961932915889597772928L,
3195    0.571118517972136195241L,
3196    0.493997535084709500285L,
3197 #else
3198 #error
3199 #endif
3200 };
3201
3202 /**
3203  * See http://www.devmaster.net/forums/showthread.php?p=43580
3204  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3205  * http://www.nezumi.demon.co.uk/consult/logx.htm
3206  *
3207  * If handle_edge_cases is true the function will perform computations
3208  * to match the required D3D10+ behavior for each of the edge cases.
3209  * That means that if input is:
3210  * - less than zero (to and including -inf) then NaN will be returned
3211  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3212  * - +infinity, then +infinity will be returned
3213  * - NaN, then NaN will be returned
3214  *
3215  * Those checks are fairly expensive so if you don't need them make sure
3216  * handle_edge_cases is false.
3217  */
3218 void
3219 lp_build_log2_approx(struct lp_build_context *bld,
3220                      LLVMValueRef x,
3221                      LLVMValueRef *p_exp,
3222                      LLVMValueRef *p_floor_log2,
3223                      LLVMValueRef *p_log2,
3224                      boolean handle_edge_cases)
3225 {
3226    LLVMBuilderRef builder = bld->gallivm->builder;
3227    const struct lp_type type = bld->type;
3228    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3229    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3230
3231    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3232    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3233    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3234
3235    LLVMValueRef i = NULL;
3236    LLVMValueRef y = NULL;
3237    LLVMValueRef z = NULL;
3238    LLVMValueRef exp = NULL;
3239    LLVMValueRef mant = NULL;
3240    LLVMValueRef logexp = NULL;
3241    LLVMValueRef p_z = NULL;
3242    LLVMValueRef res = NULL;
3243
3244    assert(lp_check_value(bld->type, x));
3245
3246    if(p_exp || p_floor_log2 || p_log2) {
3247       /* TODO: optimize the constant case */
3248       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3249           LLVMIsConstant(x)) {
3250          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3251                       __FUNCTION__);
3252       }
3253
3254       assert(type.floating && type.width == 32);
3255
3256       /*
3257        * We don't explicitly handle denormalized numbers. They will yield a
3258        * result in the neighbourhood of -127, which appears to be adequate
3259        * enough.
3260        */
3261
3262       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3263
3264       /* exp = (float) exponent(x) */
3265       exp = LLVMBuildAnd(builder, i, expmask, "");
3266    }
3267
3268    if(p_floor_log2 || p_log2) {
3269       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3270       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3271       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3272    }
3273
3274    if (p_log2) {
3275       /* mant = 1 + (float) mantissa(x) */
3276       mant = LLVMBuildAnd(builder, i, mantmask, "");
3277       mant = LLVMBuildOr(builder, mant, one, "");
3278       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3279
3280       /* y = (mant - 1) / (mant + 1) */
3281       y = lp_build_div(bld,
3282          lp_build_sub(bld, mant, bld->one),
3283          lp_build_add(bld, mant, bld->one)
3284       );
3285
3286       /* z = y^2 */
3287       z = lp_build_mul(bld, y, y);
3288
3289       /* compute P(z) */
3290       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3291                                 ARRAY_SIZE(lp_build_log2_polynomial));
3292
3293       /* y * P(z) + logexp */
3294       res = lp_build_mad(bld, y, p_z, logexp);
3295
3296       if (type.floating && handle_edge_cases) {
3297          LLVMValueRef negmask, infmask,  zmask;
3298          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3299                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3300          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3301                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3302          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3303                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3304
3305          /* If x is qual to inf make sure we return inf */
3306          res = lp_build_select(bld, infmask,
3307                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3308                                res);
3309          /* If x is qual to 0, return -inf */
3310          res = lp_build_select(bld, zmask,
3311                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3312                                res);
3313          /* If x is nan or less than 0, return nan */
3314          res = lp_build_select(bld, negmask,
3315                                lp_build_const_vec(bld->gallivm, type,  NAN),
3316                                res);
3317       }
3318    }
3319
3320    if (p_exp) {
3321       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3322       *p_exp = exp;
3323    }
3324
3325    if (p_floor_log2)
3326       *p_floor_log2 = logexp;
3327
3328    if (p_log2)
3329       *p_log2 = res;
3330 }
3331
3332
3333 /*
3334  * log2 implementation which doesn't have special code to
3335  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3336  * the results for those cases are undefined.
3337  */
3338 LLVMValueRef
3339 lp_build_log2(struct lp_build_context *bld,
3340               LLVMValueRef x)
3341 {
3342    LLVMValueRef res;
3343    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3344    return res;
3345 }
3346
3347 /*
3348  * Version of log2 which handles all edge cases.
3349  * Look at documentation of lp_build_log2_approx for
3350  * description of the behavior for each of the edge cases.
3351  */
3352 LLVMValueRef
3353 lp_build_log2_safe(struct lp_build_context *bld,
3354                    LLVMValueRef x)
3355 {
3356    LLVMValueRef res;
3357    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3358    return res;
3359 }
3360
3361
3362 /**
3363  * Faster (and less accurate) log2.
3364  *
3365  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3366  *
3367  * Piece-wise linear approximation, with exact results when x is a
3368  * power of two.
3369  *
3370  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3371  */
3372 LLVMValueRef
3373 lp_build_fast_log2(struct lp_build_context *bld,
3374                    LLVMValueRef x)
3375 {
3376    LLVMBuilderRef builder = bld->gallivm->builder;
3377    LLVMValueRef ipart;
3378    LLVMValueRef fpart;
3379
3380    assert(lp_check_value(bld->type, x));
3381
3382    assert(bld->type.floating);
3383
3384    /* ipart = floor(log2(x)) - 1 */
3385    ipart = lp_build_extract_exponent(bld, x, -1);
3386    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3387
3388    /* fpart = x / 2**ipart */
3389    fpart = lp_build_extract_mantissa(bld, x);
3390
3391    /* ipart + fpart */
3392    return LLVMBuildFAdd(builder, ipart, fpart, "");
3393 }
3394
3395
3396 /**
3397  * Fast implementation of iround(log2(x)).
3398  *
3399  * Not an approximation -- it should give accurate results all the time.
3400  */
3401 LLVMValueRef
3402 lp_build_ilog2(struct lp_build_context *bld,
3403                LLVMValueRef x)
3404 {
3405    LLVMBuilderRef builder = bld->gallivm->builder;
3406    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3407    LLVMValueRef ipart;
3408
3409    assert(bld->type.floating);
3410
3411    assert(lp_check_value(bld->type, x));
3412
3413    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3414    x = LLVMBuildFMul(builder, x, sqrt2, "");
3415
3416    /* ipart = floor(log2(x) + 0.5)  */
3417    ipart = lp_build_extract_exponent(bld, x, 0);
3418
3419    return ipart;
3420 }
3421
3422 LLVMValueRef
3423 lp_build_mod(struct lp_build_context *bld,
3424              LLVMValueRef x,
3425              LLVMValueRef y)
3426 {
3427    LLVMBuilderRef builder = bld->gallivm->builder;
3428    LLVMValueRef res;
3429    const struct lp_type type = bld->type;
3430
3431    assert(lp_check_value(type, x));
3432    assert(lp_check_value(type, y));
3433
3434    if (type.floating)
3435       res = LLVMBuildFRem(builder, x, y, "");
3436    else if (type.sign)
3437       res = LLVMBuildSRem(builder, x, y, "");
3438    else
3439       res = LLVMBuildURem(builder, x, y, "");
3440    return res;
3441 }
3442
3443
3444 /*
3445  * For floating inputs it creates and returns a mask
3446  * which is all 1's for channels which are NaN.
3447  * Channels inside x which are not NaN will be 0.
3448  */
3449 LLVMValueRef
3450 lp_build_isnan(struct lp_build_context *bld,
3451                LLVMValueRef x)
3452 {
3453    LLVMValueRef mask;
3454    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3455
3456    assert(bld->type.floating);
3457    assert(lp_check_value(bld->type, x));
3458
3459    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3460                         "isnotnan");
3461    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3462    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3463    return mask;
3464 }
3465
3466 /* Returns all 1's for floating point numbers that are
3467  * finite numbers and returns all zeros for -inf,
3468  * inf and nan's */
3469 LLVMValueRef
3470 lp_build_isfinite(struct lp_build_context *bld,
3471                   LLVMValueRef x)
3472 {
3473    LLVMBuilderRef builder = bld->gallivm->builder;
3474    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3475    struct lp_type int_type = lp_int_type(bld->type);
3476    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3477    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3478                                                     0x7f800000);
3479
3480    if (!bld->type.floating) {
3481       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3482    }
3483    assert(bld->type.floating);
3484    assert(lp_check_value(bld->type, x));
3485    assert(bld->type.width == 32);
3486
3487    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3488    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3489                            intx, infornan32);
3490 }
3491
3492 /*
3493  * Returns true if the number is nan or inf and false otherwise.
3494  * The input has to be a floating point vector.
3495  */
3496 LLVMValueRef
3497 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3498                        const struct lp_type type,
3499                        LLVMValueRef x)
3500 {
3501    LLVMBuilderRef builder = gallivm->builder;
3502    struct lp_type int_type = lp_int_type(type);
3503    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3504                                                 0x7f800000);
3505    LLVMValueRef ret;
3506
3507    assert(type.floating);
3508
3509    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3510    ret = LLVMBuildAnd(builder, ret, const0, "");
3511    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3512                           ret, const0);
3513
3514    return ret;
3515 }
3516
3517
3518 LLVMValueRef
3519 lp_build_fpstate_get(struct gallivm_state *gallivm)
3520 {
3521    if (util_cpu_caps.has_sse) {
3522       LLVMBuilderRef builder = gallivm->builder;
3523       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3524          gallivm,
3525          LLVMInt32TypeInContext(gallivm->context),
3526          "mxcsr_ptr");
3527       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3528           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3529       lp_build_intrinsic(builder,
3530                          "llvm.x86.sse.stmxcsr",
3531                          LLVMVoidTypeInContext(gallivm->context),
3532                          &mxcsr_ptr8, 1, 0);
3533       return mxcsr_ptr;
3534    }
3535    return 0;
3536 }
3537
3538 void
3539 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3540                                   boolean zero)
3541 {
3542    if (util_cpu_caps.has_sse) {
3543       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3544       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3545
3546       LLVMBuilderRef builder = gallivm->builder;
3547       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3548       LLVMValueRef mxcsr =
3549          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3550
3551       if (util_cpu_caps.has_daz) {
3552          /* Enable denormals are zero mode */
3553          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3554       }
3555       if (zero) {
3556          mxcsr = LLVMBuildOr(builder, mxcsr,
3557                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3558       } else {
3559          mxcsr = LLVMBuildAnd(builder, mxcsr,
3560                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3561       }
3562
3563       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3564       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3565    }
3566 }
3567
3568 void
3569 lp_build_fpstate_set(struct gallivm_state *gallivm,
3570                      LLVMValueRef mxcsr_ptr)
3571 {
3572    if (util_cpu_caps.has_sse) {
3573       LLVMBuilderRef builder = gallivm->builder;
3574       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3575                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3576       lp_build_intrinsic(builder,
3577                          "llvm.x86.sse.ldmxcsr",
3578                          LLVMVoidTypeInContext(gallivm->context),
3579                          &mxcsr_ptr, 1, 0);
3580    }
3581 }