src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if(a == bld->zero)
 545       return b;
 546    if(b == bld->zero)
 547       return a;
 548    if(a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if(bld->type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if(a == bld->one || b == bld->one)
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (type.width * type.length == 128) {
 559             if(util_cpu_caps.has_sse2) {
 560               if(type.width == 8)
 561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 562               if(type.width == 16)
 563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 564             } else if (util_cpu_caps.has_altivec) {
 565               if(type.width == 8)
 566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 567               if(type.width == 16)
 568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 569             }
 570          }
 571          if (type.width * type.length == 256) {
 572             if(util_cpu_caps.has_avx2) {
 573               if(type.width == 8)
 574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
 575               if(type.width == 16)
 576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
 577             }
 578          }
 579       }
 580
 581       if (intrinsic)
 582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 583    }
 584
 585    if(type.norm && !type.floating && !type.fixed) {
 586       if (type.sign) {
 587          uint64_t sign = (uint64_t)1 << (type.width - 1);
 588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 590          /* a_clamp_max is the maximum a for positive b,
 591             a_clamp_min is the minimum a for negative b. */
 592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 595       } else {
 596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 597       }
 598    }
 599
 600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 601       if (type.floating)
 602          res = LLVMConstFAdd(a, b);
 603       else
 604          res = LLVMConstAdd(a, b);
 605    else
 606       if (type.floating)
 607          res = LLVMBuildFAdd(builder, a, b, "");
 608       else
 609          res = LLVMBuildAdd(builder, a, b, "");
 610
 611    /* clamp to ceiling of 1.0 */
 612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 614
 615    /* XXX clamp to floor of -1 or 0??? */
 616
 617    return res;
 618 }
 619
 620
 621 /** Return the scalar sum of the elements of a.
 622  * Should avoid this operation whenever possible.
 623  */
 624 LLVMValueRef
 625 lp_build_horizontal_add(struct lp_build_context *bld,
 626                         LLVMValueRef a)
 627 {
 628    LLVMBuilderRef builder = bld->gallivm->builder;
 629    const struct lp_type type = bld->type;
 630    LLVMValueRef index, res;
 631    unsigned i, length;
 632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 634    LLVMValueRef vecres, elem2;
 635
 636    assert(lp_check_value(type, a));
 637
 638    if (type.length == 1) {
 639       return a;
 640    }
 641
 642    assert(!bld->type.norm);
 643
 644    /*
 645     * for byte vectors can do much better with psadbw.
 646     * Using repeated shuffle/adds here. Note with multiple vectors
 647     * this can be done more efficiently as outlined in the intel
 648     * optimization manual.
 649     * Note: could cause data rearrangement if used with smaller element
 650     * sizes.
 651     */
 652
 653    vecres = a;
 654    length = type.length / 2;
 655    while (length > 1) {
 656       LLVMValueRef vec1, vec2;
 657       for (i = 0; i < length; i++) {
 658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 660       }
 661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 662                                     LLVMConstVector(shuffles1, length), "");
 663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 664                                     LLVMConstVector(shuffles2, length), "");
 665       if (type.floating) {
 666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 667       }
 668       else {
 669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 670       }
 671       length = length >> 1;
 672    }
 673
 674    /* always have vector of size 2 here */
 675    assert(length == 1);
 676
 677    index = lp_build_const_int32(bld->gallivm, 0);
 678    res = LLVMBuildExtractElement(builder, vecres, index, "");
 679    index = lp_build_const_int32(bld->gallivm, 1);
 680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 681
 682    if (type.floating)
 683       res = LLVMBuildFAdd(builder, res, elem2, "");
 684     else
 685       res = LLVMBuildAdd(builder, res, elem2, "");
 686
 687    return res;
 688 }
 689
 690 /**
 691  * Return the horizontal sums of 4 float vectors as a float4 vector.
 692  * This uses the technique as outlined in Intel Optimization Manual.
 693  */
 694 static LLVMValueRef
 695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 696                             LLVMValueRef src[4])
 697 {
 698    struct gallivm_state *gallivm = bld->gallivm;
 699    LLVMBuilderRef builder = gallivm->builder;
 700    LLVMValueRef shuffles[4];
 701    LLVMValueRef tmp[4];
 702    LLVMValueRef sumtmp[2], shuftmp[2];
 703
 704    /* lower half of regs */
 705    shuffles[0] = lp_build_const_int32(gallivm, 0);
 706    shuffles[1] = lp_build_const_int32(gallivm, 1);
 707    shuffles[2] = lp_build_const_int32(gallivm, 4);
 708    shuffles[3] = lp_build_const_int32(gallivm, 5);
 709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 710                                    LLVMConstVector(shuffles, 4), "");
 711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 712                                    LLVMConstVector(shuffles, 4), "");
 713
 714    /* upper half of regs */
 715    shuffles[0] = lp_build_const_int32(gallivm, 2);
 716    shuffles[1] = lp_build_const_int32(gallivm, 3);
 717    shuffles[2] = lp_build_const_int32(gallivm, 6);
 718    shuffles[3] = lp_build_const_int32(gallivm, 7);
 719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 720                                    LLVMConstVector(shuffles, 4), "");
 721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 722                                    LLVMConstVector(shuffles, 4), "");
 723
 724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 726
 727    shuffles[0] = lp_build_const_int32(gallivm, 0);
 728    shuffles[1] = lp_build_const_int32(gallivm, 2);
 729    shuffles[2] = lp_build_const_int32(gallivm, 4);
 730    shuffles[3] = lp_build_const_int32(gallivm, 6);
 731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 732                                        LLVMConstVector(shuffles, 4), "");
 733
 734    shuffles[0] = lp_build_const_int32(gallivm, 1);
 735    shuffles[1] = lp_build_const_int32(gallivm, 3);
 736    shuffles[2] = lp_build_const_int32(gallivm, 5);
 737    shuffles[3] = lp_build_const_int32(gallivm, 7);
 738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 739                                        LLVMConstVector(shuffles, 4), "");
 740
 741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 742 }
 743
 744
 745 /*
 746  * partially horizontally add 2-4 float vectors with length nx4,
 747  * i.e. only four adjacent values in each vector will be added,
 748  * assuming values are really grouped in 4 which also determines
 749  * output order.
 750  *
 751  * Return a vector of the same length as the initial vectors,
 752  * with the excess elements (if any) being undefined.
 753  * The element order is independent of number of input vectors.
 754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 755  * the output order thus will be
 756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 757  */
 758 LLVMValueRef
 759 lp_build_hadd_partial4(struct lp_build_context *bld,
 760                        LLVMValueRef vectors[],
 761                        unsigned num_vecs)
 762 {
 763    struct gallivm_state *gallivm = bld->gallivm;
 764    LLVMBuilderRef builder = gallivm->builder;
 765    LLVMValueRef ret_vec;
 766    LLVMValueRef tmp[4];
 767    const char *intrinsic = NULL;
 768
 769    assert(num_vecs >= 2 && num_vecs <= 4);
 770    assert(bld->type.floating);
 771
 772    /* only use this with at least 2 vectors, as it is sort of expensive
 773     * (depending on cpu) and we always need two horizontal adds anyway,
 774     * so a shuffle/add approach might be better.
 775     */
 776
 777    tmp[0] = vectors[0];
 778    tmp[1] = vectors[1];
 779
 780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 782
 783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 784        bld->type.length == 4) {
 785       intrinsic = "llvm.x86.sse3.hadd.ps";
 786    }
 787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 788             bld->type.length == 8) {
 789       intrinsic = "llvm.x86.avx.hadd.ps.256";
 790    }
 791    if (intrinsic) {
 792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 793                                        lp_build_vec_type(gallivm, bld->type),
 794                                        tmp[0], tmp[1]);
 795       if (num_vecs > 2) {
 796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 797                                           lp_build_vec_type(gallivm, bld->type),
 798                                           tmp[2], tmp[3]);
 799       }
 800       else {
 801          tmp[1] = tmp[0];
 802       }
 803       return lp_build_intrinsic_binary(builder, intrinsic,
 804                                        lp_build_vec_type(gallivm, bld->type),
 805                                        tmp[0], tmp[1]);
 806    }
 807
 808    if (bld->type.length == 4) {
 809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 810    }
 811    else {
 812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 813       unsigned j;
 814       unsigned num_iter = bld->type.length / 4;
 815       struct lp_type parttype = bld->type;
 816       parttype.length = 4;
 817       for (j = 0; j < num_iter; j++) {
 818          LLVMValueRef partsrc[4];
 819          unsigned i;
 820          for (i = 0; i < 4; i++) {
 821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 822          }
 823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 824       }
 825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 826    }
 827    return ret_vec;
 828 }
 829
 830 /**
 831  * Generate a - b
 832  */
 833 LLVMValueRef
 834 lp_build_sub(struct lp_build_context *bld,
 835              LLVMValueRef a,
 836              LLVMValueRef b)
 837 {
 838    LLVMBuilderRef builder = bld->gallivm->builder;
 839    const struct lp_type type = bld->type;
 840    LLVMValueRef res;
 841
 842    assert(lp_check_value(type, a));
 843    assert(lp_check_value(type, b));
 844
 845    if(b == bld->zero)
 846       return a;
 847    if(a == bld->undef || b == bld->undef)
 848       return bld->undef;
 849    if(a == b)
 850       return bld->zero;
 851
 852    if(bld->type.norm) {
 853       const char *intrinsic = NULL;
 854
 855       if(b == bld->one)
 856         return bld->zero;
 857
 858       if (!type.floating && !type.fixed) {
 859          if (type.width * type.length == 128) {
 860             if (util_cpu_caps.has_sse2) {
 861               if(type.width == 8)
 862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 863               if(type.width == 16)
 864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 865             } else if (util_cpu_caps.has_altivec) {
 866               if(type.width == 8)
 867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 868               if(type.width == 16)
 869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 870             }
 871          }
 872          if (type.width * type.length == 256) {
 873             if (util_cpu_caps.has_avx2) {
 874               if(type.width == 8)
 875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
 876               if(type.width == 16)
 877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
 878             }
 879          }
 880       }
 881
 882       if (intrinsic)
 883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 884    }
 885
 886    if(type.norm && !type.floating && !type.fixed) {
 887       if (type.sign) {
 888          uint64_t sign = (uint64_t)1 << (type.width - 1);
 889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 891          /* a_clamp_max is the maximum a for negative b,
 892             a_clamp_min is the minimum a for positive b. */
 893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 896       } else {
 897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 898       }
 899    }
 900
 901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 902       if (type.floating)
 903          res = LLVMConstFSub(a, b);
 904       else
 905          res = LLVMConstSub(a, b);
 906    else
 907       if (type.floating)
 908          res = LLVMBuildFSub(builder, a, b, "");
 909       else
 910          res = LLVMBuildSub(builder, a, b, "");
 911
 912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 914
 915    return res;
 916 }
 917
 918
 919
 920 /**
 921  * Normalized multiplication.
 922  *
 923  * There are several approaches for (using 8-bit normalized multiplication as
 924  * an example):
 925  *
 926  * - alpha plus one
 927  *
 928  *     makes the following approximation to the division (Sree)
 929  *
 930  *       a*b/255 ~= (a*(b + 1)) >> 256
 931  *
 932  *     which is the fastest method that satisfies the following OpenGL criteria of
 933  *
 934  *       0*0 = 0 and 255*255 = 255
 935  *
 936  * - geometric series
 937  *
 938  *     takes the geometric series approximation to the division
 939  *
 940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 941  *
 942  *     in this case just the first two terms to fit in 16bit arithmetic
 943  *
 944  *       t/255 ~= (t + (t >> 8)) >> 8
 945  *
 946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 948  *     must be used.
 949  *
 950  * - geometric series plus rounding
 951  *
 952  *     when using a geometric series division instead of truncating the result
 953  *     use roundoff in the approximation (Jim Blinn)
 954  *
 955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 956  *
 957  *     achieving the exact results.
 958  *
 959  *
 960  *
 961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 963  * @sa Michael Herf, The "double blend trick", May 2000,
 964  *     http://www.stereopsis.com/doubleblend.html
 965  */
 966 static LLVMValueRef
 967 lp_build_mul_norm(struct gallivm_state *gallivm,
 968                   struct lp_type wide_type,
 969                   LLVMValueRef a, LLVMValueRef b)
 970 {
 971    LLVMBuilderRef builder = gallivm->builder;
 972    struct lp_build_context bld;
 973    unsigned n;
 974    LLVMValueRef half;
 975    LLVMValueRef ab;
 976
 977    assert(!wide_type.floating);
 978    assert(lp_check_value(wide_type, a));
 979    assert(lp_check_value(wide_type, b));
 980
 981    lp_build_context_init(&bld, gallivm, wide_type);
 982
 983    n = wide_type.width / 2;
 984    if (wide_type.sign) {
 985       --n;
 986    }
 987
 988    /*
 989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 991     */
 992
 993    /*
 994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 995     */
 996
 997    ab = LLVMBuildMul(builder, a, b, "");
 998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 999
1000    /*
1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002     */
1003
1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005    if (wide_type.sign) {
1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008       half = lp_build_select(&bld, sign, minus_half, half);
1009    }
1010    ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012    /* Final division */
1013    ab = lp_build_shr_imm(&bld, ab, n);
1014
1015    return ab;
1016 }
1017
1018 /**
1019  * Generate a * b
1020  */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023              LLVMValueRef a,
1024              LLVMValueRef b)
1025 {
1026    LLVMBuilderRef builder = bld->gallivm->builder;
1027    const struct lp_type type = bld->type;
1028    LLVMValueRef shift;
1029    LLVMValueRef res;
1030
1031    assert(lp_check_value(type, a));
1032    assert(lp_check_value(type, b));
1033
1034    if(a == bld->zero)
1035       return bld->zero;
1036    if(a == bld->one)
1037       return b;
1038    if(b == bld->zero)
1039       return bld->zero;
1040    if(b == bld->one)
1041       return a;
1042    if(a == bld->undef || b == bld->undef)
1043       return bld->undef;
1044
1045    if (!type.floating && !type.fixed && type.norm) {
1046       struct lp_type wide_type = lp_wider_type(type);
1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052       /* PMULLW, PSRLW, PADDW */
1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058       return ab;
1059    }
1060
1061    if(type.fixed)
1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063    else
1064       shift = NULL;
1065
1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067       if (type.floating)
1068          res = LLVMConstFMul(a, b);
1069       else
1070          res = LLVMConstMul(a, b);
1071       if(shift) {
1072          if(type.sign)
1073             res = LLVMConstAShr(res, shift);
1074          else
1075             res = LLVMConstLShr(res, shift);
1076       }
1077    }
1078    else {
1079       if (type.floating)
1080          res = LLVMBuildFMul(builder, a, b, "");
1081       else
1082          res = LLVMBuildMul(builder, a, b, "");
1083       if(shift) {
1084          if(type.sign)
1085             res = LLVMBuildAShr(builder, res, shift, "");
1086          else
1087             res = LLVMBuildLShr(builder, res, shift, "");
1088       }
1089    }
1090
1091    return res;
1092 }
1093
1094 /*
1095  * Widening mul, valid for 32x32 bit -> 64bit only.
1096  * Result is low 32bits, high bits returned in res_hi.
1097  *
1098  * Emits code that is meant to be compiled for the host CPU.
1099  */
1100 LLVMValueRef
1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102                          LLVMValueRef a,
1103                          LLVMValueRef b,
1104                          LLVMValueRef *res_hi)
1105 {
1106    struct gallivm_state *gallivm = bld->gallivm;
1107    LLVMBuilderRef builder = gallivm->builder;
1108
1109    assert(bld->type.width == 32);
1110    assert(bld->type.floating == 0);
1111    assert(bld->type.fixed == 0);
1112    assert(bld->type.norm == 0);
1113
1114    /*
1115     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116     * for x86 simd is atrocious (even if the high bits weren't required),
1117     * trying to handle real 64bit inputs (which of course can't happen due
1118     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119     * apparently llvm does not recognize this widening mul). This includes 6
1120     * (instead of 2) pmuludq plus extra adds and shifts
1121     * The same story applies to signed mul, albeit fixing this requires sse41.
1122     * https://llvm.org/bugs/show_bug.cgi?id=30845
1123     * So, whip up our own code, albeit only for length 4 and 8 (which
1124     * should be good enough)...
1125     */
1126    if ((bld->type.length == 4 || bld->type.length == 8) &&
1127        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128         util_cpu_caps.has_sse4_1)) {
1129       const char *intrinsic = NULL;
1130       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132       struct lp_type type_wide = lp_wider_type(bld->type);
1133       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134       unsigned i;
1135       for (i = 0; i < bld->type.length; i += 2) {
1136          shuf[i] = lp_build_const_int32(gallivm, i+1);
1137          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138       }
1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140       aeven = a;
1141       beven = b;
1142       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146          if (bld->type.sign) {
1147             intrinsic = "llvm.x86.avx2.pmul.dq";
1148          } else {
1149             intrinsic = "llvm.x86.avx2.pmulu.dq";
1150          }
1151          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                              wider_type, aeven, beven);
1153          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                             wider_type, aodd, bodd);
1155       }
1156       else {
1157          /* for consistent naming look elsewhere... */
1158          if (bld->type.sign) {
1159             intrinsic = "llvm.x86.sse41.pmuldq";
1160          } else {
1161             intrinsic = "llvm.x86.sse2.pmulu.dq";
1162          }
1163          /*
1164           * XXX If we only have AVX but not AVX2 this is a pain.
1165           * lp_build_intrinsic_binary_anylength() can't handle it
1166           * (due to src and dst type not being identical).
1167           */
1168          if (bld->type.length == 8) {
1169             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171             LLVMValueRef muleven2[2], mulodd2[2];
1172             struct lp_type type_wide_half = type_wide;
1173             LLVMTypeRef wtype_half;
1174             type_wide_half.length = 2;
1175             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185                                                     wtype_half, aevenlo, bevenlo);
1186             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187                                                    wtype_half, aoddlo, boddlo);
1188             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189                                                     wtype_half, aevenhi, bevenhi);
1190             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191                                                    wtype_half, aoddhi, boddhi);
1192             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195          }
1196          else {
1197             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198                                                 wider_type, aeven, beven);
1199             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200                                                wider_type, aodd, bodd);
1201          }
1202       }
1203       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206       for (i = 0; i < bld->type.length; i += 2) {
1207          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209       }
1210       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213       for (i = 0; i < bld->type.length; i += 2) {
1214          shuf[i] = lp_build_const_int32(gallivm, i);
1215          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216       }
1217       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219    }
1220    else {
1221       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222    }
1223 }
1224
1225
1226 /*
1227  * Widening mul, valid for 32x32 bit -> 64bit only.
1228  * Result is low 32bits, high bits returned in res_hi.
1229  *
1230  * Emits generic code.
1231  */
1232 LLVMValueRef
1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234                      LLVMValueRef a,
1235                      LLVMValueRef b,
1236                      LLVMValueRef *res_hi)
1237 {
1238    struct gallivm_state *gallivm = bld->gallivm;
1239    LLVMBuilderRef builder = gallivm->builder;
1240    LLVMValueRef tmp;
1241    struct lp_type type_tmp;
1242    LLVMTypeRef wide_type, cast_type;
1243
1244    type_tmp = bld->type;
1245    type_tmp.width *= 2;
1246    wide_type = lp_build_vec_type(gallivm, type_tmp);
1247    type_tmp = bld->type;
1248    type_tmp.length *= 2;
1249    cast_type = lp_build_vec_type(gallivm, type_tmp);
1250
1251    if (bld->type.sign) {
1252       a = LLVMBuildSExt(builder, a, wide_type, "");
1253       b = LLVMBuildSExt(builder, b, wide_type, "");
1254    } else {
1255       a = LLVMBuildZExt(builder, a, wide_type, "");
1256       b = LLVMBuildZExt(builder, b, wide_type, "");
1257    }
1258    tmp = LLVMBuildMul(builder, a, b, "");
1259    tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
1260    *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
1261    return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
1262 }
1263
1264
1265 /* a * b + c */
1266 LLVMValueRef
1267 lp_build_mad(struct lp_build_context *bld,
1268              LLVMValueRef a,
1269              LLVMValueRef b,
1270              LLVMValueRef c)
1271 {
1272    const struct lp_type type = bld->type;
1273    if (type.floating) {
1274       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1275    } else {
1276       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1277    }
1278 }
1279
1280
1281 /**
1282  * Small vector x scale multiplication optimization.
1283  */
1284 LLVMValueRef
1285 lp_build_mul_imm(struct lp_build_context *bld,
1286                  LLVMValueRef a,
1287                  int b)
1288 {
1289    LLVMBuilderRef builder = bld->gallivm->builder;
1290    LLVMValueRef factor;
1291
1292    assert(lp_check_value(bld->type, a));
1293
1294    if(b == 0)
1295       return bld->zero;
1296
1297    if(b == 1)
1298       return a;
1299
1300    if(b == -1)
1301       return lp_build_negate(bld, a);
1302
1303    if(b == 2 && bld->type.floating)
1304       return lp_build_add(bld, a, a);
1305
1306    if(util_is_power_of_two(b)) {
1307       unsigned shift = ffs(b) - 1;
1308
1309       if(bld->type.floating) {
1310 #if 0
1311          /*
1312           * Power of two multiplication by directly manipulating the exponent.
1313           *
1314           * XXX: This might not be always faster, it will introduce a small error
1315           * for multiplication by zero, and it will produce wrong results
1316           * for Inf and NaN.
1317           */
1318          unsigned mantissa = lp_mantissa(bld->type);
1319          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1320          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1321          a = LLVMBuildAdd(builder, a, factor, "");
1322          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1323          return a;
1324 #endif
1325       }
1326       else {
1327          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1328          return LLVMBuildShl(builder, a, factor, "");
1329       }
1330    }
1331
1332    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1333    return lp_build_mul(bld, a, factor);
1334 }
1335
1336
1337 /**
1338  * Generate a / b
1339  */
1340 LLVMValueRef
1341 lp_build_div(struct lp_build_context *bld,
1342              LLVMValueRef a,
1343              LLVMValueRef b)
1344 {
1345    LLVMBuilderRef builder = bld->gallivm->builder;
1346    const struct lp_type type = bld->type;
1347
1348    assert(lp_check_value(type, a));
1349    assert(lp_check_value(type, b));
1350
1351    if(a == bld->zero)
1352       return bld->zero;
1353    if(a == bld->one && type.floating)
1354       return lp_build_rcp(bld, b);
1355    if(b == bld->zero)
1356       return bld->undef;
1357    if(b == bld->one)
1358       return a;
1359    if(a == bld->undef || b == bld->undef)
1360       return bld->undef;
1361
1362    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1363       if (type.floating)
1364          return LLVMConstFDiv(a, b);
1365       else if (type.sign)
1366          return LLVMConstSDiv(a, b);
1367       else
1368          return LLVMConstUDiv(a, b);
1369    }
1370
1371    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1372        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1373       type.floating)
1374       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1375
1376    if (type.floating)
1377       return LLVMBuildFDiv(builder, a, b, "");
1378    else if (type.sign)
1379       return LLVMBuildSDiv(builder, a, b, "");
1380    else
1381       return LLVMBuildUDiv(builder, a, b, "");
1382 }
1383
1384
1385 /**
1386  * Linear interpolation helper.
1387  *
1388  * @param normalized whether we are interpolating normalized values,
1389  *        encoded in normalized integers, twice as wide.
1390  *
1391  * @sa http://www.stereopsis.com/doubleblend.html
1392  */
1393 static inline LLVMValueRef
1394 lp_build_lerp_simple(struct lp_build_context *bld,
1395                      LLVMValueRef x,
1396                      LLVMValueRef v0,
1397                      LLVMValueRef v1,
1398                      unsigned flags)
1399 {
1400    unsigned half_width = bld->type.width/2;
1401    LLVMBuilderRef builder = bld->gallivm->builder;
1402    LLVMValueRef delta;
1403    LLVMValueRef res;
1404
1405    assert(lp_check_value(bld->type, x));
1406    assert(lp_check_value(bld->type, v0));
1407    assert(lp_check_value(bld->type, v1));
1408
1409    delta = lp_build_sub(bld, v1, v0);
1410
1411    if (bld->type.floating) {
1412       assert(flags == 0);
1413       return lp_build_mad(bld, x, delta, v0);
1414    }
1415
1416    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1417       if (!bld->type.sign) {
1418          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1419             /*
1420              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1421              * most-significant-bit to the lowest-significant-bit, so that
1422              * later we can just divide by 2**n instead of 2**n - 1.
1423              */
1424
1425             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1426          }
1427
1428          /* (x * delta) >> n */
1429          res = lp_build_mul(bld, x, delta);
1430          res = lp_build_shr_imm(bld, res, half_width);
1431       } else {
1432          /*
1433           * The rescaling trick above doesn't work for signed numbers, so
1434           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1435           * instead.
1436           */
1437          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1438          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1439       }
1440    } else {
1441       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1442       res = lp_build_mul(bld, x, delta);
1443    }
1444
1445    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1446       /*
1447        * At this point both res and v0 only use the lower half of the bits,
1448        * the rest is zero. Instead of add / mask, do add with half wide type.
1449        */
1450       struct lp_type narrow_type;
1451       struct lp_build_context narrow_bld;
1452
1453       memset(&narrow_type, 0, sizeof narrow_type);
1454       narrow_type.sign   = bld->type.sign;
1455       narrow_type.width  = bld->type.width/2;
1456       narrow_type.length = bld->type.length*2;
1457
1458       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1459       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1460       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1461       res = lp_build_add(&narrow_bld, v0, res);
1462       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1463    } else {
1464       res = lp_build_add(bld, v0, res);
1465
1466       if (bld->type.fixed) {
1467          /*
1468           * We need to mask out the high order bits when lerping 8bit
1469           * normalized colors stored on 16bits
1470           */
1471          /* XXX: This step is necessary for lerping 8bit colors stored on
1472           * 16bits, but it will be wrong for true fixed point use cases.
1473           * Basically we need a more powerful lp_type, capable of further
1474           * distinguishing the values interpretation from the value storage.
1475           */
1476          LLVMValueRef low_bits;
1477          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1478          res = LLVMBuildAnd(builder, res, low_bits, "");
1479       }
1480    }
1481
1482    return res;
1483 }
1484
1485
1486 /**
1487  * Linear interpolation.
1488  */
1489 LLVMValueRef
1490 lp_build_lerp(struct lp_build_context *bld,
1491               LLVMValueRef x,
1492               LLVMValueRef v0,
1493               LLVMValueRef v1,
1494               unsigned flags)
1495 {
1496    const struct lp_type type = bld->type;
1497    LLVMValueRef res;
1498
1499    assert(lp_check_value(type, x));
1500    assert(lp_check_value(type, v0));
1501    assert(lp_check_value(type, v1));
1502
1503    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1504
1505    if (type.norm) {
1506       struct lp_type wide_type;
1507       struct lp_build_context wide_bld;
1508       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1509
1510       assert(type.length >= 2);
1511
1512       /*
1513        * Create a wider integer type, enough to hold the
1514        * intermediate result of the multiplication.
1515        */
1516       memset(&wide_type, 0, sizeof wide_type);
1517       wide_type.sign   = type.sign;
1518       wide_type.width  = type.width*2;
1519       wide_type.length = type.length/2;
1520
1521       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1522
1523       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1524       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1525       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1526
1527       /*
1528        * Lerp both halves.
1529        */
1530
1531       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1532
1533       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1534       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1535
1536       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1537    } else {
1538       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1539    }
1540
1541    return res;
1542 }
1543
1544
1545 /**
1546  * Bilinear interpolation.
1547  *
1548  * Values indices are in v_{yx}.
1549  */
1550 LLVMValueRef
1551 lp_build_lerp_2d(struct lp_build_context *bld,
1552                  LLVMValueRef x,
1553                  LLVMValueRef y,
1554                  LLVMValueRef v00,
1555                  LLVMValueRef v01,
1556                  LLVMValueRef v10,
1557                  LLVMValueRef v11,
1558                  unsigned flags)
1559 {
1560    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1561    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1562    return lp_build_lerp(bld, y, v0, v1, flags);
1563 }
1564
1565
1566 LLVMValueRef
1567 lp_build_lerp_3d(struct lp_build_context *bld,
1568                  LLVMValueRef x,
1569                  LLVMValueRef y,
1570                  LLVMValueRef z,
1571                  LLVMValueRef v000,
1572                  LLVMValueRef v001,
1573                  LLVMValueRef v010,
1574                  LLVMValueRef v011,
1575                  LLVMValueRef v100,
1576                  LLVMValueRef v101,
1577                  LLVMValueRef v110,
1578                  LLVMValueRef v111,
1579                  unsigned flags)
1580 {
1581    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1582    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1583    return lp_build_lerp(bld, z, v0, v1, flags);
1584 }
1585
1586
1587 /**
1588  * Generate min(a, b)
1589  * Do checks for special cases but not for nans.
1590  */
1591 LLVMValueRef
1592 lp_build_min(struct lp_build_context *bld,
1593              LLVMValueRef a,
1594              LLVMValueRef b)
1595 {
1596    assert(lp_check_value(bld->type, a));
1597    assert(lp_check_value(bld->type, b));
1598
1599    if(a == bld->undef || b == bld->undef)
1600       return bld->undef;
1601
1602    if(a == b)
1603       return a;
1604
1605    if (bld->type.norm) {
1606       if (!bld->type.sign) {
1607          if (a == bld->zero || b == bld->zero) {
1608             return bld->zero;
1609          }
1610       }
1611       if(a == bld->one)
1612          return b;
1613       if(b == bld->one)
1614          return a;
1615    }
1616
1617    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1618 }
1619
1620
1621 /**
1622  * Generate min(a, b)
1623  * NaN's are handled according to the behavior specified by the
1624  * nan_behavior argument.
1625  */
1626 LLVMValueRef
1627 lp_build_min_ext(struct lp_build_context *bld,
1628                  LLVMValueRef a,
1629                  LLVMValueRef b,
1630                  enum gallivm_nan_behavior nan_behavior)
1631 {
1632    assert(lp_check_value(bld->type, a));
1633    assert(lp_check_value(bld->type, b));
1634
1635    if(a == bld->undef || b == bld->undef)
1636       return bld->undef;
1637
1638    if(a == b)
1639       return a;
1640
1641    if (bld->type.norm) {
1642       if (!bld->type.sign) {
1643          if (a == bld->zero || b == bld->zero) {
1644             return bld->zero;
1645          }
1646       }
1647       if(a == bld->one)
1648          return b;
1649       if(b == bld->one)
1650          return a;
1651    }
1652
1653    return lp_build_min_simple(bld, a, b, nan_behavior);
1654 }
1655
1656 /**
1657  * Generate max(a, b)
1658  * Do checks for special cases, but NaN behavior is undefined.
1659  */
1660 LLVMValueRef
1661 lp_build_max(struct lp_build_context *bld,
1662              LLVMValueRef a,
1663              LLVMValueRef b)
1664 {
1665    assert(lp_check_value(bld->type, a));
1666    assert(lp_check_value(bld->type, b));
1667
1668    if(a == bld->undef || b == bld->undef)
1669       return bld->undef;
1670
1671    if(a == b)
1672       return a;
1673
1674    if(bld->type.norm) {
1675       if(a == bld->one || b == bld->one)
1676          return bld->one;
1677       if (!bld->type.sign) {
1678          if (a == bld->zero) {
1679             return b;
1680          }
1681          if (b == bld->zero) {
1682             return a;
1683          }
1684       }
1685    }
1686
1687    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1688 }
1689
1690
1691 /**
1692  * Generate max(a, b)
1693  * Checks for special cases.
1694  * NaN's are handled according to the behavior specified by the
1695  * nan_behavior argument.
1696  */
1697 LLVMValueRef
1698 lp_build_max_ext(struct lp_build_context *bld,
1699                   LLVMValueRef a,
1700                   LLVMValueRef b,
1701                   enum gallivm_nan_behavior nan_behavior)
1702 {
1703    assert(lp_check_value(bld->type, a));
1704    assert(lp_check_value(bld->type, b));
1705
1706    if(a == bld->undef || b == bld->undef)
1707       return bld->undef;
1708
1709    if(a == b)
1710       return a;
1711
1712    if(bld->type.norm) {
1713       if(a == bld->one || b == bld->one)
1714          return bld->one;
1715       if (!bld->type.sign) {
1716          if (a == bld->zero) {
1717             return b;
1718          }
1719          if (b == bld->zero) {
1720             return a;
1721          }
1722       }
1723    }
1724
1725    return lp_build_max_simple(bld, a, b, nan_behavior);
1726 }
1727
1728 /**
1729  * Generate clamp(a, min, max)
1730  * NaN behavior (for any of a, min, max) is undefined.
1731  * Do checks for special cases.
1732  */
1733 LLVMValueRef
1734 lp_build_clamp(struct lp_build_context *bld,
1735                LLVMValueRef a,
1736                LLVMValueRef min,
1737                LLVMValueRef max)
1738 {
1739    assert(lp_check_value(bld->type, a));
1740    assert(lp_check_value(bld->type, min));
1741    assert(lp_check_value(bld->type, max));
1742
1743    a = lp_build_min(bld, a, max);
1744    a = lp_build_max(bld, a, min);
1745    return a;
1746 }
1747
1748
1749 /**
1750  * Generate clamp(a, 0, 1)
1751  * A NaN will get converted to zero.
1752  */
1753 LLVMValueRef
1754 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1755                                 LLVMValueRef a)
1756 {
1757    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1758    a = lp_build_min(bld, a, bld->one);
1759    return a;
1760 }
1761
1762
1763 /**
1764  * Generate abs(a)
1765  */
1766 LLVMValueRef
1767 lp_build_abs(struct lp_build_context *bld,
1768              LLVMValueRef a)
1769 {
1770    LLVMBuilderRef builder = bld->gallivm->builder;
1771    const struct lp_type type = bld->type;
1772    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1773
1774    assert(lp_check_value(type, a));
1775
1776    if(!type.sign)
1777       return a;
1778
1779    if(type.floating) {
1780       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1781          /* Workaround llvm.org/PR27332 */
1782          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1783          unsigned long long absMask = ~(1ULL << (type.width - 1));
1784          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1785          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1786          a = LLVMBuildAnd(builder, a, mask, "");
1787          a = LLVMBuildBitCast(builder, a, vec_type, "");
1788          return a;
1789       } else {
1790          char intrinsic[32];
1791          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1792          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1793       }
1794    }
1795
1796    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1797       switch(type.width) {
1798       case 8:
1799          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1800       case 16:
1801          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1802       case 32:
1803          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1804       }
1805    }
1806    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1807       switch(type.width) {
1808       case 8:
1809          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1810       case 16:
1811          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1812       case 32:
1813          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1814       }
1815    }
1816    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1817             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1818             (type.width == 8 || type.width == 16 || type.width == 32)) {
1819       debug_printf("%s: inefficient code, should split vectors manually\n",
1820                    __FUNCTION__);
1821    }
1822
1823    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1824 }
1825
1826
1827 LLVMValueRef
1828 lp_build_negate(struct lp_build_context *bld,
1829                 LLVMValueRef a)
1830 {
1831    LLVMBuilderRef builder = bld->gallivm->builder;
1832
1833    assert(lp_check_value(bld->type, a));
1834
1835    if (bld->type.floating)
1836       a = LLVMBuildFNeg(builder, a, "");
1837    else
1838       a = LLVMBuildNeg(builder, a, "");
1839
1840    return a;
1841 }
1842
1843
1844 /** Return -1, 0 or +1 depending on the sign of a */
1845 LLVMValueRef
1846 lp_build_sgn(struct lp_build_context *bld,
1847              LLVMValueRef a)
1848 {
1849    LLVMBuilderRef builder = bld->gallivm->builder;
1850    const struct lp_type type = bld->type;
1851    LLVMValueRef cond;
1852    LLVMValueRef res;
1853
1854    assert(lp_check_value(type, a));
1855
1856    /* Handle non-zero case */
1857    if(!type.sign) {
1858       /* if not zero then sign must be positive */
1859       res = bld->one;
1860    }
1861    else if(type.floating) {
1862       LLVMTypeRef vec_type;
1863       LLVMTypeRef int_type;
1864       LLVMValueRef mask;
1865       LLVMValueRef sign;
1866       LLVMValueRef one;
1867       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1868
1869       int_type = lp_build_int_vec_type(bld->gallivm, type);
1870       vec_type = lp_build_vec_type(bld->gallivm, type);
1871       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1872
1873       /* Take the sign bit and add it to 1 constant */
1874       sign = LLVMBuildBitCast(builder, a, int_type, "");
1875       sign = LLVMBuildAnd(builder, sign, mask, "");
1876       one = LLVMConstBitCast(bld->one, int_type);
1877       res = LLVMBuildOr(builder, sign, one, "");
1878       res = LLVMBuildBitCast(builder, res, vec_type, "");
1879    }
1880    else
1881    {
1882       /* signed int/norm/fixed point */
1883       /* could use psign with sse3 and appropriate vectors here */
1884       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1885       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1886       res = lp_build_select(bld, cond, bld->one, minus_one);
1887    }
1888
1889    /* Handle zero */
1890    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1891    res = lp_build_select(bld, cond, bld->zero, res);
1892
1893    return res;
1894 }
1895
1896
1897 /**
1898  * Set the sign of float vector 'a' according to 'sign'.
1899  * If sign==0, return abs(a).
1900  * If sign==1, return -abs(a);
1901  * Other values for sign produce undefined results.
1902  */
1903 LLVMValueRef
1904 lp_build_set_sign(struct lp_build_context *bld,
1905                   LLVMValueRef a, LLVMValueRef sign)
1906 {
1907    LLVMBuilderRef builder = bld->gallivm->builder;
1908    const struct lp_type type = bld->type;
1909    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1910    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1911    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1912    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1913                              ~((unsigned long long) 1 << (type.width - 1)));
1914    LLVMValueRef val, res;
1915
1916    assert(type.floating);
1917    assert(lp_check_value(type, a));
1918
1919    /* val = reinterpret_cast<int>(a) */
1920    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1921    /* val = val & mask */
1922    val = LLVMBuildAnd(builder, val, mask, "");
1923    /* sign = sign << shift */
1924    sign = LLVMBuildShl(builder, sign, shift, "");
1925    /* res = val | sign */
1926    res = LLVMBuildOr(builder, val, sign, "");
1927    /* res = reinterpret_cast<float>(res) */
1928    res = LLVMBuildBitCast(builder, res, vec_type, "");
1929
1930    return res;
1931 }
1932
1933
1934 /**
1935  * Convert vector of (or scalar) int to vector of (or scalar) float.
1936  */
1937 LLVMValueRef
1938 lp_build_int_to_float(struct lp_build_context *bld,
1939                       LLVMValueRef a)
1940 {
1941    LLVMBuilderRef builder = bld->gallivm->builder;
1942    const struct lp_type type = bld->type;
1943    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1944
1945    assert(type.floating);
1946
1947    return LLVMBuildSIToFP(builder, a, vec_type, "");
1948 }
1949
1950 static boolean
1951 arch_rounding_available(const struct lp_type type)
1952 {
1953    if ((util_cpu_caps.has_sse4_1 &&
1954        (type.length == 1 || type.width*type.length == 128)) ||
1955        (util_cpu_caps.has_avx && type.width*type.length == 256))
1956       return TRUE;
1957    else if ((util_cpu_caps.has_altivec &&
1958             (type.width == 32 && type.length == 4)))
1959       return TRUE;
1960
1961    return FALSE;
1962 }
1963
1964 enum lp_build_round_mode
1965 {
1966    LP_BUILD_ROUND_NEAREST = 0,
1967    LP_BUILD_ROUND_FLOOR = 1,
1968    LP_BUILD_ROUND_CEIL = 2,
1969    LP_BUILD_ROUND_TRUNCATE = 3
1970 };
1971
1972 static inline LLVMValueRef
1973 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1974                              LLVMValueRef a)
1975 {
1976    LLVMBuilderRef builder = bld->gallivm->builder;
1977    const struct lp_type type = bld->type;
1978    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1979    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1980    const char *intrinsic;
1981    LLVMValueRef res;
1982
1983    assert(type.floating);
1984    /* using the double precision conversions is a bit more complicated */
1985    assert(type.width == 32);
1986
1987    assert(lp_check_value(type, a));
1988    assert(util_cpu_caps.has_sse2);
1989
1990    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1991    if (type.length == 1) {
1992       LLVMTypeRef vec_type;
1993       LLVMValueRef undef;
1994       LLVMValueRef arg;
1995       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1996
1997       vec_type = LLVMVectorType(bld->elem_type, 4);
1998
1999       intrinsic = "llvm.x86.sse.cvtss2si";
2000
2001       undef = LLVMGetUndef(vec_type);
2002
2003       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2004
2005       res = lp_build_intrinsic_unary(builder, intrinsic,
2006                                      ret_type, arg);
2007    }
2008    else {
2009       if (type.width* type.length == 128) {
2010          intrinsic = "llvm.x86.sse2.cvtps2dq";
2011       }
2012       else {
2013          assert(type.width*type.length == 256);
2014          assert(util_cpu_caps.has_avx);
2015
2016          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2017       }
2018       res = lp_build_intrinsic_unary(builder, intrinsic,
2019                                      ret_type, a);
2020    }
2021
2022    return res;
2023 }
2024
2025
2026 /*
2027  */
2028 static inline LLVMValueRef
2029 lp_build_round_altivec(struct lp_build_context *bld,
2030                        LLVMValueRef a,
2031                        enum lp_build_round_mode mode)
2032 {
2033    LLVMBuilderRef builder = bld->gallivm->builder;
2034    const struct lp_type type = bld->type;
2035    const char *intrinsic = NULL;
2036
2037    assert(type.floating);
2038
2039    assert(lp_check_value(type, a));
2040    assert(util_cpu_caps.has_altivec);
2041
2042    (void)type;
2043
2044    switch (mode) {
2045    case LP_BUILD_ROUND_NEAREST:
2046       intrinsic = "llvm.ppc.altivec.vrfin";
2047       break;
2048    case LP_BUILD_ROUND_FLOOR:
2049       intrinsic = "llvm.ppc.altivec.vrfim";
2050       break;
2051    case LP_BUILD_ROUND_CEIL:
2052       intrinsic = "llvm.ppc.altivec.vrfip";
2053       break;
2054    case LP_BUILD_ROUND_TRUNCATE:
2055       intrinsic = "llvm.ppc.altivec.vrfiz";
2056       break;
2057    }
2058
2059    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2060 }
2061
2062 static inline LLVMValueRef
2063 lp_build_round_arch(struct lp_build_context *bld,
2064                     LLVMValueRef a,
2065                     enum lp_build_round_mode mode)
2066 {
2067    if (util_cpu_caps.has_sse4_1) {
2068       LLVMBuilderRef builder = bld->gallivm->builder;
2069       const struct lp_type type = bld->type;
2070       const char *intrinsic_root;
2071       char intrinsic[32];
2072
2073       assert(type.floating);
2074       assert(lp_check_value(type, a));
2075       (void)type;
2076
2077       switch (mode) {
2078       case LP_BUILD_ROUND_NEAREST:
2079          intrinsic_root = "llvm.nearbyint";
2080          break;
2081       case LP_BUILD_ROUND_FLOOR:
2082          intrinsic_root = "llvm.floor";
2083          break;
2084       case LP_BUILD_ROUND_CEIL:
2085          intrinsic_root = "llvm.ceil";
2086          break;
2087       case LP_BUILD_ROUND_TRUNCATE:
2088          intrinsic_root = "llvm.trunc";
2089          break;
2090       }
2091
2092       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2093       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2094    }
2095    else /* (util_cpu_caps.has_altivec) */
2096      return lp_build_round_altivec(bld, a, mode);
2097 }
2098
2099 /**
2100  * Return the integer part of a float (vector) value (== round toward zero).
2101  * The returned value is a float (vector).
2102  * Ex: trunc(-1.5) = -1.0
2103  */
2104 LLVMValueRef
2105 lp_build_trunc(struct lp_build_context *bld,
2106                LLVMValueRef a)
2107 {
2108    LLVMBuilderRef builder = bld->gallivm->builder;
2109    const struct lp_type type = bld->type;
2110
2111    assert(type.floating);
2112    assert(lp_check_value(type, a));
2113
2114    if (arch_rounding_available(type)) {
2115       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2116    }
2117    else {
2118       const struct lp_type type = bld->type;
2119       struct lp_type inttype;
2120       struct lp_build_context intbld;
2121       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2122       LLVMValueRef trunc, res, anosign, mask;
2123       LLVMTypeRef int_vec_type = bld->int_vec_type;
2124       LLVMTypeRef vec_type = bld->vec_type;
2125
2126       assert(type.width == 32); /* might want to handle doubles at some point */
2127
2128       inttype = type;
2129       inttype.floating = 0;
2130       lp_build_context_init(&intbld, bld->gallivm, inttype);
2131
2132       /* round by truncation */
2133       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2134       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2135
2136       /* mask out sign bit */
2137       anosign = lp_build_abs(bld, a);
2138       /*
2139        * mask out all values if anosign > 2^24
2140        * This should work both for large ints (all rounding is no-op for them
2141        * because such floats are always exact) as well as special cases like
2142        * NaNs, Infs (taking advantage of the fact they use max exponent).
2143        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2144        */
2145       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2146       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2147       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2148       return lp_build_select(bld, mask, a, res);
2149    }
2150 }
2151
2152
2153 /**
2154  * Return float (vector) rounded to nearest integer (vector).  The returned
2155  * value is a float (vector).
2156  * Ex: round(0.9) = 1.0
2157  * Ex: round(-1.5) = -2.0
2158  */
2159 LLVMValueRef
2160 lp_build_round(struct lp_build_context *bld,
2161                LLVMValueRef a)
2162 {
2163    LLVMBuilderRef builder = bld->gallivm->builder;
2164    const struct lp_type type = bld->type;
2165
2166    assert(type.floating);
2167    assert(lp_check_value(type, a));
2168
2169    if (arch_rounding_available(type)) {
2170       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2171    }
2172    else {
2173       const struct lp_type type = bld->type;
2174       struct lp_type inttype;
2175       struct lp_build_context intbld;
2176       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2177       LLVMValueRef res, anosign, mask;
2178       LLVMTypeRef int_vec_type = bld->int_vec_type;
2179       LLVMTypeRef vec_type = bld->vec_type;
2180
2181       assert(type.width == 32); /* might want to handle doubles at some point */
2182
2183       inttype = type;
2184       inttype.floating = 0;
2185       lp_build_context_init(&intbld, bld->gallivm, inttype);
2186
2187       res = lp_build_iround(bld, a);
2188       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2189
2190       /* mask out sign bit */
2191       anosign = lp_build_abs(bld, a);
2192       /*
2193        * mask out all values if anosign > 2^24
2194        * This should work both for large ints (all rounding is no-op for them
2195        * because such floats are always exact) as well as special cases like
2196        * NaNs, Infs (taking advantage of the fact they use max exponent).
2197        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2198        */
2199       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2200       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2201       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2202       return lp_build_select(bld, mask, a, res);
2203    }
2204 }
2205
2206
2207 /**
2208  * Return floor of float (vector), result is a float (vector)
2209  * Ex: floor(1.1) = 1.0
2210  * Ex: floor(-1.1) = -2.0
2211  */
2212 LLVMValueRef
2213 lp_build_floor(struct lp_build_context *bld,
2214                LLVMValueRef a)
2215 {
2216    LLVMBuilderRef builder = bld->gallivm->builder;
2217    const struct lp_type type = bld->type;
2218
2219    assert(type.floating);
2220    assert(lp_check_value(type, a));
2221
2222    if (arch_rounding_available(type)) {
2223       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2224    }
2225    else {
2226       const struct lp_type type = bld->type;
2227       struct lp_type inttype;
2228       struct lp_build_context intbld;
2229       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2230       LLVMValueRef trunc, res, anosign, mask;
2231       LLVMTypeRef int_vec_type = bld->int_vec_type;
2232       LLVMTypeRef vec_type = bld->vec_type;
2233
2234       if (type.width != 32) {
2235          char intrinsic[32];
2236          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2237          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2238       }
2239
2240       assert(type.width == 32); /* might want to handle doubles at some point */
2241
2242       inttype = type;
2243       inttype.floating = 0;
2244       lp_build_context_init(&intbld, bld->gallivm, inttype);
2245
2246       /* round by truncation */
2247       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2248       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2249
2250       if (type.sign) {
2251          LLVMValueRef tmp;
2252
2253          /*
2254           * fix values if rounding is wrong (for non-special cases)
2255           * - this is the case if trunc > a
2256           */
2257          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2258          /* tmp = trunc > a ? 1.0 : 0.0 */
2259          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2260          tmp = lp_build_and(&intbld, mask, tmp);
2261          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2262          res = lp_build_sub(bld, res, tmp);
2263       }
2264
2265       /* mask out sign bit */
2266       anosign = lp_build_abs(bld, a);
2267       /*
2268        * mask out all values if anosign > 2^24
2269        * This should work both for large ints (all rounding is no-op for them
2270        * because such floats are always exact) as well as special cases like
2271        * NaNs, Infs (taking advantage of the fact they use max exponent).
2272        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2273        */
2274       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2275       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2276       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2277       return lp_build_select(bld, mask, a, res);
2278    }
2279 }
2280
2281
2282 /**
2283  * Return ceiling of float (vector), returning float (vector).
2284  * Ex: ceil( 1.1) = 2.0
2285  * Ex: ceil(-1.1) = -1.0
2286  */
2287 LLVMValueRef
2288 lp_build_ceil(struct lp_build_context *bld,
2289               LLVMValueRef a)
2290 {
2291    LLVMBuilderRef builder = bld->gallivm->builder;
2292    const struct lp_type type = bld->type;
2293
2294    assert(type.floating);
2295    assert(lp_check_value(type, a));
2296
2297    if (arch_rounding_available(type)) {
2298       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2299    }
2300    else {
2301       const struct lp_type type = bld->type;
2302       struct lp_type inttype;
2303       struct lp_build_context intbld;
2304       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2305       LLVMValueRef trunc, res, anosign, mask, tmp;
2306       LLVMTypeRef int_vec_type = bld->int_vec_type;
2307       LLVMTypeRef vec_type = bld->vec_type;
2308
2309       if (type.width != 32) {
2310          char intrinsic[32];
2311          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2312          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2313       }
2314
2315       assert(type.width == 32); /* might want to handle doubles at some point */
2316
2317       inttype = type;
2318       inttype.floating = 0;
2319       lp_build_context_init(&intbld, bld->gallivm, inttype);
2320
2321       /* round by truncation */
2322       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2323       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2324
2325       /*
2326        * fix values if rounding is wrong (for non-special cases)
2327        * - this is the case if trunc < a
2328        */
2329       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2330       /* tmp = trunc < a ? 1.0 : 0.0 */
2331       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2332       tmp = lp_build_and(&intbld, mask, tmp);
2333       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2334       res = lp_build_add(bld, trunc, tmp);
2335
2336       /* mask out sign bit */
2337       anosign = lp_build_abs(bld, a);
2338       /*
2339        * mask out all values if anosign > 2^24
2340        * This should work both for large ints (all rounding is no-op for them
2341        * because such floats are always exact) as well as special cases like
2342        * NaNs, Infs (taking advantage of the fact they use max exponent).
2343        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2344        */
2345       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2346       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2347       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2348       return lp_build_select(bld, mask, a, res);
2349    }
2350 }
2351
2352
2353 /**
2354  * Return fractional part of 'a' computed as a - floor(a)
2355  * Typically used in texture coord arithmetic.
2356  */
2357 LLVMValueRef
2358 lp_build_fract(struct lp_build_context *bld,
2359                LLVMValueRef a)
2360 {
2361    assert(bld->type.floating);
2362    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2363 }
2364
2365
2366 /**
2367  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2368  * against 0.99999(9). (Will also return that value for NaNs.)
2369  */
2370 static inline LLVMValueRef
2371 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2372 {
2373    LLVMValueRef max;
2374
2375    /* this is the largest number smaller than 1.0 representable as float */
2376    max = lp_build_const_vec(bld->gallivm, bld->type,
2377                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2378    return lp_build_min_ext(bld, fract, max,
2379                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2380 }
2381
2382
2383 /**
2384  * Same as lp_build_fract, but guarantees that the result is always smaller
2385  * than one. Will also return the smaller-than-one value for infs, NaNs.
2386  */
2387 LLVMValueRef
2388 lp_build_fract_safe(struct lp_build_context *bld,
2389                     LLVMValueRef a)
2390 {
2391    return clamp_fract(bld, lp_build_fract(bld, a));
2392 }
2393
2394
2395 /**
2396  * Return the integer part of a float (vector) value (== round toward zero).
2397  * The returned value is an integer (vector).
2398  * Ex: itrunc(-1.5) = -1
2399  */
2400 LLVMValueRef
2401 lp_build_itrunc(struct lp_build_context *bld,
2402                 LLVMValueRef a)
2403 {
2404    LLVMBuilderRef builder = bld->gallivm->builder;
2405    const struct lp_type type = bld->type;
2406    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2407
2408    assert(type.floating);
2409    assert(lp_check_value(type, a));
2410
2411    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2412 }
2413
2414
2415 /**
2416  * Return float (vector) rounded to nearest integer (vector).  The returned
2417  * value is an integer (vector).
2418  * Ex: iround(0.9) = 1
2419  * Ex: iround(-1.5) = -2
2420  */
2421 LLVMValueRef
2422 lp_build_iround(struct lp_build_context *bld,
2423                 LLVMValueRef a)
2424 {
2425    LLVMBuilderRef builder = bld->gallivm->builder;
2426    const struct lp_type type = bld->type;
2427    LLVMTypeRef int_vec_type = bld->int_vec_type;
2428    LLVMValueRef res;
2429
2430    assert(type.floating);
2431
2432    assert(lp_check_value(type, a));
2433
2434    if ((util_cpu_caps.has_sse2 &&
2435        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2436        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2437       return lp_build_iround_nearest_sse2(bld, a);
2438    }
2439    if (arch_rounding_available(type)) {
2440       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2441    }
2442    else {
2443       LLVMValueRef half;
2444
2445       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2446
2447       if (type.sign) {
2448          LLVMTypeRef vec_type = bld->vec_type;
2449          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2450                                     (unsigned long long)1 << (type.width - 1));
2451          LLVMValueRef sign;
2452
2453          /* get sign bit */
2454          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2455          sign = LLVMBuildAnd(builder, sign, mask, "");
2456
2457          /* sign * 0.5 */
2458          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2459          half = LLVMBuildOr(builder, sign, half, "");
2460          half = LLVMBuildBitCast(builder, half, vec_type, "");
2461       }
2462
2463       res = LLVMBuildFAdd(builder, a, half, "");
2464    }
2465
2466    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2467
2468    return res;
2469 }
2470
2471
2472 /**
2473  * Return floor of float (vector), result is an int (vector)
2474  * Ex: ifloor(1.1) = 1.0
2475  * Ex: ifloor(-1.1) = -2.0
2476  */
2477 LLVMValueRef
2478 lp_build_ifloor(struct lp_build_context *bld,
2479                 LLVMValueRef a)
2480 {
2481    LLVMBuilderRef builder = bld->gallivm->builder;
2482    const struct lp_type type = bld->type;
2483    LLVMTypeRef int_vec_type = bld->int_vec_type;
2484    LLVMValueRef res;
2485
2486    assert(type.floating);
2487    assert(lp_check_value(type, a));
2488
2489    res = a;
2490    if (type.sign) {
2491       if (arch_rounding_available(type)) {
2492          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2493       }
2494       else {
2495          struct lp_type inttype;
2496          struct lp_build_context intbld;
2497          LLVMValueRef trunc, itrunc, mask;
2498
2499          assert(type.floating);
2500          assert(lp_check_value(type, a));
2501
2502          inttype = type;
2503          inttype.floating = 0;
2504          lp_build_context_init(&intbld, bld->gallivm, inttype);
2505
2506          /* round by truncation */
2507          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2508          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2509
2510          /*
2511           * fix values if rounding is wrong (for non-special cases)
2512           * - this is the case if trunc > a
2513           * The results of doing this with NaNs, very large values etc.
2514           * are undefined but this seems to be the case anyway.
2515           */
2516          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2517          /* cheapie minus one with mask since the mask is minus one / zero */
2518          return lp_build_add(&intbld, itrunc, mask);
2519       }
2520    }
2521
2522    /* round to nearest (toward zero) */
2523    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2524
2525    return res;
2526 }
2527
2528
2529 /**
2530  * Return ceiling of float (vector), returning int (vector).
2531  * Ex: iceil( 1.1) = 2
2532  * Ex: iceil(-1.1) = -1
2533  */
2534 LLVMValueRef
2535 lp_build_iceil(struct lp_build_context *bld,
2536                LLVMValueRef a)
2537 {
2538    LLVMBuilderRef builder = bld->gallivm->builder;
2539    const struct lp_type type = bld->type;
2540    LLVMTypeRef int_vec_type = bld->int_vec_type;
2541    LLVMValueRef res;
2542
2543    assert(type.floating);
2544    assert(lp_check_value(type, a));
2545
2546    if (arch_rounding_available(type)) {
2547       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2548    }
2549    else {
2550       struct lp_type inttype;
2551       struct lp_build_context intbld;
2552       LLVMValueRef trunc, itrunc, mask;
2553
2554       assert(type.floating);
2555       assert(lp_check_value(type, a));
2556
2557       inttype = type;
2558       inttype.floating = 0;
2559       lp_build_context_init(&intbld, bld->gallivm, inttype);
2560
2561       /* round by truncation */
2562       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2563       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2564
2565       /*
2566        * fix values if rounding is wrong (for non-special cases)
2567        * - this is the case if trunc < a
2568        * The results of doing this with NaNs, very large values etc.
2569        * are undefined but this seems to be the case anyway.
2570        */
2571       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2572       /* cheapie plus one with mask since the mask is minus one / zero */
2573       return lp_build_sub(&intbld, itrunc, mask);
2574    }
2575
2576    /* round to nearest (toward zero) */
2577    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2578
2579    return res;
2580 }
2581
2582
2583 /**
2584  * Combined ifloor() & fract().
2585  *
2586  * Preferred to calling the functions separately, as it will ensure that the
2587  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2588  */
2589 void
2590 lp_build_ifloor_fract(struct lp_build_context *bld,
2591                       LLVMValueRef a,
2592                       LLVMValueRef *out_ipart,
2593                       LLVMValueRef *out_fpart)
2594 {
2595    LLVMBuilderRef builder = bld->gallivm->builder;
2596    const struct lp_type type = bld->type;
2597    LLVMValueRef ipart;
2598
2599    assert(type.floating);
2600    assert(lp_check_value(type, a));
2601
2602    if (arch_rounding_available(type)) {
2603       /*
2604        * floor() is easier.
2605        */
2606
2607       ipart = lp_build_floor(bld, a);
2608       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2609       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2610    }
2611    else {
2612       /*
2613        * ifloor() is easier.
2614        */
2615
2616       *out_ipart = lp_build_ifloor(bld, a);
2617       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2618       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2619    }
2620 }
2621
2622
2623 /**
2624  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2625  * always smaller than one.
2626  */
2627 void
2628 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2629                            LLVMValueRef a,
2630                            LLVMValueRef *out_ipart,
2631                            LLVMValueRef *out_fpart)
2632 {
2633    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2634    *out_fpart = clamp_fract(bld, *out_fpart);
2635 }
2636
2637
2638 LLVMValueRef
2639 lp_build_sqrt(struct lp_build_context *bld,
2640               LLVMValueRef a)
2641 {
2642    LLVMBuilderRef builder = bld->gallivm->builder;
2643    const struct lp_type type = bld->type;
2644    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2645    char intrinsic[32];
2646
2647    assert(lp_check_value(type, a));
2648
2649    assert(type.floating);
2650    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2651
2652    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2653 }
2654
2655
2656 /**
2657  * Do one Newton-Raphson step to improve reciprocate precision:
2658  *
2659  *   x_{i+1} = x_i * (2 - a * x_i)
2660  *
2661  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2662  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2663  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2664  * halo. It would be necessary to clamp the argument to prevent this.
2665  *
2666  * See also:
2667  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2668  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2669  */
2670 static inline LLVMValueRef
2671 lp_build_rcp_refine(struct lp_build_context *bld,
2672                     LLVMValueRef a,
2673                     LLVMValueRef rcp_a)
2674 {
2675    LLVMBuilderRef builder = bld->gallivm->builder;
2676    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2677    LLVMValueRef res;
2678
2679    res = LLVMBuildFMul(builder, a, rcp_a, "");
2680    res = LLVMBuildFSub(builder, two, res, "");
2681    res = LLVMBuildFMul(builder, rcp_a, res, "");
2682
2683    return res;
2684 }
2685
2686
2687 LLVMValueRef
2688 lp_build_rcp(struct lp_build_context *bld,
2689              LLVMValueRef a)
2690 {
2691    LLVMBuilderRef builder = bld->gallivm->builder;
2692    const struct lp_type type = bld->type;
2693
2694    assert(lp_check_value(type, a));
2695
2696    if(a == bld->zero)
2697       return bld->undef;
2698    if(a == bld->one)
2699       return bld->one;
2700    if(a == bld->undef)
2701       return bld->undef;
2702
2703    assert(type.floating);
2704
2705    if(LLVMIsConstant(a))
2706       return LLVMConstFDiv(bld->one, a);
2707
2708    /*
2709     * We don't use RCPPS because:
2710     * - it only has 10bits of precision
2711     * - it doesn't even get the reciprocate of 1.0 exactly
2712     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2713     * - for recent processors the benefit over DIVPS is marginal, a case
2714     *   dependent
2715     *
2716     * We could still use it on certain processors if benchmarks show that the
2717     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2718     * particular uses that require less workarounds.
2719     */
2720
2721    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2722          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2723       const unsigned num_iterations = 0;
2724       LLVMValueRef res;
2725       unsigned i;
2726       const char *intrinsic = NULL;
2727
2728       if (type.length == 4) {
2729          intrinsic = "llvm.x86.sse.rcp.ps";
2730       }
2731       else {
2732          intrinsic = "llvm.x86.avx.rcp.ps.256";
2733       }
2734
2735       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2736
2737       for (i = 0; i < num_iterations; ++i) {
2738          res = lp_build_rcp_refine(bld, a, res);
2739       }
2740
2741       return res;
2742    }
2743
2744    return LLVMBuildFDiv(builder, bld->one, a, "");
2745 }
2746
2747
2748 /**
2749  * Do one Newton-Raphson step to improve rsqrt precision:
2750  *
2751  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2752  *
2753  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2754  */
2755 static inline LLVMValueRef
2756 lp_build_rsqrt_refine(struct lp_build_context *bld,
2757                       LLVMValueRef a,
2758                       LLVMValueRef rsqrt_a)
2759 {
2760    LLVMBuilderRef builder = bld->gallivm->builder;
2761    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2762    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2763    LLVMValueRef res;
2764
2765    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2766    res = LLVMBuildFMul(builder, a, res, "");
2767    res = LLVMBuildFSub(builder, three, res, "");
2768    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2769    res = LLVMBuildFMul(builder, half, res, "");
2770
2771    return res;
2772 }
2773
2774
2775 /**
2776  * Generate 1/sqrt(a).
2777  * Result is undefined for values < 0, infinity for +0.
2778  */
2779 LLVMValueRef
2780 lp_build_rsqrt(struct lp_build_context *bld,
2781                LLVMValueRef a)
2782 {
2783    const struct lp_type type = bld->type;
2784
2785    assert(lp_check_value(type, a));
2786
2787    assert(type.floating);
2788
2789    /*
2790     * This should be faster but all denormals will end up as infinity.
2791     */
2792    if (0 && lp_build_fast_rsqrt_available(type)) {
2793       const unsigned num_iterations = 1;
2794       LLVMValueRef res;
2795       unsigned i;
2796
2797       /* rsqrt(1.0) != 1.0 here */
2798       res = lp_build_fast_rsqrt(bld, a);
2799
2800       if (num_iterations) {
2801          /*
2802           * Newton-Raphson will result in NaN instead of infinity for zero,
2803           * and NaN instead of zero for infinity.
2804           * Also, need to ensure rsqrt(1.0) == 1.0.
2805           * All numbers smaller than FLT_MIN will result in +infinity
2806           * (rsqrtps treats all denormals as zero).
2807           */
2808          LLVMValueRef cmp;
2809          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2810          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2811
2812          for (i = 0; i < num_iterations; ++i) {
2813             res = lp_build_rsqrt_refine(bld, a, res);
2814          }
2815          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2816          res = lp_build_select(bld, cmp, inf, res);
2817          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2818          res = lp_build_select(bld, cmp, bld->zero, res);
2819          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2820          res = lp_build_select(bld, cmp, bld->one, res);
2821       }
2822
2823       return res;
2824    }
2825
2826    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2827 }
2828
2829 /**
2830  * If there's a fast (inaccurate) rsqrt instruction available
2831  * (caller may want to avoid to call rsqrt_fast if it's not available,
2832  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2833  * unavailable it would result in sqrt/div/mul so obviously
2834  * much better to just call sqrt, skipping both div and mul).
2835  */
2836 boolean
2837 lp_build_fast_rsqrt_available(struct lp_type type)
2838 {
2839    assert(type.floating);
2840
2841    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2842        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2843       return true;
2844    }
2845    return false;
2846 }
2847
2848
2849 /**
2850  * Generate 1/sqrt(a).
2851  * Result is undefined for values < 0, infinity for +0.
2852  * Precision is limited, only ~10 bits guaranteed
2853  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2854  */
2855 LLVMValueRef
2856 lp_build_fast_rsqrt(struct lp_build_context *bld,
2857                     LLVMValueRef a)
2858 {
2859    LLVMBuilderRef builder = bld->gallivm->builder;
2860    const struct lp_type type = bld->type;
2861
2862    assert(lp_check_value(type, a));
2863
2864    if (lp_build_fast_rsqrt_available(type)) {
2865       const char *intrinsic = NULL;
2866
2867       if (type.length == 4) {
2868          intrinsic = "llvm.x86.sse.rsqrt.ps";
2869       }
2870       else {
2871          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2872       }
2873       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2874    }
2875    else {
2876       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2877    }
2878    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2879 }
2880
2881
2882 /**
2883  * Generate sin(a) or cos(a) using polynomial approximation.
2884  * TODO: it might be worth recognizing sin and cos using same source
2885  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2886  * would be way cheaper than calculating (nearly) everything twice...
2887  * Not sure it's common enough to be worth bothering however, scs
2888  * opcode could also benefit from calculating both though.
2889  */
2890 static LLVMValueRef
2891 lp_build_sin_or_cos(struct lp_build_context *bld,
2892                     LLVMValueRef a,
2893                     boolean cos)
2894 {
2895    struct gallivm_state *gallivm = bld->gallivm;
2896    LLVMBuilderRef b = gallivm->builder;
2897    struct lp_type int_type = lp_int_type(bld->type);
2898
2899    /*
2900     *  take the absolute value,
2901     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2902     */
2903
2904    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2905    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2906
2907    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2908    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2909
2910    /*
2911     * scale by 4/Pi
2912     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2913     */
2914
2915    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2916    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2917
2918    /*
2919     * store the integer part of y in mm0
2920     * emm2 = _mm_cvttps_epi32(y);
2921     */
2922
2923    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2924
2925    /*
2926     * j=(j+1) & (~1) (see the cephes sources)
2927     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2928     */
2929
2930    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2931    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2932    /*
2933     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2934     */
2935    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2936    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2937
2938    /*
2939     * y = _mm_cvtepi32_ps(emm2);
2940     */
2941    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2942
2943    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2944    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2945    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2946    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2947
2948    /*
2949     * Argument used for poly selection and sign bit determination
2950     * is different for sin vs. cos.
2951     */
2952    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2953                                emm2_and;
2954
2955    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2956                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2957                                               const_29, "sign_bit") :
2958                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2959                                                               LLVMBuildShl(b, emm2_add,
2960                                                                            const_29, ""), ""),
2961                                               sign_mask, "sign_bit");
2962
2963    /*
2964     * get the polynom selection mask
2965     * there is one polynom for 0 <= x <= Pi/4
2966     * and another one for Pi/4<x<=Pi/2
2967     * Both branches will be computed.
2968     *
2969     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2970     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2971     */
2972
2973    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2974    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2975                                              int_type, PIPE_FUNC_EQUAL,
2976                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2977
2978    /*
2979     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2980     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2981     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2982     */
2983    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2984    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2985    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2986
2987    /*
2988     * The magic pass: "Extended precision modular arithmetic"
2989     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2990     */
2991    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2992    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2993    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2994
2995    /*
2996     * Evaluate the first polynom  (0 <= x <= Pi/4)
2997     *
2998     * z = _mm_mul_ps(x,x);
2999     */
3000    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3001
3002    /*
3003     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3004     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3005     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3006     */
3007    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3008    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3009    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3010
3011    /*
3012     * y = *(v4sf*)_ps_coscof_p0;
3013     * y = _mm_mul_ps(y, z);
3014     */
3015    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3016    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3017    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3018    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3019
3020
3021    /*
3022     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3023     * y = _mm_sub_ps(y, tmp);
3024     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3025     */
3026    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3027    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3028    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3029    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3030    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3031
3032    /*
3033     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3034     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3035     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3036     */
3037    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3038    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3039    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3040
3041    /*
3042     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3043     *
3044     * y2 = *(v4sf*)_ps_sincof_p0;
3045     * y2 = _mm_mul_ps(y2, z);
3046     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3047     * y2 = _mm_mul_ps(y2, z);
3048     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3049     * y2 = _mm_mul_ps(y2, z);
3050     * y2 = _mm_mul_ps(y2, x);
3051     * y2 = _mm_add_ps(y2, x);
3052     */
3053
3054    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3055    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3056    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3057    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3058
3059    /*
3060     * select the correct result from the two polynoms
3061     * xmm3 = poly_mask;
3062     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3063     * y = _mm_andnot_ps(xmm3, y);
3064     * y = _mm_or_ps(y,y2);
3065     */
3066    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3067    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3068    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3069    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3070    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3071    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3072
3073    /*
3074     * update the sign
3075     * y = _mm_xor_ps(y, sign_bit);
3076     */
3077    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3078    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3079
3080    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3081
3082    /* clamp output to be within [-1, 1] */
3083    y_result = lp_build_clamp(bld, y_result,
3084                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3085                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3086    /* If a is -inf, inf or NaN then return NaN */
3087    y_result = lp_build_select(bld, isfinite, y_result,
3088                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3089    return y_result;
3090 }
3091
3092
3093 /**
3094  * Generate sin(a)
3095  */
3096 LLVMValueRef
3097 lp_build_sin(struct lp_build_context *bld,
3098              LLVMValueRef a)
3099 {
3100    return lp_build_sin_or_cos(bld, a, FALSE);
3101 }
3102
3103
3104 /**
3105  * Generate cos(a)
3106  */
3107 LLVMValueRef
3108 lp_build_cos(struct lp_build_context *bld,
3109              LLVMValueRef a)
3110 {
3111    return lp_build_sin_or_cos(bld, a, TRUE);
3112 }
3113
3114
3115 /**
3116  * Generate pow(x, y)
3117  */
3118 LLVMValueRef
3119 lp_build_pow(struct lp_build_context *bld,
3120              LLVMValueRef x,
3121              LLVMValueRef y)
3122 {
3123    /* TODO: optimize the constant case */
3124    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3125        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3126       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3127                    __FUNCTION__);
3128    }
3129
3130    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3131 }
3132
3133
3134 /**
3135  * Generate exp(x)
3136  */
3137 LLVMValueRef
3138 lp_build_exp(struct lp_build_context *bld,
3139              LLVMValueRef x)
3140 {
3141    /* log2(e) = 1/log(2) */
3142    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3143                                            1.4426950408889634);
3144
3145    assert(lp_check_value(bld->type, x));
3146
3147    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3148 }
3149
3150
3151 /**
3152  * Generate log(x)
3153  * Behavior is undefined with infs, 0s and nans
3154  */
3155 LLVMValueRef
3156 lp_build_log(struct lp_build_context *bld,
3157              LLVMValueRef x)
3158 {
3159    /* log(2) */
3160    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3161                                           0.69314718055994529);
3162
3163    assert(lp_check_value(bld->type, x));
3164
3165    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3166 }
3167
3168 /**
3169  * Generate log(x) that handles edge cases (infs, 0s and nans)
3170  */
3171 LLVMValueRef
3172 lp_build_log_safe(struct lp_build_context *bld,
3173                   LLVMValueRef x)
3174 {
3175    /* log(2) */
3176    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3177                                           0.69314718055994529);
3178
3179    assert(lp_check_value(bld->type, x));
3180
3181    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3182 }
3183
3184
3185 /**
3186  * Generate polynomial.
3187  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3188  */
3189 LLVMValueRef
3190 lp_build_polynomial(struct lp_build_context *bld,
3191                     LLVMValueRef x,
3192                     const double *coeffs,
3193                     unsigned num_coeffs)
3194 {
3195    const struct lp_type type = bld->type;
3196    LLVMValueRef even = NULL, odd = NULL;
3197    LLVMValueRef x2;
3198    unsigned i;
3199
3200    assert(lp_check_value(bld->type, x));
3201
3202    /* TODO: optimize the constant case */
3203    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3204        LLVMIsConstant(x)) {
3205       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3206                    __FUNCTION__);
3207    }
3208
3209    /*
3210     * Calculate odd and even terms seperately to decrease data dependency
3211     * Ex:
3212     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3213     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3214     */
3215    x2 = lp_build_mul(bld, x, x);
3216
3217    for (i = num_coeffs; i--; ) {
3218       LLVMValueRef coeff;
3219
3220       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3221
3222       if (i % 2 == 0) {
3223          if (even)
3224             even = lp_build_mad(bld, x2, even, coeff);
3225          else
3226             even = coeff;
3227       } else {
3228          if (odd)
3229             odd = lp_build_mad(bld, x2, odd, coeff);
3230          else
3231             odd = coeff;
3232       }
3233    }
3234
3235    if (odd)
3236       return lp_build_mad(bld, odd, x, even);
3237    else if (even)
3238       return even;
3239    else
3240       return bld->undef;
3241 }
3242
3243
3244 /**
3245  * Minimax polynomial fit of 2**x, in range [0, 1[
3246  */
3247 const double lp_build_exp2_polynomial[] = {
3248 #if EXP_POLY_DEGREE == 5
3249    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3250    0.693153073200168932794,
3251    0.240153617044375388211,
3252    0.0558263180532956664775,
3253    0.00898934009049466391101,
3254    0.00187757667519147912699
3255 #elif EXP_POLY_DEGREE == 4
3256    1.00000259337069434683,
3257    0.693003834469974940458,
3258    0.24144275689150793076,
3259    0.0520114606103070150235,
3260    0.0135341679161270268764
3261 #elif EXP_POLY_DEGREE == 3
3262    0.999925218562710312959,
3263    0.695833540494823811697,
3264    0.226067155427249155588,
3265    0.0780245226406372992967
3266 #elif EXP_POLY_DEGREE == 2
3267    1.00172476321474503578,
3268    0.657636275736077639316,
3269    0.33718943461968720704
3270 #else
3271 #error
3272 #endif
3273 };
3274
3275
3276 LLVMValueRef
3277 lp_build_exp2(struct lp_build_context *bld,
3278               LLVMValueRef x)
3279 {
3280    LLVMBuilderRef builder = bld->gallivm->builder;
3281    const struct lp_type type = bld->type;
3282    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3283    LLVMValueRef ipart = NULL;
3284    LLVMValueRef fpart = NULL;
3285    LLVMValueRef expipart = NULL;
3286    LLVMValueRef expfpart = NULL;
3287    LLVMValueRef res = NULL;
3288
3289    assert(lp_check_value(bld->type, x));
3290
3291    /* TODO: optimize the constant case */
3292    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3293        LLVMIsConstant(x)) {
3294       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3295                    __FUNCTION__);
3296    }
3297
3298    assert(type.floating && type.width == 32);
3299
3300    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3301     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3302    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3303                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3304    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3305                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3306
3307    /* ipart = floor(x) */
3308    /* fpart = x - ipart */
3309    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3310
3311    /* expipart = (float) (1 << ipart) */
3312    expipart = LLVMBuildAdd(builder, ipart,
3313                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3314    expipart = LLVMBuildShl(builder, expipart,
3315                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3316    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3317
3318    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3319                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3320
3321    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3322
3323    return res;
3324 }
3325
3326
3327
3328 /**
3329  * Extract the exponent of a IEEE-754 floating point value.
3330  *
3331  * Optionally apply an integer bias.
3332  *
3333  * Result is an integer value with
3334  *
3335  *   ifloor(log2(x)) + bias
3336  */
3337 LLVMValueRef
3338 lp_build_extract_exponent(struct lp_build_context *bld,
3339                           LLVMValueRef x,
3340                           int bias)
3341 {
3342    LLVMBuilderRef builder = bld->gallivm->builder;
3343    const struct lp_type type = bld->type;
3344    unsigned mantissa = lp_mantissa(type);
3345    LLVMValueRef res;
3346
3347    assert(type.floating);
3348
3349    assert(lp_check_value(bld->type, x));
3350
3351    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3352
3353    res = LLVMBuildLShr(builder, x,
3354                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3355    res = LLVMBuildAnd(builder, res,
3356                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3357    res = LLVMBuildSub(builder, res,
3358                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3359
3360    return res;
3361 }
3362
3363
3364 /**
3365  * Extract the mantissa of the a floating.
3366  *
3367  * Result is a floating point value with
3368  *
3369  *   x / floor(log2(x))
3370  */
3371 LLVMValueRef
3372 lp_build_extract_mantissa(struct lp_build_context *bld,
3373                           LLVMValueRef x)
3374 {
3375    LLVMBuilderRef builder = bld->gallivm->builder;
3376    const struct lp_type type = bld->type;
3377    unsigned mantissa = lp_mantissa(type);
3378    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3379                                                   (1ULL << mantissa) - 1);
3380    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3381    LLVMValueRef res;
3382
3383    assert(lp_check_value(bld->type, x));
3384
3385    assert(type.floating);
3386
3387    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3388
3389    /* res = x / 2**ipart */
3390    res = LLVMBuildAnd(builder, x, mantmask, "");
3391    res = LLVMBuildOr(builder, res, one, "");
3392    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3393
3394    return res;
3395 }
3396
3397
3398
3399 /**
3400  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3401  * These coefficients can be generate with
3402  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3403  */
3404 const double lp_build_log2_polynomial[] = {
3405 #if LOG_POLY_DEGREE == 5
3406    2.88539008148777786488L,
3407    0.961796878841293367824L,
3408    0.577058946784739859012L,
3409    0.412914355135828735411L,
3410    0.308591899232910175289L,
3411    0.352376952300281371868L,
3412 #elif LOG_POLY_DEGREE == 4
3413    2.88539009343309178325L,
3414    0.961791550404184197881L,
3415    0.577440339438736392009L,
3416    0.403343858251329912514L,
3417    0.406718052498846252698L,
3418 #elif LOG_POLY_DEGREE == 3
3419    2.88538959748872753838L,
3420    0.961932915889597772928L,
3421    0.571118517972136195241L,
3422    0.493997535084709500285L,
3423 #else
3424 #error
3425 #endif
3426 };
3427
3428 /**
3429  * See http://www.devmaster.net/forums/showthread.php?p=43580
3430  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3431  * http://www.nezumi.demon.co.uk/consult/logx.htm
3432  *
3433  * If handle_edge_cases is true the function will perform computations
3434  * to match the required D3D10+ behavior for each of the edge cases.
3435  * That means that if input is:
3436  * - less than zero (to and including -inf) then NaN will be returned
3437  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3438  * - +infinity, then +infinity will be returned
3439  * - NaN, then NaN will be returned
3440  *
3441  * Those checks are fairly expensive so if you don't need them make sure
3442  * handle_edge_cases is false.
3443  */
3444 void
3445 lp_build_log2_approx(struct lp_build_context *bld,
3446                      LLVMValueRef x,
3447                      LLVMValueRef *p_exp,
3448                      LLVMValueRef *p_floor_log2,
3449                      LLVMValueRef *p_log2,
3450                      boolean handle_edge_cases)
3451 {
3452    LLVMBuilderRef builder = bld->gallivm->builder;
3453    const struct lp_type type = bld->type;
3454    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3455    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3456
3457    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3458    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3459    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3460
3461    LLVMValueRef i = NULL;
3462    LLVMValueRef y = NULL;
3463    LLVMValueRef z = NULL;
3464    LLVMValueRef exp = NULL;
3465    LLVMValueRef mant = NULL;
3466    LLVMValueRef logexp = NULL;
3467    LLVMValueRef p_z = NULL;
3468    LLVMValueRef res = NULL;
3469
3470    assert(lp_check_value(bld->type, x));
3471
3472    if(p_exp || p_floor_log2 || p_log2) {
3473       /* TODO: optimize the constant case */
3474       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3475           LLVMIsConstant(x)) {
3476          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3477                       __FUNCTION__);
3478       }
3479
3480       assert(type.floating && type.width == 32);
3481
3482       /*
3483        * We don't explicitly handle denormalized numbers. They will yield a
3484        * result in the neighbourhood of -127, which appears to be adequate
3485        * enough.
3486        */
3487
3488       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3489
3490       /* exp = (float) exponent(x) */
3491       exp = LLVMBuildAnd(builder, i, expmask, "");
3492    }
3493
3494    if(p_floor_log2 || p_log2) {
3495       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3496       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3497       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3498    }
3499
3500    if (p_log2) {
3501       /* mant = 1 + (float) mantissa(x) */
3502       mant = LLVMBuildAnd(builder, i, mantmask, "");
3503       mant = LLVMBuildOr(builder, mant, one, "");
3504       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3505
3506       /* y = (mant - 1) / (mant + 1) */
3507       y = lp_build_div(bld,
3508          lp_build_sub(bld, mant, bld->one),
3509          lp_build_add(bld, mant, bld->one)
3510       );
3511
3512       /* z = y^2 */
3513       z = lp_build_mul(bld, y, y);
3514
3515       /* compute P(z) */
3516       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3517                                 ARRAY_SIZE(lp_build_log2_polynomial));
3518
3519       /* y * P(z) + logexp */
3520       res = lp_build_mad(bld, y, p_z, logexp);
3521
3522       if (type.floating && handle_edge_cases) {
3523          LLVMValueRef negmask, infmask,  zmask;
3524          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3525                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3526          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3527                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3528          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3529                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3530
3531          /* If x is qual to inf make sure we return inf */
3532          res = lp_build_select(bld, infmask,
3533                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3534                                res);
3535          /* If x is qual to 0, return -inf */
3536          res = lp_build_select(bld, zmask,
3537                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3538                                res);
3539          /* If x is nan or less than 0, return nan */
3540          res = lp_build_select(bld, negmask,
3541                                lp_build_const_vec(bld->gallivm, type,  NAN),
3542                                res);
3543       }
3544    }
3545
3546    if (p_exp) {
3547       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3548       *p_exp = exp;
3549    }
3550
3551    if (p_floor_log2)
3552       *p_floor_log2 = logexp;
3553
3554    if (p_log2)
3555       *p_log2 = res;
3556 }
3557
3558
3559 /*
3560  * log2 implementation which doesn't have special code to
3561  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3562  * the results for those cases are undefined.
3563  */
3564 LLVMValueRef
3565 lp_build_log2(struct lp_build_context *bld,
3566               LLVMValueRef x)
3567 {
3568    LLVMValueRef res;
3569    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3570    return res;
3571 }
3572
3573 /*
3574  * Version of log2 which handles all edge cases.
3575  * Look at documentation of lp_build_log2_approx for
3576  * description of the behavior for each of the edge cases.
3577  */
3578 LLVMValueRef
3579 lp_build_log2_safe(struct lp_build_context *bld,
3580                    LLVMValueRef x)
3581 {
3582    LLVMValueRef res;
3583    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3584    return res;
3585 }
3586
3587
3588 /**
3589  * Faster (and less accurate) log2.
3590  *
3591  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3592  *
3593  * Piece-wise linear approximation, with exact results when x is a
3594  * power of two.
3595  *
3596  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3597  */
3598 LLVMValueRef
3599 lp_build_fast_log2(struct lp_build_context *bld,
3600                    LLVMValueRef x)
3601 {
3602    LLVMBuilderRef builder = bld->gallivm->builder;
3603    LLVMValueRef ipart;
3604    LLVMValueRef fpart;
3605
3606    assert(lp_check_value(bld->type, x));
3607
3608    assert(bld->type.floating);
3609
3610    /* ipart = floor(log2(x)) - 1 */
3611    ipart = lp_build_extract_exponent(bld, x, -1);
3612    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3613
3614    /* fpart = x / 2**ipart */
3615    fpart = lp_build_extract_mantissa(bld, x);
3616
3617    /* ipart + fpart */
3618    return LLVMBuildFAdd(builder, ipart, fpart, "");
3619 }
3620
3621
3622 /**
3623  * Fast implementation of iround(log2(x)).
3624  *
3625  * Not an approximation -- it should give accurate results all the time.
3626  */
3627 LLVMValueRef
3628 lp_build_ilog2(struct lp_build_context *bld,
3629                LLVMValueRef x)
3630 {
3631    LLVMBuilderRef builder = bld->gallivm->builder;
3632    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3633    LLVMValueRef ipart;
3634
3635    assert(bld->type.floating);
3636
3637    assert(lp_check_value(bld->type, x));
3638
3639    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3640    x = LLVMBuildFMul(builder, x, sqrt2, "");
3641
3642    /* ipart = floor(log2(x) + 0.5)  */
3643    ipart = lp_build_extract_exponent(bld, x, 0);
3644
3645    return ipart;
3646 }
3647
3648 LLVMValueRef
3649 lp_build_mod(struct lp_build_context *bld,
3650              LLVMValueRef x,
3651              LLVMValueRef y)
3652 {
3653    LLVMBuilderRef builder = bld->gallivm->builder;
3654    LLVMValueRef res;
3655    const struct lp_type type = bld->type;
3656
3657    assert(lp_check_value(type, x));
3658    assert(lp_check_value(type, y));
3659
3660    if (type.floating)
3661       res = LLVMBuildFRem(builder, x, y, "");
3662    else if (type.sign)
3663       res = LLVMBuildSRem(builder, x, y, "");
3664    else
3665       res = LLVMBuildURem(builder, x, y, "");
3666    return res;
3667 }
3668
3669
3670 /*
3671  * For floating inputs it creates and returns a mask
3672  * which is all 1's for channels which are NaN.
3673  * Channels inside x which are not NaN will be 0.
3674  */
3675 LLVMValueRef
3676 lp_build_isnan(struct lp_build_context *bld,
3677                LLVMValueRef x)
3678 {
3679    LLVMValueRef mask;
3680    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3681
3682    assert(bld->type.floating);
3683    assert(lp_check_value(bld->type, x));
3684
3685    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3686                         "isnotnan");
3687    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3688    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3689    return mask;
3690 }
3691
3692 /* Returns all 1's for floating point numbers that are
3693  * finite numbers and returns all zeros for -inf,
3694  * inf and nan's */
3695 LLVMValueRef
3696 lp_build_isfinite(struct lp_build_context *bld,
3697                   LLVMValueRef x)
3698 {
3699    LLVMBuilderRef builder = bld->gallivm->builder;
3700    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3701    struct lp_type int_type = lp_int_type(bld->type);
3702    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3703    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3704                                                     0x7f800000);
3705
3706    if (!bld->type.floating) {
3707       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3708    }
3709    assert(bld->type.floating);
3710    assert(lp_check_value(bld->type, x));
3711    assert(bld->type.width == 32);
3712
3713    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3714    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3715                            intx, infornan32);
3716 }
3717
3718 /*
3719  * Returns true if the number is nan or inf and false otherwise.
3720  * The input has to be a floating point vector.
3721  */
3722 LLVMValueRef
3723 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3724                        const struct lp_type type,
3725                        LLVMValueRef x)
3726 {
3727    LLVMBuilderRef builder = gallivm->builder;
3728    struct lp_type int_type = lp_int_type(type);
3729    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3730                                                 0x7f800000);
3731    LLVMValueRef ret;
3732
3733    assert(type.floating);
3734
3735    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3736    ret = LLVMBuildAnd(builder, ret, const0, "");
3737    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3738                           ret, const0);
3739
3740    return ret;
3741 }
3742
3743
3744 LLVMValueRef
3745 lp_build_fpstate_get(struct gallivm_state *gallivm)
3746 {
3747    if (util_cpu_caps.has_sse) {
3748       LLVMBuilderRef builder = gallivm->builder;
3749       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3750          gallivm,
3751          LLVMInt32TypeInContext(gallivm->context),
3752          "mxcsr_ptr");
3753       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3754           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3755       lp_build_intrinsic(builder,
3756                          "llvm.x86.sse.stmxcsr",
3757                          LLVMVoidTypeInContext(gallivm->context),
3758                          &mxcsr_ptr8, 1, 0);
3759       return mxcsr_ptr;
3760    }
3761    return 0;
3762 }
3763
3764 void
3765 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3766                                   boolean zero)
3767 {
3768    if (util_cpu_caps.has_sse) {
3769       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3770       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3771
3772       LLVMBuilderRef builder = gallivm->builder;
3773       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3774       LLVMValueRef mxcsr =
3775          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3776
3777       if (util_cpu_caps.has_daz) {
3778          /* Enable denormals are zero mode */
3779          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3780       }
3781       if (zero) {
3782          mxcsr = LLVMBuildOr(builder, mxcsr,
3783                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3784       } else {
3785          mxcsr = LLVMBuildAnd(builder, mxcsr,
3786                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3787       }
3788
3789       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3790       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3791    }
3792 }
3793
3794 void
3795 lp_build_fpstate_set(struct gallivm_state *gallivm,
3796                      LLVMValueRef mxcsr_ptr)
3797 {
3798    if (util_cpu_caps.has_sse) {
3799       LLVMBuilderRef builder = gallivm->builder;
3800       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3801                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3802       lp_build_intrinsic(builder,
3803                          "llvm.x86.sse.ldmxcsr",
3804                          LLVMVoidTypeInContext(gallivm->context),
3805                          &mxcsr_ptr, 1, 0);
3806    }
3807 }