src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if (a == bld->zero)
 545       return b;
 546    if (b == bld->zero)
 547       return a;
 548    if (a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if (type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if (!type.sign && (a == bld->one || b == bld->one))
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (type.width * type.length == 128) {
 559             if (util_cpu_caps.has_sse2) {
 560                if (type.width == 8)
 561                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
 562                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
 563                if (type.width == 16)
 564                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
 565                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
 566             } else if (util_cpu_caps.has_altivec) {
 567                if (type.width == 8)
 568                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 569                if (type.width == 16)
 570                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 571             }
 572          }
 573          if (type.width * type.length == 256) {
 574             if (util_cpu_caps.has_avx2) {
 575                if (type.width == 8)
 576                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
 577                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
 578                if (type.width == 16)
 579                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
 580                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
 581             }
 582          }
 583       }
 584
 585       if (intrinsic)
 586          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 587    }
 588
 589    if(type.norm && !type.floating && !type.fixed) {
 590       if (type.sign) {
 591          uint64_t sign = (uint64_t)1 << (type.width - 1);
 592          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 593          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 594          /* a_clamp_max is the maximum a for positive b,
 595             a_clamp_min is the minimum a for negative b. */
 596          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 597          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 598          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 599       }
 600    }
 601
 602    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 603       if (type.floating)
 604          res = LLVMConstFAdd(a, b);
 605       else
 606          res = LLVMConstAdd(a, b);
 607    else
 608       if (type.floating)
 609          res = LLVMBuildFAdd(builder, a, b, "");
 610       else
 611          res = LLVMBuildAdd(builder, a, b, "");
 612
 613    /* clamp to ceiling of 1.0 */
 614    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 615       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 616
 617    if (type.norm && !type.floating && !type.fixed) {
 618       if (!type.sign) {
 619          /*
 620           * newer llvm versions no longer support the intrinsics, but recognize
 621           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 622           * code, it is important we match the pattern llvm uses (and pray llvm
 623           * doesn't change it - and hope they decide on the same pattern for
 624           * all backends supporting it...).
 625           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 626           * interfere with llvm's ability to recognize the pattern but seems
 627           * a bit brittle.
 628           */
 629          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 630          res = lp_build_select(bld, overflowed,
 631                                LLVMConstAllOnes(bld->int_vec_type), res);
 632       }
 633    }
 634
 635    /* XXX clamp to floor of -1 or 0??? */
 636
 637    return res;
 638 }
 639
 640
 641 /** Return the scalar sum of the elements of a.
 642  * Should avoid this operation whenever possible.
 643  */
 644 LLVMValueRef
 645 lp_build_horizontal_add(struct lp_build_context *bld,
 646                         LLVMValueRef a)
 647 {
 648    LLVMBuilderRef builder = bld->gallivm->builder;
 649    const struct lp_type type = bld->type;
 650    LLVMValueRef index, res;
 651    unsigned i, length;
 652    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 653    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 654    LLVMValueRef vecres, elem2;
 655
 656    assert(lp_check_value(type, a));
 657
 658    if (type.length == 1) {
 659       return a;
 660    }
 661
 662    assert(!bld->type.norm);
 663
 664    /*
 665     * for byte vectors can do much better with psadbw.
 666     * Using repeated shuffle/adds here. Note with multiple vectors
 667     * this can be done more efficiently as outlined in the intel
 668     * optimization manual.
 669     * Note: could cause data rearrangement if used with smaller element
 670     * sizes.
 671     */
 672
 673    vecres = a;
 674    length = type.length / 2;
 675    while (length > 1) {
 676       LLVMValueRef vec1, vec2;
 677       for (i = 0; i < length; i++) {
 678          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 679          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 680       }
 681       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 682                                     LLVMConstVector(shuffles1, length), "");
 683       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 684                                     LLVMConstVector(shuffles2, length), "");
 685       if (type.floating) {
 686          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 687       }
 688       else {
 689          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 690       }
 691       length = length >> 1;
 692    }
 693
 694    /* always have vector of size 2 here */
 695    assert(length == 1);
 696
 697    index = lp_build_const_int32(bld->gallivm, 0);
 698    res = LLVMBuildExtractElement(builder, vecres, index, "");
 699    index = lp_build_const_int32(bld->gallivm, 1);
 700    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 701
 702    if (type.floating)
 703       res = LLVMBuildFAdd(builder, res, elem2, "");
 704     else
 705       res = LLVMBuildAdd(builder, res, elem2, "");
 706
 707    return res;
 708 }
 709
 710 /**
 711  * Return the horizontal sums of 4 float vectors as a float4 vector.
 712  * This uses the technique as outlined in Intel Optimization Manual.
 713  */
 714 static LLVMValueRef
 715 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 716                             LLVMValueRef src[4])
 717 {
 718    struct gallivm_state *gallivm = bld->gallivm;
 719    LLVMBuilderRef builder = gallivm->builder;
 720    LLVMValueRef shuffles[4];
 721    LLVMValueRef tmp[4];
 722    LLVMValueRef sumtmp[2], shuftmp[2];
 723
 724    /* lower half of regs */
 725    shuffles[0] = lp_build_const_int32(gallivm, 0);
 726    shuffles[1] = lp_build_const_int32(gallivm, 1);
 727    shuffles[2] = lp_build_const_int32(gallivm, 4);
 728    shuffles[3] = lp_build_const_int32(gallivm, 5);
 729    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 730                                    LLVMConstVector(shuffles, 4), "");
 731    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 732                                    LLVMConstVector(shuffles, 4), "");
 733
 734    /* upper half of regs */
 735    shuffles[0] = lp_build_const_int32(gallivm, 2);
 736    shuffles[1] = lp_build_const_int32(gallivm, 3);
 737    shuffles[2] = lp_build_const_int32(gallivm, 6);
 738    shuffles[3] = lp_build_const_int32(gallivm, 7);
 739    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 740                                    LLVMConstVector(shuffles, 4), "");
 741    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 742                                    LLVMConstVector(shuffles, 4), "");
 743
 744    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 745    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 746
 747    shuffles[0] = lp_build_const_int32(gallivm, 0);
 748    shuffles[1] = lp_build_const_int32(gallivm, 2);
 749    shuffles[2] = lp_build_const_int32(gallivm, 4);
 750    shuffles[3] = lp_build_const_int32(gallivm, 6);
 751    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 752                                        LLVMConstVector(shuffles, 4), "");
 753
 754    shuffles[0] = lp_build_const_int32(gallivm, 1);
 755    shuffles[1] = lp_build_const_int32(gallivm, 3);
 756    shuffles[2] = lp_build_const_int32(gallivm, 5);
 757    shuffles[3] = lp_build_const_int32(gallivm, 7);
 758    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 759                                        LLVMConstVector(shuffles, 4), "");
 760
 761    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 762 }
 763
 764
 765 /*
 766  * partially horizontally add 2-4 float vectors with length nx4,
 767  * i.e. only four adjacent values in each vector will be added,
 768  * assuming values are really grouped in 4 which also determines
 769  * output order.
 770  *
 771  * Return a vector of the same length as the initial vectors,
 772  * with the excess elements (if any) being undefined.
 773  * The element order is independent of number of input vectors.
 774  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 775  * the output order thus will be
 776  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 777  */
 778 LLVMValueRef
 779 lp_build_hadd_partial4(struct lp_build_context *bld,
 780                        LLVMValueRef vectors[],
 781                        unsigned num_vecs)
 782 {
 783    struct gallivm_state *gallivm = bld->gallivm;
 784    LLVMBuilderRef builder = gallivm->builder;
 785    LLVMValueRef ret_vec;
 786    LLVMValueRef tmp[4];
 787    const char *intrinsic = NULL;
 788
 789    assert(num_vecs >= 2 && num_vecs <= 4);
 790    assert(bld->type.floating);
 791
 792    /* only use this with at least 2 vectors, as it is sort of expensive
 793     * (depending on cpu) and we always need two horizontal adds anyway,
 794     * so a shuffle/add approach might be better.
 795     */
 796
 797    tmp[0] = vectors[0];
 798    tmp[1] = vectors[1];
 799
 800    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 801    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 802
 803    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 804        bld->type.length == 4) {
 805       intrinsic = "llvm.x86.sse3.hadd.ps";
 806    }
 807    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 808             bld->type.length == 8) {
 809       intrinsic = "llvm.x86.avx.hadd.ps.256";
 810    }
 811    if (intrinsic) {
 812       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 813                                        lp_build_vec_type(gallivm, bld->type),
 814                                        tmp[0], tmp[1]);
 815       if (num_vecs > 2) {
 816          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 817                                           lp_build_vec_type(gallivm, bld->type),
 818                                           tmp[2], tmp[3]);
 819       }
 820       else {
 821          tmp[1] = tmp[0];
 822       }
 823       return lp_build_intrinsic_binary(builder, intrinsic,
 824                                        lp_build_vec_type(gallivm, bld->type),
 825                                        tmp[0], tmp[1]);
 826    }
 827
 828    if (bld->type.length == 4) {
 829       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 830    }
 831    else {
 832       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 833       unsigned j;
 834       unsigned num_iter = bld->type.length / 4;
 835       struct lp_type parttype = bld->type;
 836       parttype.length = 4;
 837       for (j = 0; j < num_iter; j++) {
 838          LLVMValueRef partsrc[4];
 839          unsigned i;
 840          for (i = 0; i < 4; i++) {
 841             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 842          }
 843          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 844       }
 845       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 846    }
 847    return ret_vec;
 848 }
 849
 850 /**
 851  * Generate a - b
 852  */
 853 LLVMValueRef
 854 lp_build_sub(struct lp_build_context *bld,
 855              LLVMValueRef a,
 856              LLVMValueRef b)
 857 {
 858    LLVMBuilderRef builder = bld->gallivm->builder;
 859    const struct lp_type type = bld->type;
 860    LLVMValueRef res;
 861
 862    assert(lp_check_value(type, a));
 863    assert(lp_check_value(type, b));
 864
 865    if (b == bld->zero)
 866       return a;
 867    if (a == bld->undef || b == bld->undef)
 868       return bld->undef;
 869    if (a == b)
 870       return bld->zero;
 871
 872    if (type.norm) {
 873       const char *intrinsic = NULL;
 874
 875       if (!type.sign && b == bld->one)
 876         return bld->zero;
 877
 878       if (!type.floating && !type.fixed) {
 879          if (type.width * type.length == 128) {
 880             if (util_cpu_caps.has_sse2) {
 881                if (type.width == 8)
 882                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
 883                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
 884                if (type.width == 16)
 885                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
 886                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
 887             } else if (util_cpu_caps.has_altivec) {
 888                if (type.width == 8)
 889                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 890                if (type.width == 16)
 891                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 892             }
 893          }
 894          if (type.width * type.length == 256) {
 895             if (util_cpu_caps.has_avx2) {
 896                if (type.width == 8)
 897                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
 898                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
 899                if (type.width == 16)
 900                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
 901                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
 902             }
 903          }
 904       }
 905
 906       if (intrinsic)
 907          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 908    }
 909
 910    if(type.norm && !type.floating && !type.fixed) {
 911       if (type.sign) {
 912          uint64_t sign = (uint64_t)1 << (type.width - 1);
 913          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 914          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 915          /* a_clamp_max is the maximum a for negative b,
 916             a_clamp_min is the minimum a for positive b. */
 917          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 918          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 919          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 920       } else {
 921          /*
 922           * This must match llvm pattern for saturated unsigned sub.
 923           * (lp_build_max_simple actually does the job with its current
 924           * definition but do it explicitly here.)
 925           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 926           * interfere with llvm's ability to recognize the pattern but seems
 927           * a bit brittle.
 928           */
 929          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 930          a = lp_build_select(bld, no_ov, a, b);
 931       }
 932    }
 933
 934    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 935       if (type.floating)
 936          res = LLVMConstFSub(a, b);
 937       else
 938          res = LLVMConstSub(a, b);
 939    else
 940       if (type.floating)
 941          res = LLVMBuildFSub(builder, a, b, "");
 942       else
 943          res = LLVMBuildSub(builder, a, b, "");
 944
 945    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 946       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 947
 948    return res;
 949 }
 950
 951
 952
 953 /**
 954  * Normalized multiplication.
 955  *
 956  * There are several approaches for (using 8-bit normalized multiplication as
 957  * an example):
 958  *
 959  * - alpha plus one
 960  *
 961  *     makes the following approximation to the division (Sree)
 962  *
 963  *       a*b/255 ~= (a*(b + 1)) >> 256
 964  *
 965  *     which is the fastest method that satisfies the following OpenGL criteria of
 966  *
 967  *       0*0 = 0 and 255*255 = 255
 968  *
 969  * - geometric series
 970  *
 971  *     takes the geometric series approximation to the division
 972  *
 973  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 974  *
 975  *     in this case just the first two terms to fit in 16bit arithmetic
 976  *
 977  *       t/255 ~= (t + (t >> 8)) >> 8
 978  *
 979  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 980  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 981  *     must be used.
 982  *
 983  * - geometric series plus rounding
 984  *
 985  *     when using a geometric series division instead of truncating the result
 986  *     use roundoff in the approximation (Jim Blinn)
 987  *
 988  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 989  *
 990  *     achieving the exact results.
 991  *
 992  *
 993  *
 994  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 995  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 996  * @sa Michael Herf, The "double blend trick", May 2000,
 997  *     http://www.stereopsis.com/doubleblend.html
 998  */
 999 LLVMValueRef
1000 lp_build_mul_norm(struct gallivm_state *gallivm,
1001                   struct lp_type wide_type,
1002                   LLVMValueRef a, LLVMValueRef b)
1003 {
1004    LLVMBuilderRef builder = gallivm->builder;
1005    struct lp_build_context bld;
1006    unsigned n;
1007    LLVMValueRef half;
1008    LLVMValueRef ab;
1009
1010    assert(!wide_type.floating);
1011    assert(lp_check_value(wide_type, a));
1012    assert(lp_check_value(wide_type, b));
1013
1014    lp_build_context_init(&bld, gallivm, wide_type);
1015
1016    n = wide_type.width / 2;
1017    if (wide_type.sign) {
1018       --n;
1019    }
1020
1021    /*
1022     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1023     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1024     */
1025
1026    /*
1027     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1028     */
1029
1030    ab = LLVMBuildMul(builder, a, b, "");
1031    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1032
1033    /*
1034     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1035     */
1036
1037    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1038    if (wide_type.sign) {
1039       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1040       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1041       half = lp_build_select(&bld, sign, minus_half, half);
1042    }
1043    ab = LLVMBuildAdd(builder, ab, half, "");
1044
1045    /* Final division */
1046    ab = lp_build_shr_imm(&bld, ab, n);
1047
1048    return ab;
1049 }
1050
1051 /**
1052  * Generate a * b
1053  */
1054 LLVMValueRef
1055 lp_build_mul(struct lp_build_context *bld,
1056              LLVMValueRef a,
1057              LLVMValueRef b)
1058 {
1059    LLVMBuilderRef builder = bld->gallivm->builder;
1060    const struct lp_type type = bld->type;
1061    LLVMValueRef shift;
1062    LLVMValueRef res;
1063
1064    assert(lp_check_value(type, a));
1065    assert(lp_check_value(type, b));
1066
1067    if(a == bld->zero)
1068       return bld->zero;
1069    if(a == bld->one)
1070       return b;
1071    if(b == bld->zero)
1072       return bld->zero;
1073    if(b == bld->one)
1074       return a;
1075    if(a == bld->undef || b == bld->undef)
1076       return bld->undef;
1077
1078    if (!type.floating && !type.fixed && type.norm) {
1079       struct lp_type wide_type = lp_wider_type(type);
1080       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1081
1082       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1083       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1084
1085       /* PMULLW, PSRLW, PADDW */
1086       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1087       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1088
1089       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1090
1091       return ab;
1092    }
1093
1094    if(type.fixed)
1095       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1096    else
1097       shift = NULL;
1098
1099    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1100       if (type.floating)
1101          res = LLVMConstFMul(a, b);
1102       else
1103          res = LLVMConstMul(a, b);
1104       if(shift) {
1105          if(type.sign)
1106             res = LLVMConstAShr(res, shift);
1107          else
1108             res = LLVMConstLShr(res, shift);
1109       }
1110    }
1111    else {
1112       if (type.floating)
1113          res = LLVMBuildFMul(builder, a, b, "");
1114       else
1115          res = LLVMBuildMul(builder, a, b, "");
1116       if(shift) {
1117          if(type.sign)
1118             res = LLVMBuildAShr(builder, res, shift, "");
1119          else
1120             res = LLVMBuildLShr(builder, res, shift, "");
1121       }
1122    }
1123
1124    return res;
1125 }
1126
1127 /*
1128  * Widening mul, valid for 32x32 bit -> 64bit only.
1129  * Result is low 32bits, high bits returned in res_hi.
1130  *
1131  * Emits code that is meant to be compiled for the host CPU.
1132  */
1133 LLVMValueRef
1134 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1135                          LLVMValueRef a,
1136                          LLVMValueRef b,
1137                          LLVMValueRef *res_hi)
1138 {
1139    struct gallivm_state *gallivm = bld->gallivm;
1140    LLVMBuilderRef builder = gallivm->builder;
1141
1142    assert(bld->type.width == 32);
1143    assert(bld->type.floating == 0);
1144    assert(bld->type.fixed == 0);
1145    assert(bld->type.norm == 0);
1146
1147    /*
1148     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1149     * for x86 simd is atrocious (even if the high bits weren't required),
1150     * trying to handle real 64bit inputs (which of course can't happen due
1151     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1152     * apparently llvm does not recognize this widening mul). This includes 6
1153     * (instead of 2) pmuludq plus extra adds and shifts
1154     * The same story applies to signed mul, albeit fixing this requires sse41.
1155     * https://llvm.org/bugs/show_bug.cgi?id=30845
1156     * So, whip up our own code, albeit only for length 4 and 8 (which
1157     * should be good enough)...
1158     */
1159    if ((bld->type.length == 4 || bld->type.length == 8) &&
1160        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1161         util_cpu_caps.has_sse4_1)) {
1162       const char *intrinsic = NULL;
1163       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1164       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1165       struct lp_type type_wide = lp_wider_type(bld->type);
1166       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1167       unsigned i;
1168       for (i = 0; i < bld->type.length; i += 2) {
1169          shuf[i] = lp_build_const_int32(gallivm, i+1);
1170          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1171       }
1172       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1173       aeven = a;
1174       beven = b;
1175       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1176       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1177
1178       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1179          if (bld->type.sign) {
1180             intrinsic = "llvm.x86.avx2.pmul.dq";
1181          } else {
1182             intrinsic = "llvm.x86.avx2.pmulu.dq";
1183          }
1184          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1185                                              wider_type, aeven, beven);
1186          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1187                                             wider_type, aodd, bodd);
1188       }
1189       else {
1190          /* for consistent naming look elsewhere... */
1191          if (bld->type.sign) {
1192             intrinsic = "llvm.x86.sse41.pmuldq";
1193          } else {
1194             intrinsic = "llvm.x86.sse2.pmulu.dq";
1195          }
1196          /*
1197           * XXX If we only have AVX but not AVX2 this is a pain.
1198           * lp_build_intrinsic_binary_anylength() can't handle it
1199           * (due to src and dst type not being identical).
1200           */
1201          if (bld->type.length == 8) {
1202             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1203             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1204             LLVMValueRef muleven2[2], mulodd2[2];
1205             struct lp_type type_wide_half = type_wide;
1206             LLVMTypeRef wtype_half;
1207             type_wide_half.length = 2;
1208             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1209             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1210             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1211             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1212             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1213             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1214             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1215             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1216             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1217             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1218                                                     wtype_half, aevenlo, bevenlo);
1219             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1220                                                    wtype_half, aoddlo, boddlo);
1221             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1222                                                     wtype_half, aevenhi, bevenhi);
1223             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1224                                                    wtype_half, aoddhi, boddhi);
1225             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1226             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1227
1228          }
1229          else {
1230             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1231                                                 wider_type, aeven, beven);
1232             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1233                                                wider_type, aodd, bodd);
1234          }
1235       }
1236       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1237       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1238
1239       for (i = 0; i < bld->type.length; i += 2) {
1240          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1241          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1242       }
1243       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1244       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1245
1246       for (i = 0; i < bld->type.length; i += 2) {
1247          shuf[i] = lp_build_const_int32(gallivm, i);
1248          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1249       }
1250       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1251       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1252    }
1253    else {
1254       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1255    }
1256 }
1257
1258
1259 /*
1260  * Widening mul, valid for 32x32 bit -> 64bit only.
1261  * Result is low 32bits, high bits returned in res_hi.
1262  *
1263  * Emits generic code.
1264  */
1265 LLVMValueRef
1266 lp_build_mul_32_lohi(struct lp_build_context *bld,
1267                      LLVMValueRef a,
1268                      LLVMValueRef b,
1269                      LLVMValueRef *res_hi)
1270 {
1271    struct gallivm_state *gallivm = bld->gallivm;
1272    LLVMBuilderRef builder = gallivm->builder;
1273    LLVMValueRef tmp, shift, res_lo;
1274    struct lp_type type_tmp;
1275    LLVMTypeRef wide_type, narrow_type;
1276
1277    type_tmp = bld->type;
1278    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1279    type_tmp.width *= 2;
1280    wide_type = lp_build_vec_type(gallivm, type_tmp);
1281    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1282
1283    if (bld->type.sign) {
1284       a = LLVMBuildSExt(builder, a, wide_type, "");
1285       b = LLVMBuildSExt(builder, b, wide_type, "");
1286    } else {
1287       a = LLVMBuildZExt(builder, a, wide_type, "");
1288       b = LLVMBuildZExt(builder, b, wide_type, "");
1289    }
1290    tmp = LLVMBuildMul(builder, a, b, "");
1291
1292    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1293
1294    /* Since we truncate anyway, LShr and AShr are equivalent. */
1295    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1296    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1297
1298    return res_lo;
1299 }
1300
1301
1302 /* a * b + c */
1303 LLVMValueRef
1304 lp_build_mad(struct lp_build_context *bld,
1305              LLVMValueRef a,
1306              LLVMValueRef b,
1307              LLVMValueRef c)
1308 {
1309    const struct lp_type type = bld->type;
1310    if (type.floating) {
1311       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1312    } else {
1313       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1314    }
1315 }
1316
1317
1318 /**
1319  * Small vector x scale multiplication optimization.
1320  */
1321 LLVMValueRef
1322 lp_build_mul_imm(struct lp_build_context *bld,
1323                  LLVMValueRef a,
1324                  int b)
1325 {
1326    LLVMBuilderRef builder = bld->gallivm->builder;
1327    LLVMValueRef factor;
1328
1329    assert(lp_check_value(bld->type, a));
1330
1331    if(b == 0)
1332       return bld->zero;
1333
1334    if(b == 1)
1335       return a;
1336
1337    if(b == -1)
1338       return lp_build_negate(bld, a);
1339
1340    if(b == 2 && bld->type.floating)
1341       return lp_build_add(bld, a, a);
1342
1343    if(util_is_power_of_two_or_zero(b)) {
1344       unsigned shift = ffs(b) - 1;
1345
1346       if(bld->type.floating) {
1347 #if 0
1348          /*
1349           * Power of two multiplication by directly manipulating the exponent.
1350           *
1351           * XXX: This might not be always faster, it will introduce a small error
1352           * for multiplication by zero, and it will produce wrong results
1353           * for Inf and NaN.
1354           */
1355          unsigned mantissa = lp_mantissa(bld->type);
1356          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1357          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1358          a = LLVMBuildAdd(builder, a, factor, "");
1359          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1360          return a;
1361 #endif
1362       }
1363       else {
1364          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1365          return LLVMBuildShl(builder, a, factor, "");
1366       }
1367    }
1368
1369    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1370    return lp_build_mul(bld, a, factor);
1371 }
1372
1373
1374 /**
1375  * Generate a / b
1376  */
1377 LLVMValueRef
1378 lp_build_div(struct lp_build_context *bld,
1379              LLVMValueRef a,
1380              LLVMValueRef b)
1381 {
1382    LLVMBuilderRef builder = bld->gallivm->builder;
1383    const struct lp_type type = bld->type;
1384
1385    assert(lp_check_value(type, a));
1386    assert(lp_check_value(type, b));
1387
1388    if(a == bld->zero)
1389       return bld->zero;
1390    if(a == bld->one && type.floating)
1391       return lp_build_rcp(bld, b);
1392    if(b == bld->zero)
1393       return bld->undef;
1394    if(b == bld->one)
1395       return a;
1396    if(a == bld->undef || b == bld->undef)
1397       return bld->undef;
1398
1399    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1400       if (type.floating)
1401          return LLVMConstFDiv(a, b);
1402       else if (type.sign)
1403          return LLVMConstSDiv(a, b);
1404       else
1405          return LLVMConstUDiv(a, b);
1406    }
1407
1408    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1409    if(FALSE &&
1410       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1411        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1412       type.floating)
1413       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1414
1415    if (type.floating)
1416       return LLVMBuildFDiv(builder, a, b, "");
1417    else if (type.sign)
1418       return LLVMBuildSDiv(builder, a, b, "");
1419    else
1420       return LLVMBuildUDiv(builder, a, b, "");
1421 }
1422
1423
1424 /**
1425  * Linear interpolation helper.
1426  *
1427  * @param normalized whether we are interpolating normalized values,
1428  *        encoded in normalized integers, twice as wide.
1429  *
1430  * @sa http://www.stereopsis.com/doubleblend.html
1431  */
1432 static inline LLVMValueRef
1433 lp_build_lerp_simple(struct lp_build_context *bld,
1434                      LLVMValueRef x,
1435                      LLVMValueRef v0,
1436                      LLVMValueRef v1,
1437                      unsigned flags)
1438 {
1439    unsigned half_width = bld->type.width/2;
1440    LLVMBuilderRef builder = bld->gallivm->builder;
1441    LLVMValueRef delta;
1442    LLVMValueRef res;
1443
1444    assert(lp_check_value(bld->type, x));
1445    assert(lp_check_value(bld->type, v0));
1446    assert(lp_check_value(bld->type, v1));
1447
1448    delta = lp_build_sub(bld, v1, v0);
1449
1450    if (bld->type.floating) {
1451       assert(flags == 0);
1452       return lp_build_mad(bld, x, delta, v0);
1453    }
1454
1455    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1456       if (!bld->type.sign) {
1457          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1458             /*
1459              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1460              * most-significant-bit to the lowest-significant-bit, so that
1461              * later we can just divide by 2**n instead of 2**n - 1.
1462              */
1463
1464             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1465          }
1466
1467          /* (x * delta) >> n */
1468          res = lp_build_mul(bld, x, delta);
1469          res = lp_build_shr_imm(bld, res, half_width);
1470       } else {
1471          /*
1472           * The rescaling trick above doesn't work for signed numbers, so
1473           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1474           * instead.
1475           */
1476          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1477          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1478       }
1479    } else {
1480       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1481       res = lp_build_mul(bld, x, delta);
1482    }
1483
1484    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1485       /*
1486        * At this point both res and v0 only use the lower half of the bits,
1487        * the rest is zero. Instead of add / mask, do add with half wide type.
1488        */
1489       struct lp_type narrow_type;
1490       struct lp_build_context narrow_bld;
1491
1492       memset(&narrow_type, 0, sizeof narrow_type);
1493       narrow_type.sign   = bld->type.sign;
1494       narrow_type.width  = bld->type.width/2;
1495       narrow_type.length = bld->type.length*2;
1496
1497       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1498       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1499       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1500       res = lp_build_add(&narrow_bld, v0, res);
1501       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1502    } else {
1503       res = lp_build_add(bld, v0, res);
1504
1505       if (bld->type.fixed) {
1506          /*
1507           * We need to mask out the high order bits when lerping 8bit
1508           * normalized colors stored on 16bits
1509           */
1510          /* XXX: This step is necessary for lerping 8bit colors stored on
1511           * 16bits, but it will be wrong for true fixed point use cases.
1512           * Basically we need a more powerful lp_type, capable of further
1513           * distinguishing the values interpretation from the value storage.
1514           */
1515          LLVMValueRef low_bits;
1516          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1517          res = LLVMBuildAnd(builder, res, low_bits, "");
1518       }
1519    }
1520
1521    return res;
1522 }
1523
1524
1525 /**
1526  * Linear interpolation.
1527  */
1528 LLVMValueRef
1529 lp_build_lerp(struct lp_build_context *bld,
1530               LLVMValueRef x,
1531               LLVMValueRef v0,
1532               LLVMValueRef v1,
1533               unsigned flags)
1534 {
1535    const struct lp_type type = bld->type;
1536    LLVMValueRef res;
1537
1538    assert(lp_check_value(type, x));
1539    assert(lp_check_value(type, v0));
1540    assert(lp_check_value(type, v1));
1541
1542    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1543
1544    if (type.norm) {
1545       struct lp_type wide_type;
1546       struct lp_build_context wide_bld;
1547       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1548
1549       assert(type.length >= 2);
1550
1551       /*
1552        * Create a wider integer type, enough to hold the
1553        * intermediate result of the multiplication.
1554        */
1555       memset(&wide_type, 0, sizeof wide_type);
1556       wide_type.sign   = type.sign;
1557       wide_type.width  = type.width*2;
1558       wide_type.length = type.length/2;
1559
1560       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1561
1562       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1563       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1564       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1565
1566       /*
1567        * Lerp both halves.
1568        */
1569
1570       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1571
1572       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1573       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1574
1575       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1576    } else {
1577       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1578    }
1579
1580    return res;
1581 }
1582
1583
1584 /**
1585  * Bilinear interpolation.
1586  *
1587  * Values indices are in v_{yx}.
1588  */
1589 LLVMValueRef
1590 lp_build_lerp_2d(struct lp_build_context *bld,
1591                  LLVMValueRef x,
1592                  LLVMValueRef y,
1593                  LLVMValueRef v00,
1594                  LLVMValueRef v01,
1595                  LLVMValueRef v10,
1596                  LLVMValueRef v11,
1597                  unsigned flags)
1598 {
1599    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1600    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1601    return lp_build_lerp(bld, y, v0, v1, flags);
1602 }
1603
1604
1605 LLVMValueRef
1606 lp_build_lerp_3d(struct lp_build_context *bld,
1607                  LLVMValueRef x,
1608                  LLVMValueRef y,
1609                  LLVMValueRef z,
1610                  LLVMValueRef v000,
1611                  LLVMValueRef v001,
1612                  LLVMValueRef v010,
1613                  LLVMValueRef v011,
1614                  LLVMValueRef v100,
1615                  LLVMValueRef v101,
1616                  LLVMValueRef v110,
1617                  LLVMValueRef v111,
1618                  unsigned flags)
1619 {
1620    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1621    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1622    return lp_build_lerp(bld, z, v0, v1, flags);
1623 }
1624
1625
1626 /**
1627  * Generate min(a, b)
1628  * Do checks for special cases but not for nans.
1629  */
1630 LLVMValueRef
1631 lp_build_min(struct lp_build_context *bld,
1632              LLVMValueRef a,
1633              LLVMValueRef b)
1634 {
1635    assert(lp_check_value(bld->type, a));
1636    assert(lp_check_value(bld->type, b));
1637
1638    if(a == bld->undef || b == bld->undef)
1639       return bld->undef;
1640
1641    if(a == b)
1642       return a;
1643
1644    if (bld->type.norm) {
1645       if (!bld->type.sign) {
1646          if (a == bld->zero || b == bld->zero) {
1647             return bld->zero;
1648          }
1649       }
1650       if(a == bld->one)
1651          return b;
1652       if(b == bld->one)
1653          return a;
1654    }
1655
1656    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1657 }
1658
1659
1660 /**
1661  * Generate min(a, b)
1662  * NaN's are handled according to the behavior specified by the
1663  * nan_behavior argument.
1664  */
1665 LLVMValueRef
1666 lp_build_min_ext(struct lp_build_context *bld,
1667                  LLVMValueRef a,
1668                  LLVMValueRef b,
1669                  enum gallivm_nan_behavior nan_behavior)
1670 {
1671    assert(lp_check_value(bld->type, a));
1672    assert(lp_check_value(bld->type, b));
1673
1674    if(a == bld->undef || b == bld->undef)
1675       return bld->undef;
1676
1677    if(a == b)
1678       return a;
1679
1680    if (bld->type.norm) {
1681       if (!bld->type.sign) {
1682          if (a == bld->zero || b == bld->zero) {
1683             return bld->zero;
1684          }
1685       }
1686       if(a == bld->one)
1687          return b;
1688       if(b == bld->one)
1689          return a;
1690    }
1691
1692    return lp_build_min_simple(bld, a, b, nan_behavior);
1693 }
1694
1695 /**
1696  * Generate max(a, b)
1697  * Do checks for special cases, but NaN behavior is undefined.
1698  */
1699 LLVMValueRef
1700 lp_build_max(struct lp_build_context *bld,
1701              LLVMValueRef a,
1702              LLVMValueRef b)
1703 {
1704    assert(lp_check_value(bld->type, a));
1705    assert(lp_check_value(bld->type, b));
1706
1707    if(a == bld->undef || b == bld->undef)
1708       return bld->undef;
1709
1710    if(a == b)
1711       return a;
1712
1713    if(bld->type.norm) {
1714       if(a == bld->one || b == bld->one)
1715          return bld->one;
1716       if (!bld->type.sign) {
1717          if (a == bld->zero) {
1718             return b;
1719          }
1720          if (b == bld->zero) {
1721             return a;
1722          }
1723       }
1724    }
1725
1726    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1727 }
1728
1729
1730 /**
1731  * Generate max(a, b)
1732  * Checks for special cases.
1733  * NaN's are handled according to the behavior specified by the
1734  * nan_behavior argument.
1735  */
1736 LLVMValueRef
1737 lp_build_max_ext(struct lp_build_context *bld,
1738                   LLVMValueRef a,
1739                   LLVMValueRef b,
1740                   enum gallivm_nan_behavior nan_behavior)
1741 {
1742    assert(lp_check_value(bld->type, a));
1743    assert(lp_check_value(bld->type, b));
1744
1745    if(a == bld->undef || b == bld->undef)
1746       return bld->undef;
1747
1748    if(a == b)
1749       return a;
1750
1751    if(bld->type.norm) {
1752       if(a == bld->one || b == bld->one)
1753          return bld->one;
1754       if (!bld->type.sign) {
1755          if (a == bld->zero) {
1756             return b;
1757          }
1758          if (b == bld->zero) {
1759             return a;
1760          }
1761       }
1762    }
1763
1764    return lp_build_max_simple(bld, a, b, nan_behavior);
1765 }
1766
1767 /**
1768  * Generate clamp(a, min, max)
1769  * NaN behavior (for any of a, min, max) is undefined.
1770  * Do checks for special cases.
1771  */
1772 LLVMValueRef
1773 lp_build_clamp(struct lp_build_context *bld,
1774                LLVMValueRef a,
1775                LLVMValueRef min,
1776                LLVMValueRef max)
1777 {
1778    assert(lp_check_value(bld->type, a));
1779    assert(lp_check_value(bld->type, min));
1780    assert(lp_check_value(bld->type, max));
1781
1782    a = lp_build_min(bld, a, max);
1783    a = lp_build_max(bld, a, min);
1784    return a;
1785 }
1786
1787
1788 /**
1789  * Generate clamp(a, 0, 1)
1790  * A NaN will get converted to zero.
1791  */
1792 LLVMValueRef
1793 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1794                                 LLVMValueRef a)
1795 {
1796    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1797    a = lp_build_min(bld, a, bld->one);
1798    return a;
1799 }
1800
1801
1802 /**
1803  * Generate abs(a)
1804  */
1805 LLVMValueRef
1806 lp_build_abs(struct lp_build_context *bld,
1807              LLVMValueRef a)
1808 {
1809    LLVMBuilderRef builder = bld->gallivm->builder;
1810    const struct lp_type type = bld->type;
1811    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1812
1813    assert(lp_check_value(type, a));
1814
1815    if(!type.sign)
1816       return a;
1817
1818    if(type.floating) {
1819       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1820          /* Workaround llvm.org/PR27332 */
1821          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1822          unsigned long long absMask = ~(1ULL << (type.width - 1));
1823          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1824          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1825          a = LLVMBuildAnd(builder, a, mask, "");
1826          a = LLVMBuildBitCast(builder, a, vec_type, "");
1827          return a;
1828       } else {
1829          char intrinsic[32];
1830          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1831          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1832       }
1833    }
1834
1835    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1836       switch(type.width) {
1837       case 8:
1838          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1839       case 16:
1840          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1841       case 32:
1842          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1843       }
1844    }
1845    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1846       switch(type.width) {
1847       case 8:
1848          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1849       case 16:
1850          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1851       case 32:
1852          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1853       }
1854    }
1855
1856    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1857                           a, LLVMBuildNeg(builder, a, ""));
1858 }
1859
1860
1861 LLVMValueRef
1862 lp_build_negate(struct lp_build_context *bld,
1863                 LLVMValueRef a)
1864 {
1865    LLVMBuilderRef builder = bld->gallivm->builder;
1866
1867    assert(lp_check_value(bld->type, a));
1868
1869    if (bld->type.floating)
1870       a = LLVMBuildFNeg(builder, a, "");
1871    else
1872       a = LLVMBuildNeg(builder, a, "");
1873
1874    return a;
1875 }
1876
1877
1878 /** Return -1, 0 or +1 depending on the sign of a */
1879 LLVMValueRef
1880 lp_build_sgn(struct lp_build_context *bld,
1881              LLVMValueRef a)
1882 {
1883    LLVMBuilderRef builder = bld->gallivm->builder;
1884    const struct lp_type type = bld->type;
1885    LLVMValueRef cond;
1886    LLVMValueRef res;
1887
1888    assert(lp_check_value(type, a));
1889
1890    /* Handle non-zero case */
1891    if(!type.sign) {
1892       /* if not zero then sign must be positive */
1893       res = bld->one;
1894    }
1895    else if(type.floating) {
1896       LLVMTypeRef vec_type;
1897       LLVMTypeRef int_type;
1898       LLVMValueRef mask;
1899       LLVMValueRef sign;
1900       LLVMValueRef one;
1901       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1902
1903       int_type = lp_build_int_vec_type(bld->gallivm, type);
1904       vec_type = lp_build_vec_type(bld->gallivm, type);
1905       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1906
1907       /* Take the sign bit and add it to 1 constant */
1908       sign = LLVMBuildBitCast(builder, a, int_type, "");
1909       sign = LLVMBuildAnd(builder, sign, mask, "");
1910       one = LLVMConstBitCast(bld->one, int_type);
1911       res = LLVMBuildOr(builder, sign, one, "");
1912       res = LLVMBuildBitCast(builder, res, vec_type, "");
1913    }
1914    else
1915    {
1916       /* signed int/norm/fixed point */
1917       /* could use psign with sse3 and appropriate vectors here */
1918       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1919       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1920       res = lp_build_select(bld, cond, bld->one, minus_one);
1921    }
1922
1923    /* Handle zero */
1924    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1925    res = lp_build_select(bld, cond, bld->zero, res);
1926
1927    return res;
1928 }
1929
1930
1931 /**
1932  * Set the sign of float vector 'a' according to 'sign'.
1933  * If sign==0, return abs(a).
1934  * If sign==1, return -abs(a);
1935  * Other values for sign produce undefined results.
1936  */
1937 LLVMValueRef
1938 lp_build_set_sign(struct lp_build_context *bld,
1939                   LLVMValueRef a, LLVMValueRef sign)
1940 {
1941    LLVMBuilderRef builder = bld->gallivm->builder;
1942    const struct lp_type type = bld->type;
1943    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1944    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1946    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1947                              ~((unsigned long long) 1 << (type.width - 1)));
1948    LLVMValueRef val, res;
1949
1950    assert(type.floating);
1951    assert(lp_check_value(type, a));
1952
1953    /* val = reinterpret_cast<int>(a) */
1954    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1955    /* val = val & mask */
1956    val = LLVMBuildAnd(builder, val, mask, "");
1957    /* sign = sign << shift */
1958    sign = LLVMBuildShl(builder, sign, shift, "");
1959    /* res = val | sign */
1960    res = LLVMBuildOr(builder, val, sign, "");
1961    /* res = reinterpret_cast<float>(res) */
1962    res = LLVMBuildBitCast(builder, res, vec_type, "");
1963
1964    return res;
1965 }
1966
1967
1968 /**
1969  * Convert vector of (or scalar) int to vector of (or scalar) float.
1970  */
1971 LLVMValueRef
1972 lp_build_int_to_float(struct lp_build_context *bld,
1973                       LLVMValueRef a)
1974 {
1975    LLVMBuilderRef builder = bld->gallivm->builder;
1976    const struct lp_type type = bld->type;
1977    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1978
1979    assert(type.floating);
1980
1981    return LLVMBuildSIToFP(builder, a, vec_type, "");
1982 }
1983
1984 static boolean
1985 arch_rounding_available(const struct lp_type type)
1986 {
1987    if ((util_cpu_caps.has_sse4_1 &&
1988        (type.length == 1 || type.width*type.length == 128)) ||
1989        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1990        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1991       return TRUE;
1992    else if ((util_cpu_caps.has_altivec &&
1993             (type.width == 32 && type.length == 4)))
1994       return TRUE;
1995
1996    return FALSE;
1997 }
1998
1999 enum lp_build_round_mode
2000 {
2001    LP_BUILD_ROUND_NEAREST = 0,
2002    LP_BUILD_ROUND_FLOOR = 1,
2003    LP_BUILD_ROUND_CEIL = 2,
2004    LP_BUILD_ROUND_TRUNCATE = 3
2005 };
2006
2007 static inline LLVMValueRef
2008 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2009                              LLVMValueRef a)
2010 {
2011    LLVMBuilderRef builder = bld->gallivm->builder;
2012    const struct lp_type type = bld->type;
2013    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2014    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2015    const char *intrinsic;
2016    LLVMValueRef res;
2017
2018    assert(type.floating);
2019    /* using the double precision conversions is a bit more complicated */
2020    assert(type.width == 32);
2021
2022    assert(lp_check_value(type, a));
2023    assert(util_cpu_caps.has_sse2);
2024
2025    /* This is relying on MXCSR rounding mode, which should always be nearest. */
2026    if (type.length == 1) {
2027       LLVMTypeRef vec_type;
2028       LLVMValueRef undef;
2029       LLVMValueRef arg;
2030       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2031
2032       vec_type = LLVMVectorType(bld->elem_type, 4);
2033
2034       intrinsic = "llvm.x86.sse.cvtss2si";
2035
2036       undef = LLVMGetUndef(vec_type);
2037
2038       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2039
2040       res = lp_build_intrinsic_unary(builder, intrinsic,
2041                                      ret_type, arg);
2042    }
2043    else {
2044       if (type.width* type.length == 128) {
2045          intrinsic = "llvm.x86.sse2.cvtps2dq";
2046       }
2047       else {
2048          assert(type.width*type.length == 256);
2049          assert(util_cpu_caps.has_avx);
2050
2051          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2052       }
2053       res = lp_build_intrinsic_unary(builder, intrinsic,
2054                                      ret_type, a);
2055    }
2056
2057    return res;
2058 }
2059
2060
2061 /*
2062  */
2063 static inline LLVMValueRef
2064 lp_build_round_altivec(struct lp_build_context *bld,
2065                        LLVMValueRef a,
2066                        enum lp_build_round_mode mode)
2067 {
2068    LLVMBuilderRef builder = bld->gallivm->builder;
2069    const struct lp_type type = bld->type;
2070    const char *intrinsic = NULL;
2071
2072    assert(type.floating);
2073
2074    assert(lp_check_value(type, a));
2075    assert(util_cpu_caps.has_altivec);
2076
2077    (void)type;
2078
2079    switch (mode) {
2080    case LP_BUILD_ROUND_NEAREST:
2081       intrinsic = "llvm.ppc.altivec.vrfin";
2082       break;
2083    case LP_BUILD_ROUND_FLOOR:
2084       intrinsic = "llvm.ppc.altivec.vrfim";
2085       break;
2086    case LP_BUILD_ROUND_CEIL:
2087       intrinsic = "llvm.ppc.altivec.vrfip";
2088       break;
2089    case LP_BUILD_ROUND_TRUNCATE:
2090       intrinsic = "llvm.ppc.altivec.vrfiz";
2091       break;
2092    }
2093
2094    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2095 }
2096
2097 static inline LLVMValueRef
2098 lp_build_round_arch(struct lp_build_context *bld,
2099                     LLVMValueRef a,
2100                     enum lp_build_round_mode mode)
2101 {
2102    if (util_cpu_caps.has_sse4_1) {
2103       LLVMBuilderRef builder = bld->gallivm->builder;
2104       const struct lp_type type = bld->type;
2105       const char *intrinsic_root;
2106       char intrinsic[32];
2107
2108       assert(type.floating);
2109       assert(lp_check_value(type, a));
2110       (void)type;
2111
2112       switch (mode) {
2113       case LP_BUILD_ROUND_NEAREST:
2114          intrinsic_root = "llvm.nearbyint";
2115          break;
2116       case LP_BUILD_ROUND_FLOOR:
2117          intrinsic_root = "llvm.floor";
2118          break;
2119       case LP_BUILD_ROUND_CEIL:
2120          intrinsic_root = "llvm.ceil";
2121          break;
2122       case LP_BUILD_ROUND_TRUNCATE:
2123          intrinsic_root = "llvm.trunc";
2124          break;
2125       }
2126
2127       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2128       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2129    }
2130    else /* (util_cpu_caps.has_altivec) */
2131      return lp_build_round_altivec(bld, a, mode);
2132 }
2133
2134 /**
2135  * Return the integer part of a float (vector) value (== round toward zero).
2136  * The returned value is a float (vector).
2137  * Ex: trunc(-1.5) = -1.0
2138  */
2139 LLVMValueRef
2140 lp_build_trunc(struct lp_build_context *bld,
2141                LLVMValueRef a)
2142 {
2143    LLVMBuilderRef builder = bld->gallivm->builder;
2144    const struct lp_type type = bld->type;
2145
2146    assert(type.floating);
2147    assert(lp_check_value(type, a));
2148
2149    if (arch_rounding_available(type)) {
2150       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2151    }
2152    else {
2153       const struct lp_type type = bld->type;
2154       struct lp_type inttype;
2155       struct lp_build_context intbld;
2156       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2157       LLVMValueRef trunc, res, anosign, mask;
2158       LLVMTypeRef int_vec_type = bld->int_vec_type;
2159       LLVMTypeRef vec_type = bld->vec_type;
2160
2161       assert(type.width == 32); /* might want to handle doubles at some point */
2162
2163       inttype = type;
2164       inttype.floating = 0;
2165       lp_build_context_init(&intbld, bld->gallivm, inttype);
2166
2167       /* round by truncation */
2168       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2169       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2170
2171       /* mask out sign bit */
2172       anosign = lp_build_abs(bld, a);
2173       /*
2174        * mask out all values if anosign > 2^24
2175        * This should work both for large ints (all rounding is no-op for them
2176        * because such floats are always exact) as well as special cases like
2177        * NaNs, Infs (taking advantage of the fact they use max exponent).
2178        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2179        */
2180       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2181       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2182       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2183       return lp_build_select(bld, mask, a, res);
2184    }
2185 }
2186
2187
2188 /**
2189  * Return float (vector) rounded to nearest integer (vector).  The returned
2190  * value is a float (vector).
2191  * Ex: round(0.9) = 1.0
2192  * Ex: round(-1.5) = -2.0
2193  */
2194 LLVMValueRef
2195 lp_build_round(struct lp_build_context *bld,
2196                LLVMValueRef a)
2197 {
2198    LLVMBuilderRef builder = bld->gallivm->builder;
2199    const struct lp_type type = bld->type;
2200
2201    assert(type.floating);
2202    assert(lp_check_value(type, a));
2203
2204    if (arch_rounding_available(type)) {
2205       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2206    }
2207    else {
2208       const struct lp_type type = bld->type;
2209       struct lp_type inttype;
2210       struct lp_build_context intbld;
2211       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2212       LLVMValueRef res, anosign, mask;
2213       LLVMTypeRef int_vec_type = bld->int_vec_type;
2214       LLVMTypeRef vec_type = bld->vec_type;
2215
2216       assert(type.width == 32); /* might want to handle doubles at some point */
2217
2218       inttype = type;
2219       inttype.floating = 0;
2220       lp_build_context_init(&intbld, bld->gallivm, inttype);
2221
2222       res = lp_build_iround(bld, a);
2223       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2224
2225       /* mask out sign bit */
2226       anosign = lp_build_abs(bld, a);
2227       /*
2228        * mask out all values if anosign > 2^24
2229        * This should work both for large ints (all rounding is no-op for them
2230        * because such floats are always exact) as well as special cases like
2231        * NaNs, Infs (taking advantage of the fact they use max exponent).
2232        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2233        */
2234       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2235       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2236       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2237       return lp_build_select(bld, mask, a, res);
2238    }
2239 }
2240
2241
2242 /**
2243  * Return floor of float (vector), result is a float (vector)
2244  * Ex: floor(1.1) = 1.0
2245  * Ex: floor(-1.1) = -2.0
2246  */
2247 LLVMValueRef
2248 lp_build_floor(struct lp_build_context *bld,
2249                LLVMValueRef a)
2250 {
2251    LLVMBuilderRef builder = bld->gallivm->builder;
2252    const struct lp_type type = bld->type;
2253
2254    assert(type.floating);
2255    assert(lp_check_value(type, a));
2256
2257    if (arch_rounding_available(type)) {
2258       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2259    }
2260    else {
2261       const struct lp_type type = bld->type;
2262       struct lp_type inttype;
2263       struct lp_build_context intbld;
2264       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2265       LLVMValueRef trunc, res, anosign, mask;
2266       LLVMTypeRef int_vec_type = bld->int_vec_type;
2267       LLVMTypeRef vec_type = bld->vec_type;
2268
2269       if (type.width != 32) {
2270          char intrinsic[32];
2271          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2272          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2273       }
2274
2275       assert(type.width == 32); /* might want to handle doubles at some point */
2276
2277       inttype = type;
2278       inttype.floating = 0;
2279       lp_build_context_init(&intbld, bld->gallivm, inttype);
2280
2281       /* round by truncation */
2282       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2283       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2284
2285       if (type.sign) {
2286          LLVMValueRef tmp;
2287
2288          /*
2289           * fix values if rounding is wrong (for non-special cases)
2290           * - this is the case if trunc > a
2291           */
2292          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2293          /* tmp = trunc > a ? 1.0 : 0.0 */
2294          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2295          tmp = lp_build_and(&intbld, mask, tmp);
2296          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2297          res = lp_build_sub(bld, res, tmp);
2298       }
2299
2300       /* mask out sign bit */
2301       anosign = lp_build_abs(bld, a);
2302       /*
2303        * mask out all values if anosign > 2^24
2304        * This should work both for large ints (all rounding is no-op for them
2305        * because such floats are always exact) as well as special cases like
2306        * NaNs, Infs (taking advantage of the fact they use max exponent).
2307        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2308        */
2309       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2310       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2311       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2312       return lp_build_select(bld, mask, a, res);
2313    }
2314 }
2315
2316
2317 /**
2318  * Return ceiling of float (vector), returning float (vector).
2319  * Ex: ceil( 1.1) = 2.0
2320  * Ex: ceil(-1.1) = -1.0
2321  */
2322 LLVMValueRef
2323 lp_build_ceil(struct lp_build_context *bld,
2324               LLVMValueRef a)
2325 {
2326    LLVMBuilderRef builder = bld->gallivm->builder;
2327    const struct lp_type type = bld->type;
2328
2329    assert(type.floating);
2330    assert(lp_check_value(type, a));
2331
2332    if (arch_rounding_available(type)) {
2333       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2334    }
2335    else {
2336       const struct lp_type type = bld->type;
2337       struct lp_type inttype;
2338       struct lp_build_context intbld;
2339       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2340       LLVMValueRef trunc, res, anosign, mask, tmp;
2341       LLVMTypeRef int_vec_type = bld->int_vec_type;
2342       LLVMTypeRef vec_type = bld->vec_type;
2343
2344       if (type.width != 32) {
2345          char intrinsic[32];
2346          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2347          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2348       }
2349
2350       assert(type.width == 32); /* might want to handle doubles at some point */
2351
2352       inttype = type;
2353       inttype.floating = 0;
2354       lp_build_context_init(&intbld, bld->gallivm, inttype);
2355
2356       /* round by truncation */
2357       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2358       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2359
2360       /*
2361        * fix values if rounding is wrong (for non-special cases)
2362        * - this is the case if trunc < a
2363        */
2364       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2365       /* tmp = trunc < a ? 1.0 : 0.0 */
2366       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2367       tmp = lp_build_and(&intbld, mask, tmp);
2368       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2369       res = lp_build_add(bld, trunc, tmp);
2370
2371       /* mask out sign bit */
2372       anosign = lp_build_abs(bld, a);
2373       /*
2374        * mask out all values if anosign > 2^24
2375        * This should work both for large ints (all rounding is no-op for them
2376        * because such floats are always exact) as well as special cases like
2377        * NaNs, Infs (taking advantage of the fact they use max exponent).
2378        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2379        */
2380       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2381       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2382       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2383       return lp_build_select(bld, mask, a, res);
2384    }
2385 }
2386
2387
2388 /**
2389  * Return fractional part of 'a' computed as a - floor(a)
2390  * Typically used in texture coord arithmetic.
2391  */
2392 LLVMValueRef
2393 lp_build_fract(struct lp_build_context *bld,
2394                LLVMValueRef a)
2395 {
2396    assert(bld->type.floating);
2397    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2398 }
2399
2400
2401 /**
2402  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2403  * against 0.99999(9). (Will also return that value for NaNs.)
2404  */
2405 static inline LLVMValueRef
2406 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2407 {
2408    LLVMValueRef max;
2409
2410    /* this is the largest number smaller than 1.0 representable as float */
2411    max = lp_build_const_vec(bld->gallivm, bld->type,
2412                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2413    return lp_build_min_ext(bld, fract, max,
2414                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2415 }
2416
2417
2418 /**
2419  * Same as lp_build_fract, but guarantees that the result is always smaller
2420  * than one. Will also return the smaller-than-one value for infs, NaNs.
2421  */
2422 LLVMValueRef
2423 lp_build_fract_safe(struct lp_build_context *bld,
2424                     LLVMValueRef a)
2425 {
2426    return clamp_fract(bld, lp_build_fract(bld, a));
2427 }
2428
2429
2430 /**
2431  * Return the integer part of a float (vector) value (== round toward zero).
2432  * The returned value is an integer (vector).
2433  * Ex: itrunc(-1.5) = -1
2434  */
2435 LLVMValueRef
2436 lp_build_itrunc(struct lp_build_context *bld,
2437                 LLVMValueRef a)
2438 {
2439    LLVMBuilderRef builder = bld->gallivm->builder;
2440    const struct lp_type type = bld->type;
2441    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2442
2443    assert(type.floating);
2444    assert(lp_check_value(type, a));
2445
2446    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2447 }
2448
2449
2450 /**
2451  * Return float (vector) rounded to nearest integer (vector).  The returned
2452  * value is an integer (vector).
2453  * Ex: iround(0.9) = 1
2454  * Ex: iround(-1.5) = -2
2455  */
2456 LLVMValueRef
2457 lp_build_iround(struct lp_build_context *bld,
2458                 LLVMValueRef a)
2459 {
2460    LLVMBuilderRef builder = bld->gallivm->builder;
2461    const struct lp_type type = bld->type;
2462    LLVMTypeRef int_vec_type = bld->int_vec_type;
2463    LLVMValueRef res;
2464
2465    assert(type.floating);
2466
2467    assert(lp_check_value(type, a));
2468
2469    if ((util_cpu_caps.has_sse2 &&
2470        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2471        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2472       return lp_build_iround_nearest_sse2(bld, a);
2473    }
2474    if (arch_rounding_available(type)) {
2475       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2476    }
2477    else {
2478       LLVMValueRef half;
2479
2480       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2481
2482       if (type.sign) {
2483          LLVMTypeRef vec_type = bld->vec_type;
2484          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2485                                     (unsigned long long)1 << (type.width - 1));
2486          LLVMValueRef sign;
2487
2488          /* get sign bit */
2489          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2490          sign = LLVMBuildAnd(builder, sign, mask, "");
2491
2492          /* sign * 0.5 */
2493          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2494          half = LLVMBuildOr(builder, sign, half, "");
2495          half = LLVMBuildBitCast(builder, half, vec_type, "");
2496       }
2497
2498       res = LLVMBuildFAdd(builder, a, half, "");
2499    }
2500
2501    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2502
2503    return res;
2504 }
2505
2506
2507 /**
2508  * Return floor of float (vector), result is an int (vector)
2509  * Ex: ifloor(1.1) = 1.0
2510  * Ex: ifloor(-1.1) = -2.0
2511  */
2512 LLVMValueRef
2513 lp_build_ifloor(struct lp_build_context *bld,
2514                 LLVMValueRef a)
2515 {
2516    LLVMBuilderRef builder = bld->gallivm->builder;
2517    const struct lp_type type = bld->type;
2518    LLVMTypeRef int_vec_type = bld->int_vec_type;
2519    LLVMValueRef res;
2520
2521    assert(type.floating);
2522    assert(lp_check_value(type, a));
2523
2524    res = a;
2525    if (type.sign) {
2526       if (arch_rounding_available(type)) {
2527          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2528       }
2529       else {
2530          struct lp_type inttype;
2531          struct lp_build_context intbld;
2532          LLVMValueRef trunc, itrunc, mask;
2533
2534          assert(type.floating);
2535          assert(lp_check_value(type, a));
2536
2537          inttype = type;
2538          inttype.floating = 0;
2539          lp_build_context_init(&intbld, bld->gallivm, inttype);
2540
2541          /* round by truncation */
2542          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2543          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2544
2545          /*
2546           * fix values if rounding is wrong (for non-special cases)
2547           * - this is the case if trunc > a
2548           * The results of doing this with NaNs, very large values etc.
2549           * are undefined but this seems to be the case anyway.
2550           */
2551          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2552          /* cheapie minus one with mask since the mask is minus one / zero */
2553          return lp_build_add(&intbld, itrunc, mask);
2554       }
2555    }
2556
2557    /* round to nearest (toward zero) */
2558    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2559
2560    return res;
2561 }
2562
2563
2564 /**
2565  * Return ceiling of float (vector), returning int (vector).
2566  * Ex: iceil( 1.1) = 2
2567  * Ex: iceil(-1.1) = -1
2568  */
2569 LLVMValueRef
2570 lp_build_iceil(struct lp_build_context *bld,
2571                LLVMValueRef a)
2572 {
2573    LLVMBuilderRef builder = bld->gallivm->builder;
2574    const struct lp_type type = bld->type;
2575    LLVMTypeRef int_vec_type = bld->int_vec_type;
2576    LLVMValueRef res;
2577
2578    assert(type.floating);
2579    assert(lp_check_value(type, a));
2580
2581    if (arch_rounding_available(type)) {
2582       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2583    }
2584    else {
2585       struct lp_type inttype;
2586       struct lp_build_context intbld;
2587       LLVMValueRef trunc, itrunc, mask;
2588
2589       assert(type.floating);
2590       assert(lp_check_value(type, a));
2591
2592       inttype = type;
2593       inttype.floating = 0;
2594       lp_build_context_init(&intbld, bld->gallivm, inttype);
2595
2596       /* round by truncation */
2597       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2598       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2599
2600       /*
2601        * fix values if rounding is wrong (for non-special cases)
2602        * - this is the case if trunc < a
2603        * The results of doing this with NaNs, very large values etc.
2604        * are undefined but this seems to be the case anyway.
2605        */
2606       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2607       /* cheapie plus one with mask since the mask is minus one / zero */
2608       return lp_build_sub(&intbld, itrunc, mask);
2609    }
2610
2611    /* round to nearest (toward zero) */
2612    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2613
2614    return res;
2615 }
2616
2617
2618 /**
2619  * Combined ifloor() & fract().
2620  *
2621  * Preferred to calling the functions separately, as it will ensure that the
2622  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2623  */
2624 void
2625 lp_build_ifloor_fract(struct lp_build_context *bld,
2626                       LLVMValueRef a,
2627                       LLVMValueRef *out_ipart,
2628                       LLVMValueRef *out_fpart)
2629 {
2630    LLVMBuilderRef builder = bld->gallivm->builder;
2631    const struct lp_type type = bld->type;
2632    LLVMValueRef ipart;
2633
2634    assert(type.floating);
2635    assert(lp_check_value(type, a));
2636
2637    if (arch_rounding_available(type)) {
2638       /*
2639        * floor() is easier.
2640        */
2641
2642       ipart = lp_build_floor(bld, a);
2643       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2644       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2645    }
2646    else {
2647       /*
2648        * ifloor() is easier.
2649        */
2650
2651       *out_ipart = lp_build_ifloor(bld, a);
2652       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2653       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2654    }
2655 }
2656
2657
2658 /**
2659  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2660  * always smaller than one.
2661  */
2662 void
2663 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2664                            LLVMValueRef a,
2665                            LLVMValueRef *out_ipart,
2666                            LLVMValueRef *out_fpart)
2667 {
2668    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2669    *out_fpart = clamp_fract(bld, *out_fpart);
2670 }
2671
2672
2673 LLVMValueRef
2674 lp_build_sqrt(struct lp_build_context *bld,
2675               LLVMValueRef a)
2676 {
2677    LLVMBuilderRef builder = bld->gallivm->builder;
2678    const struct lp_type type = bld->type;
2679    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2680    char intrinsic[32];
2681
2682    assert(lp_check_value(type, a));
2683
2684    assert(type.floating);
2685    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2686
2687    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2688 }
2689
2690
2691 /**
2692  * Do one Newton-Raphson step to improve reciprocate precision:
2693  *
2694  *   x_{i+1} = x_i * (2 - a * x_i)
2695  *
2696  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2697  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2698  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2699  * halo. It would be necessary to clamp the argument to prevent this.
2700  *
2701  * See also:
2702  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2703  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2704  */
2705 static inline LLVMValueRef
2706 lp_build_rcp_refine(struct lp_build_context *bld,
2707                     LLVMValueRef a,
2708                     LLVMValueRef rcp_a)
2709 {
2710    LLVMBuilderRef builder = bld->gallivm->builder;
2711    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2712    LLVMValueRef res;
2713
2714    res = LLVMBuildFMul(builder, a, rcp_a, "");
2715    res = LLVMBuildFSub(builder, two, res, "");
2716    res = LLVMBuildFMul(builder, rcp_a, res, "");
2717
2718    return res;
2719 }
2720
2721
2722 LLVMValueRef
2723 lp_build_rcp(struct lp_build_context *bld,
2724              LLVMValueRef a)
2725 {
2726    LLVMBuilderRef builder = bld->gallivm->builder;
2727    const struct lp_type type = bld->type;
2728
2729    assert(lp_check_value(type, a));
2730
2731    if(a == bld->zero)
2732       return bld->undef;
2733    if(a == bld->one)
2734       return bld->one;
2735    if(a == bld->undef)
2736       return bld->undef;
2737
2738    assert(type.floating);
2739
2740    if(LLVMIsConstant(a))
2741       return LLVMConstFDiv(bld->one, a);
2742
2743    /*
2744     * We don't use RCPPS because:
2745     * - it only has 10bits of precision
2746     * - it doesn't even get the reciprocate of 1.0 exactly
2747     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2748     * - for recent processors the benefit over DIVPS is marginal, a case
2749     *   dependent
2750     *
2751     * We could still use it on certain processors if benchmarks show that the
2752     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2753     * particular uses that require less workarounds.
2754     */
2755
2756    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2757          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2758       const unsigned num_iterations = 0;
2759       LLVMValueRef res;
2760       unsigned i;
2761       const char *intrinsic = NULL;
2762
2763       if (type.length == 4) {
2764          intrinsic = "llvm.x86.sse.rcp.ps";
2765       }
2766       else {
2767          intrinsic = "llvm.x86.avx.rcp.ps.256";
2768       }
2769
2770       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2771
2772       for (i = 0; i < num_iterations; ++i) {
2773          res = lp_build_rcp_refine(bld, a, res);
2774       }
2775
2776       return res;
2777    }
2778
2779    return LLVMBuildFDiv(builder, bld->one, a, "");
2780 }
2781
2782
2783 /**
2784  * Do one Newton-Raphson step to improve rsqrt precision:
2785  *
2786  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2787  *
2788  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2789  */
2790 static inline LLVMValueRef
2791 lp_build_rsqrt_refine(struct lp_build_context *bld,
2792                       LLVMValueRef a,
2793                       LLVMValueRef rsqrt_a)
2794 {
2795    LLVMBuilderRef builder = bld->gallivm->builder;
2796    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2797    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2798    LLVMValueRef res;
2799
2800    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2801    res = LLVMBuildFMul(builder, a, res, "");
2802    res = LLVMBuildFSub(builder, three, res, "");
2803    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2804    res = LLVMBuildFMul(builder, half, res, "");
2805
2806    return res;
2807 }
2808
2809
2810 /**
2811  * Generate 1/sqrt(a).
2812  * Result is undefined for values < 0, infinity for +0.
2813  */
2814 LLVMValueRef
2815 lp_build_rsqrt(struct lp_build_context *bld,
2816                LLVMValueRef a)
2817 {
2818    const struct lp_type type = bld->type;
2819
2820    assert(lp_check_value(type, a));
2821
2822    assert(type.floating);
2823
2824    /*
2825     * This should be faster but all denormals will end up as infinity.
2826     */
2827    if (0 && lp_build_fast_rsqrt_available(type)) {
2828       const unsigned num_iterations = 1;
2829       LLVMValueRef res;
2830       unsigned i;
2831
2832       /* rsqrt(1.0) != 1.0 here */
2833       res = lp_build_fast_rsqrt(bld, a);
2834
2835       if (num_iterations) {
2836          /*
2837           * Newton-Raphson will result in NaN instead of infinity for zero,
2838           * and NaN instead of zero for infinity.
2839           * Also, need to ensure rsqrt(1.0) == 1.0.
2840           * All numbers smaller than FLT_MIN will result in +infinity
2841           * (rsqrtps treats all denormals as zero).
2842           */
2843          LLVMValueRef cmp;
2844          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2845          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2846
2847          for (i = 0; i < num_iterations; ++i) {
2848             res = lp_build_rsqrt_refine(bld, a, res);
2849          }
2850          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2851          res = lp_build_select(bld, cmp, inf, res);
2852          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2853          res = lp_build_select(bld, cmp, bld->zero, res);
2854          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2855          res = lp_build_select(bld, cmp, bld->one, res);
2856       }
2857
2858       return res;
2859    }
2860
2861    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2862 }
2863
2864 /**
2865  * If there's a fast (inaccurate) rsqrt instruction available
2866  * (caller may want to avoid to call rsqrt_fast if it's not available,
2867  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2868  * unavailable it would result in sqrt/div/mul so obviously
2869  * much better to just call sqrt, skipping both div and mul).
2870  */
2871 boolean
2872 lp_build_fast_rsqrt_available(struct lp_type type)
2873 {
2874    assert(type.floating);
2875
2876    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2877        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2878       return true;
2879    }
2880    return false;
2881 }
2882
2883
2884 /**
2885  * Generate 1/sqrt(a).
2886  * Result is undefined for values < 0, infinity for +0.
2887  * Precision is limited, only ~10 bits guaranteed
2888  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2889  */
2890 LLVMValueRef
2891 lp_build_fast_rsqrt(struct lp_build_context *bld,
2892                     LLVMValueRef a)
2893 {
2894    LLVMBuilderRef builder = bld->gallivm->builder;
2895    const struct lp_type type = bld->type;
2896
2897    assert(lp_check_value(type, a));
2898
2899    if (lp_build_fast_rsqrt_available(type)) {
2900       const char *intrinsic = NULL;
2901
2902       if (type.length == 4) {
2903          intrinsic = "llvm.x86.sse.rsqrt.ps";
2904       }
2905       else {
2906          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2907       }
2908       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2909    }
2910    else {
2911       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2912    }
2913    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2914 }
2915
2916
2917 /**
2918  * Generate sin(a) or cos(a) using polynomial approximation.
2919  * TODO: it might be worth recognizing sin and cos using same source
2920  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2921  * would be way cheaper than calculating (nearly) everything twice...
2922  * Not sure it's common enough to be worth bothering however, scs
2923  * opcode could also benefit from calculating both though.
2924  */
2925 static LLVMValueRef
2926 lp_build_sin_or_cos(struct lp_build_context *bld,
2927                     LLVMValueRef a,
2928                     boolean cos)
2929 {
2930    struct gallivm_state *gallivm = bld->gallivm;
2931    LLVMBuilderRef b = gallivm->builder;
2932    struct lp_type int_type = lp_int_type(bld->type);
2933
2934    /*
2935     *  take the absolute value,
2936     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2937     */
2938
2939    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2940    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2941
2942    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2943    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2944
2945    /*
2946     * scale by 4/Pi
2947     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2948     */
2949
2950    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2951    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2952
2953    /*
2954     * store the integer part of y in mm0
2955     * emm2 = _mm_cvttps_epi32(y);
2956     */
2957
2958    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2959
2960    /*
2961     * j=(j+1) & (~1) (see the cephes sources)
2962     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2963     */
2964
2965    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2966    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2967    /*
2968     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2969     */
2970    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2971    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2972
2973    /*
2974     * y = _mm_cvtepi32_ps(emm2);
2975     */
2976    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2977
2978    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2979    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2980    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2981    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2982
2983    /*
2984     * Argument used for poly selection and sign bit determination
2985     * is different for sin vs. cos.
2986     */
2987    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2988                                emm2_and;
2989
2990    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2991                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2992                                               const_29, "sign_bit") :
2993                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2994                                                               LLVMBuildShl(b, emm2_add,
2995                                                                            const_29, ""), ""),
2996                                               sign_mask, "sign_bit");
2997
2998    /*
2999     * get the polynom selection mask
3000     * there is one polynom for 0 <= x <= Pi/4
3001     * and another one for Pi/4<x<=Pi/2
3002     * Both branches will be computed.
3003     *
3004     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3005     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3006     */
3007
3008    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3009    LLVMValueRef poly_mask = lp_build_compare(gallivm,
3010                                              int_type, PIPE_FUNC_EQUAL,
3011                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3012
3013    /*
3014     * _PS_CONST(minus_cephes_DP1, -0.78515625);
3015     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3016     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3017     */
3018    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3019    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3020    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3021
3022    /*
3023     * The magic pass: "Extended precision modular arithmetic"
3024     * x = ((x - y * DP1) - y * DP2) - y * DP3;
3025     */
3026    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3027    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3028    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3029
3030    /*
3031     * Evaluate the first polynom  (0 <= x <= Pi/4)
3032     *
3033     * z = _mm_mul_ps(x,x);
3034     */
3035    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3036
3037    /*
3038     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3039     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3040     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3041     */
3042    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3043    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3044    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3045
3046    /*
3047     * y = *(v4sf*)_ps_coscof_p0;
3048     * y = _mm_mul_ps(y, z);
3049     */
3050    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3051    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3052    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3053    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3054
3055
3056    /*
3057     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3058     * y = _mm_sub_ps(y, tmp);
3059     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3060     */
3061    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3062    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3063    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3064    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3065    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3066
3067    /*
3068     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3069     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3070     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3071     */
3072    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3073    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3074    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3075
3076    /*
3077     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3078     *
3079     * y2 = *(v4sf*)_ps_sincof_p0;
3080     * y2 = _mm_mul_ps(y2, z);
3081     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3082     * y2 = _mm_mul_ps(y2, z);
3083     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3084     * y2 = _mm_mul_ps(y2, z);
3085     * y2 = _mm_mul_ps(y2, x);
3086     * y2 = _mm_add_ps(y2, x);
3087     */
3088
3089    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3090    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3091    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3092    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3093
3094    /*
3095     * select the correct result from the two polynoms
3096     * xmm3 = poly_mask;
3097     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3098     * y = _mm_andnot_ps(xmm3, y);
3099     * y = _mm_or_ps(y,y2);
3100     */
3101    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3102    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3103    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3104    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3105    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3106    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3107
3108    /*
3109     * update the sign
3110     * y = _mm_xor_ps(y, sign_bit);
3111     */
3112    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3113    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3114
3115    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3116
3117    /* clamp output to be within [-1, 1] */
3118    y_result = lp_build_clamp(bld, y_result,
3119                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3120                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3121    /* If a is -inf, inf or NaN then return NaN */
3122    y_result = lp_build_select(bld, isfinite, y_result,
3123                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3124    return y_result;
3125 }
3126
3127
3128 /**
3129  * Generate sin(a)
3130  */
3131 LLVMValueRef
3132 lp_build_sin(struct lp_build_context *bld,
3133              LLVMValueRef a)
3134 {
3135    return lp_build_sin_or_cos(bld, a, FALSE);
3136 }
3137
3138
3139 /**
3140  * Generate cos(a)
3141  */
3142 LLVMValueRef
3143 lp_build_cos(struct lp_build_context *bld,
3144              LLVMValueRef a)
3145 {
3146    return lp_build_sin_or_cos(bld, a, TRUE);
3147 }
3148
3149
3150 /**
3151  * Generate pow(x, y)
3152  */
3153 LLVMValueRef
3154 lp_build_pow(struct lp_build_context *bld,
3155              LLVMValueRef x,
3156              LLVMValueRef y)
3157 {
3158    /* TODO: optimize the constant case */
3159    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3160        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3161       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3162                    __FUNCTION__);
3163    }
3164
3165    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3166 }
3167
3168
3169 /**
3170  * Generate exp(x)
3171  */
3172 LLVMValueRef
3173 lp_build_exp(struct lp_build_context *bld,
3174              LLVMValueRef x)
3175 {
3176    /* log2(e) = 1/log(2) */
3177    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3178                                            1.4426950408889634);
3179
3180    assert(lp_check_value(bld->type, x));
3181
3182    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3183 }
3184
3185
3186 /**
3187  * Generate log(x)
3188  * Behavior is undefined with infs, 0s and nans
3189  */
3190 LLVMValueRef
3191 lp_build_log(struct lp_build_context *bld,
3192              LLVMValueRef x)
3193 {
3194    /* log(2) */
3195    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3196                                           0.69314718055994529);
3197
3198    assert(lp_check_value(bld->type, x));
3199
3200    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3201 }
3202
3203 /**
3204  * Generate log(x) that handles edge cases (infs, 0s and nans)
3205  */
3206 LLVMValueRef
3207 lp_build_log_safe(struct lp_build_context *bld,
3208                   LLVMValueRef x)
3209 {
3210    /* log(2) */
3211    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3212                                           0.69314718055994529);
3213
3214    assert(lp_check_value(bld->type, x));
3215
3216    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3217 }
3218
3219
3220 /**
3221  * Generate polynomial.
3222  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3223  */
3224 LLVMValueRef
3225 lp_build_polynomial(struct lp_build_context *bld,
3226                     LLVMValueRef x,
3227                     const double *coeffs,
3228                     unsigned num_coeffs)
3229 {
3230    const struct lp_type type = bld->type;
3231    LLVMValueRef even = NULL, odd = NULL;
3232    LLVMValueRef x2;
3233    unsigned i;
3234
3235    assert(lp_check_value(bld->type, x));
3236
3237    /* TODO: optimize the constant case */
3238    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3239        LLVMIsConstant(x)) {
3240       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3241                    __FUNCTION__);
3242    }
3243
3244    /*
3245     * Calculate odd and even terms seperately to decrease data dependency
3246     * Ex:
3247     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3248     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3249     */
3250    x2 = lp_build_mul(bld, x, x);
3251
3252    for (i = num_coeffs; i--; ) {
3253       LLVMValueRef coeff;
3254
3255       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3256
3257       if (i % 2 == 0) {
3258          if (even)
3259             even = lp_build_mad(bld, x2, even, coeff);
3260          else
3261             even = coeff;
3262       } else {
3263          if (odd)
3264             odd = lp_build_mad(bld, x2, odd, coeff);
3265          else
3266             odd = coeff;
3267       }
3268    }
3269
3270    if (odd)
3271       return lp_build_mad(bld, odd, x, even);
3272    else if (even)
3273       return even;
3274    else
3275       return bld->undef;
3276 }
3277
3278
3279 /**
3280  * Minimax polynomial fit of 2**x, in range [0, 1[
3281  */
3282 const double lp_build_exp2_polynomial[] = {
3283 #if EXP_POLY_DEGREE == 5
3284    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3285    0.693153073200168932794,
3286    0.240153617044375388211,
3287    0.0558263180532956664775,
3288    0.00898934009049466391101,
3289    0.00187757667519147912699
3290 #elif EXP_POLY_DEGREE == 4
3291    1.00000259337069434683,
3292    0.693003834469974940458,
3293    0.24144275689150793076,
3294    0.0520114606103070150235,
3295    0.0135341679161270268764
3296 #elif EXP_POLY_DEGREE == 3
3297    0.999925218562710312959,
3298    0.695833540494823811697,
3299    0.226067155427249155588,
3300    0.0780245226406372992967
3301 #elif EXP_POLY_DEGREE == 2
3302    1.00172476321474503578,
3303    0.657636275736077639316,
3304    0.33718943461968720704
3305 #else
3306 #error
3307 #endif
3308 };
3309
3310
3311 LLVMValueRef
3312 lp_build_exp2(struct lp_build_context *bld,
3313               LLVMValueRef x)
3314 {
3315    LLVMBuilderRef builder = bld->gallivm->builder;
3316    const struct lp_type type = bld->type;
3317    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3318    LLVMValueRef ipart = NULL;
3319    LLVMValueRef fpart = NULL;
3320    LLVMValueRef expipart = NULL;
3321    LLVMValueRef expfpart = NULL;
3322    LLVMValueRef res = NULL;
3323
3324    assert(lp_check_value(bld->type, x));
3325
3326    /* TODO: optimize the constant case */
3327    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3328        LLVMIsConstant(x)) {
3329       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3330                    __FUNCTION__);
3331    }
3332
3333    assert(type.floating && type.width == 32);
3334
3335    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3336     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3337    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3338                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3339    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3340                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3341
3342    /* ipart = floor(x) */
3343    /* fpart = x - ipart */
3344    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3345
3346    /* expipart = (float) (1 << ipart) */
3347    expipart = LLVMBuildAdd(builder, ipart,
3348                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3349    expipart = LLVMBuildShl(builder, expipart,
3350                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3351    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3352
3353    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3354                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3355
3356    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3357
3358    return res;
3359 }
3360
3361
3362
3363 /**
3364  * Extract the exponent of a IEEE-754 floating point value.
3365  *
3366  * Optionally apply an integer bias.
3367  *
3368  * Result is an integer value with
3369  *
3370  *   ifloor(log2(x)) + bias
3371  */
3372 LLVMValueRef
3373 lp_build_extract_exponent(struct lp_build_context *bld,
3374                           LLVMValueRef x,
3375                           int bias)
3376 {
3377    LLVMBuilderRef builder = bld->gallivm->builder;
3378    const struct lp_type type = bld->type;
3379    unsigned mantissa = lp_mantissa(type);
3380    LLVMValueRef res;
3381
3382    assert(type.floating);
3383
3384    assert(lp_check_value(bld->type, x));
3385
3386    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3387
3388    res = LLVMBuildLShr(builder, x,
3389                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3390    res = LLVMBuildAnd(builder, res,
3391                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3392    res = LLVMBuildSub(builder, res,
3393                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3394
3395    return res;
3396 }
3397
3398
3399 /**
3400  * Extract the mantissa of the a floating.
3401  *
3402  * Result is a floating point value with
3403  *
3404  *   x / floor(log2(x))
3405  */
3406 LLVMValueRef
3407 lp_build_extract_mantissa(struct lp_build_context *bld,
3408                           LLVMValueRef x)
3409 {
3410    LLVMBuilderRef builder = bld->gallivm->builder;
3411    const struct lp_type type = bld->type;
3412    unsigned mantissa = lp_mantissa(type);
3413    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3414                                                   (1ULL << mantissa) - 1);
3415    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3416    LLVMValueRef res;
3417
3418    assert(lp_check_value(bld->type, x));
3419
3420    assert(type.floating);
3421
3422    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3423
3424    /* res = x / 2**ipart */
3425    res = LLVMBuildAnd(builder, x, mantmask, "");
3426    res = LLVMBuildOr(builder, res, one, "");
3427    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3428
3429    return res;
3430 }
3431
3432
3433
3434 /**
3435  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3436  * These coefficients can be generate with
3437  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3438  */
3439 const double lp_build_log2_polynomial[] = {
3440 #if LOG_POLY_DEGREE == 5
3441    2.88539008148777786488L,
3442    0.961796878841293367824L,
3443    0.577058946784739859012L,
3444    0.412914355135828735411L,
3445    0.308591899232910175289L,
3446    0.352376952300281371868L,
3447 #elif LOG_POLY_DEGREE == 4
3448    2.88539009343309178325L,
3449    0.961791550404184197881L,
3450    0.577440339438736392009L,
3451    0.403343858251329912514L,
3452    0.406718052498846252698L,
3453 #elif LOG_POLY_DEGREE == 3
3454    2.88538959748872753838L,
3455    0.961932915889597772928L,
3456    0.571118517972136195241L,
3457    0.493997535084709500285L,
3458 #else
3459 #error
3460 #endif
3461 };
3462
3463 /**
3464  * See http://www.devmaster.net/forums/showthread.php?p=43580
3465  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3466  * http://www.nezumi.demon.co.uk/consult/logx.htm
3467  *
3468  * If handle_edge_cases is true the function will perform computations
3469  * to match the required D3D10+ behavior for each of the edge cases.
3470  * That means that if input is:
3471  * - less than zero (to and including -inf) then NaN will be returned
3472  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3473  * - +infinity, then +infinity will be returned
3474  * - NaN, then NaN will be returned
3475  *
3476  * Those checks are fairly expensive so if you don't need them make sure
3477  * handle_edge_cases is false.
3478  */
3479 void
3480 lp_build_log2_approx(struct lp_build_context *bld,
3481                      LLVMValueRef x,
3482                      LLVMValueRef *p_exp,
3483                      LLVMValueRef *p_floor_log2,
3484                      LLVMValueRef *p_log2,
3485                      boolean handle_edge_cases)
3486 {
3487    LLVMBuilderRef builder = bld->gallivm->builder;
3488    const struct lp_type type = bld->type;
3489    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3490    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3491
3492    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3493    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3494    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3495
3496    LLVMValueRef i = NULL;
3497    LLVMValueRef y = NULL;
3498    LLVMValueRef z = NULL;
3499    LLVMValueRef exp = NULL;
3500    LLVMValueRef mant = NULL;
3501    LLVMValueRef logexp = NULL;
3502    LLVMValueRef p_z = NULL;
3503    LLVMValueRef res = NULL;
3504
3505    assert(lp_check_value(bld->type, x));
3506
3507    if(p_exp || p_floor_log2 || p_log2) {
3508       /* TODO: optimize the constant case */
3509       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3510           LLVMIsConstant(x)) {
3511          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3512                       __FUNCTION__);
3513       }
3514
3515       assert(type.floating && type.width == 32);
3516
3517       /*
3518        * We don't explicitly handle denormalized numbers. They will yield a
3519        * result in the neighbourhood of -127, which appears to be adequate
3520        * enough.
3521        */
3522
3523       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3524
3525       /* exp = (float) exponent(x) */
3526       exp = LLVMBuildAnd(builder, i, expmask, "");
3527    }
3528
3529    if(p_floor_log2 || p_log2) {
3530       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3531       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3532       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3533    }
3534
3535    if (p_log2) {
3536       /* mant = 1 + (float) mantissa(x) */
3537       mant = LLVMBuildAnd(builder, i, mantmask, "");
3538       mant = LLVMBuildOr(builder, mant, one, "");
3539       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3540
3541       /* y = (mant - 1) / (mant + 1) */
3542       y = lp_build_div(bld,
3543          lp_build_sub(bld, mant, bld->one),
3544          lp_build_add(bld, mant, bld->one)
3545       );
3546
3547       /* z = y^2 */
3548       z = lp_build_mul(bld, y, y);
3549
3550       /* compute P(z) */
3551       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3552                                 ARRAY_SIZE(lp_build_log2_polynomial));
3553
3554       /* y * P(z) + logexp */
3555       res = lp_build_mad(bld, y, p_z, logexp);
3556
3557       if (type.floating && handle_edge_cases) {
3558          LLVMValueRef negmask, infmask,  zmask;
3559          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3560                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3561          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3562                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3563          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3564                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3565
3566          /* If x is qual to inf make sure we return inf */
3567          res = lp_build_select(bld, infmask,
3568                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3569                                res);
3570          /* If x is qual to 0, return -inf */
3571          res = lp_build_select(bld, zmask,
3572                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3573                                res);
3574          /* If x is nan or less than 0, return nan */
3575          res = lp_build_select(bld, negmask,
3576                                lp_build_const_vec(bld->gallivm, type,  NAN),
3577                                res);
3578       }
3579    }
3580
3581    if (p_exp) {
3582       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3583       *p_exp = exp;
3584    }
3585
3586    if (p_floor_log2)
3587       *p_floor_log2 = logexp;
3588
3589    if (p_log2)
3590       *p_log2 = res;
3591 }
3592
3593
3594 /*
3595  * log2 implementation which doesn't have special code to
3596  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3597  * the results for those cases are undefined.
3598  */
3599 LLVMValueRef
3600 lp_build_log2(struct lp_build_context *bld,
3601               LLVMValueRef x)
3602 {
3603    LLVMValueRef res;
3604    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3605    return res;
3606 }
3607
3608 /*
3609  * Version of log2 which handles all edge cases.
3610  * Look at documentation of lp_build_log2_approx for
3611  * description of the behavior for each of the edge cases.
3612  */
3613 LLVMValueRef
3614 lp_build_log2_safe(struct lp_build_context *bld,
3615                    LLVMValueRef x)
3616 {
3617    LLVMValueRef res;
3618    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3619    return res;
3620 }
3621
3622
3623 /**
3624  * Faster (and less accurate) log2.
3625  *
3626  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3627  *
3628  * Piece-wise linear approximation, with exact results when x is a
3629  * power of two.
3630  *
3631  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3632  */
3633 LLVMValueRef
3634 lp_build_fast_log2(struct lp_build_context *bld,
3635                    LLVMValueRef x)
3636 {
3637    LLVMBuilderRef builder = bld->gallivm->builder;
3638    LLVMValueRef ipart;
3639    LLVMValueRef fpart;
3640
3641    assert(lp_check_value(bld->type, x));
3642
3643    assert(bld->type.floating);
3644
3645    /* ipart = floor(log2(x)) - 1 */
3646    ipart = lp_build_extract_exponent(bld, x, -1);
3647    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3648
3649    /* fpart = x / 2**ipart */
3650    fpart = lp_build_extract_mantissa(bld, x);
3651
3652    /* ipart + fpart */
3653    return LLVMBuildFAdd(builder, ipart, fpart, "");
3654 }
3655
3656
3657 /**
3658  * Fast implementation of iround(log2(x)).
3659  *
3660  * Not an approximation -- it should give accurate results all the time.
3661  */
3662 LLVMValueRef
3663 lp_build_ilog2(struct lp_build_context *bld,
3664                LLVMValueRef x)
3665 {
3666    LLVMBuilderRef builder = bld->gallivm->builder;
3667    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3668    LLVMValueRef ipart;
3669
3670    assert(bld->type.floating);
3671
3672    assert(lp_check_value(bld->type, x));
3673
3674    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3675    x = LLVMBuildFMul(builder, x, sqrt2, "");
3676
3677    /* ipart = floor(log2(x) + 0.5)  */
3678    ipart = lp_build_extract_exponent(bld, x, 0);
3679
3680    return ipart;
3681 }
3682
3683 LLVMValueRef
3684 lp_build_mod(struct lp_build_context *bld,
3685              LLVMValueRef x,
3686              LLVMValueRef y)
3687 {
3688    LLVMBuilderRef builder = bld->gallivm->builder;
3689    LLVMValueRef res;
3690    const struct lp_type type = bld->type;
3691
3692    assert(lp_check_value(type, x));
3693    assert(lp_check_value(type, y));
3694
3695    if (type.floating)
3696       res = LLVMBuildFRem(builder, x, y, "");
3697    else if (type.sign)
3698       res = LLVMBuildSRem(builder, x, y, "");
3699    else
3700       res = LLVMBuildURem(builder, x, y, "");
3701    return res;
3702 }
3703
3704
3705 /*
3706  * For floating inputs it creates and returns a mask
3707  * which is all 1's for channels which are NaN.
3708  * Channels inside x which are not NaN will be 0.
3709  */
3710 LLVMValueRef
3711 lp_build_isnan(struct lp_build_context *bld,
3712                LLVMValueRef x)
3713 {
3714    LLVMValueRef mask;
3715    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3716
3717    assert(bld->type.floating);
3718    assert(lp_check_value(bld->type, x));
3719
3720    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3721                         "isnotnan");
3722    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3723    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3724    return mask;
3725 }
3726
3727 /* Returns all 1's for floating point numbers that are
3728  * finite numbers and returns all zeros for -inf,
3729  * inf and nan's */
3730 LLVMValueRef
3731 lp_build_isfinite(struct lp_build_context *bld,
3732                   LLVMValueRef x)
3733 {
3734    LLVMBuilderRef builder = bld->gallivm->builder;
3735    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3736    struct lp_type int_type = lp_int_type(bld->type);
3737    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3738    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3739                                                     0x7f800000);
3740
3741    if (!bld->type.floating) {
3742       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3743    }
3744    assert(bld->type.floating);
3745    assert(lp_check_value(bld->type, x));
3746    assert(bld->type.width == 32);
3747
3748    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3749    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3750                            intx, infornan32);
3751 }
3752
3753 /*
3754  * Returns true if the number is nan or inf and false otherwise.
3755  * The input has to be a floating point vector.
3756  */
3757 LLVMValueRef
3758 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3759                        const struct lp_type type,
3760                        LLVMValueRef x)
3761 {
3762    LLVMBuilderRef builder = gallivm->builder;
3763    struct lp_type int_type = lp_int_type(type);
3764    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3765                                                 0x7f800000);
3766    LLVMValueRef ret;
3767
3768    assert(type.floating);
3769
3770    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3771    ret = LLVMBuildAnd(builder, ret, const0, "");
3772    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3773                           ret, const0);
3774
3775    return ret;
3776 }
3777
3778
3779 LLVMValueRef
3780 lp_build_fpstate_get(struct gallivm_state *gallivm)
3781 {
3782    if (util_cpu_caps.has_sse) {
3783       LLVMBuilderRef builder = gallivm->builder;
3784       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3785          gallivm,
3786          LLVMInt32TypeInContext(gallivm->context),
3787          "mxcsr_ptr");
3788       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3789           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3790       lp_build_intrinsic(builder,
3791                          "llvm.x86.sse.stmxcsr",
3792                          LLVMVoidTypeInContext(gallivm->context),
3793                          &mxcsr_ptr8, 1, 0);
3794       return mxcsr_ptr;
3795    }
3796    return 0;
3797 }
3798
3799 void
3800 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3801                                   boolean zero)
3802 {
3803    if (util_cpu_caps.has_sse) {
3804       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3805       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3806
3807       LLVMBuilderRef builder = gallivm->builder;
3808       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3809       LLVMValueRef mxcsr =
3810          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3811
3812       if (util_cpu_caps.has_daz) {
3813          /* Enable denormals are zero mode */
3814          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3815       }
3816       if (zero) {
3817          mxcsr = LLVMBuildOr(builder, mxcsr,
3818                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3819       } else {
3820          mxcsr = LLVMBuildAnd(builder, mxcsr,
3821                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3822       }
3823
3824       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3825       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3826    }
3827 }
3828
3829 void
3830 lp_build_fpstate_set(struct gallivm_state *gallivm,
3831                      LLVMValueRef mxcsr_ptr)
3832 {
3833    if (util_cpu_caps.has_sse) {
3834       LLVMBuilderRef builder = gallivm->builder;
3835       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3836                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3837       lp_build_intrinsic(builder,
3838                          "llvm.x86.sse.ldmxcsr",
3839                          LLVMVoidTypeInContext(gallivm->context),
3840                          &mxcsr_ptr, 1, 0);
3841    }
3842 }