gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if(a == bld->zero)
 545       return b;
 546    if(b == bld->zero)
 547       return a;
 548    if(a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if(bld->type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if(a == bld->one || b == bld->one)
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (type.width * type.length == 128) {
 559             if(util_cpu_caps.has_sse2) {
 560               if(type.width == 8)
 561                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 562               if(type.width == 16)
 563                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 564             } else if (util_cpu_caps.has_altivec) {
 565               if(type.width == 8)
 566                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 567               if(type.width == 16)
 568                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 569             }
 570          }
 571          if (type.width * type.length == 256) {
 572             if(util_cpu_caps.has_avx2) {
 573               if(type.width == 8)
 574                 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
 575               if(type.width == 16)
 576                 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
 577             }
 578          }
 579       }
 580
 581       if (intrinsic)
 582          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 583    }
 584
 585    if(type.norm && !type.floating && !type.fixed) {
 586       if (type.sign) {
 587          uint64_t sign = (uint64_t)1 << (type.width - 1);
 588          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 589          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 590          /* a_clamp_max is the maximum a for positive b,
 591             a_clamp_min is the minimum a for negative b. */
 592          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 593          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 594          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 595       } else {
 596          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 597       }
 598    }
 599
 600    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 601       if (type.floating)
 602          res = LLVMConstFAdd(a, b);
 603       else
 604          res = LLVMConstAdd(a, b);
 605    else
 606       if (type.floating)
 607          res = LLVMBuildFAdd(builder, a, b, "");
 608       else
 609          res = LLVMBuildAdd(builder, a, b, "");
 610
 611    /* clamp to ceiling of 1.0 */
 612    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 613       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 614
 615    /* XXX clamp to floor of -1 or 0??? */
 616
 617    return res;
 618 }
 619
 620
 621 /** Return the scalar sum of the elements of a.
 622  * Should avoid this operation whenever possible.
 623  */
 624 LLVMValueRef
 625 lp_build_horizontal_add(struct lp_build_context *bld,
 626                         LLVMValueRef a)
 627 {
 628    LLVMBuilderRef builder = bld->gallivm->builder;
 629    const struct lp_type type = bld->type;
 630    LLVMValueRef index, res;
 631    unsigned i, length;
 632    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 633    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 634    LLVMValueRef vecres, elem2;
 635
 636    assert(lp_check_value(type, a));
 637
 638    if (type.length == 1) {
 639       return a;
 640    }
 641
 642    assert(!bld->type.norm);
 643
 644    /*
 645     * for byte vectors can do much better with psadbw.
 646     * Using repeated shuffle/adds here. Note with multiple vectors
 647     * this can be done more efficiently as outlined in the intel
 648     * optimization manual.
 649     * Note: could cause data rearrangement if used with smaller element
 650     * sizes.
 651     */
 652
 653    vecres = a;
 654    length = type.length / 2;
 655    while (length > 1) {
 656       LLVMValueRef vec1, vec2;
 657       for (i = 0; i < length; i++) {
 658          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 659          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 660       }
 661       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 662                                     LLVMConstVector(shuffles1, length), "");
 663       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 664                                     LLVMConstVector(shuffles2, length), "");
 665       if (type.floating) {
 666          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 667       }
 668       else {
 669          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 670       }
 671       length = length >> 1;
 672    }
 673
 674    /* always have vector of size 2 here */
 675    assert(length == 1);
 676
 677    index = lp_build_const_int32(bld->gallivm, 0);
 678    res = LLVMBuildExtractElement(builder, vecres, index, "");
 679    index = lp_build_const_int32(bld->gallivm, 1);
 680    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 681
 682    if (type.floating)
 683       res = LLVMBuildFAdd(builder, res, elem2, "");
 684     else
 685       res = LLVMBuildAdd(builder, res, elem2, "");
 686
 687    return res;
 688 }
 689
 690 /**
 691  * Return the horizontal sums of 4 float vectors as a float4 vector.
 692  * This uses the technique as outlined in Intel Optimization Manual.
 693  */
 694 static LLVMValueRef
 695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 696                             LLVMValueRef src[4])
 697 {
 698    struct gallivm_state *gallivm = bld->gallivm;
 699    LLVMBuilderRef builder = gallivm->builder;
 700    LLVMValueRef shuffles[4];
 701    LLVMValueRef tmp[4];
 702    LLVMValueRef sumtmp[2], shuftmp[2];
 703
 704    /* lower half of regs */
 705    shuffles[0] = lp_build_const_int32(gallivm, 0);
 706    shuffles[1] = lp_build_const_int32(gallivm, 1);
 707    shuffles[2] = lp_build_const_int32(gallivm, 4);
 708    shuffles[3] = lp_build_const_int32(gallivm, 5);
 709    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 710                                    LLVMConstVector(shuffles, 4), "");
 711    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 712                                    LLVMConstVector(shuffles, 4), "");
 713
 714    /* upper half of regs */
 715    shuffles[0] = lp_build_const_int32(gallivm, 2);
 716    shuffles[1] = lp_build_const_int32(gallivm, 3);
 717    shuffles[2] = lp_build_const_int32(gallivm, 6);
 718    shuffles[3] = lp_build_const_int32(gallivm, 7);
 719    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 720                                    LLVMConstVector(shuffles, 4), "");
 721    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 722                                    LLVMConstVector(shuffles, 4), "");
 723
 724    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 725    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 726
 727    shuffles[0] = lp_build_const_int32(gallivm, 0);
 728    shuffles[1] = lp_build_const_int32(gallivm, 2);
 729    shuffles[2] = lp_build_const_int32(gallivm, 4);
 730    shuffles[3] = lp_build_const_int32(gallivm, 6);
 731    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 732                                        LLVMConstVector(shuffles, 4), "");
 733
 734    shuffles[0] = lp_build_const_int32(gallivm, 1);
 735    shuffles[1] = lp_build_const_int32(gallivm, 3);
 736    shuffles[2] = lp_build_const_int32(gallivm, 5);
 737    shuffles[3] = lp_build_const_int32(gallivm, 7);
 738    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 739                                        LLVMConstVector(shuffles, 4), "");
 740
 741    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 742 }
 743
 744
 745 /*
 746  * partially horizontally add 2-4 float vectors with length nx4,
 747  * i.e. only four adjacent values in each vector will be added,
 748  * assuming values are really grouped in 4 which also determines
 749  * output order.
 750  *
 751  * Return a vector of the same length as the initial vectors,
 752  * with the excess elements (if any) being undefined.
 753  * The element order is independent of number of input vectors.
 754  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 755  * the output order thus will be
 756  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 757  */
 758 LLVMValueRef
 759 lp_build_hadd_partial4(struct lp_build_context *bld,
 760                        LLVMValueRef vectors[],
 761                        unsigned num_vecs)
 762 {
 763    struct gallivm_state *gallivm = bld->gallivm;
 764    LLVMBuilderRef builder = gallivm->builder;
 765    LLVMValueRef ret_vec;
 766    LLVMValueRef tmp[4];
 767    const char *intrinsic = NULL;
 768
 769    assert(num_vecs >= 2 && num_vecs <= 4);
 770    assert(bld->type.floating);
 771
 772    /* only use this with at least 2 vectors, as it is sort of expensive
 773     * (depending on cpu) and we always need two horizontal adds anyway,
 774     * so a shuffle/add approach might be better.
 775     */
 776
 777    tmp[0] = vectors[0];
 778    tmp[1] = vectors[1];
 779
 780    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 781    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 782
 783    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 784        bld->type.length == 4) {
 785       intrinsic = "llvm.x86.sse3.hadd.ps";
 786    }
 787    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 788             bld->type.length == 8) {
 789       intrinsic = "llvm.x86.avx.hadd.ps.256";
 790    }
 791    if (intrinsic) {
 792       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 793                                        lp_build_vec_type(gallivm, bld->type),
 794                                        tmp[0], tmp[1]);
 795       if (num_vecs > 2) {
 796          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 797                                           lp_build_vec_type(gallivm, bld->type),
 798                                           tmp[2], tmp[3]);
 799       }
 800       else {
 801          tmp[1] = tmp[0];
 802       }
 803       return lp_build_intrinsic_binary(builder, intrinsic,
 804                                        lp_build_vec_type(gallivm, bld->type),
 805                                        tmp[0], tmp[1]);
 806    }
 807
 808    if (bld->type.length == 4) {
 809       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 810    }
 811    else {
 812       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 813       unsigned j;
 814       unsigned num_iter = bld->type.length / 4;
 815       struct lp_type parttype = bld->type;
 816       parttype.length = 4;
 817       for (j = 0; j < num_iter; j++) {
 818          LLVMValueRef partsrc[4];
 819          unsigned i;
 820          for (i = 0; i < 4; i++) {
 821             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 822          }
 823          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 824       }
 825       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 826    }
 827    return ret_vec;
 828 }
 829
 830 /**
 831  * Generate a - b
 832  */
 833 LLVMValueRef
 834 lp_build_sub(struct lp_build_context *bld,
 835              LLVMValueRef a,
 836              LLVMValueRef b)
 837 {
 838    LLVMBuilderRef builder = bld->gallivm->builder;
 839    const struct lp_type type = bld->type;
 840    LLVMValueRef res;
 841
 842    assert(lp_check_value(type, a));
 843    assert(lp_check_value(type, b));
 844
 845    if(b == bld->zero)
 846       return a;
 847    if(a == bld->undef || b == bld->undef)
 848       return bld->undef;
 849    if(a == b)
 850       return bld->zero;
 851
 852    if(bld->type.norm) {
 853       const char *intrinsic = NULL;
 854
 855       if(b == bld->one)
 856         return bld->zero;
 857
 858       if (!type.floating && !type.fixed) {
 859          if (type.width * type.length == 128) {
 860             if (util_cpu_caps.has_sse2) {
 861               if(type.width == 8)
 862                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 863               if(type.width == 16)
 864                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 865             } else if (util_cpu_caps.has_altivec) {
 866               if(type.width == 8)
 867                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 868               if(type.width == 16)
 869                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 870             }
 871          }
 872          if (type.width * type.length == 256) {
 873             if (util_cpu_caps.has_avx2) {
 874               if(type.width == 8)
 875                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
 876               if(type.width == 16)
 877                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
 878             }
 879          }
 880       }
 881
 882       if (intrinsic)
 883          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 884    }
 885
 886    if(type.norm && !type.floating && !type.fixed) {
 887       if (type.sign) {
 888          uint64_t sign = (uint64_t)1 << (type.width - 1);
 889          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 890          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 891          /* a_clamp_max is the maximum a for negative b,
 892             a_clamp_min is the minimum a for positive b. */
 893          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 894          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 895          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 896       } else {
 897          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 898       }
 899    }
 900
 901    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 902       if (type.floating)
 903          res = LLVMConstFSub(a, b);
 904       else
 905          res = LLVMConstSub(a, b);
 906    else
 907       if (type.floating)
 908          res = LLVMBuildFSub(builder, a, b, "");
 909       else
 910          res = LLVMBuildSub(builder, a, b, "");
 911
 912    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 913       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 914
 915    return res;
 916 }
 917
 918
 919
 920 /**
 921  * Normalized multiplication.
 922  *
 923  * There are several approaches for (using 8-bit normalized multiplication as
 924  * an example):
 925  *
 926  * - alpha plus one
 927  *
 928  *     makes the following approximation to the division (Sree)
 929  *
 930  *       a*b/255 ~= (a*(b + 1)) >> 256
 931  *
 932  *     which is the fastest method that satisfies the following OpenGL criteria of
 933  *
 934  *       0*0 = 0 and 255*255 = 255
 935  *
 936  * - geometric series
 937  *
 938  *     takes the geometric series approximation to the division
 939  *
 940  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 941  *
 942  *     in this case just the first two terms to fit in 16bit arithmetic
 943  *
 944  *       t/255 ~= (t + (t >> 8)) >> 8
 945  *
 946  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 947  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 948  *     must be used.
 949  *
 950  * - geometric series plus rounding
 951  *
 952  *     when using a geometric series division instead of truncating the result
 953  *     use roundoff in the approximation (Jim Blinn)
 954  *
 955  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 956  *
 957  *     achieving the exact results.
 958  *
 959  *
 960  *
 961  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 962  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 963  * @sa Michael Herf, The "double blend trick", May 2000,
 964  *     http://www.stereopsis.com/doubleblend.html
 965  */
 966 static LLVMValueRef
 967 lp_build_mul_norm(struct gallivm_state *gallivm,
 968                   struct lp_type wide_type,
 969                   LLVMValueRef a, LLVMValueRef b)
 970 {
 971    LLVMBuilderRef builder = gallivm->builder;
 972    struct lp_build_context bld;
 973    unsigned n;
 974    LLVMValueRef half;
 975    LLVMValueRef ab;
 976
 977    assert(!wide_type.floating);
 978    assert(lp_check_value(wide_type, a));
 979    assert(lp_check_value(wide_type, b));
 980
 981    lp_build_context_init(&bld, gallivm, wide_type);
 982
 983    n = wide_type.width / 2;
 984    if (wide_type.sign) {
 985       --n;
 986    }
 987
 988    /*
 989     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 990     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 991     */
 992
 993    /*
 994     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 995     */
 996
 997    ab = LLVMBuildMul(builder, a, b, "");
 998    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 999
1000    /*
1001     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002     */
1003
1004    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005    if (wide_type.sign) {
1006       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008       half = lp_build_select(&bld, sign, minus_half, half);
1009    }
1010    ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012    /* Final division */
1013    ab = lp_build_shr_imm(&bld, ab, n);
1014
1015    return ab;
1016 }
1017
1018 /**
1019  * Generate a * b
1020  */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023              LLVMValueRef a,
1024              LLVMValueRef b)
1025 {
1026    LLVMBuilderRef builder = bld->gallivm->builder;
1027    const struct lp_type type = bld->type;
1028    LLVMValueRef shift;
1029    LLVMValueRef res;
1030
1031    assert(lp_check_value(type, a));
1032    assert(lp_check_value(type, b));
1033
1034    if(a == bld->zero)
1035       return bld->zero;
1036    if(a == bld->one)
1037       return b;
1038    if(b == bld->zero)
1039       return bld->zero;
1040    if(b == bld->one)
1041       return a;
1042    if(a == bld->undef || b == bld->undef)
1043       return bld->undef;
1044
1045    if (!type.floating && !type.fixed && type.norm) {
1046       struct lp_type wide_type = lp_wider_type(type);
1047       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052       /* PMULLW, PSRLW, PADDW */
1053       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058       return ab;
1059    }
1060
1061    if(type.fixed)
1062       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063    else
1064       shift = NULL;
1065
1066    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067       if (type.floating)
1068          res = LLVMConstFMul(a, b);
1069       else
1070          res = LLVMConstMul(a, b);
1071       if(shift) {
1072          if(type.sign)
1073             res = LLVMConstAShr(res, shift);
1074          else
1075             res = LLVMConstLShr(res, shift);
1076       }
1077    }
1078    else {
1079       if (type.floating)
1080          res = LLVMBuildFMul(builder, a, b, "");
1081       else
1082          res = LLVMBuildMul(builder, a, b, "");
1083       if(shift) {
1084          if(type.sign)
1085             res = LLVMBuildAShr(builder, res, shift, "");
1086          else
1087             res = LLVMBuildLShr(builder, res, shift, "");
1088       }
1089    }
1090
1091    return res;
1092 }
1093
1094 /*
1095  * Widening mul, valid for 32x32 bit -> 64bit only.
1096  * Result is low 32bits, high bits returned in res_hi.
1097  *
1098  * Emits code that is meant to be compiled for the host CPU.
1099  */
1100 LLVMValueRef
1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102                          LLVMValueRef a,
1103                          LLVMValueRef b,
1104                          LLVMValueRef *res_hi)
1105 {
1106    struct gallivm_state *gallivm = bld->gallivm;
1107    LLVMBuilderRef builder = gallivm->builder;
1108
1109    assert(bld->type.width == 32);
1110    assert(bld->type.floating == 0);
1111    assert(bld->type.fixed == 0);
1112    assert(bld->type.norm == 0);
1113
1114    /*
1115     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116     * for x86 simd is atrocious (even if the high bits weren't required),
1117     * trying to handle real 64bit inputs (which of course can't happen due
1118     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119     * apparently llvm does not recognize this widening mul). This includes 6
1120     * (instead of 2) pmuludq plus extra adds and shifts
1121     * The same story applies to signed mul, albeit fixing this requires sse41.
1122     * https://llvm.org/bugs/show_bug.cgi?id=30845
1123     * So, whip up our own code, albeit only for length 4 and 8 (which
1124     * should be good enough)...
1125     */
1126    if ((bld->type.length == 4 || bld->type.length == 8) &&
1127        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128         util_cpu_caps.has_sse4_1)) {
1129       const char *intrinsic = NULL;
1130       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132       struct lp_type type_wide = lp_wider_type(bld->type);
1133       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134       unsigned i;
1135       for (i = 0; i < bld->type.length; i += 2) {
1136          shuf[i] = lp_build_const_int32(gallivm, i+1);
1137          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138       }
1139       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140       aeven = a;
1141       beven = b;
1142       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146          if (bld->type.sign) {
1147             intrinsic = "llvm.x86.avx2.pmul.dq";
1148          } else {
1149             intrinsic = "llvm.x86.avx2.pmulu.dq";
1150          }
1151          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152                                              wider_type, aeven, beven);
1153          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154                                             wider_type, aodd, bodd);
1155       }
1156       else {
1157          /* for consistent naming look elsewhere... */
1158          if (bld->type.sign) {
1159             intrinsic = "llvm.x86.sse41.pmuldq";
1160          } else {
1161             intrinsic = "llvm.x86.sse2.pmulu.dq";
1162          }
1163          /*
1164           * XXX If we only have AVX but not AVX2 this is a pain.
1165           * lp_build_intrinsic_binary_anylength() can't handle it
1166           * (due to src and dst type not being identical).
1167           */
1168          if (bld->type.length == 8) {
1169             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171             LLVMValueRef muleven2[2], mulodd2[2];
1172             struct lp_type type_wide_half = type_wide;
1173             LLVMTypeRef wtype_half;
1174             type_wide_half.length = 2;
1175             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185                                                     wtype_half, aevenlo, bevenlo);
1186             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187                                                    wtype_half, aoddlo, boddlo);
1188             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189                                                     wtype_half, aevenhi, bevenhi);
1190             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191                                                    wtype_half, aoddhi, boddhi);
1192             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195          }
1196          else {
1197             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198                                                 wider_type, aeven, beven);
1199             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200                                                wider_type, aodd, bodd);
1201          }
1202       }
1203       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206       for (i = 0; i < bld->type.length; i += 2) {
1207          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209       }
1210       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213       for (i = 0; i < bld->type.length; i += 2) {
1214          shuf[i] = lp_build_const_int32(gallivm, i);
1215          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216       }
1217       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219    }
1220    else {
1221       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222    }
1223 }
1224
1225
1226 /*
1227  * Widening mul, valid for 32x32 bit -> 64bit only.
1228  * Result is low 32bits, high bits returned in res_hi.
1229  *
1230  * Emits generic code.
1231  */
1232 LLVMValueRef
1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234                      LLVMValueRef a,
1235                      LLVMValueRef b,
1236                      LLVMValueRef *res_hi)
1237 {
1238    struct gallivm_state *gallivm = bld->gallivm;
1239    LLVMBuilderRef builder = gallivm->builder;
1240    LLVMValueRef tmp, shift, res_lo;
1241    struct lp_type type_tmp;
1242    LLVMTypeRef wide_type, narrow_type;
1243
1244    type_tmp = bld->type;
1245    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246    type_tmp.width *= 2;
1247    wide_type = lp_build_vec_type(gallivm, type_tmp);
1248    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249
1250    if (bld->type.sign) {
1251       a = LLVMBuildSExt(builder, a, wide_type, "");
1252       b = LLVMBuildSExt(builder, b, wide_type, "");
1253    } else {
1254       a = LLVMBuildZExt(builder, a, wide_type, "");
1255       b = LLVMBuildZExt(builder, b, wide_type, "");
1256    }
1257    tmp = LLVMBuildMul(builder, a, b, "");
1258
1259    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260
1261    /* Since we truncate anyway, LShr and AShr are equivalent. */
1262    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264
1265    return res_lo;
1266 }
1267
1268
1269 /* a * b + c */
1270 LLVMValueRef
1271 lp_build_mad(struct lp_build_context *bld,
1272              LLVMValueRef a,
1273              LLVMValueRef b,
1274              LLVMValueRef c)
1275 {
1276    const struct lp_type type = bld->type;
1277    if (type.floating) {
1278       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279    } else {
1280       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281    }
1282 }
1283
1284
1285 /**
1286  * Small vector x scale multiplication optimization.
1287  */
1288 LLVMValueRef
1289 lp_build_mul_imm(struct lp_build_context *bld,
1290                  LLVMValueRef a,
1291                  int b)
1292 {
1293    LLVMBuilderRef builder = bld->gallivm->builder;
1294    LLVMValueRef factor;
1295
1296    assert(lp_check_value(bld->type, a));
1297
1298    if(b == 0)
1299       return bld->zero;
1300
1301    if(b == 1)
1302       return a;
1303
1304    if(b == -1)
1305       return lp_build_negate(bld, a);
1306
1307    if(b == 2 && bld->type.floating)
1308       return lp_build_add(bld, a, a);
1309
1310    if(util_is_power_of_two(b)) {
1311       unsigned shift = ffs(b) - 1;
1312
1313       if(bld->type.floating) {
1314 #if 0
1315          /*
1316           * Power of two multiplication by directly manipulating the exponent.
1317           *
1318           * XXX: This might not be always faster, it will introduce a small error
1319           * for multiplication by zero, and it will produce wrong results
1320           * for Inf and NaN.
1321           */
1322          unsigned mantissa = lp_mantissa(bld->type);
1323          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325          a = LLVMBuildAdd(builder, a, factor, "");
1326          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327          return a;
1328 #endif
1329       }
1330       else {
1331          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332          return LLVMBuildShl(builder, a, factor, "");
1333       }
1334    }
1335
1336    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337    return lp_build_mul(bld, a, factor);
1338 }
1339
1340
1341 /**
1342  * Generate a / b
1343  */
1344 LLVMValueRef
1345 lp_build_div(struct lp_build_context *bld,
1346              LLVMValueRef a,
1347              LLVMValueRef b)
1348 {
1349    LLVMBuilderRef builder = bld->gallivm->builder;
1350    const struct lp_type type = bld->type;
1351
1352    assert(lp_check_value(type, a));
1353    assert(lp_check_value(type, b));
1354
1355    if(a == bld->zero)
1356       return bld->zero;
1357    if(a == bld->one && type.floating)
1358       return lp_build_rcp(bld, b);
1359    if(b == bld->zero)
1360       return bld->undef;
1361    if(b == bld->one)
1362       return a;
1363    if(a == bld->undef || b == bld->undef)
1364       return bld->undef;
1365
1366    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367       if (type.floating)
1368          return LLVMConstFDiv(a, b);
1369       else if (type.sign)
1370          return LLVMConstSDiv(a, b);
1371       else
1372          return LLVMConstUDiv(a, b);
1373    }
1374
1375    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1376    if(FALSE &&
1377       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1378        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1379       type.floating)
1380       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1381
1382    if (type.floating)
1383       return LLVMBuildFDiv(builder, a, b, "");
1384    else if (type.sign)
1385       return LLVMBuildSDiv(builder, a, b, "");
1386    else
1387       return LLVMBuildUDiv(builder, a, b, "");
1388 }
1389
1390
1391 /**
1392  * Linear interpolation helper.
1393  *
1394  * @param normalized whether we are interpolating normalized values,
1395  *        encoded in normalized integers, twice as wide.
1396  *
1397  * @sa http://www.stereopsis.com/doubleblend.html
1398  */
1399 static inline LLVMValueRef
1400 lp_build_lerp_simple(struct lp_build_context *bld,
1401                      LLVMValueRef x,
1402                      LLVMValueRef v0,
1403                      LLVMValueRef v1,
1404                      unsigned flags)
1405 {
1406    unsigned half_width = bld->type.width/2;
1407    LLVMBuilderRef builder = bld->gallivm->builder;
1408    LLVMValueRef delta;
1409    LLVMValueRef res;
1410
1411    assert(lp_check_value(bld->type, x));
1412    assert(lp_check_value(bld->type, v0));
1413    assert(lp_check_value(bld->type, v1));
1414
1415    delta = lp_build_sub(bld, v1, v0);
1416
1417    if (bld->type.floating) {
1418       assert(flags == 0);
1419       return lp_build_mad(bld, x, delta, v0);
1420    }
1421
1422    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1423       if (!bld->type.sign) {
1424          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1425             /*
1426              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1427              * most-significant-bit to the lowest-significant-bit, so that
1428              * later we can just divide by 2**n instead of 2**n - 1.
1429              */
1430
1431             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1432          }
1433
1434          /* (x * delta) >> n */
1435          res = lp_build_mul(bld, x, delta);
1436          res = lp_build_shr_imm(bld, res, half_width);
1437       } else {
1438          /*
1439           * The rescaling trick above doesn't work for signed numbers, so
1440           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1441           * instead.
1442           */
1443          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1444          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1445       }
1446    } else {
1447       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1448       res = lp_build_mul(bld, x, delta);
1449    }
1450
1451    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1452       /*
1453        * At this point both res and v0 only use the lower half of the bits,
1454        * the rest is zero. Instead of add / mask, do add with half wide type.
1455        */
1456       struct lp_type narrow_type;
1457       struct lp_build_context narrow_bld;
1458
1459       memset(&narrow_type, 0, sizeof narrow_type);
1460       narrow_type.sign   = bld->type.sign;
1461       narrow_type.width  = bld->type.width/2;
1462       narrow_type.length = bld->type.length*2;
1463
1464       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1465       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1466       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1467       res = lp_build_add(&narrow_bld, v0, res);
1468       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1469    } else {
1470       res = lp_build_add(bld, v0, res);
1471
1472       if (bld->type.fixed) {
1473          /*
1474           * We need to mask out the high order bits when lerping 8bit
1475           * normalized colors stored on 16bits
1476           */
1477          /* XXX: This step is necessary for lerping 8bit colors stored on
1478           * 16bits, but it will be wrong for true fixed point use cases.
1479           * Basically we need a more powerful lp_type, capable of further
1480           * distinguishing the values interpretation from the value storage.
1481           */
1482          LLVMValueRef low_bits;
1483          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1484          res = LLVMBuildAnd(builder, res, low_bits, "");
1485       }
1486    }
1487
1488    return res;
1489 }
1490
1491
1492 /**
1493  * Linear interpolation.
1494  */
1495 LLVMValueRef
1496 lp_build_lerp(struct lp_build_context *bld,
1497               LLVMValueRef x,
1498               LLVMValueRef v0,
1499               LLVMValueRef v1,
1500               unsigned flags)
1501 {
1502    const struct lp_type type = bld->type;
1503    LLVMValueRef res;
1504
1505    assert(lp_check_value(type, x));
1506    assert(lp_check_value(type, v0));
1507    assert(lp_check_value(type, v1));
1508
1509    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1510
1511    if (type.norm) {
1512       struct lp_type wide_type;
1513       struct lp_build_context wide_bld;
1514       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1515
1516       assert(type.length >= 2);
1517
1518       /*
1519        * Create a wider integer type, enough to hold the
1520        * intermediate result of the multiplication.
1521        */
1522       memset(&wide_type, 0, sizeof wide_type);
1523       wide_type.sign   = type.sign;
1524       wide_type.width  = type.width*2;
1525       wide_type.length = type.length/2;
1526
1527       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1528
1529       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1530       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1531       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1532
1533       /*
1534        * Lerp both halves.
1535        */
1536
1537       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1538
1539       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1540       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1541
1542       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1543    } else {
1544       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1545    }
1546
1547    return res;
1548 }
1549
1550
1551 /**
1552  * Bilinear interpolation.
1553  *
1554  * Values indices are in v_{yx}.
1555  */
1556 LLVMValueRef
1557 lp_build_lerp_2d(struct lp_build_context *bld,
1558                  LLVMValueRef x,
1559                  LLVMValueRef y,
1560                  LLVMValueRef v00,
1561                  LLVMValueRef v01,
1562                  LLVMValueRef v10,
1563                  LLVMValueRef v11,
1564                  unsigned flags)
1565 {
1566    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1567    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1568    return lp_build_lerp(bld, y, v0, v1, flags);
1569 }
1570
1571
1572 LLVMValueRef
1573 lp_build_lerp_3d(struct lp_build_context *bld,
1574                  LLVMValueRef x,
1575                  LLVMValueRef y,
1576                  LLVMValueRef z,
1577                  LLVMValueRef v000,
1578                  LLVMValueRef v001,
1579                  LLVMValueRef v010,
1580                  LLVMValueRef v011,
1581                  LLVMValueRef v100,
1582                  LLVMValueRef v101,
1583                  LLVMValueRef v110,
1584                  LLVMValueRef v111,
1585                  unsigned flags)
1586 {
1587    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1588    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1589    return lp_build_lerp(bld, z, v0, v1, flags);
1590 }
1591
1592
1593 /**
1594  * Generate min(a, b)
1595  * Do checks for special cases but not for nans.
1596  */
1597 LLVMValueRef
1598 lp_build_min(struct lp_build_context *bld,
1599              LLVMValueRef a,
1600              LLVMValueRef b)
1601 {
1602    assert(lp_check_value(bld->type, a));
1603    assert(lp_check_value(bld->type, b));
1604
1605    if(a == bld->undef || b == bld->undef)
1606       return bld->undef;
1607
1608    if(a == b)
1609       return a;
1610
1611    if (bld->type.norm) {
1612       if (!bld->type.sign) {
1613          if (a == bld->zero || b == bld->zero) {
1614             return bld->zero;
1615          }
1616       }
1617       if(a == bld->one)
1618          return b;
1619       if(b == bld->one)
1620          return a;
1621    }
1622
1623    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1624 }
1625
1626
1627 /**
1628  * Generate min(a, b)
1629  * NaN's are handled according to the behavior specified by the
1630  * nan_behavior argument.
1631  */
1632 LLVMValueRef
1633 lp_build_min_ext(struct lp_build_context *bld,
1634                  LLVMValueRef a,
1635                  LLVMValueRef b,
1636                  enum gallivm_nan_behavior nan_behavior)
1637 {
1638    assert(lp_check_value(bld->type, a));
1639    assert(lp_check_value(bld->type, b));
1640
1641    if(a == bld->undef || b == bld->undef)
1642       return bld->undef;
1643
1644    if(a == b)
1645       return a;
1646
1647    if (bld->type.norm) {
1648       if (!bld->type.sign) {
1649          if (a == bld->zero || b == bld->zero) {
1650             return bld->zero;
1651          }
1652       }
1653       if(a == bld->one)
1654          return b;
1655       if(b == bld->one)
1656          return a;
1657    }
1658
1659    return lp_build_min_simple(bld, a, b, nan_behavior);
1660 }
1661
1662 /**
1663  * Generate max(a, b)
1664  * Do checks for special cases, but NaN behavior is undefined.
1665  */
1666 LLVMValueRef
1667 lp_build_max(struct lp_build_context *bld,
1668              LLVMValueRef a,
1669              LLVMValueRef b)
1670 {
1671    assert(lp_check_value(bld->type, a));
1672    assert(lp_check_value(bld->type, b));
1673
1674    if(a == bld->undef || b == bld->undef)
1675       return bld->undef;
1676
1677    if(a == b)
1678       return a;
1679
1680    if(bld->type.norm) {
1681       if(a == bld->one || b == bld->one)
1682          return bld->one;
1683       if (!bld->type.sign) {
1684          if (a == bld->zero) {
1685             return b;
1686          }
1687          if (b == bld->zero) {
1688             return a;
1689          }
1690       }
1691    }
1692
1693    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1694 }
1695
1696
1697 /**
1698  * Generate max(a, b)
1699  * Checks for special cases.
1700  * NaN's are handled according to the behavior specified by the
1701  * nan_behavior argument.
1702  */
1703 LLVMValueRef
1704 lp_build_max_ext(struct lp_build_context *bld,
1705                   LLVMValueRef a,
1706                   LLVMValueRef b,
1707                   enum gallivm_nan_behavior nan_behavior)
1708 {
1709    assert(lp_check_value(bld->type, a));
1710    assert(lp_check_value(bld->type, b));
1711
1712    if(a == bld->undef || b == bld->undef)
1713       return bld->undef;
1714
1715    if(a == b)
1716       return a;
1717
1718    if(bld->type.norm) {
1719       if(a == bld->one || b == bld->one)
1720          return bld->one;
1721       if (!bld->type.sign) {
1722          if (a == bld->zero) {
1723             return b;
1724          }
1725          if (b == bld->zero) {
1726             return a;
1727          }
1728       }
1729    }
1730
1731    return lp_build_max_simple(bld, a, b, nan_behavior);
1732 }
1733
1734 /**
1735  * Generate clamp(a, min, max)
1736  * NaN behavior (for any of a, min, max) is undefined.
1737  * Do checks for special cases.
1738  */
1739 LLVMValueRef
1740 lp_build_clamp(struct lp_build_context *bld,
1741                LLVMValueRef a,
1742                LLVMValueRef min,
1743                LLVMValueRef max)
1744 {
1745    assert(lp_check_value(bld->type, a));
1746    assert(lp_check_value(bld->type, min));
1747    assert(lp_check_value(bld->type, max));
1748
1749    a = lp_build_min(bld, a, max);
1750    a = lp_build_max(bld, a, min);
1751    return a;
1752 }
1753
1754
1755 /**
1756  * Generate clamp(a, 0, 1)
1757  * A NaN will get converted to zero.
1758  */
1759 LLVMValueRef
1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1761                                 LLVMValueRef a)
1762 {
1763    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1764    a = lp_build_min(bld, a, bld->one);
1765    return a;
1766 }
1767
1768
1769 /**
1770  * Generate abs(a)
1771  */
1772 LLVMValueRef
1773 lp_build_abs(struct lp_build_context *bld,
1774              LLVMValueRef a)
1775 {
1776    LLVMBuilderRef builder = bld->gallivm->builder;
1777    const struct lp_type type = bld->type;
1778    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1779
1780    assert(lp_check_value(type, a));
1781
1782    if(!type.sign)
1783       return a;
1784
1785    if(type.floating) {
1786       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1787          /* Workaround llvm.org/PR27332 */
1788          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1789          unsigned long long absMask = ~(1ULL << (type.width - 1));
1790          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1791          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1792          a = LLVMBuildAnd(builder, a, mask, "");
1793          a = LLVMBuildBitCast(builder, a, vec_type, "");
1794          return a;
1795       } else {
1796          char intrinsic[32];
1797          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1798          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1799       }
1800    }
1801
1802    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1803       switch(type.width) {
1804       case 8:
1805          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1806       case 16:
1807          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1808       case 32:
1809          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1810       }
1811    }
1812    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1813       switch(type.width) {
1814       case 8:
1815          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1816       case 16:
1817          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1818       case 32:
1819          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1820       }
1821    }
1822    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1823             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1824             (type.width == 8 || type.width == 16 || type.width == 32)) {
1825       debug_printf("%s: inefficient code, should split vectors manually\n",
1826                    __FUNCTION__);
1827    }
1828
1829    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1830 }
1831
1832
1833 LLVMValueRef
1834 lp_build_negate(struct lp_build_context *bld,
1835                 LLVMValueRef a)
1836 {
1837    LLVMBuilderRef builder = bld->gallivm->builder;
1838
1839    assert(lp_check_value(bld->type, a));
1840
1841    if (bld->type.floating)
1842       a = LLVMBuildFNeg(builder, a, "");
1843    else
1844       a = LLVMBuildNeg(builder, a, "");
1845
1846    return a;
1847 }
1848
1849
1850 /** Return -1, 0 or +1 depending on the sign of a */
1851 LLVMValueRef
1852 lp_build_sgn(struct lp_build_context *bld,
1853              LLVMValueRef a)
1854 {
1855    LLVMBuilderRef builder = bld->gallivm->builder;
1856    const struct lp_type type = bld->type;
1857    LLVMValueRef cond;
1858    LLVMValueRef res;
1859
1860    assert(lp_check_value(type, a));
1861
1862    /* Handle non-zero case */
1863    if(!type.sign) {
1864       /* if not zero then sign must be positive */
1865       res = bld->one;
1866    }
1867    else if(type.floating) {
1868       LLVMTypeRef vec_type;
1869       LLVMTypeRef int_type;
1870       LLVMValueRef mask;
1871       LLVMValueRef sign;
1872       LLVMValueRef one;
1873       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1874
1875       int_type = lp_build_int_vec_type(bld->gallivm, type);
1876       vec_type = lp_build_vec_type(bld->gallivm, type);
1877       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1878
1879       /* Take the sign bit and add it to 1 constant */
1880       sign = LLVMBuildBitCast(builder, a, int_type, "");
1881       sign = LLVMBuildAnd(builder, sign, mask, "");
1882       one = LLVMConstBitCast(bld->one, int_type);
1883       res = LLVMBuildOr(builder, sign, one, "");
1884       res = LLVMBuildBitCast(builder, res, vec_type, "");
1885    }
1886    else
1887    {
1888       /* signed int/norm/fixed point */
1889       /* could use psign with sse3 and appropriate vectors here */
1890       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1891       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1892       res = lp_build_select(bld, cond, bld->one, minus_one);
1893    }
1894
1895    /* Handle zero */
1896    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1897    res = lp_build_select(bld, cond, bld->zero, res);
1898
1899    return res;
1900 }
1901
1902
1903 /**
1904  * Set the sign of float vector 'a' according to 'sign'.
1905  * If sign==0, return abs(a).
1906  * If sign==1, return -abs(a);
1907  * Other values for sign produce undefined results.
1908  */
1909 LLVMValueRef
1910 lp_build_set_sign(struct lp_build_context *bld,
1911                   LLVMValueRef a, LLVMValueRef sign)
1912 {
1913    LLVMBuilderRef builder = bld->gallivm->builder;
1914    const struct lp_type type = bld->type;
1915    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1916    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1917    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1918    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1919                              ~((unsigned long long) 1 << (type.width - 1)));
1920    LLVMValueRef val, res;
1921
1922    assert(type.floating);
1923    assert(lp_check_value(type, a));
1924
1925    /* val = reinterpret_cast<int>(a) */
1926    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1927    /* val = val & mask */
1928    val = LLVMBuildAnd(builder, val, mask, "");
1929    /* sign = sign << shift */
1930    sign = LLVMBuildShl(builder, sign, shift, "");
1931    /* res = val | sign */
1932    res = LLVMBuildOr(builder, val, sign, "");
1933    /* res = reinterpret_cast<float>(res) */
1934    res = LLVMBuildBitCast(builder, res, vec_type, "");
1935
1936    return res;
1937 }
1938
1939
1940 /**
1941  * Convert vector of (or scalar) int to vector of (or scalar) float.
1942  */
1943 LLVMValueRef
1944 lp_build_int_to_float(struct lp_build_context *bld,
1945                       LLVMValueRef a)
1946 {
1947    LLVMBuilderRef builder = bld->gallivm->builder;
1948    const struct lp_type type = bld->type;
1949    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1950
1951    assert(type.floating);
1952
1953    return LLVMBuildSIToFP(builder, a, vec_type, "");
1954 }
1955
1956 static boolean
1957 arch_rounding_available(const struct lp_type type)
1958 {
1959    if ((util_cpu_caps.has_sse4_1 &&
1960        (type.length == 1 || type.width*type.length == 128)) ||
1961        (util_cpu_caps.has_avx && type.width*type.length == 256))
1962       return TRUE;
1963    else if ((util_cpu_caps.has_altivec &&
1964             (type.width == 32 && type.length == 4)))
1965       return TRUE;
1966
1967    return FALSE;
1968 }
1969
1970 enum lp_build_round_mode
1971 {
1972    LP_BUILD_ROUND_NEAREST = 0,
1973    LP_BUILD_ROUND_FLOOR = 1,
1974    LP_BUILD_ROUND_CEIL = 2,
1975    LP_BUILD_ROUND_TRUNCATE = 3
1976 };
1977
1978 static inline LLVMValueRef
1979 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1980                              LLVMValueRef a)
1981 {
1982    LLVMBuilderRef builder = bld->gallivm->builder;
1983    const struct lp_type type = bld->type;
1984    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1985    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1986    const char *intrinsic;
1987    LLVMValueRef res;
1988
1989    assert(type.floating);
1990    /* using the double precision conversions is a bit more complicated */
1991    assert(type.width == 32);
1992
1993    assert(lp_check_value(type, a));
1994    assert(util_cpu_caps.has_sse2);
1995
1996    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1997    if (type.length == 1) {
1998       LLVMTypeRef vec_type;
1999       LLVMValueRef undef;
2000       LLVMValueRef arg;
2001       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2002
2003       vec_type = LLVMVectorType(bld->elem_type, 4);
2004
2005       intrinsic = "llvm.x86.sse.cvtss2si";
2006
2007       undef = LLVMGetUndef(vec_type);
2008
2009       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2010
2011       res = lp_build_intrinsic_unary(builder, intrinsic,
2012                                      ret_type, arg);
2013    }
2014    else {
2015       if (type.width* type.length == 128) {
2016          intrinsic = "llvm.x86.sse2.cvtps2dq";
2017       }
2018       else {
2019          assert(type.width*type.length == 256);
2020          assert(util_cpu_caps.has_avx);
2021
2022          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2023       }
2024       res = lp_build_intrinsic_unary(builder, intrinsic,
2025                                      ret_type, a);
2026    }
2027
2028    return res;
2029 }
2030
2031
2032 /*
2033  */
2034 static inline LLVMValueRef
2035 lp_build_round_altivec(struct lp_build_context *bld,
2036                        LLVMValueRef a,
2037                        enum lp_build_round_mode mode)
2038 {
2039    LLVMBuilderRef builder = bld->gallivm->builder;
2040    const struct lp_type type = bld->type;
2041    const char *intrinsic = NULL;
2042
2043    assert(type.floating);
2044
2045    assert(lp_check_value(type, a));
2046    assert(util_cpu_caps.has_altivec);
2047
2048    (void)type;
2049
2050    switch (mode) {
2051    case LP_BUILD_ROUND_NEAREST:
2052       intrinsic = "llvm.ppc.altivec.vrfin";
2053       break;
2054    case LP_BUILD_ROUND_FLOOR:
2055       intrinsic = "llvm.ppc.altivec.vrfim";
2056       break;
2057    case LP_BUILD_ROUND_CEIL:
2058       intrinsic = "llvm.ppc.altivec.vrfip";
2059       break;
2060    case LP_BUILD_ROUND_TRUNCATE:
2061       intrinsic = "llvm.ppc.altivec.vrfiz";
2062       break;
2063    }
2064
2065    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2066 }
2067
2068 static inline LLVMValueRef
2069 lp_build_round_arch(struct lp_build_context *bld,
2070                     LLVMValueRef a,
2071                     enum lp_build_round_mode mode)
2072 {
2073    if (util_cpu_caps.has_sse4_1) {
2074       LLVMBuilderRef builder = bld->gallivm->builder;
2075       const struct lp_type type = bld->type;
2076       const char *intrinsic_root;
2077       char intrinsic[32];
2078
2079       assert(type.floating);
2080       assert(lp_check_value(type, a));
2081       (void)type;
2082
2083       switch (mode) {
2084       case LP_BUILD_ROUND_NEAREST:
2085          intrinsic_root = "llvm.nearbyint";
2086          break;
2087       case LP_BUILD_ROUND_FLOOR:
2088          intrinsic_root = "llvm.floor";
2089          break;
2090       case LP_BUILD_ROUND_CEIL:
2091          intrinsic_root = "llvm.ceil";
2092          break;
2093       case LP_BUILD_ROUND_TRUNCATE:
2094          intrinsic_root = "llvm.trunc";
2095          break;
2096       }
2097
2098       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2099       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2100    }
2101    else /* (util_cpu_caps.has_altivec) */
2102      return lp_build_round_altivec(bld, a, mode);
2103 }
2104
2105 /**
2106  * Return the integer part of a float (vector) value (== round toward zero).
2107  * The returned value is a float (vector).
2108  * Ex: trunc(-1.5) = -1.0
2109  */
2110 LLVMValueRef
2111 lp_build_trunc(struct lp_build_context *bld,
2112                LLVMValueRef a)
2113 {
2114    LLVMBuilderRef builder = bld->gallivm->builder;
2115    const struct lp_type type = bld->type;
2116
2117    assert(type.floating);
2118    assert(lp_check_value(type, a));
2119
2120    if (arch_rounding_available(type)) {
2121       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2122    }
2123    else {
2124       const struct lp_type type = bld->type;
2125       struct lp_type inttype;
2126       struct lp_build_context intbld;
2127       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2128       LLVMValueRef trunc, res, anosign, mask;
2129       LLVMTypeRef int_vec_type = bld->int_vec_type;
2130       LLVMTypeRef vec_type = bld->vec_type;
2131
2132       assert(type.width == 32); /* might want to handle doubles at some point */
2133
2134       inttype = type;
2135       inttype.floating = 0;
2136       lp_build_context_init(&intbld, bld->gallivm, inttype);
2137
2138       /* round by truncation */
2139       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2140       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2141
2142       /* mask out sign bit */
2143       anosign = lp_build_abs(bld, a);
2144       /*
2145        * mask out all values if anosign > 2^24
2146        * This should work both for large ints (all rounding is no-op for them
2147        * because such floats are always exact) as well as special cases like
2148        * NaNs, Infs (taking advantage of the fact they use max exponent).
2149        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2150        */
2151       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2152       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2153       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2154       return lp_build_select(bld, mask, a, res);
2155    }
2156 }
2157
2158
2159 /**
2160  * Return float (vector) rounded to nearest integer (vector).  The returned
2161  * value is a float (vector).
2162  * Ex: round(0.9) = 1.0
2163  * Ex: round(-1.5) = -2.0
2164  */
2165 LLVMValueRef
2166 lp_build_round(struct lp_build_context *bld,
2167                LLVMValueRef a)
2168 {
2169    LLVMBuilderRef builder = bld->gallivm->builder;
2170    const struct lp_type type = bld->type;
2171
2172    assert(type.floating);
2173    assert(lp_check_value(type, a));
2174
2175    if (arch_rounding_available(type)) {
2176       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2177    }
2178    else {
2179       const struct lp_type type = bld->type;
2180       struct lp_type inttype;
2181       struct lp_build_context intbld;
2182       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2183       LLVMValueRef res, anosign, mask;
2184       LLVMTypeRef int_vec_type = bld->int_vec_type;
2185       LLVMTypeRef vec_type = bld->vec_type;
2186
2187       assert(type.width == 32); /* might want to handle doubles at some point */
2188
2189       inttype = type;
2190       inttype.floating = 0;
2191       lp_build_context_init(&intbld, bld->gallivm, inttype);
2192
2193       res = lp_build_iround(bld, a);
2194       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2195
2196       /* mask out sign bit */
2197       anosign = lp_build_abs(bld, a);
2198       /*
2199        * mask out all values if anosign > 2^24
2200        * This should work both for large ints (all rounding is no-op for them
2201        * because such floats are always exact) as well as special cases like
2202        * NaNs, Infs (taking advantage of the fact they use max exponent).
2203        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2204        */
2205       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2206       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2207       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2208       return lp_build_select(bld, mask, a, res);
2209    }
2210 }
2211
2212
2213 /**
2214  * Return floor of float (vector), result is a float (vector)
2215  * Ex: floor(1.1) = 1.0
2216  * Ex: floor(-1.1) = -2.0
2217  */
2218 LLVMValueRef
2219 lp_build_floor(struct lp_build_context *bld,
2220                LLVMValueRef a)
2221 {
2222    LLVMBuilderRef builder = bld->gallivm->builder;
2223    const struct lp_type type = bld->type;
2224
2225    assert(type.floating);
2226    assert(lp_check_value(type, a));
2227
2228    if (arch_rounding_available(type)) {
2229       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2230    }
2231    else {
2232       const struct lp_type type = bld->type;
2233       struct lp_type inttype;
2234       struct lp_build_context intbld;
2235       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2236       LLVMValueRef trunc, res, anosign, mask;
2237       LLVMTypeRef int_vec_type = bld->int_vec_type;
2238       LLVMTypeRef vec_type = bld->vec_type;
2239
2240       if (type.width != 32) {
2241          char intrinsic[32];
2242          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2243          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2244       }
2245
2246       assert(type.width == 32); /* might want to handle doubles at some point */
2247
2248       inttype = type;
2249       inttype.floating = 0;
2250       lp_build_context_init(&intbld, bld->gallivm, inttype);
2251
2252       /* round by truncation */
2253       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2254       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2255
2256       if (type.sign) {
2257          LLVMValueRef tmp;
2258
2259          /*
2260           * fix values if rounding is wrong (for non-special cases)
2261           * - this is the case if trunc > a
2262           */
2263          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2264          /* tmp = trunc > a ? 1.0 : 0.0 */
2265          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2266          tmp = lp_build_and(&intbld, mask, tmp);
2267          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2268          res = lp_build_sub(bld, res, tmp);
2269       }
2270
2271       /* mask out sign bit */
2272       anosign = lp_build_abs(bld, a);
2273       /*
2274        * mask out all values if anosign > 2^24
2275        * This should work both for large ints (all rounding is no-op for them
2276        * because such floats are always exact) as well as special cases like
2277        * NaNs, Infs (taking advantage of the fact they use max exponent).
2278        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2279        */
2280       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2281       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2282       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2283       return lp_build_select(bld, mask, a, res);
2284    }
2285 }
2286
2287
2288 /**
2289  * Return ceiling of float (vector), returning float (vector).
2290  * Ex: ceil( 1.1) = 2.0
2291  * Ex: ceil(-1.1) = -1.0
2292  */
2293 LLVMValueRef
2294 lp_build_ceil(struct lp_build_context *bld,
2295               LLVMValueRef a)
2296 {
2297    LLVMBuilderRef builder = bld->gallivm->builder;
2298    const struct lp_type type = bld->type;
2299
2300    assert(type.floating);
2301    assert(lp_check_value(type, a));
2302
2303    if (arch_rounding_available(type)) {
2304       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2305    }
2306    else {
2307       const struct lp_type type = bld->type;
2308       struct lp_type inttype;
2309       struct lp_build_context intbld;
2310       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2311       LLVMValueRef trunc, res, anosign, mask, tmp;
2312       LLVMTypeRef int_vec_type = bld->int_vec_type;
2313       LLVMTypeRef vec_type = bld->vec_type;
2314
2315       if (type.width != 32) {
2316          char intrinsic[32];
2317          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2318          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2319       }
2320
2321       assert(type.width == 32); /* might want to handle doubles at some point */
2322
2323       inttype = type;
2324       inttype.floating = 0;
2325       lp_build_context_init(&intbld, bld->gallivm, inttype);
2326
2327       /* round by truncation */
2328       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2329       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2330
2331       /*
2332        * fix values if rounding is wrong (for non-special cases)
2333        * - this is the case if trunc < a
2334        */
2335       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2336       /* tmp = trunc < a ? 1.0 : 0.0 */
2337       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2338       tmp = lp_build_and(&intbld, mask, tmp);
2339       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2340       res = lp_build_add(bld, trunc, tmp);
2341
2342       /* mask out sign bit */
2343       anosign = lp_build_abs(bld, a);
2344       /*
2345        * mask out all values if anosign > 2^24
2346        * This should work both for large ints (all rounding is no-op for them
2347        * because such floats are always exact) as well as special cases like
2348        * NaNs, Infs (taking advantage of the fact they use max exponent).
2349        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2350        */
2351       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2352       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2353       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2354       return lp_build_select(bld, mask, a, res);
2355    }
2356 }
2357
2358
2359 /**
2360  * Return fractional part of 'a' computed as a - floor(a)
2361  * Typically used in texture coord arithmetic.
2362  */
2363 LLVMValueRef
2364 lp_build_fract(struct lp_build_context *bld,
2365                LLVMValueRef a)
2366 {
2367    assert(bld->type.floating);
2368    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2369 }
2370
2371
2372 /**
2373  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2374  * against 0.99999(9). (Will also return that value for NaNs.)
2375  */
2376 static inline LLVMValueRef
2377 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2378 {
2379    LLVMValueRef max;
2380
2381    /* this is the largest number smaller than 1.0 representable as float */
2382    max = lp_build_const_vec(bld->gallivm, bld->type,
2383                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2384    return lp_build_min_ext(bld, fract, max,
2385                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2386 }
2387
2388
2389 /**
2390  * Same as lp_build_fract, but guarantees that the result is always smaller
2391  * than one. Will also return the smaller-than-one value for infs, NaNs.
2392  */
2393 LLVMValueRef
2394 lp_build_fract_safe(struct lp_build_context *bld,
2395                     LLVMValueRef a)
2396 {
2397    return clamp_fract(bld, lp_build_fract(bld, a));
2398 }
2399
2400
2401 /**
2402  * Return the integer part of a float (vector) value (== round toward zero).
2403  * The returned value is an integer (vector).
2404  * Ex: itrunc(-1.5) = -1
2405  */
2406 LLVMValueRef
2407 lp_build_itrunc(struct lp_build_context *bld,
2408                 LLVMValueRef a)
2409 {
2410    LLVMBuilderRef builder = bld->gallivm->builder;
2411    const struct lp_type type = bld->type;
2412    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2413
2414    assert(type.floating);
2415    assert(lp_check_value(type, a));
2416
2417    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2418 }
2419
2420
2421 /**
2422  * Return float (vector) rounded to nearest integer (vector).  The returned
2423  * value is an integer (vector).
2424  * Ex: iround(0.9) = 1
2425  * Ex: iround(-1.5) = -2
2426  */
2427 LLVMValueRef
2428 lp_build_iround(struct lp_build_context *bld,
2429                 LLVMValueRef a)
2430 {
2431    LLVMBuilderRef builder = bld->gallivm->builder;
2432    const struct lp_type type = bld->type;
2433    LLVMTypeRef int_vec_type = bld->int_vec_type;
2434    LLVMValueRef res;
2435
2436    assert(type.floating);
2437
2438    assert(lp_check_value(type, a));
2439
2440    if ((util_cpu_caps.has_sse2 &&
2441        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2442        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2443       return lp_build_iround_nearest_sse2(bld, a);
2444    }
2445    if (arch_rounding_available(type)) {
2446       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2447    }
2448    else {
2449       LLVMValueRef half;
2450
2451       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2452
2453       if (type.sign) {
2454          LLVMTypeRef vec_type = bld->vec_type;
2455          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2456                                     (unsigned long long)1 << (type.width - 1));
2457          LLVMValueRef sign;
2458
2459          /* get sign bit */
2460          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2461          sign = LLVMBuildAnd(builder, sign, mask, "");
2462
2463          /* sign * 0.5 */
2464          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2465          half = LLVMBuildOr(builder, sign, half, "");
2466          half = LLVMBuildBitCast(builder, half, vec_type, "");
2467       }
2468
2469       res = LLVMBuildFAdd(builder, a, half, "");
2470    }
2471
2472    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2473
2474    return res;
2475 }
2476
2477
2478 /**
2479  * Return floor of float (vector), result is an int (vector)
2480  * Ex: ifloor(1.1) = 1.0
2481  * Ex: ifloor(-1.1) = -2.0
2482  */
2483 LLVMValueRef
2484 lp_build_ifloor(struct lp_build_context *bld,
2485                 LLVMValueRef a)
2486 {
2487    LLVMBuilderRef builder = bld->gallivm->builder;
2488    const struct lp_type type = bld->type;
2489    LLVMTypeRef int_vec_type = bld->int_vec_type;
2490    LLVMValueRef res;
2491
2492    assert(type.floating);
2493    assert(lp_check_value(type, a));
2494
2495    res = a;
2496    if (type.sign) {
2497       if (arch_rounding_available(type)) {
2498          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2499       }
2500       else {
2501          struct lp_type inttype;
2502          struct lp_build_context intbld;
2503          LLVMValueRef trunc, itrunc, mask;
2504
2505          assert(type.floating);
2506          assert(lp_check_value(type, a));
2507
2508          inttype = type;
2509          inttype.floating = 0;
2510          lp_build_context_init(&intbld, bld->gallivm, inttype);
2511
2512          /* round by truncation */
2513          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2514          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2515
2516          /*
2517           * fix values if rounding is wrong (for non-special cases)
2518           * - this is the case if trunc > a
2519           * The results of doing this with NaNs, very large values etc.
2520           * are undefined but this seems to be the case anyway.
2521           */
2522          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2523          /* cheapie minus one with mask since the mask is minus one / zero */
2524          return lp_build_add(&intbld, itrunc, mask);
2525       }
2526    }
2527
2528    /* round to nearest (toward zero) */
2529    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2530
2531    return res;
2532 }
2533
2534
2535 /**
2536  * Return ceiling of float (vector), returning int (vector).
2537  * Ex: iceil( 1.1) = 2
2538  * Ex: iceil(-1.1) = -1
2539  */
2540 LLVMValueRef
2541 lp_build_iceil(struct lp_build_context *bld,
2542                LLVMValueRef a)
2543 {
2544    LLVMBuilderRef builder = bld->gallivm->builder;
2545    const struct lp_type type = bld->type;
2546    LLVMTypeRef int_vec_type = bld->int_vec_type;
2547    LLVMValueRef res;
2548
2549    assert(type.floating);
2550    assert(lp_check_value(type, a));
2551
2552    if (arch_rounding_available(type)) {
2553       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2554    }
2555    else {
2556       struct lp_type inttype;
2557       struct lp_build_context intbld;
2558       LLVMValueRef trunc, itrunc, mask;
2559
2560       assert(type.floating);
2561       assert(lp_check_value(type, a));
2562
2563       inttype = type;
2564       inttype.floating = 0;
2565       lp_build_context_init(&intbld, bld->gallivm, inttype);
2566
2567       /* round by truncation */
2568       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2569       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2570
2571       /*
2572        * fix values if rounding is wrong (for non-special cases)
2573        * - this is the case if trunc < a
2574        * The results of doing this with NaNs, very large values etc.
2575        * are undefined but this seems to be the case anyway.
2576        */
2577       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2578       /* cheapie plus one with mask since the mask is minus one / zero */
2579       return lp_build_sub(&intbld, itrunc, mask);
2580    }
2581
2582    /* round to nearest (toward zero) */
2583    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2584
2585    return res;
2586 }
2587
2588
2589 /**
2590  * Combined ifloor() & fract().
2591  *
2592  * Preferred to calling the functions separately, as it will ensure that the
2593  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2594  */
2595 void
2596 lp_build_ifloor_fract(struct lp_build_context *bld,
2597                       LLVMValueRef a,
2598                       LLVMValueRef *out_ipart,
2599                       LLVMValueRef *out_fpart)
2600 {
2601    LLVMBuilderRef builder = bld->gallivm->builder;
2602    const struct lp_type type = bld->type;
2603    LLVMValueRef ipart;
2604
2605    assert(type.floating);
2606    assert(lp_check_value(type, a));
2607
2608    if (arch_rounding_available(type)) {
2609       /*
2610        * floor() is easier.
2611        */
2612
2613       ipart = lp_build_floor(bld, a);
2614       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2615       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2616    }
2617    else {
2618       /*
2619        * ifloor() is easier.
2620        */
2621
2622       *out_ipart = lp_build_ifloor(bld, a);
2623       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2624       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2625    }
2626 }
2627
2628
2629 /**
2630  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2631  * always smaller than one.
2632  */
2633 void
2634 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2635                            LLVMValueRef a,
2636                            LLVMValueRef *out_ipart,
2637                            LLVMValueRef *out_fpart)
2638 {
2639    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2640    *out_fpart = clamp_fract(bld, *out_fpart);
2641 }
2642
2643
2644 LLVMValueRef
2645 lp_build_sqrt(struct lp_build_context *bld,
2646               LLVMValueRef a)
2647 {
2648    LLVMBuilderRef builder = bld->gallivm->builder;
2649    const struct lp_type type = bld->type;
2650    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2651    char intrinsic[32];
2652
2653    assert(lp_check_value(type, a));
2654
2655    assert(type.floating);
2656    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2657
2658    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2659 }
2660
2661
2662 /**
2663  * Do one Newton-Raphson step to improve reciprocate precision:
2664  *
2665  *   x_{i+1} = x_i * (2 - a * x_i)
2666  *
2667  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2668  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2669  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2670  * halo. It would be necessary to clamp the argument to prevent this.
2671  *
2672  * See also:
2673  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2674  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2675  */
2676 static inline LLVMValueRef
2677 lp_build_rcp_refine(struct lp_build_context *bld,
2678                     LLVMValueRef a,
2679                     LLVMValueRef rcp_a)
2680 {
2681    LLVMBuilderRef builder = bld->gallivm->builder;
2682    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2683    LLVMValueRef res;
2684
2685    res = LLVMBuildFMul(builder, a, rcp_a, "");
2686    res = LLVMBuildFSub(builder, two, res, "");
2687    res = LLVMBuildFMul(builder, rcp_a, res, "");
2688
2689    return res;
2690 }
2691
2692
2693 LLVMValueRef
2694 lp_build_rcp(struct lp_build_context *bld,
2695              LLVMValueRef a)
2696 {
2697    LLVMBuilderRef builder = bld->gallivm->builder;
2698    const struct lp_type type = bld->type;
2699
2700    assert(lp_check_value(type, a));
2701
2702    if(a == bld->zero)
2703       return bld->undef;
2704    if(a == bld->one)
2705       return bld->one;
2706    if(a == bld->undef)
2707       return bld->undef;
2708
2709    assert(type.floating);
2710
2711    if(LLVMIsConstant(a))
2712       return LLVMConstFDiv(bld->one, a);
2713
2714    /*
2715     * We don't use RCPPS because:
2716     * - it only has 10bits of precision
2717     * - it doesn't even get the reciprocate of 1.0 exactly
2718     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2719     * - for recent processors the benefit over DIVPS is marginal, a case
2720     *   dependent
2721     *
2722     * We could still use it on certain processors if benchmarks show that the
2723     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2724     * particular uses that require less workarounds.
2725     */
2726
2727    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2728          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2729       const unsigned num_iterations = 0;
2730       LLVMValueRef res;
2731       unsigned i;
2732       const char *intrinsic = NULL;
2733
2734       if (type.length == 4) {
2735          intrinsic = "llvm.x86.sse.rcp.ps";
2736       }
2737       else {
2738          intrinsic = "llvm.x86.avx.rcp.ps.256";
2739       }
2740
2741       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2742
2743       for (i = 0; i < num_iterations; ++i) {
2744          res = lp_build_rcp_refine(bld, a, res);
2745       }
2746
2747       return res;
2748    }
2749
2750    return LLVMBuildFDiv(builder, bld->one, a, "");
2751 }
2752
2753
2754 /**
2755  * Do one Newton-Raphson step to improve rsqrt precision:
2756  *
2757  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2758  *
2759  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2760  */
2761 static inline LLVMValueRef
2762 lp_build_rsqrt_refine(struct lp_build_context *bld,
2763                       LLVMValueRef a,
2764                       LLVMValueRef rsqrt_a)
2765 {
2766    LLVMBuilderRef builder = bld->gallivm->builder;
2767    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2768    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2769    LLVMValueRef res;
2770
2771    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2772    res = LLVMBuildFMul(builder, a, res, "");
2773    res = LLVMBuildFSub(builder, three, res, "");
2774    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2775    res = LLVMBuildFMul(builder, half, res, "");
2776
2777    return res;
2778 }
2779
2780
2781 /**
2782  * Generate 1/sqrt(a).
2783  * Result is undefined for values < 0, infinity for +0.
2784  */
2785 LLVMValueRef
2786 lp_build_rsqrt(struct lp_build_context *bld,
2787                LLVMValueRef a)
2788 {
2789    const struct lp_type type = bld->type;
2790
2791    assert(lp_check_value(type, a));
2792
2793    assert(type.floating);
2794
2795    /*
2796     * This should be faster but all denormals will end up as infinity.
2797     */
2798    if (0 && lp_build_fast_rsqrt_available(type)) {
2799       const unsigned num_iterations = 1;
2800       LLVMValueRef res;
2801       unsigned i;
2802
2803       /* rsqrt(1.0) != 1.0 here */
2804       res = lp_build_fast_rsqrt(bld, a);
2805
2806       if (num_iterations) {
2807          /*
2808           * Newton-Raphson will result in NaN instead of infinity for zero,
2809           * and NaN instead of zero for infinity.
2810           * Also, need to ensure rsqrt(1.0) == 1.0.
2811           * All numbers smaller than FLT_MIN will result in +infinity
2812           * (rsqrtps treats all denormals as zero).
2813           */
2814          LLVMValueRef cmp;
2815          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2816          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2817
2818          for (i = 0; i < num_iterations; ++i) {
2819             res = lp_build_rsqrt_refine(bld, a, res);
2820          }
2821          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2822          res = lp_build_select(bld, cmp, inf, res);
2823          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2824          res = lp_build_select(bld, cmp, bld->zero, res);
2825          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2826          res = lp_build_select(bld, cmp, bld->one, res);
2827       }
2828
2829       return res;
2830    }
2831
2832    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2833 }
2834
2835 /**
2836  * If there's a fast (inaccurate) rsqrt instruction available
2837  * (caller may want to avoid to call rsqrt_fast if it's not available,
2838  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2839  * unavailable it would result in sqrt/div/mul so obviously
2840  * much better to just call sqrt, skipping both div and mul).
2841  */
2842 boolean
2843 lp_build_fast_rsqrt_available(struct lp_type type)
2844 {
2845    assert(type.floating);
2846
2847    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2848        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2849       return true;
2850    }
2851    return false;
2852 }
2853
2854
2855 /**
2856  * Generate 1/sqrt(a).
2857  * Result is undefined for values < 0, infinity for +0.
2858  * Precision is limited, only ~10 bits guaranteed
2859  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2860  */
2861 LLVMValueRef
2862 lp_build_fast_rsqrt(struct lp_build_context *bld,
2863                     LLVMValueRef a)
2864 {
2865    LLVMBuilderRef builder = bld->gallivm->builder;
2866    const struct lp_type type = bld->type;
2867
2868    assert(lp_check_value(type, a));
2869
2870    if (lp_build_fast_rsqrt_available(type)) {
2871       const char *intrinsic = NULL;
2872
2873       if (type.length == 4) {
2874          intrinsic = "llvm.x86.sse.rsqrt.ps";
2875       }
2876       else {
2877          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2878       }
2879       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2880    }
2881    else {
2882       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2883    }
2884    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2885 }
2886
2887
2888 /**
2889  * Generate sin(a) or cos(a) using polynomial approximation.
2890  * TODO: it might be worth recognizing sin and cos using same source
2891  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2892  * would be way cheaper than calculating (nearly) everything twice...
2893  * Not sure it's common enough to be worth bothering however, scs
2894  * opcode could also benefit from calculating both though.
2895  */
2896 static LLVMValueRef
2897 lp_build_sin_or_cos(struct lp_build_context *bld,
2898                     LLVMValueRef a,
2899                     boolean cos)
2900 {
2901    struct gallivm_state *gallivm = bld->gallivm;
2902    LLVMBuilderRef b = gallivm->builder;
2903    struct lp_type int_type = lp_int_type(bld->type);
2904
2905    /*
2906     *  take the absolute value,
2907     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2908     */
2909
2910    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2911    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2912
2913    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2914    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2915
2916    /*
2917     * scale by 4/Pi
2918     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2919     */
2920
2921    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2922    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2923
2924    /*
2925     * store the integer part of y in mm0
2926     * emm2 = _mm_cvttps_epi32(y);
2927     */
2928
2929    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2930
2931    /*
2932     * j=(j+1) & (~1) (see the cephes sources)
2933     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2934     */
2935
2936    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2937    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2938    /*
2939     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2940     */
2941    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2942    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2943
2944    /*
2945     * y = _mm_cvtepi32_ps(emm2);
2946     */
2947    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2948
2949    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2950    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2951    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2952    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2953
2954    /*
2955     * Argument used for poly selection and sign bit determination
2956     * is different for sin vs. cos.
2957     */
2958    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2959                                emm2_and;
2960
2961    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2962                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2963                                               const_29, "sign_bit") :
2964                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2965                                                               LLVMBuildShl(b, emm2_add,
2966                                                                            const_29, ""), ""),
2967                                               sign_mask, "sign_bit");
2968
2969    /*
2970     * get the polynom selection mask
2971     * there is one polynom for 0 <= x <= Pi/4
2972     * and another one for Pi/4<x<=Pi/2
2973     * Both branches will be computed.
2974     *
2975     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2976     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2977     */
2978
2979    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2980    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2981                                              int_type, PIPE_FUNC_EQUAL,
2982                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2983
2984    /*
2985     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2986     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2987     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2988     */
2989    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2990    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2991    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2992
2993    /*
2994     * The magic pass: "Extended precision modular arithmetic"
2995     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2996     */
2997    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2998    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2999    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3000
3001    /*
3002     * Evaluate the first polynom  (0 <= x <= Pi/4)
3003     *
3004     * z = _mm_mul_ps(x,x);
3005     */
3006    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3007
3008    /*
3009     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3010     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3011     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3012     */
3013    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3014    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3015    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3016
3017    /*
3018     * y = *(v4sf*)_ps_coscof_p0;
3019     * y = _mm_mul_ps(y, z);
3020     */
3021    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3022    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3023    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3024    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3025
3026
3027    /*
3028     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3029     * y = _mm_sub_ps(y, tmp);
3030     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3031     */
3032    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3033    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3034    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3035    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3036    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3037
3038    /*
3039     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3040     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3041     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3042     */
3043    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3044    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3045    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3046
3047    /*
3048     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3049     *
3050     * y2 = *(v4sf*)_ps_sincof_p0;
3051     * y2 = _mm_mul_ps(y2, z);
3052     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3053     * y2 = _mm_mul_ps(y2, z);
3054     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3055     * y2 = _mm_mul_ps(y2, z);
3056     * y2 = _mm_mul_ps(y2, x);
3057     * y2 = _mm_add_ps(y2, x);
3058     */
3059
3060    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3061    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3062    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3063    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3064
3065    /*
3066     * select the correct result from the two polynoms
3067     * xmm3 = poly_mask;
3068     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3069     * y = _mm_andnot_ps(xmm3, y);
3070     * y = _mm_or_ps(y,y2);
3071     */
3072    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3073    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3074    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3075    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3076    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3077    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3078
3079    /*
3080     * update the sign
3081     * y = _mm_xor_ps(y, sign_bit);
3082     */
3083    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3084    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3085
3086    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3087
3088    /* clamp output to be within [-1, 1] */
3089    y_result = lp_build_clamp(bld, y_result,
3090                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3091                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3092    /* If a is -inf, inf or NaN then return NaN */
3093    y_result = lp_build_select(bld, isfinite, y_result,
3094                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3095    return y_result;
3096 }
3097
3098
3099 /**
3100  * Generate sin(a)
3101  */
3102 LLVMValueRef
3103 lp_build_sin(struct lp_build_context *bld,
3104              LLVMValueRef a)
3105 {
3106    return lp_build_sin_or_cos(bld, a, FALSE);
3107 }
3108
3109
3110 /**
3111  * Generate cos(a)
3112  */
3113 LLVMValueRef
3114 lp_build_cos(struct lp_build_context *bld,
3115              LLVMValueRef a)
3116 {
3117    return lp_build_sin_or_cos(bld, a, TRUE);
3118 }
3119
3120
3121 /**
3122  * Generate pow(x, y)
3123  */
3124 LLVMValueRef
3125 lp_build_pow(struct lp_build_context *bld,
3126              LLVMValueRef x,
3127              LLVMValueRef y)
3128 {
3129    /* TODO: optimize the constant case */
3130    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3131        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3132       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3133                    __FUNCTION__);
3134    }
3135
3136    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3137 }
3138
3139
3140 /**
3141  * Generate exp(x)
3142  */
3143 LLVMValueRef
3144 lp_build_exp(struct lp_build_context *bld,
3145              LLVMValueRef x)
3146 {
3147    /* log2(e) = 1/log(2) */
3148    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3149                                            1.4426950408889634);
3150
3151    assert(lp_check_value(bld->type, x));
3152
3153    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3154 }
3155
3156
3157 /**
3158  * Generate log(x)
3159  * Behavior is undefined with infs, 0s and nans
3160  */
3161 LLVMValueRef
3162 lp_build_log(struct lp_build_context *bld,
3163              LLVMValueRef x)
3164 {
3165    /* log(2) */
3166    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3167                                           0.69314718055994529);
3168
3169    assert(lp_check_value(bld->type, x));
3170
3171    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3172 }
3173
3174 /**
3175  * Generate log(x) that handles edge cases (infs, 0s and nans)
3176  */
3177 LLVMValueRef
3178 lp_build_log_safe(struct lp_build_context *bld,
3179                   LLVMValueRef x)
3180 {
3181    /* log(2) */
3182    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3183                                           0.69314718055994529);
3184
3185    assert(lp_check_value(bld->type, x));
3186
3187    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3188 }
3189
3190
3191 /**
3192  * Generate polynomial.
3193  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3194  */
3195 LLVMValueRef
3196 lp_build_polynomial(struct lp_build_context *bld,
3197                     LLVMValueRef x,
3198                     const double *coeffs,
3199                     unsigned num_coeffs)
3200 {
3201    const struct lp_type type = bld->type;
3202    LLVMValueRef even = NULL, odd = NULL;
3203    LLVMValueRef x2;
3204    unsigned i;
3205
3206    assert(lp_check_value(bld->type, x));
3207
3208    /* TODO: optimize the constant case */
3209    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3210        LLVMIsConstant(x)) {
3211       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3212                    __FUNCTION__);
3213    }
3214
3215    /*
3216     * Calculate odd and even terms seperately to decrease data dependency
3217     * Ex:
3218     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3219     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3220     */
3221    x2 = lp_build_mul(bld, x, x);
3222
3223    for (i = num_coeffs; i--; ) {
3224       LLVMValueRef coeff;
3225
3226       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3227
3228       if (i % 2 == 0) {
3229          if (even)
3230             even = lp_build_mad(bld, x2, even, coeff);
3231          else
3232             even = coeff;
3233       } else {
3234          if (odd)
3235             odd = lp_build_mad(bld, x2, odd, coeff);
3236          else
3237             odd = coeff;
3238       }
3239    }
3240
3241    if (odd)
3242       return lp_build_mad(bld, odd, x, even);
3243    else if (even)
3244       return even;
3245    else
3246       return bld->undef;
3247 }
3248
3249
3250 /**
3251  * Minimax polynomial fit of 2**x, in range [0, 1[
3252  */
3253 const double lp_build_exp2_polynomial[] = {
3254 #if EXP_POLY_DEGREE == 5
3255    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3256    0.693153073200168932794,
3257    0.240153617044375388211,
3258    0.0558263180532956664775,
3259    0.00898934009049466391101,
3260    0.00187757667519147912699
3261 #elif EXP_POLY_DEGREE == 4
3262    1.00000259337069434683,
3263    0.693003834469974940458,
3264    0.24144275689150793076,
3265    0.0520114606103070150235,
3266    0.0135341679161270268764
3267 #elif EXP_POLY_DEGREE == 3
3268    0.999925218562710312959,
3269    0.695833540494823811697,
3270    0.226067155427249155588,
3271    0.0780245226406372992967
3272 #elif EXP_POLY_DEGREE == 2
3273    1.00172476321474503578,
3274    0.657636275736077639316,
3275    0.33718943461968720704
3276 #else
3277 #error
3278 #endif
3279 };
3280
3281
3282 LLVMValueRef
3283 lp_build_exp2(struct lp_build_context *bld,
3284               LLVMValueRef x)
3285 {
3286    LLVMBuilderRef builder = bld->gallivm->builder;
3287    const struct lp_type type = bld->type;
3288    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3289    LLVMValueRef ipart = NULL;
3290    LLVMValueRef fpart = NULL;
3291    LLVMValueRef expipart = NULL;
3292    LLVMValueRef expfpart = NULL;
3293    LLVMValueRef res = NULL;
3294
3295    assert(lp_check_value(bld->type, x));
3296
3297    /* TODO: optimize the constant case */
3298    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3299        LLVMIsConstant(x)) {
3300       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3301                    __FUNCTION__);
3302    }
3303
3304    assert(type.floating && type.width == 32);
3305
3306    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3307     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3308    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3309                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3310    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3311                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3312
3313    /* ipart = floor(x) */
3314    /* fpart = x - ipart */
3315    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3316
3317    /* expipart = (float) (1 << ipart) */
3318    expipart = LLVMBuildAdd(builder, ipart,
3319                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3320    expipart = LLVMBuildShl(builder, expipart,
3321                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3322    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3323
3324    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3325                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3326
3327    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3328
3329    return res;
3330 }
3331
3332
3333
3334 /**
3335  * Extract the exponent of a IEEE-754 floating point value.
3336  *
3337  * Optionally apply an integer bias.
3338  *
3339  * Result is an integer value with
3340  *
3341  *   ifloor(log2(x)) + bias
3342  */
3343 LLVMValueRef
3344 lp_build_extract_exponent(struct lp_build_context *bld,
3345                           LLVMValueRef x,
3346                           int bias)
3347 {
3348    LLVMBuilderRef builder = bld->gallivm->builder;
3349    const struct lp_type type = bld->type;
3350    unsigned mantissa = lp_mantissa(type);
3351    LLVMValueRef res;
3352
3353    assert(type.floating);
3354
3355    assert(lp_check_value(bld->type, x));
3356
3357    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3358
3359    res = LLVMBuildLShr(builder, x,
3360                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3361    res = LLVMBuildAnd(builder, res,
3362                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3363    res = LLVMBuildSub(builder, res,
3364                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3365
3366    return res;
3367 }
3368
3369
3370 /**
3371  * Extract the mantissa of the a floating.
3372  *
3373  * Result is a floating point value with
3374  *
3375  *   x / floor(log2(x))
3376  */
3377 LLVMValueRef
3378 lp_build_extract_mantissa(struct lp_build_context *bld,
3379                           LLVMValueRef x)
3380 {
3381    LLVMBuilderRef builder = bld->gallivm->builder;
3382    const struct lp_type type = bld->type;
3383    unsigned mantissa = lp_mantissa(type);
3384    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3385                                                   (1ULL << mantissa) - 1);
3386    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3387    LLVMValueRef res;
3388
3389    assert(lp_check_value(bld->type, x));
3390
3391    assert(type.floating);
3392
3393    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3394
3395    /* res = x / 2**ipart */
3396    res = LLVMBuildAnd(builder, x, mantmask, "");
3397    res = LLVMBuildOr(builder, res, one, "");
3398    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3399
3400    return res;
3401 }
3402
3403
3404
3405 /**
3406  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3407  * These coefficients can be generate with
3408  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3409  */
3410 const double lp_build_log2_polynomial[] = {
3411 #if LOG_POLY_DEGREE == 5
3412    2.88539008148777786488L,
3413    0.961796878841293367824L,
3414    0.577058946784739859012L,
3415    0.412914355135828735411L,
3416    0.308591899232910175289L,
3417    0.352376952300281371868L,
3418 #elif LOG_POLY_DEGREE == 4
3419    2.88539009343309178325L,
3420    0.961791550404184197881L,
3421    0.577440339438736392009L,
3422    0.403343858251329912514L,
3423    0.406718052498846252698L,
3424 #elif LOG_POLY_DEGREE == 3
3425    2.88538959748872753838L,
3426    0.961932915889597772928L,
3427    0.571118517972136195241L,
3428    0.493997535084709500285L,
3429 #else
3430 #error
3431 #endif
3432 };
3433
3434 /**
3435  * See http://www.devmaster.net/forums/showthread.php?p=43580
3436  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3437  * http://www.nezumi.demon.co.uk/consult/logx.htm
3438  *
3439  * If handle_edge_cases is true the function will perform computations
3440  * to match the required D3D10+ behavior for each of the edge cases.
3441  * That means that if input is:
3442  * - less than zero (to and including -inf) then NaN will be returned
3443  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3444  * - +infinity, then +infinity will be returned
3445  * - NaN, then NaN will be returned
3446  *
3447  * Those checks are fairly expensive so if you don't need them make sure
3448  * handle_edge_cases is false.
3449  */
3450 void
3451 lp_build_log2_approx(struct lp_build_context *bld,
3452                      LLVMValueRef x,
3453                      LLVMValueRef *p_exp,
3454                      LLVMValueRef *p_floor_log2,
3455                      LLVMValueRef *p_log2,
3456                      boolean handle_edge_cases)
3457 {
3458    LLVMBuilderRef builder = bld->gallivm->builder;
3459    const struct lp_type type = bld->type;
3460    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3461    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3462
3463    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3464    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3465    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3466
3467    LLVMValueRef i = NULL;
3468    LLVMValueRef y = NULL;
3469    LLVMValueRef z = NULL;
3470    LLVMValueRef exp = NULL;
3471    LLVMValueRef mant = NULL;
3472    LLVMValueRef logexp = NULL;
3473    LLVMValueRef p_z = NULL;
3474    LLVMValueRef res = NULL;
3475
3476    assert(lp_check_value(bld->type, x));
3477
3478    if(p_exp || p_floor_log2 || p_log2) {
3479       /* TODO: optimize the constant case */
3480       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3481           LLVMIsConstant(x)) {
3482          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3483                       __FUNCTION__);
3484       }
3485
3486       assert(type.floating && type.width == 32);
3487
3488       /*
3489        * We don't explicitly handle denormalized numbers. They will yield a
3490        * result in the neighbourhood of -127, which appears to be adequate
3491        * enough.
3492        */
3493
3494       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3495
3496       /* exp = (float) exponent(x) */
3497       exp = LLVMBuildAnd(builder, i, expmask, "");
3498    }
3499
3500    if(p_floor_log2 || p_log2) {
3501       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3502       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3503       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3504    }
3505
3506    if (p_log2) {
3507       /* mant = 1 + (float) mantissa(x) */
3508       mant = LLVMBuildAnd(builder, i, mantmask, "");
3509       mant = LLVMBuildOr(builder, mant, one, "");
3510       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3511
3512       /* y = (mant - 1) / (mant + 1) */
3513       y = lp_build_div(bld,
3514          lp_build_sub(bld, mant, bld->one),
3515          lp_build_add(bld, mant, bld->one)
3516       );
3517
3518       /* z = y^2 */
3519       z = lp_build_mul(bld, y, y);
3520
3521       /* compute P(z) */
3522       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3523                                 ARRAY_SIZE(lp_build_log2_polynomial));
3524
3525       /* y * P(z) + logexp */
3526       res = lp_build_mad(bld, y, p_z, logexp);
3527
3528       if (type.floating && handle_edge_cases) {
3529          LLVMValueRef negmask, infmask,  zmask;
3530          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3531                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3532          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3533                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3534          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3535                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3536
3537          /* If x is qual to inf make sure we return inf */
3538          res = lp_build_select(bld, infmask,
3539                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3540                                res);
3541          /* If x is qual to 0, return -inf */
3542          res = lp_build_select(bld, zmask,
3543                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3544                                res);
3545          /* If x is nan or less than 0, return nan */
3546          res = lp_build_select(bld, negmask,
3547                                lp_build_const_vec(bld->gallivm, type,  NAN),
3548                                res);
3549       }
3550    }
3551
3552    if (p_exp) {
3553       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3554       *p_exp = exp;
3555    }
3556
3557    if (p_floor_log2)
3558       *p_floor_log2 = logexp;
3559
3560    if (p_log2)
3561       *p_log2 = res;
3562 }
3563
3564
3565 /*
3566  * log2 implementation which doesn't have special code to
3567  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3568  * the results for those cases are undefined.
3569  */
3570 LLVMValueRef
3571 lp_build_log2(struct lp_build_context *bld,
3572               LLVMValueRef x)
3573 {
3574    LLVMValueRef res;
3575    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3576    return res;
3577 }
3578
3579 /*
3580  * Version of log2 which handles all edge cases.
3581  * Look at documentation of lp_build_log2_approx for
3582  * description of the behavior for each of the edge cases.
3583  */
3584 LLVMValueRef
3585 lp_build_log2_safe(struct lp_build_context *bld,
3586                    LLVMValueRef x)
3587 {
3588    LLVMValueRef res;
3589    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3590    return res;
3591 }
3592
3593
3594 /**
3595  * Faster (and less accurate) log2.
3596  *
3597  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3598  *
3599  * Piece-wise linear approximation, with exact results when x is a
3600  * power of two.
3601  *
3602  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3603  */
3604 LLVMValueRef
3605 lp_build_fast_log2(struct lp_build_context *bld,
3606                    LLVMValueRef x)
3607 {
3608    LLVMBuilderRef builder = bld->gallivm->builder;
3609    LLVMValueRef ipart;
3610    LLVMValueRef fpart;
3611
3612    assert(lp_check_value(bld->type, x));
3613
3614    assert(bld->type.floating);
3615
3616    /* ipart = floor(log2(x)) - 1 */
3617    ipart = lp_build_extract_exponent(bld, x, -1);
3618    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3619
3620    /* fpart = x / 2**ipart */
3621    fpart = lp_build_extract_mantissa(bld, x);
3622
3623    /* ipart + fpart */
3624    return LLVMBuildFAdd(builder, ipart, fpart, "");
3625 }
3626
3627
3628 /**
3629  * Fast implementation of iround(log2(x)).
3630  *
3631  * Not an approximation -- it should give accurate results all the time.
3632  */
3633 LLVMValueRef
3634 lp_build_ilog2(struct lp_build_context *bld,
3635                LLVMValueRef x)
3636 {
3637    LLVMBuilderRef builder = bld->gallivm->builder;
3638    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3639    LLVMValueRef ipart;
3640
3641    assert(bld->type.floating);
3642
3643    assert(lp_check_value(bld->type, x));
3644
3645    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3646    x = LLVMBuildFMul(builder, x, sqrt2, "");
3647
3648    /* ipart = floor(log2(x) + 0.5)  */
3649    ipart = lp_build_extract_exponent(bld, x, 0);
3650
3651    return ipart;
3652 }
3653
3654 LLVMValueRef
3655 lp_build_mod(struct lp_build_context *bld,
3656              LLVMValueRef x,
3657              LLVMValueRef y)
3658 {
3659    LLVMBuilderRef builder = bld->gallivm->builder;
3660    LLVMValueRef res;
3661    const struct lp_type type = bld->type;
3662
3663    assert(lp_check_value(type, x));
3664    assert(lp_check_value(type, y));
3665
3666    if (type.floating)
3667       res = LLVMBuildFRem(builder, x, y, "");
3668    else if (type.sign)
3669       res = LLVMBuildSRem(builder, x, y, "");
3670    else
3671       res = LLVMBuildURem(builder, x, y, "");
3672    return res;
3673 }
3674
3675
3676 /*
3677  * For floating inputs it creates and returns a mask
3678  * which is all 1's for channels which are NaN.
3679  * Channels inside x which are not NaN will be 0.
3680  */
3681 LLVMValueRef
3682 lp_build_isnan(struct lp_build_context *bld,
3683                LLVMValueRef x)
3684 {
3685    LLVMValueRef mask;
3686    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3687
3688    assert(bld->type.floating);
3689    assert(lp_check_value(bld->type, x));
3690
3691    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3692                         "isnotnan");
3693    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3694    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3695    return mask;
3696 }
3697
3698 /* Returns all 1's for floating point numbers that are
3699  * finite numbers and returns all zeros for -inf,
3700  * inf and nan's */
3701 LLVMValueRef
3702 lp_build_isfinite(struct lp_build_context *bld,
3703                   LLVMValueRef x)
3704 {
3705    LLVMBuilderRef builder = bld->gallivm->builder;
3706    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3707    struct lp_type int_type = lp_int_type(bld->type);
3708    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3709    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3710                                                     0x7f800000);
3711
3712    if (!bld->type.floating) {
3713       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3714    }
3715    assert(bld->type.floating);
3716    assert(lp_check_value(bld->type, x));
3717    assert(bld->type.width == 32);
3718
3719    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3720    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3721                            intx, infornan32);
3722 }
3723
3724 /*
3725  * Returns true if the number is nan or inf and false otherwise.
3726  * The input has to be a floating point vector.
3727  */
3728 LLVMValueRef
3729 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3730                        const struct lp_type type,
3731                        LLVMValueRef x)
3732 {
3733    LLVMBuilderRef builder = gallivm->builder;
3734    struct lp_type int_type = lp_int_type(type);
3735    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3736                                                 0x7f800000);
3737    LLVMValueRef ret;
3738
3739    assert(type.floating);
3740
3741    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3742    ret = LLVMBuildAnd(builder, ret, const0, "");
3743    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3744                           ret, const0);
3745
3746    return ret;
3747 }
3748
3749
3750 LLVMValueRef
3751 lp_build_fpstate_get(struct gallivm_state *gallivm)
3752 {
3753    if (util_cpu_caps.has_sse) {
3754       LLVMBuilderRef builder = gallivm->builder;
3755       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3756          gallivm,
3757          LLVMInt32TypeInContext(gallivm->context),
3758          "mxcsr_ptr");
3759       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3760           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3761       lp_build_intrinsic(builder,
3762                          "llvm.x86.sse.stmxcsr",
3763                          LLVMVoidTypeInContext(gallivm->context),
3764                          &mxcsr_ptr8, 1, 0);
3765       return mxcsr_ptr;
3766    }
3767    return 0;
3768 }
3769
3770 void
3771 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3772                                   boolean zero)
3773 {
3774    if (util_cpu_caps.has_sse) {
3775       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3776       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3777
3778       LLVMBuilderRef builder = gallivm->builder;
3779       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3780       LLVMValueRef mxcsr =
3781          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3782
3783       if (util_cpu_caps.has_daz) {
3784          /* Enable denormals are zero mode */
3785          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3786       }
3787       if (zero) {
3788          mxcsr = LLVMBuildOr(builder, mxcsr,
3789                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3790       } else {
3791          mxcsr = LLVMBuildAnd(builder, mxcsr,
3792                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3793       }
3794
3795       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3796       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3797    }
3798 }
3799
3800 void
3801 lp_build_fpstate_set(struct gallivm_state *gallivm,
3802                      LLVMValueRef mxcsr_ptr)
3803 {
3804    if (util_cpu_caps.has_sse) {
3805       LLVMBuilderRef builder = gallivm->builder;
3806       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3807                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3808       lp_build_intrinsic(builder,
3809                          "llvm.x86.sse.ldmxcsr",
3810                          LLVMVoidTypeInContext(gallivm->context),
3811                          &mxcsr_ptr, 1, 0);
3812    }
3813 }