src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if (a == bld->zero)
 545       return b;
 546    if (b == bld->zero)
 547       return a;
 548    if (a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if (type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if (!type.sign && (a == bld->one || b == bld->one))
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (HAVE_LLVM >= 0x0900) {
 559             char intrin[32];
 560             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 561             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 562             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 563          }
 564          if (type.width * type.length == 128) {
 565             if (util_cpu_caps.has_sse2) {
 566                if (type.width == 8)
 567                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
 568                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
 569                if (type.width == 16)
 570                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
 571                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
 572             } else if (util_cpu_caps.has_altivec) {
 573                if (type.width == 8)
 574                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 575                if (type.width == 16)
 576                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 577             }
 578          }
 579          if (type.width * type.length == 256) {
 580             if (util_cpu_caps.has_avx2) {
 581                if (type.width == 8)
 582                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
 583                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
 584                if (type.width == 16)
 585                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
 586                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
 587             }
 588          }
 589       }
 590
 591       if (intrinsic)
 592          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 593    }
 594
 595    if(type.norm && !type.floating && !type.fixed) {
 596       if (type.sign) {
 597          uint64_t sign = (uint64_t)1 << (type.width - 1);
 598          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 599          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 600          /* a_clamp_max is the maximum a for positive b,
 601             a_clamp_min is the minimum a for negative b. */
 602          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 603          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 604          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 605       }
 606    }
 607
 608    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 609       if (type.floating)
 610          res = LLVMConstFAdd(a, b);
 611       else
 612          res = LLVMConstAdd(a, b);
 613    else
 614       if (type.floating)
 615          res = LLVMBuildFAdd(builder, a, b, "");
 616       else
 617          res = LLVMBuildAdd(builder, a, b, "");
 618
 619    /* clamp to ceiling of 1.0 */
 620    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 621       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 622
 623    if (type.norm && !type.floating && !type.fixed) {
 624       if (!type.sign) {
 625          /*
 626           * newer llvm versions no longer support the intrinsics, but recognize
 627           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 628           * code, it is important we match the pattern llvm uses (and pray llvm
 629           * doesn't change it - and hope they decide on the same pattern for
 630           * all backends supporting it...).
 631           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 632           * interfere with llvm's ability to recognize the pattern but seems
 633           * a bit brittle.
 634           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 635           */
 636          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 637          res = lp_build_select(bld, overflowed,
 638                                LLVMConstAllOnes(bld->int_vec_type), res);
 639       }
 640    }
 641
 642    /* XXX clamp to floor of -1 or 0??? */
 643
 644    return res;
 645 }
 646
 647
 648 /** Return the scalar sum of the elements of a.
 649  * Should avoid this operation whenever possible.
 650  */
 651 LLVMValueRef
 652 lp_build_horizontal_add(struct lp_build_context *bld,
 653                         LLVMValueRef a)
 654 {
 655    LLVMBuilderRef builder = bld->gallivm->builder;
 656    const struct lp_type type = bld->type;
 657    LLVMValueRef index, res;
 658    unsigned i, length;
 659    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 660    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 661    LLVMValueRef vecres, elem2;
 662
 663    assert(lp_check_value(type, a));
 664
 665    if (type.length == 1) {
 666       return a;
 667    }
 668
 669    assert(!bld->type.norm);
 670
 671    /*
 672     * for byte vectors can do much better with psadbw.
 673     * Using repeated shuffle/adds here. Note with multiple vectors
 674     * this can be done more efficiently as outlined in the intel
 675     * optimization manual.
 676     * Note: could cause data rearrangement if used with smaller element
 677     * sizes.
 678     */
 679
 680    vecres = a;
 681    length = type.length / 2;
 682    while (length > 1) {
 683       LLVMValueRef vec1, vec2;
 684       for (i = 0; i < length; i++) {
 685          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 686          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 687       }
 688       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 689                                     LLVMConstVector(shuffles1, length), "");
 690       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 691                                     LLVMConstVector(shuffles2, length), "");
 692       if (type.floating) {
 693          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 694       }
 695       else {
 696          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 697       }
 698       length = length >> 1;
 699    }
 700
 701    /* always have vector of size 2 here */
 702    assert(length == 1);
 703
 704    index = lp_build_const_int32(bld->gallivm, 0);
 705    res = LLVMBuildExtractElement(builder, vecres, index, "");
 706    index = lp_build_const_int32(bld->gallivm, 1);
 707    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 708
 709    if (type.floating)
 710       res = LLVMBuildFAdd(builder, res, elem2, "");
 711     else
 712       res = LLVMBuildAdd(builder, res, elem2, "");
 713
 714    return res;
 715 }
 716
 717 /**
 718  * Return the horizontal sums of 4 float vectors as a float4 vector.
 719  * This uses the technique as outlined in Intel Optimization Manual.
 720  */
 721 static LLVMValueRef
 722 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 723                             LLVMValueRef src[4])
 724 {
 725    struct gallivm_state *gallivm = bld->gallivm;
 726    LLVMBuilderRef builder = gallivm->builder;
 727    LLVMValueRef shuffles[4];
 728    LLVMValueRef tmp[4];
 729    LLVMValueRef sumtmp[2], shuftmp[2];
 730
 731    /* lower half of regs */
 732    shuffles[0] = lp_build_const_int32(gallivm, 0);
 733    shuffles[1] = lp_build_const_int32(gallivm, 1);
 734    shuffles[2] = lp_build_const_int32(gallivm, 4);
 735    shuffles[3] = lp_build_const_int32(gallivm, 5);
 736    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 737                                    LLVMConstVector(shuffles, 4), "");
 738    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 739                                    LLVMConstVector(shuffles, 4), "");
 740
 741    /* upper half of regs */
 742    shuffles[0] = lp_build_const_int32(gallivm, 2);
 743    shuffles[1] = lp_build_const_int32(gallivm, 3);
 744    shuffles[2] = lp_build_const_int32(gallivm, 6);
 745    shuffles[3] = lp_build_const_int32(gallivm, 7);
 746    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 747                                    LLVMConstVector(shuffles, 4), "");
 748    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 749                                    LLVMConstVector(shuffles, 4), "");
 750
 751    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 752    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 753
 754    shuffles[0] = lp_build_const_int32(gallivm, 0);
 755    shuffles[1] = lp_build_const_int32(gallivm, 2);
 756    shuffles[2] = lp_build_const_int32(gallivm, 4);
 757    shuffles[3] = lp_build_const_int32(gallivm, 6);
 758    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 759                                        LLVMConstVector(shuffles, 4), "");
 760
 761    shuffles[0] = lp_build_const_int32(gallivm, 1);
 762    shuffles[1] = lp_build_const_int32(gallivm, 3);
 763    shuffles[2] = lp_build_const_int32(gallivm, 5);
 764    shuffles[3] = lp_build_const_int32(gallivm, 7);
 765    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 766                                        LLVMConstVector(shuffles, 4), "");
 767
 768    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 769 }
 770
 771
 772 /*
 773  * partially horizontally add 2-4 float vectors with length nx4,
 774  * i.e. only four adjacent values in each vector will be added,
 775  * assuming values are really grouped in 4 which also determines
 776  * output order.
 777  *
 778  * Return a vector of the same length as the initial vectors,
 779  * with the excess elements (if any) being undefined.
 780  * The element order is independent of number of input vectors.
 781  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 782  * the output order thus will be
 783  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 784  */
 785 LLVMValueRef
 786 lp_build_hadd_partial4(struct lp_build_context *bld,
 787                        LLVMValueRef vectors[],
 788                        unsigned num_vecs)
 789 {
 790    struct gallivm_state *gallivm = bld->gallivm;
 791    LLVMBuilderRef builder = gallivm->builder;
 792    LLVMValueRef ret_vec;
 793    LLVMValueRef tmp[4];
 794    const char *intrinsic = NULL;
 795
 796    assert(num_vecs >= 2 && num_vecs <= 4);
 797    assert(bld->type.floating);
 798
 799    /* only use this with at least 2 vectors, as it is sort of expensive
 800     * (depending on cpu) and we always need two horizontal adds anyway,
 801     * so a shuffle/add approach might be better.
 802     */
 803
 804    tmp[0] = vectors[0];
 805    tmp[1] = vectors[1];
 806
 807    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 808    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 809
 810    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 811        bld->type.length == 4) {
 812       intrinsic = "llvm.x86.sse3.hadd.ps";
 813    }
 814    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 815             bld->type.length == 8) {
 816       intrinsic = "llvm.x86.avx.hadd.ps.256";
 817    }
 818    if (intrinsic) {
 819       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 820                                        lp_build_vec_type(gallivm, bld->type),
 821                                        tmp[0], tmp[1]);
 822       if (num_vecs > 2) {
 823          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 824                                           lp_build_vec_type(gallivm, bld->type),
 825                                           tmp[2], tmp[3]);
 826       }
 827       else {
 828          tmp[1] = tmp[0];
 829       }
 830       return lp_build_intrinsic_binary(builder, intrinsic,
 831                                        lp_build_vec_type(gallivm, bld->type),
 832                                        tmp[0], tmp[1]);
 833    }
 834
 835    if (bld->type.length == 4) {
 836       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 837    }
 838    else {
 839       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 840       unsigned j;
 841       unsigned num_iter = bld->type.length / 4;
 842       struct lp_type parttype = bld->type;
 843       parttype.length = 4;
 844       for (j = 0; j < num_iter; j++) {
 845          LLVMValueRef partsrc[4];
 846          unsigned i;
 847          for (i = 0; i < 4; i++) {
 848             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 849          }
 850          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 851       }
 852       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 853    }
 854    return ret_vec;
 855 }
 856
 857 /**
 858  * Generate a - b
 859  */
 860 LLVMValueRef
 861 lp_build_sub(struct lp_build_context *bld,
 862              LLVMValueRef a,
 863              LLVMValueRef b)
 864 {
 865    LLVMBuilderRef builder = bld->gallivm->builder;
 866    const struct lp_type type = bld->type;
 867    LLVMValueRef res;
 868
 869    assert(lp_check_value(type, a));
 870    assert(lp_check_value(type, b));
 871
 872    if (b == bld->zero)
 873       return a;
 874    if (a == bld->undef || b == bld->undef)
 875       return bld->undef;
 876    if (a == b)
 877       return bld->zero;
 878
 879    if (type.norm) {
 880       const char *intrinsic = NULL;
 881
 882       if (!type.sign && b == bld->one)
 883         return bld->zero;
 884
 885       if (!type.floating && !type.fixed) {
 886          if (HAVE_LLVM >= 0x0900) {
 887             char intrin[32];
 888             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 889             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 890             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 891          }
 892          if (type.width * type.length == 128) {
 893             if (util_cpu_caps.has_sse2) {
 894                if (type.width == 8)
 895                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
 896                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
 897                if (type.width == 16)
 898                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
 899                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
 900             } else if (util_cpu_caps.has_altivec) {
 901                if (type.width == 8)
 902                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 903                if (type.width == 16)
 904                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 905             }
 906          }
 907          if (type.width * type.length == 256) {
 908             if (util_cpu_caps.has_avx2) {
 909                if (type.width == 8)
 910                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
 911                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
 912                if (type.width == 16)
 913                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
 914                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
 915             }
 916          }
 917       }
 918
 919       if (intrinsic)
 920          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 921    }
 922
 923    if(type.norm && !type.floating && !type.fixed) {
 924       if (type.sign) {
 925          uint64_t sign = (uint64_t)1 << (type.width - 1);
 926          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 927          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 928          /* a_clamp_max is the maximum a for negative b,
 929             a_clamp_min is the minimum a for positive b. */
 930          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 931          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 932          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 933       } else {
 934          /*
 935           * This must match llvm pattern for saturated unsigned sub.
 936           * (lp_build_max_simple actually does the job with its current
 937           * definition but do it explicitly here.)
 938           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 939           * interfere with llvm's ability to recognize the pattern but seems
 940           * a bit brittle.
 941           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 942           */
 943          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 944          a = lp_build_select(bld, no_ov, a, b);
 945       }
 946    }
 947
 948    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 949       if (type.floating)
 950          res = LLVMConstFSub(a, b);
 951       else
 952          res = LLVMConstSub(a, b);
 953    else
 954       if (type.floating)
 955          res = LLVMBuildFSub(builder, a, b, "");
 956       else
 957          res = LLVMBuildSub(builder, a, b, "");
 958
 959    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 960       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 961
 962    return res;
 963 }
 964
 965
 966
 967 /**
 968  * Normalized multiplication.
 969  *
 970  * There are several approaches for (using 8-bit normalized multiplication as
 971  * an example):
 972  *
 973  * - alpha plus one
 974  *
 975  *     makes the following approximation to the division (Sree)
 976  *
 977  *       a*b/255 ~= (a*(b + 1)) >> 256
 978  *
 979  *     which is the fastest method that satisfies the following OpenGL criteria of
 980  *
 981  *       0*0 = 0 and 255*255 = 255
 982  *
 983  * - geometric series
 984  *
 985  *     takes the geometric series approximation to the division
 986  *
 987  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 988  *
 989  *     in this case just the first two terms to fit in 16bit arithmetic
 990  *
 991  *       t/255 ~= (t + (t >> 8)) >> 8
 992  *
 993  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 994  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 995  *     must be used.
 996  *
 997  * - geometric series plus rounding
 998  *
 999  *     when using a geometric series division instead of truncating the result
1000  *     use roundoff in the approximation (Jim Blinn)
1001  *
1002  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
1003  *
1004  *     achieving the exact results.
1005  *
1006  *
1007  *
1008  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1009  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1010  * @sa Michael Herf, The "double blend trick", May 2000,
1011  *     http://www.stereopsis.com/doubleblend.html
1012  */
1013 LLVMValueRef
1014 lp_build_mul_norm(struct gallivm_state *gallivm,
1015                   struct lp_type wide_type,
1016                   LLVMValueRef a, LLVMValueRef b)
1017 {
1018    LLVMBuilderRef builder = gallivm->builder;
1019    struct lp_build_context bld;
1020    unsigned n;
1021    LLVMValueRef half;
1022    LLVMValueRef ab;
1023
1024    assert(!wide_type.floating);
1025    assert(lp_check_value(wide_type, a));
1026    assert(lp_check_value(wide_type, b));
1027
1028    lp_build_context_init(&bld, gallivm, wide_type);
1029
1030    n = wide_type.width / 2;
1031    if (wide_type.sign) {
1032       --n;
1033    }
1034
1035    /*
1036     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1037     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1038     */
1039
1040    /*
1041     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1042     */
1043
1044    ab = LLVMBuildMul(builder, a, b, "");
1045    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1046
1047    /*
1048     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1049     */
1050
1051    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1052    if (wide_type.sign) {
1053       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1054       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1055       half = lp_build_select(&bld, sign, minus_half, half);
1056    }
1057    ab = LLVMBuildAdd(builder, ab, half, "");
1058
1059    /* Final division */
1060    ab = lp_build_shr_imm(&bld, ab, n);
1061
1062    return ab;
1063 }
1064
1065 /**
1066  * Generate a * b
1067  */
1068 LLVMValueRef
1069 lp_build_mul(struct lp_build_context *bld,
1070              LLVMValueRef a,
1071              LLVMValueRef b)
1072 {
1073    LLVMBuilderRef builder = bld->gallivm->builder;
1074    const struct lp_type type = bld->type;
1075    LLVMValueRef shift;
1076    LLVMValueRef res;
1077
1078    assert(lp_check_value(type, a));
1079    assert(lp_check_value(type, b));
1080
1081    if(a == bld->zero)
1082       return bld->zero;
1083    if(a == bld->one)
1084       return b;
1085    if(b == bld->zero)
1086       return bld->zero;
1087    if(b == bld->one)
1088       return a;
1089    if(a == bld->undef || b == bld->undef)
1090       return bld->undef;
1091
1092    if (!type.floating && !type.fixed && type.norm) {
1093       struct lp_type wide_type = lp_wider_type(type);
1094       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1095
1096       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1097       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1098
1099       /* PMULLW, PSRLW, PADDW */
1100       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1101       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1102
1103       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1104
1105       return ab;
1106    }
1107
1108    if(type.fixed)
1109       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1110    else
1111       shift = NULL;
1112
1113    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1114       if (type.floating)
1115          res = LLVMConstFMul(a, b);
1116       else
1117          res = LLVMConstMul(a, b);
1118       if(shift) {
1119          if(type.sign)
1120             res = LLVMConstAShr(res, shift);
1121          else
1122             res = LLVMConstLShr(res, shift);
1123       }
1124    }
1125    else {
1126       if (type.floating)
1127          res = LLVMBuildFMul(builder, a, b, "");
1128       else
1129          res = LLVMBuildMul(builder, a, b, "");
1130       if(shift) {
1131          if(type.sign)
1132             res = LLVMBuildAShr(builder, res, shift, "");
1133          else
1134             res = LLVMBuildLShr(builder, res, shift, "");
1135       }
1136    }
1137
1138    return res;
1139 }
1140
1141 /*
1142  * Widening mul, valid for 32x32 bit -> 64bit only.
1143  * Result is low 32bits, high bits returned in res_hi.
1144  *
1145  * Emits code that is meant to be compiled for the host CPU.
1146  */
1147 LLVMValueRef
1148 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1149                          LLVMValueRef a,
1150                          LLVMValueRef b,
1151                          LLVMValueRef *res_hi)
1152 {
1153    struct gallivm_state *gallivm = bld->gallivm;
1154    LLVMBuilderRef builder = gallivm->builder;
1155
1156    assert(bld->type.width == 32);
1157    assert(bld->type.floating == 0);
1158    assert(bld->type.fixed == 0);
1159    assert(bld->type.norm == 0);
1160
1161    /*
1162     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1163     * for x86 simd is atrocious (even if the high bits weren't required),
1164     * trying to handle real 64bit inputs (which of course can't happen due
1165     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1166     * apparently llvm does not recognize this widening mul). This includes 6
1167     * (instead of 2) pmuludq plus extra adds and shifts
1168     * The same story applies to signed mul, albeit fixing this requires sse41.
1169     * https://llvm.org/bugs/show_bug.cgi?id=30845
1170     * So, whip up our own code, albeit only for length 4 and 8 (which
1171     * should be good enough)...
1172     */
1173    if ((bld->type.length == 4 || bld->type.length == 8) &&
1174        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1175         util_cpu_caps.has_sse4_1)) {
1176       const char *intrinsic = NULL;
1177       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1178       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1179       struct lp_type type_wide = lp_wider_type(bld->type);
1180       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1181       unsigned i;
1182       for (i = 0; i < bld->type.length; i += 2) {
1183          shuf[i] = lp_build_const_int32(gallivm, i+1);
1184          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1185       }
1186       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1187       aeven = a;
1188       beven = b;
1189       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1190       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1191
1192       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1193          if (bld->type.sign) {
1194             intrinsic = "llvm.x86.avx2.pmul.dq";
1195          } else {
1196             intrinsic = "llvm.x86.avx2.pmulu.dq";
1197          }
1198          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1199                                              wider_type, aeven, beven);
1200          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1201                                             wider_type, aodd, bodd);
1202       }
1203       else {
1204          /* for consistent naming look elsewhere... */
1205          if (bld->type.sign) {
1206             intrinsic = "llvm.x86.sse41.pmuldq";
1207          } else {
1208             intrinsic = "llvm.x86.sse2.pmulu.dq";
1209          }
1210          /*
1211           * XXX If we only have AVX but not AVX2 this is a pain.
1212           * lp_build_intrinsic_binary_anylength() can't handle it
1213           * (due to src and dst type not being identical).
1214           */
1215          if (bld->type.length == 8) {
1216             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1217             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1218             LLVMValueRef muleven2[2], mulodd2[2];
1219             struct lp_type type_wide_half = type_wide;
1220             LLVMTypeRef wtype_half;
1221             type_wide_half.length = 2;
1222             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1223             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1224             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1225             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1226             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1227             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1228             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1229             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1230             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1231             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1232                                                     wtype_half, aevenlo, bevenlo);
1233             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1234                                                    wtype_half, aoddlo, boddlo);
1235             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1236                                                     wtype_half, aevenhi, bevenhi);
1237             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1238                                                    wtype_half, aoddhi, boddhi);
1239             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1240             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1241
1242          }
1243          else {
1244             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1245                                                 wider_type, aeven, beven);
1246             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1247                                                wider_type, aodd, bodd);
1248          }
1249       }
1250       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1251       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1252
1253       for (i = 0; i < bld->type.length; i += 2) {
1254          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1255          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1256       }
1257       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1258       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1259
1260       for (i = 0; i < bld->type.length; i += 2) {
1261          shuf[i] = lp_build_const_int32(gallivm, i);
1262          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1263       }
1264       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1265       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1266    }
1267    else {
1268       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1269    }
1270 }
1271
1272
1273 /*
1274  * Widening mul, valid for 32x32 bit -> 64bit only.
1275  * Result is low 32bits, high bits returned in res_hi.
1276  *
1277  * Emits generic code.
1278  */
1279 LLVMValueRef
1280 lp_build_mul_32_lohi(struct lp_build_context *bld,
1281                      LLVMValueRef a,
1282                      LLVMValueRef b,
1283                      LLVMValueRef *res_hi)
1284 {
1285    struct gallivm_state *gallivm = bld->gallivm;
1286    LLVMBuilderRef builder = gallivm->builder;
1287    LLVMValueRef tmp, shift, res_lo;
1288    struct lp_type type_tmp;
1289    LLVMTypeRef wide_type, narrow_type;
1290
1291    type_tmp = bld->type;
1292    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1293    type_tmp.width *= 2;
1294    wide_type = lp_build_vec_type(gallivm, type_tmp);
1295    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1296
1297    if (bld->type.sign) {
1298       a = LLVMBuildSExt(builder, a, wide_type, "");
1299       b = LLVMBuildSExt(builder, b, wide_type, "");
1300    } else {
1301       a = LLVMBuildZExt(builder, a, wide_type, "");
1302       b = LLVMBuildZExt(builder, b, wide_type, "");
1303    }
1304    tmp = LLVMBuildMul(builder, a, b, "");
1305
1306    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1307
1308    /* Since we truncate anyway, LShr and AShr are equivalent. */
1309    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1310    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1311
1312    return res_lo;
1313 }
1314
1315
1316 /* a * b + c */
1317 LLVMValueRef
1318 lp_build_mad(struct lp_build_context *bld,
1319              LLVMValueRef a,
1320              LLVMValueRef b,
1321              LLVMValueRef c)
1322 {
1323    const struct lp_type type = bld->type;
1324    if (type.floating) {
1325       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1326    } else {
1327       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1328    }
1329 }
1330
1331
1332 /**
1333  * Small vector x scale multiplication optimization.
1334  */
1335 LLVMValueRef
1336 lp_build_mul_imm(struct lp_build_context *bld,
1337                  LLVMValueRef a,
1338                  int b)
1339 {
1340    LLVMBuilderRef builder = bld->gallivm->builder;
1341    LLVMValueRef factor;
1342
1343    assert(lp_check_value(bld->type, a));
1344
1345    if(b == 0)
1346       return bld->zero;
1347
1348    if(b == 1)
1349       return a;
1350
1351    if(b == -1)
1352       return lp_build_negate(bld, a);
1353
1354    if(b == 2 && bld->type.floating)
1355       return lp_build_add(bld, a, a);
1356
1357    if(util_is_power_of_two_or_zero(b)) {
1358       unsigned shift = ffs(b) - 1;
1359
1360       if(bld->type.floating) {
1361 #if 0
1362          /*
1363           * Power of two multiplication by directly manipulating the exponent.
1364           *
1365           * XXX: This might not be always faster, it will introduce a small error
1366           * for multiplication by zero, and it will produce wrong results
1367           * for Inf and NaN.
1368           */
1369          unsigned mantissa = lp_mantissa(bld->type);
1370          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1371          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1372          a = LLVMBuildAdd(builder, a, factor, "");
1373          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1374          return a;
1375 #endif
1376       }
1377       else {
1378          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1379          return LLVMBuildShl(builder, a, factor, "");
1380       }
1381    }
1382
1383    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1384    return lp_build_mul(bld, a, factor);
1385 }
1386
1387
1388 /**
1389  * Generate a / b
1390  */
1391 LLVMValueRef
1392 lp_build_div(struct lp_build_context *bld,
1393              LLVMValueRef a,
1394              LLVMValueRef b)
1395 {
1396    LLVMBuilderRef builder = bld->gallivm->builder;
1397    const struct lp_type type = bld->type;
1398
1399    assert(lp_check_value(type, a));
1400    assert(lp_check_value(type, b));
1401
1402    if(a == bld->zero)
1403       return bld->zero;
1404    if(a == bld->one && type.floating)
1405       return lp_build_rcp(bld, b);
1406    if(b == bld->zero)
1407       return bld->undef;
1408    if(b == bld->one)
1409       return a;
1410    if(a == bld->undef || b == bld->undef)
1411       return bld->undef;
1412
1413    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1414       if (type.floating)
1415          return LLVMConstFDiv(a, b);
1416       else if (type.sign)
1417          return LLVMConstSDiv(a, b);
1418       else
1419          return LLVMConstUDiv(a, b);
1420    }
1421
1422    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1423    if(FALSE &&
1424       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1425        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1426       type.floating)
1427       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1428
1429    if (type.floating)
1430       return LLVMBuildFDiv(builder, a, b, "");
1431    else if (type.sign)
1432       return LLVMBuildSDiv(builder, a, b, "");
1433    else
1434       return LLVMBuildUDiv(builder, a, b, "");
1435 }
1436
1437
1438 /**
1439  * Linear interpolation helper.
1440  *
1441  * @param normalized whether we are interpolating normalized values,
1442  *        encoded in normalized integers, twice as wide.
1443  *
1444  * @sa http://www.stereopsis.com/doubleblend.html
1445  */
1446 static inline LLVMValueRef
1447 lp_build_lerp_simple(struct lp_build_context *bld,
1448                      LLVMValueRef x,
1449                      LLVMValueRef v0,
1450                      LLVMValueRef v1,
1451                      unsigned flags)
1452 {
1453    unsigned half_width = bld->type.width/2;
1454    LLVMBuilderRef builder = bld->gallivm->builder;
1455    LLVMValueRef delta;
1456    LLVMValueRef res;
1457
1458    assert(lp_check_value(bld->type, x));
1459    assert(lp_check_value(bld->type, v0));
1460    assert(lp_check_value(bld->type, v1));
1461
1462    delta = lp_build_sub(bld, v1, v0);
1463
1464    if (bld->type.floating) {
1465       assert(flags == 0);
1466       return lp_build_mad(bld, x, delta, v0);
1467    }
1468
1469    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1470       if (!bld->type.sign) {
1471          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1472             /*
1473              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1474              * most-significant-bit to the lowest-significant-bit, so that
1475              * later we can just divide by 2**n instead of 2**n - 1.
1476              */
1477
1478             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1479          }
1480
1481          /* (x * delta) >> n */
1482          res = lp_build_mul(bld, x, delta);
1483          res = lp_build_shr_imm(bld, res, half_width);
1484       } else {
1485          /*
1486           * The rescaling trick above doesn't work for signed numbers, so
1487           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1488           * instead.
1489           */
1490          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1491          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1492       }
1493    } else {
1494       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1495       res = lp_build_mul(bld, x, delta);
1496    }
1497
1498    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1499       /*
1500        * At this point both res and v0 only use the lower half of the bits,
1501        * the rest is zero. Instead of add / mask, do add with half wide type.
1502        */
1503       struct lp_type narrow_type;
1504       struct lp_build_context narrow_bld;
1505
1506       memset(&narrow_type, 0, sizeof narrow_type);
1507       narrow_type.sign   = bld->type.sign;
1508       narrow_type.width  = bld->type.width/2;
1509       narrow_type.length = bld->type.length*2;
1510
1511       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1512       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1513       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1514       res = lp_build_add(&narrow_bld, v0, res);
1515       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1516    } else {
1517       res = lp_build_add(bld, v0, res);
1518
1519       if (bld->type.fixed) {
1520          /*
1521           * We need to mask out the high order bits when lerping 8bit
1522           * normalized colors stored on 16bits
1523           */
1524          /* XXX: This step is necessary for lerping 8bit colors stored on
1525           * 16bits, but it will be wrong for true fixed point use cases.
1526           * Basically we need a more powerful lp_type, capable of further
1527           * distinguishing the values interpretation from the value storage.
1528           */
1529          LLVMValueRef low_bits;
1530          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1531          res = LLVMBuildAnd(builder, res, low_bits, "");
1532       }
1533    }
1534
1535    return res;
1536 }
1537
1538
1539 /**
1540  * Linear interpolation.
1541  */
1542 LLVMValueRef
1543 lp_build_lerp(struct lp_build_context *bld,
1544               LLVMValueRef x,
1545               LLVMValueRef v0,
1546               LLVMValueRef v1,
1547               unsigned flags)
1548 {
1549    const struct lp_type type = bld->type;
1550    LLVMValueRef res;
1551
1552    assert(lp_check_value(type, x));
1553    assert(lp_check_value(type, v0));
1554    assert(lp_check_value(type, v1));
1555
1556    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1557
1558    if (type.norm) {
1559       struct lp_type wide_type;
1560       struct lp_build_context wide_bld;
1561       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1562
1563       assert(type.length >= 2);
1564
1565       /*
1566        * Create a wider integer type, enough to hold the
1567        * intermediate result of the multiplication.
1568        */
1569       memset(&wide_type, 0, sizeof wide_type);
1570       wide_type.sign   = type.sign;
1571       wide_type.width  = type.width*2;
1572       wide_type.length = type.length/2;
1573
1574       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1575
1576       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1577       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1578       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1579
1580       /*
1581        * Lerp both halves.
1582        */
1583
1584       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1585
1586       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1587       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1588
1589       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1590    } else {
1591       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1592    }
1593
1594    return res;
1595 }
1596
1597
1598 /**
1599  * Bilinear interpolation.
1600  *
1601  * Values indices are in v_{yx}.
1602  */
1603 LLVMValueRef
1604 lp_build_lerp_2d(struct lp_build_context *bld,
1605                  LLVMValueRef x,
1606                  LLVMValueRef y,
1607                  LLVMValueRef v00,
1608                  LLVMValueRef v01,
1609                  LLVMValueRef v10,
1610                  LLVMValueRef v11,
1611                  unsigned flags)
1612 {
1613    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1614    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1615    return lp_build_lerp(bld, y, v0, v1, flags);
1616 }
1617
1618
1619 LLVMValueRef
1620 lp_build_lerp_3d(struct lp_build_context *bld,
1621                  LLVMValueRef x,
1622                  LLVMValueRef y,
1623                  LLVMValueRef z,
1624                  LLVMValueRef v000,
1625                  LLVMValueRef v001,
1626                  LLVMValueRef v010,
1627                  LLVMValueRef v011,
1628                  LLVMValueRef v100,
1629                  LLVMValueRef v101,
1630                  LLVMValueRef v110,
1631                  LLVMValueRef v111,
1632                  unsigned flags)
1633 {
1634    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1635    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1636    return lp_build_lerp(bld, z, v0, v1, flags);
1637 }
1638
1639
1640 /**
1641  * Generate min(a, b)
1642  * Do checks for special cases but not for nans.
1643  */
1644 LLVMValueRef
1645 lp_build_min(struct lp_build_context *bld,
1646              LLVMValueRef a,
1647              LLVMValueRef b)
1648 {
1649    assert(lp_check_value(bld->type, a));
1650    assert(lp_check_value(bld->type, b));
1651
1652    if(a == bld->undef || b == bld->undef)
1653       return bld->undef;
1654
1655    if(a == b)
1656       return a;
1657
1658    if (bld->type.norm) {
1659       if (!bld->type.sign) {
1660          if (a == bld->zero || b == bld->zero) {
1661             return bld->zero;
1662          }
1663       }
1664       if(a == bld->one)
1665          return b;
1666       if(b == bld->one)
1667          return a;
1668    }
1669
1670    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1671 }
1672
1673
1674 /**
1675  * Generate min(a, b)
1676  * NaN's are handled according to the behavior specified by the
1677  * nan_behavior argument.
1678  */
1679 LLVMValueRef
1680 lp_build_min_ext(struct lp_build_context *bld,
1681                  LLVMValueRef a,
1682                  LLVMValueRef b,
1683                  enum gallivm_nan_behavior nan_behavior)
1684 {
1685    assert(lp_check_value(bld->type, a));
1686    assert(lp_check_value(bld->type, b));
1687
1688    if(a == bld->undef || b == bld->undef)
1689       return bld->undef;
1690
1691    if(a == b)
1692       return a;
1693
1694    if (bld->type.norm) {
1695       if (!bld->type.sign) {
1696          if (a == bld->zero || b == bld->zero) {
1697             return bld->zero;
1698          }
1699       }
1700       if(a == bld->one)
1701          return b;
1702       if(b == bld->one)
1703          return a;
1704    }
1705
1706    return lp_build_min_simple(bld, a, b, nan_behavior);
1707 }
1708
1709 /**
1710  * Generate max(a, b)
1711  * Do checks for special cases, but NaN behavior is undefined.
1712  */
1713 LLVMValueRef
1714 lp_build_max(struct lp_build_context *bld,
1715              LLVMValueRef a,
1716              LLVMValueRef b)
1717 {
1718    assert(lp_check_value(bld->type, a));
1719    assert(lp_check_value(bld->type, b));
1720
1721    if(a == bld->undef || b == bld->undef)
1722       return bld->undef;
1723
1724    if(a == b)
1725       return a;
1726
1727    if(bld->type.norm) {
1728       if(a == bld->one || b == bld->one)
1729          return bld->one;
1730       if (!bld->type.sign) {
1731          if (a == bld->zero) {
1732             return b;
1733          }
1734          if (b == bld->zero) {
1735             return a;
1736          }
1737       }
1738    }
1739
1740    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1741 }
1742
1743
1744 /**
1745  * Generate max(a, b)
1746  * Checks for special cases.
1747  * NaN's are handled according to the behavior specified by the
1748  * nan_behavior argument.
1749  */
1750 LLVMValueRef
1751 lp_build_max_ext(struct lp_build_context *bld,
1752                   LLVMValueRef a,
1753                   LLVMValueRef b,
1754                   enum gallivm_nan_behavior nan_behavior)
1755 {
1756    assert(lp_check_value(bld->type, a));
1757    assert(lp_check_value(bld->type, b));
1758
1759    if(a == bld->undef || b == bld->undef)
1760       return bld->undef;
1761
1762    if(a == b)
1763       return a;
1764
1765    if(bld->type.norm) {
1766       if(a == bld->one || b == bld->one)
1767          return bld->one;
1768       if (!bld->type.sign) {
1769          if (a == bld->zero) {
1770             return b;
1771          }
1772          if (b == bld->zero) {
1773             return a;
1774          }
1775       }
1776    }
1777
1778    return lp_build_max_simple(bld, a, b, nan_behavior);
1779 }
1780
1781 /**
1782  * Generate clamp(a, min, max)
1783  * NaN behavior (for any of a, min, max) is undefined.
1784  * Do checks for special cases.
1785  */
1786 LLVMValueRef
1787 lp_build_clamp(struct lp_build_context *bld,
1788                LLVMValueRef a,
1789                LLVMValueRef min,
1790                LLVMValueRef max)
1791 {
1792    assert(lp_check_value(bld->type, a));
1793    assert(lp_check_value(bld->type, min));
1794    assert(lp_check_value(bld->type, max));
1795
1796    a = lp_build_min(bld, a, max);
1797    a = lp_build_max(bld, a, min);
1798    return a;
1799 }
1800
1801
1802 /**
1803  * Generate clamp(a, 0, 1)
1804  * A NaN will get converted to zero.
1805  */
1806 LLVMValueRef
1807 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1808                                 LLVMValueRef a)
1809 {
1810    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1811    a = lp_build_min(bld, a, bld->one);
1812    return a;
1813 }
1814
1815
1816 /**
1817  * Generate abs(a)
1818  */
1819 LLVMValueRef
1820 lp_build_abs(struct lp_build_context *bld,
1821              LLVMValueRef a)
1822 {
1823    LLVMBuilderRef builder = bld->gallivm->builder;
1824    const struct lp_type type = bld->type;
1825    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1826
1827    assert(lp_check_value(type, a));
1828
1829    if(!type.sign)
1830       return a;
1831
1832    if(type.floating) {
1833       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1834          /* Workaround llvm.org/PR27332 */
1835          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1836          unsigned long long absMask = ~(1ULL << (type.width - 1));
1837          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1838          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1839          a = LLVMBuildAnd(builder, a, mask, "");
1840          a = LLVMBuildBitCast(builder, a, vec_type, "");
1841          return a;
1842       } else {
1843          char intrinsic[32];
1844          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1845          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1846       }
1847    }
1848
1849    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1850       switch(type.width) {
1851       case 8:
1852          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1853       case 16:
1854          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1855       case 32:
1856          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1857       }
1858    }
1859    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1860       switch(type.width) {
1861       case 8:
1862          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1863       case 16:
1864          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1865       case 32:
1866          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1867       }
1868    }
1869
1870    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1871                           a, LLVMBuildNeg(builder, a, ""));
1872 }
1873
1874
1875 LLVMValueRef
1876 lp_build_negate(struct lp_build_context *bld,
1877                 LLVMValueRef a)
1878 {
1879    LLVMBuilderRef builder = bld->gallivm->builder;
1880
1881    assert(lp_check_value(bld->type, a));
1882
1883    if (bld->type.floating)
1884       a = LLVMBuildFNeg(builder, a, "");
1885    else
1886       a = LLVMBuildNeg(builder, a, "");
1887
1888    return a;
1889 }
1890
1891
1892 /** Return -1, 0 or +1 depending on the sign of a */
1893 LLVMValueRef
1894 lp_build_sgn(struct lp_build_context *bld,
1895              LLVMValueRef a)
1896 {
1897    LLVMBuilderRef builder = bld->gallivm->builder;
1898    const struct lp_type type = bld->type;
1899    LLVMValueRef cond;
1900    LLVMValueRef res;
1901
1902    assert(lp_check_value(type, a));
1903
1904    /* Handle non-zero case */
1905    if(!type.sign) {
1906       /* if not zero then sign must be positive */
1907       res = bld->one;
1908    }
1909    else if(type.floating) {
1910       LLVMTypeRef vec_type;
1911       LLVMTypeRef int_type;
1912       LLVMValueRef mask;
1913       LLVMValueRef sign;
1914       LLVMValueRef one;
1915       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1916
1917       int_type = lp_build_int_vec_type(bld->gallivm, type);
1918       vec_type = lp_build_vec_type(bld->gallivm, type);
1919       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1920
1921       /* Take the sign bit and add it to 1 constant */
1922       sign = LLVMBuildBitCast(builder, a, int_type, "");
1923       sign = LLVMBuildAnd(builder, sign, mask, "");
1924       one = LLVMConstBitCast(bld->one, int_type);
1925       res = LLVMBuildOr(builder, sign, one, "");
1926       res = LLVMBuildBitCast(builder, res, vec_type, "");
1927    }
1928    else
1929    {
1930       /* signed int/norm/fixed point */
1931       /* could use psign with sse3 and appropriate vectors here */
1932       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1933       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1934       res = lp_build_select(bld, cond, bld->one, minus_one);
1935    }
1936
1937    /* Handle zero */
1938    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1939    res = lp_build_select(bld, cond, bld->zero, res);
1940
1941    return res;
1942 }
1943
1944
1945 /**
1946  * Set the sign of float vector 'a' according to 'sign'.
1947  * If sign==0, return abs(a).
1948  * If sign==1, return -abs(a);
1949  * Other values for sign produce undefined results.
1950  */
1951 LLVMValueRef
1952 lp_build_set_sign(struct lp_build_context *bld,
1953                   LLVMValueRef a, LLVMValueRef sign)
1954 {
1955    LLVMBuilderRef builder = bld->gallivm->builder;
1956    const struct lp_type type = bld->type;
1957    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1958    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1959    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1960    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1961                              ~((unsigned long long) 1 << (type.width - 1)));
1962    LLVMValueRef val, res;
1963
1964    assert(type.floating);
1965    assert(lp_check_value(type, a));
1966
1967    /* val = reinterpret_cast<int>(a) */
1968    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1969    /* val = val & mask */
1970    val = LLVMBuildAnd(builder, val, mask, "");
1971    /* sign = sign << shift */
1972    sign = LLVMBuildShl(builder, sign, shift, "");
1973    /* res = val | sign */
1974    res = LLVMBuildOr(builder, val, sign, "");
1975    /* res = reinterpret_cast<float>(res) */
1976    res = LLVMBuildBitCast(builder, res, vec_type, "");
1977
1978    return res;
1979 }
1980
1981
1982 /**
1983  * Convert vector of (or scalar) int to vector of (or scalar) float.
1984  */
1985 LLVMValueRef
1986 lp_build_int_to_float(struct lp_build_context *bld,
1987                       LLVMValueRef a)
1988 {
1989    LLVMBuilderRef builder = bld->gallivm->builder;
1990    const struct lp_type type = bld->type;
1991    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1992
1993    assert(type.floating);
1994
1995    return LLVMBuildSIToFP(builder, a, vec_type, "");
1996 }
1997
1998 static boolean
1999 arch_rounding_available(const struct lp_type type)
2000 {
2001    if ((util_cpu_caps.has_sse4_1 &&
2002        (type.length == 1 || type.width*type.length == 128)) ||
2003        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2004        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2005       return TRUE;
2006    else if ((util_cpu_caps.has_altivec &&
2007             (type.width == 32 && type.length == 4)))
2008       return TRUE;
2009    else if (util_cpu_caps.has_neon)
2010       return TRUE;
2011
2012    return FALSE;
2013 }
2014
2015 enum lp_build_round_mode
2016 {
2017    LP_BUILD_ROUND_NEAREST = 0,
2018    LP_BUILD_ROUND_FLOOR = 1,
2019    LP_BUILD_ROUND_CEIL = 2,
2020    LP_BUILD_ROUND_TRUNCATE = 3
2021 };
2022
2023 static inline LLVMValueRef
2024 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2025                              LLVMValueRef a)
2026 {
2027    LLVMBuilderRef builder = bld->gallivm->builder;
2028    const struct lp_type type = bld->type;
2029    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2030    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2031    const char *intrinsic;
2032    LLVMValueRef res;
2033
2034    assert(type.floating);
2035    /* using the double precision conversions is a bit more complicated */
2036    assert(type.width == 32);
2037
2038    assert(lp_check_value(type, a));
2039    assert(util_cpu_caps.has_sse2);
2040
2041    /* This is relying on MXCSR rounding mode, which should always be nearest. */
2042    if (type.length == 1) {
2043       LLVMTypeRef vec_type;
2044       LLVMValueRef undef;
2045       LLVMValueRef arg;
2046       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2047
2048       vec_type = LLVMVectorType(bld->elem_type, 4);
2049
2050       intrinsic = "llvm.x86.sse.cvtss2si";
2051
2052       undef = LLVMGetUndef(vec_type);
2053
2054       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2055
2056       res = lp_build_intrinsic_unary(builder, intrinsic,
2057                                      ret_type, arg);
2058    }
2059    else {
2060       if (type.width* type.length == 128) {
2061          intrinsic = "llvm.x86.sse2.cvtps2dq";
2062       }
2063       else {
2064          assert(type.width*type.length == 256);
2065          assert(util_cpu_caps.has_avx);
2066
2067          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2068       }
2069       res = lp_build_intrinsic_unary(builder, intrinsic,
2070                                      ret_type, a);
2071    }
2072
2073    return res;
2074 }
2075
2076
2077 /*
2078  */
2079 static inline LLVMValueRef
2080 lp_build_round_altivec(struct lp_build_context *bld,
2081                        LLVMValueRef a,
2082                        enum lp_build_round_mode mode)
2083 {
2084    LLVMBuilderRef builder = bld->gallivm->builder;
2085    const struct lp_type type = bld->type;
2086    const char *intrinsic = NULL;
2087
2088    assert(type.floating);
2089
2090    assert(lp_check_value(type, a));
2091    assert(util_cpu_caps.has_altivec);
2092
2093    (void)type;
2094
2095    switch (mode) {
2096    case LP_BUILD_ROUND_NEAREST:
2097       intrinsic = "llvm.ppc.altivec.vrfin";
2098       break;
2099    case LP_BUILD_ROUND_FLOOR:
2100       intrinsic = "llvm.ppc.altivec.vrfim";
2101       break;
2102    case LP_BUILD_ROUND_CEIL:
2103       intrinsic = "llvm.ppc.altivec.vrfip";
2104       break;
2105    case LP_BUILD_ROUND_TRUNCATE:
2106       intrinsic = "llvm.ppc.altivec.vrfiz";
2107       break;
2108    }
2109
2110    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2111 }
2112
2113 static inline LLVMValueRef
2114 lp_build_round_arch(struct lp_build_context *bld,
2115                     LLVMValueRef a,
2116                     enum lp_build_round_mode mode)
2117 {
2118    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2119       LLVMBuilderRef builder = bld->gallivm->builder;
2120       const struct lp_type type = bld->type;
2121       const char *intrinsic_root;
2122       char intrinsic[32];
2123
2124       assert(type.floating);
2125       assert(lp_check_value(type, a));
2126       (void)type;
2127
2128       switch (mode) {
2129       case LP_BUILD_ROUND_NEAREST:
2130          intrinsic_root = "llvm.nearbyint";
2131          break;
2132       case LP_BUILD_ROUND_FLOOR:
2133          intrinsic_root = "llvm.floor";
2134          break;
2135       case LP_BUILD_ROUND_CEIL:
2136          intrinsic_root = "llvm.ceil";
2137          break;
2138       case LP_BUILD_ROUND_TRUNCATE:
2139          intrinsic_root = "llvm.trunc";
2140          break;
2141       }
2142
2143       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2144       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2145    }
2146    else /* (util_cpu_caps.has_altivec) */
2147      return lp_build_round_altivec(bld, a, mode);
2148 }
2149
2150 /**
2151  * Return the integer part of a float (vector) value (== round toward zero).
2152  * The returned value is a float (vector).
2153  * Ex: trunc(-1.5) = -1.0
2154  */
2155 LLVMValueRef
2156 lp_build_trunc(struct lp_build_context *bld,
2157                LLVMValueRef a)
2158 {
2159    LLVMBuilderRef builder = bld->gallivm->builder;
2160    const struct lp_type type = bld->type;
2161
2162    assert(type.floating);
2163    assert(lp_check_value(type, a));
2164
2165    if (arch_rounding_available(type)) {
2166       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2167    }
2168    else {
2169       const struct lp_type type = bld->type;
2170       struct lp_type inttype;
2171       struct lp_build_context intbld;
2172       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173       LLVMValueRef trunc, res, anosign, mask;
2174       LLVMTypeRef int_vec_type = bld->int_vec_type;
2175       LLVMTypeRef vec_type = bld->vec_type;
2176
2177       assert(type.width == 32); /* might want to handle doubles at some point */
2178
2179       inttype = type;
2180       inttype.floating = 0;
2181       lp_build_context_init(&intbld, bld->gallivm, inttype);
2182
2183       /* round by truncation */
2184       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2185       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2186
2187       /* mask out sign bit */
2188       anosign = lp_build_abs(bld, a);
2189       /*
2190        * mask out all values if anosign > 2^24
2191        * This should work both for large ints (all rounding is no-op for them
2192        * because such floats are always exact) as well as special cases like
2193        * NaNs, Infs (taking advantage of the fact they use max exponent).
2194        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2195        */
2196       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2197       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2198       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2199       return lp_build_select(bld, mask, a, res);
2200    }
2201 }
2202
2203
2204 /**
2205  * Return float (vector) rounded to nearest integer (vector).  The returned
2206  * value is a float (vector).
2207  * Ex: round(0.9) = 1.0
2208  * Ex: round(-1.5) = -2.0
2209  */
2210 LLVMValueRef
2211 lp_build_round(struct lp_build_context *bld,
2212                LLVMValueRef a)
2213 {
2214    LLVMBuilderRef builder = bld->gallivm->builder;
2215    const struct lp_type type = bld->type;
2216
2217    assert(type.floating);
2218    assert(lp_check_value(type, a));
2219
2220    if (arch_rounding_available(type)) {
2221       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2222    }
2223    else {
2224       const struct lp_type type = bld->type;
2225       struct lp_type inttype;
2226       struct lp_build_context intbld;
2227       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2228       LLVMValueRef res, anosign, mask;
2229       LLVMTypeRef int_vec_type = bld->int_vec_type;
2230       LLVMTypeRef vec_type = bld->vec_type;
2231
2232       assert(type.width == 32); /* might want to handle doubles at some point */
2233
2234       inttype = type;
2235       inttype.floating = 0;
2236       lp_build_context_init(&intbld, bld->gallivm, inttype);
2237
2238       res = lp_build_iround(bld, a);
2239       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2240
2241       /* mask out sign bit */
2242       anosign = lp_build_abs(bld, a);
2243       /*
2244        * mask out all values if anosign > 2^24
2245        * This should work both for large ints (all rounding is no-op for them
2246        * because such floats are always exact) as well as special cases like
2247        * NaNs, Infs (taking advantage of the fact they use max exponent).
2248        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2249        */
2250       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2251       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2252       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2253       return lp_build_select(bld, mask, a, res);
2254    }
2255 }
2256
2257
2258 /**
2259  * Return floor of float (vector), result is a float (vector)
2260  * Ex: floor(1.1) = 1.0
2261  * Ex: floor(-1.1) = -2.0
2262  */
2263 LLVMValueRef
2264 lp_build_floor(struct lp_build_context *bld,
2265                LLVMValueRef a)
2266 {
2267    LLVMBuilderRef builder = bld->gallivm->builder;
2268    const struct lp_type type = bld->type;
2269
2270    assert(type.floating);
2271    assert(lp_check_value(type, a));
2272
2273    if (arch_rounding_available(type)) {
2274       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2275    }
2276    else {
2277       const struct lp_type type = bld->type;
2278       struct lp_type inttype;
2279       struct lp_build_context intbld;
2280       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2281       LLVMValueRef trunc, res, anosign, mask;
2282       LLVMTypeRef int_vec_type = bld->int_vec_type;
2283       LLVMTypeRef vec_type = bld->vec_type;
2284
2285       if (type.width != 32) {
2286          char intrinsic[32];
2287          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2288          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2289       }
2290
2291       assert(type.width == 32); /* might want to handle doubles at some point */
2292
2293       inttype = type;
2294       inttype.floating = 0;
2295       lp_build_context_init(&intbld, bld->gallivm, inttype);
2296
2297       /* round by truncation */
2298       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2299       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2300
2301       if (type.sign) {
2302          LLVMValueRef tmp;
2303
2304          /*
2305           * fix values if rounding is wrong (for non-special cases)
2306           * - this is the case if trunc > a
2307           */
2308          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2309          /* tmp = trunc > a ? 1.0 : 0.0 */
2310          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2311          tmp = lp_build_and(&intbld, mask, tmp);
2312          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2313          res = lp_build_sub(bld, res, tmp);
2314       }
2315
2316       /* mask out sign bit */
2317       anosign = lp_build_abs(bld, a);
2318       /*
2319        * mask out all values if anosign > 2^24
2320        * This should work both for large ints (all rounding is no-op for them
2321        * because such floats are always exact) as well as special cases like
2322        * NaNs, Infs (taking advantage of the fact they use max exponent).
2323        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2324        */
2325       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2326       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2327       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2328       return lp_build_select(bld, mask, a, res);
2329    }
2330 }
2331
2332
2333 /**
2334  * Return ceiling of float (vector), returning float (vector).
2335  * Ex: ceil( 1.1) = 2.0
2336  * Ex: ceil(-1.1) = -1.0
2337  */
2338 LLVMValueRef
2339 lp_build_ceil(struct lp_build_context *bld,
2340               LLVMValueRef a)
2341 {
2342    LLVMBuilderRef builder = bld->gallivm->builder;
2343    const struct lp_type type = bld->type;
2344
2345    assert(type.floating);
2346    assert(lp_check_value(type, a));
2347
2348    if (arch_rounding_available(type)) {
2349       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2350    }
2351    else {
2352       const struct lp_type type = bld->type;
2353       struct lp_type inttype;
2354       struct lp_build_context intbld;
2355       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2356       LLVMValueRef trunc, res, anosign, mask, tmp;
2357       LLVMTypeRef int_vec_type = bld->int_vec_type;
2358       LLVMTypeRef vec_type = bld->vec_type;
2359
2360       if (type.width != 32) {
2361          char intrinsic[32];
2362          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2363          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2364       }
2365
2366       assert(type.width == 32); /* might want to handle doubles at some point */
2367
2368       inttype = type;
2369       inttype.floating = 0;
2370       lp_build_context_init(&intbld, bld->gallivm, inttype);
2371
2372       /* round by truncation */
2373       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2374       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2375
2376       /*
2377        * fix values if rounding is wrong (for non-special cases)
2378        * - this is the case if trunc < a
2379        */
2380       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2381       /* tmp = trunc < a ? 1.0 : 0.0 */
2382       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2383       tmp = lp_build_and(&intbld, mask, tmp);
2384       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2385       res = lp_build_add(bld, trunc, tmp);
2386
2387       /* mask out sign bit */
2388       anosign = lp_build_abs(bld, a);
2389       /*
2390        * mask out all values if anosign > 2^24
2391        * This should work both for large ints (all rounding is no-op for them
2392        * because such floats are always exact) as well as special cases like
2393        * NaNs, Infs (taking advantage of the fact they use max exponent).
2394        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2395        */
2396       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2397       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2398       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2399       return lp_build_select(bld, mask, a, res);
2400    }
2401 }
2402
2403
2404 /**
2405  * Return fractional part of 'a' computed as a - floor(a)
2406  * Typically used in texture coord arithmetic.
2407  */
2408 LLVMValueRef
2409 lp_build_fract(struct lp_build_context *bld,
2410                LLVMValueRef a)
2411 {
2412    assert(bld->type.floating);
2413    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2414 }
2415
2416
2417 /**
2418  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2419  * against 0.99999(9). (Will also return that value for NaNs.)
2420  */
2421 static inline LLVMValueRef
2422 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2423 {
2424    LLVMValueRef max;
2425
2426    /* this is the largest number smaller than 1.0 representable as float */
2427    max = lp_build_const_vec(bld->gallivm, bld->type,
2428                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2429    return lp_build_min_ext(bld, fract, max,
2430                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2431 }
2432
2433
2434 /**
2435  * Same as lp_build_fract, but guarantees that the result is always smaller
2436  * than one. Will also return the smaller-than-one value for infs, NaNs.
2437  */
2438 LLVMValueRef
2439 lp_build_fract_safe(struct lp_build_context *bld,
2440                     LLVMValueRef a)
2441 {
2442    return clamp_fract(bld, lp_build_fract(bld, a));
2443 }
2444
2445
2446 /**
2447  * Return the integer part of a float (vector) value (== round toward zero).
2448  * The returned value is an integer (vector).
2449  * Ex: itrunc(-1.5) = -1
2450  */
2451 LLVMValueRef
2452 lp_build_itrunc(struct lp_build_context *bld,
2453                 LLVMValueRef a)
2454 {
2455    LLVMBuilderRef builder = bld->gallivm->builder;
2456    const struct lp_type type = bld->type;
2457    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2458
2459    assert(type.floating);
2460    assert(lp_check_value(type, a));
2461
2462    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2463 }
2464
2465
2466 /**
2467  * Return float (vector) rounded to nearest integer (vector).  The returned
2468  * value is an integer (vector).
2469  * Ex: iround(0.9) = 1
2470  * Ex: iround(-1.5) = -2
2471  */
2472 LLVMValueRef
2473 lp_build_iround(struct lp_build_context *bld,
2474                 LLVMValueRef a)
2475 {
2476    LLVMBuilderRef builder = bld->gallivm->builder;
2477    const struct lp_type type = bld->type;
2478    LLVMTypeRef int_vec_type = bld->int_vec_type;
2479    LLVMValueRef res;
2480
2481    assert(type.floating);
2482
2483    assert(lp_check_value(type, a));
2484
2485    if ((util_cpu_caps.has_sse2 &&
2486        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2487        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2488       return lp_build_iround_nearest_sse2(bld, a);
2489    }
2490    if (arch_rounding_available(type)) {
2491       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2492    }
2493    else {
2494       LLVMValueRef half;
2495
2496       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2497
2498       if (type.sign) {
2499          LLVMTypeRef vec_type = bld->vec_type;
2500          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2501                                     (unsigned long long)1 << (type.width - 1));
2502          LLVMValueRef sign;
2503
2504          /* get sign bit */
2505          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2506          sign = LLVMBuildAnd(builder, sign, mask, "");
2507
2508          /* sign * 0.5 */
2509          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2510          half = LLVMBuildOr(builder, sign, half, "");
2511          half = LLVMBuildBitCast(builder, half, vec_type, "");
2512       }
2513
2514       res = LLVMBuildFAdd(builder, a, half, "");
2515    }
2516
2517    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2518
2519    return res;
2520 }
2521
2522
2523 /**
2524  * Return floor of float (vector), result is an int (vector)
2525  * Ex: ifloor(1.1) = 1.0
2526  * Ex: ifloor(-1.1) = -2.0
2527  */
2528 LLVMValueRef
2529 lp_build_ifloor(struct lp_build_context *bld,
2530                 LLVMValueRef a)
2531 {
2532    LLVMBuilderRef builder = bld->gallivm->builder;
2533    const struct lp_type type = bld->type;
2534    LLVMTypeRef int_vec_type = bld->int_vec_type;
2535    LLVMValueRef res;
2536
2537    assert(type.floating);
2538    assert(lp_check_value(type, a));
2539
2540    res = a;
2541    if (type.sign) {
2542       if (arch_rounding_available(type)) {
2543          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2544       }
2545       else {
2546          struct lp_type inttype;
2547          struct lp_build_context intbld;
2548          LLVMValueRef trunc, itrunc, mask;
2549
2550          assert(type.floating);
2551          assert(lp_check_value(type, a));
2552
2553          inttype = type;
2554          inttype.floating = 0;
2555          lp_build_context_init(&intbld, bld->gallivm, inttype);
2556
2557          /* round by truncation */
2558          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2559          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2560
2561          /*
2562           * fix values if rounding is wrong (for non-special cases)
2563           * - this is the case if trunc > a
2564           * The results of doing this with NaNs, very large values etc.
2565           * are undefined but this seems to be the case anyway.
2566           */
2567          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2568          /* cheapie minus one with mask since the mask is minus one / zero */
2569          return lp_build_add(&intbld, itrunc, mask);
2570       }
2571    }
2572
2573    /* round to nearest (toward zero) */
2574    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2575
2576    return res;
2577 }
2578
2579
2580 /**
2581  * Return ceiling of float (vector), returning int (vector).
2582  * Ex: iceil( 1.1) = 2
2583  * Ex: iceil(-1.1) = -1
2584  */
2585 LLVMValueRef
2586 lp_build_iceil(struct lp_build_context *bld,
2587                LLVMValueRef a)
2588 {
2589    LLVMBuilderRef builder = bld->gallivm->builder;
2590    const struct lp_type type = bld->type;
2591    LLVMTypeRef int_vec_type = bld->int_vec_type;
2592    LLVMValueRef res;
2593
2594    assert(type.floating);
2595    assert(lp_check_value(type, a));
2596
2597    if (arch_rounding_available(type)) {
2598       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2599    }
2600    else {
2601       struct lp_type inttype;
2602       struct lp_build_context intbld;
2603       LLVMValueRef trunc, itrunc, mask;
2604
2605       assert(type.floating);
2606       assert(lp_check_value(type, a));
2607
2608       inttype = type;
2609       inttype.floating = 0;
2610       lp_build_context_init(&intbld, bld->gallivm, inttype);
2611
2612       /* round by truncation */
2613       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2614       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2615
2616       /*
2617        * fix values if rounding is wrong (for non-special cases)
2618        * - this is the case if trunc < a
2619        * The results of doing this with NaNs, very large values etc.
2620        * are undefined but this seems to be the case anyway.
2621        */
2622       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2623       /* cheapie plus one with mask since the mask is minus one / zero */
2624       return lp_build_sub(&intbld, itrunc, mask);
2625    }
2626
2627    /* round to nearest (toward zero) */
2628    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2629
2630    return res;
2631 }
2632
2633
2634 /**
2635  * Combined ifloor() & fract().
2636  *
2637  * Preferred to calling the functions separately, as it will ensure that the
2638  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2639  */
2640 void
2641 lp_build_ifloor_fract(struct lp_build_context *bld,
2642                       LLVMValueRef a,
2643                       LLVMValueRef *out_ipart,
2644                       LLVMValueRef *out_fpart)
2645 {
2646    LLVMBuilderRef builder = bld->gallivm->builder;
2647    const struct lp_type type = bld->type;
2648    LLVMValueRef ipart;
2649
2650    assert(type.floating);
2651    assert(lp_check_value(type, a));
2652
2653    if (arch_rounding_available(type)) {
2654       /*
2655        * floor() is easier.
2656        */
2657
2658       ipart = lp_build_floor(bld, a);
2659       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2660       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2661    }
2662    else {
2663       /*
2664        * ifloor() is easier.
2665        */
2666
2667       *out_ipart = lp_build_ifloor(bld, a);
2668       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2669       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2670    }
2671 }
2672
2673
2674 /**
2675  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2676  * always smaller than one.
2677  */
2678 void
2679 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2680                            LLVMValueRef a,
2681                            LLVMValueRef *out_ipart,
2682                            LLVMValueRef *out_fpart)
2683 {
2684    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2685    *out_fpart = clamp_fract(bld, *out_fpart);
2686 }
2687
2688
2689 LLVMValueRef
2690 lp_build_sqrt(struct lp_build_context *bld,
2691               LLVMValueRef a)
2692 {
2693    LLVMBuilderRef builder = bld->gallivm->builder;
2694    const struct lp_type type = bld->type;
2695    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2696    char intrinsic[32];
2697
2698    assert(lp_check_value(type, a));
2699
2700    assert(type.floating);
2701    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2702
2703    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2704 }
2705
2706
2707 /**
2708  * Do one Newton-Raphson step to improve reciprocate precision:
2709  *
2710  *   x_{i+1} = x_i * (2 - a * x_i)
2711  *
2712  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2713  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2714  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2715  * halo. It would be necessary to clamp the argument to prevent this.
2716  *
2717  * See also:
2718  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2719  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2720  */
2721 static inline LLVMValueRef
2722 lp_build_rcp_refine(struct lp_build_context *bld,
2723                     LLVMValueRef a,
2724                     LLVMValueRef rcp_a)
2725 {
2726    LLVMBuilderRef builder = bld->gallivm->builder;
2727    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2728    LLVMValueRef res;
2729
2730    res = LLVMBuildFMul(builder, a, rcp_a, "");
2731    res = LLVMBuildFSub(builder, two, res, "");
2732    res = LLVMBuildFMul(builder, rcp_a, res, "");
2733
2734    return res;
2735 }
2736
2737
2738 LLVMValueRef
2739 lp_build_rcp(struct lp_build_context *bld,
2740              LLVMValueRef a)
2741 {
2742    LLVMBuilderRef builder = bld->gallivm->builder;
2743    const struct lp_type type = bld->type;
2744
2745    assert(lp_check_value(type, a));
2746
2747    if(a == bld->zero)
2748       return bld->undef;
2749    if(a == bld->one)
2750       return bld->one;
2751    if(a == bld->undef)
2752       return bld->undef;
2753
2754    assert(type.floating);
2755
2756    if(LLVMIsConstant(a))
2757       return LLVMConstFDiv(bld->one, a);
2758
2759    /*
2760     * We don't use RCPPS because:
2761     * - it only has 10bits of precision
2762     * - it doesn't even get the reciprocate of 1.0 exactly
2763     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2764     * - for recent processors the benefit over DIVPS is marginal, a case
2765     *   dependent
2766     *
2767     * We could still use it on certain processors if benchmarks show that the
2768     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2769     * particular uses that require less workarounds.
2770     */
2771
2772    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2773          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2774       const unsigned num_iterations = 0;
2775       LLVMValueRef res;
2776       unsigned i;
2777       const char *intrinsic = NULL;
2778
2779       if (type.length == 4) {
2780          intrinsic = "llvm.x86.sse.rcp.ps";
2781       }
2782       else {
2783          intrinsic = "llvm.x86.avx.rcp.ps.256";
2784       }
2785
2786       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2787
2788       for (i = 0; i < num_iterations; ++i) {
2789          res = lp_build_rcp_refine(bld, a, res);
2790       }
2791
2792       return res;
2793    }
2794
2795    return LLVMBuildFDiv(builder, bld->one, a, "");
2796 }
2797
2798
2799 /**
2800  * Do one Newton-Raphson step to improve rsqrt precision:
2801  *
2802  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2803  *
2804  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2805  */
2806 static inline LLVMValueRef
2807 lp_build_rsqrt_refine(struct lp_build_context *bld,
2808                       LLVMValueRef a,
2809                       LLVMValueRef rsqrt_a)
2810 {
2811    LLVMBuilderRef builder = bld->gallivm->builder;
2812    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2813    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2814    LLVMValueRef res;
2815
2816    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2817    res = LLVMBuildFMul(builder, a, res, "");
2818    res = LLVMBuildFSub(builder, three, res, "");
2819    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2820    res = LLVMBuildFMul(builder, half, res, "");
2821
2822    return res;
2823 }
2824
2825
2826 /**
2827  * Generate 1/sqrt(a).
2828  * Result is undefined for values < 0, infinity for +0.
2829  */
2830 LLVMValueRef
2831 lp_build_rsqrt(struct lp_build_context *bld,
2832                LLVMValueRef a)
2833 {
2834    const struct lp_type type = bld->type;
2835
2836    assert(lp_check_value(type, a));
2837
2838    assert(type.floating);
2839
2840    /*
2841     * This should be faster but all denormals will end up as infinity.
2842     */
2843    if (0 && lp_build_fast_rsqrt_available(type)) {
2844       const unsigned num_iterations = 1;
2845       LLVMValueRef res;
2846       unsigned i;
2847
2848       /* rsqrt(1.0) != 1.0 here */
2849       res = lp_build_fast_rsqrt(bld, a);
2850
2851       if (num_iterations) {
2852          /*
2853           * Newton-Raphson will result in NaN instead of infinity for zero,
2854           * and NaN instead of zero for infinity.
2855           * Also, need to ensure rsqrt(1.0) == 1.0.
2856           * All numbers smaller than FLT_MIN will result in +infinity
2857           * (rsqrtps treats all denormals as zero).
2858           */
2859          LLVMValueRef cmp;
2860          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2861          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2862
2863          for (i = 0; i < num_iterations; ++i) {
2864             res = lp_build_rsqrt_refine(bld, a, res);
2865          }
2866          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2867          res = lp_build_select(bld, cmp, inf, res);
2868          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2869          res = lp_build_select(bld, cmp, bld->zero, res);
2870          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2871          res = lp_build_select(bld, cmp, bld->one, res);
2872       }
2873
2874       return res;
2875    }
2876
2877    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2878 }
2879
2880 /**
2881  * If there's a fast (inaccurate) rsqrt instruction available
2882  * (caller may want to avoid to call rsqrt_fast if it's not available,
2883  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2884  * unavailable it would result in sqrt/div/mul so obviously
2885  * much better to just call sqrt, skipping both div and mul).
2886  */
2887 boolean
2888 lp_build_fast_rsqrt_available(struct lp_type type)
2889 {
2890    assert(type.floating);
2891
2892    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2893        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2894       return true;
2895    }
2896    return false;
2897 }
2898
2899
2900 /**
2901  * Generate 1/sqrt(a).
2902  * Result is undefined for values < 0, infinity for +0.
2903  * Precision is limited, only ~10 bits guaranteed
2904  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2905  */
2906 LLVMValueRef
2907 lp_build_fast_rsqrt(struct lp_build_context *bld,
2908                     LLVMValueRef a)
2909 {
2910    LLVMBuilderRef builder = bld->gallivm->builder;
2911    const struct lp_type type = bld->type;
2912
2913    assert(lp_check_value(type, a));
2914
2915    if (lp_build_fast_rsqrt_available(type)) {
2916       const char *intrinsic = NULL;
2917
2918       if (type.length == 4) {
2919          intrinsic = "llvm.x86.sse.rsqrt.ps";
2920       }
2921       else {
2922          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2923       }
2924       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2925    }
2926    else {
2927       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2928    }
2929    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2930 }
2931
2932
2933 /**
2934  * Generate sin(a) or cos(a) using polynomial approximation.
2935  * TODO: it might be worth recognizing sin and cos using same source
2936  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2937  * would be way cheaper than calculating (nearly) everything twice...
2938  * Not sure it's common enough to be worth bothering however, scs
2939  * opcode could also benefit from calculating both though.
2940  */
2941 static LLVMValueRef
2942 lp_build_sin_or_cos(struct lp_build_context *bld,
2943                     LLVMValueRef a,
2944                     boolean cos)
2945 {
2946    struct gallivm_state *gallivm = bld->gallivm;
2947    LLVMBuilderRef b = gallivm->builder;
2948    struct lp_type int_type = lp_int_type(bld->type);
2949
2950    /*
2951     *  take the absolute value,
2952     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2953     */
2954
2955    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2956    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2957
2958    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2959    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2960
2961    /*
2962     * scale by 4/Pi
2963     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2964     */
2965
2966    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2967    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2968
2969    /*
2970     * store the integer part of y in mm0
2971     * emm2 = _mm_cvttps_epi32(y);
2972     */
2973
2974    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2975
2976    /*
2977     * j=(j+1) & (~1) (see the cephes sources)
2978     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2979     */
2980
2981    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2982    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2983    /*
2984     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2985     */
2986    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2987    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2988
2989    /*
2990     * y = _mm_cvtepi32_ps(emm2);
2991     */
2992    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2993
2994    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2995    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2996    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2997    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2998
2999    /*
3000     * Argument used for poly selection and sign bit determination
3001     * is different for sin vs. cos.
3002     */
3003    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3004                                emm2_and;
3005
3006    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3007                                                               LLVMBuildNot(b, emm2_2, ""), ""),
3008                                               const_29, "sign_bit") :
3009                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3010                                                               LLVMBuildShl(b, emm2_add,
3011                                                                            const_29, ""), ""),
3012                                               sign_mask, "sign_bit");
3013
3014    /*
3015     * get the polynom selection mask
3016     * there is one polynom for 0 <= x <= Pi/4
3017     * and another one for Pi/4<x<=Pi/2
3018     * Both branches will be computed.
3019     *
3020     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3021     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3022     */
3023
3024    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3025    LLVMValueRef poly_mask = lp_build_compare(gallivm,
3026                                              int_type, PIPE_FUNC_EQUAL,
3027                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3028
3029    /*
3030     * _PS_CONST(minus_cephes_DP1, -0.78515625);
3031     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3032     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3033     */
3034    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3035    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3036    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3037
3038    /*
3039     * The magic pass: "Extended precision modular arithmetic"
3040     * x = ((x - y * DP1) - y * DP2) - y * DP3;
3041     */
3042    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3043    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3044    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3045
3046    /*
3047     * Evaluate the first polynom  (0 <= x <= Pi/4)
3048     *
3049     * z = _mm_mul_ps(x,x);
3050     */
3051    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3052
3053    /*
3054     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3055     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3056     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3057     */
3058    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3059    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3060    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3061
3062    /*
3063     * y = *(v4sf*)_ps_coscof_p0;
3064     * y = _mm_mul_ps(y, z);
3065     */
3066    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3067    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3068    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3069    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3070
3071
3072    /*
3073     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3074     * y = _mm_sub_ps(y, tmp);
3075     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3076     */
3077    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3078    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3079    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3080    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3081    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3082
3083    /*
3084     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3085     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3086     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3087     */
3088    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3089    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3090    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3091
3092    /*
3093     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3094     *
3095     * y2 = *(v4sf*)_ps_sincof_p0;
3096     * y2 = _mm_mul_ps(y2, z);
3097     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3098     * y2 = _mm_mul_ps(y2, z);
3099     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3100     * y2 = _mm_mul_ps(y2, z);
3101     * y2 = _mm_mul_ps(y2, x);
3102     * y2 = _mm_add_ps(y2, x);
3103     */
3104
3105    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3106    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3107    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3108    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3109
3110    /*
3111     * select the correct result from the two polynoms
3112     * xmm3 = poly_mask;
3113     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3114     * y = _mm_andnot_ps(xmm3, y);
3115     * y = _mm_or_ps(y,y2);
3116     */
3117    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3118    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3119    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3120    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3121    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3122    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3123
3124    /*
3125     * update the sign
3126     * y = _mm_xor_ps(y, sign_bit);
3127     */
3128    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3129    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3130
3131    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3132
3133    /* clamp output to be within [-1, 1] */
3134    y_result = lp_build_clamp(bld, y_result,
3135                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3136                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3137    /* If a is -inf, inf or NaN then return NaN */
3138    y_result = lp_build_select(bld, isfinite, y_result,
3139                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3140    return y_result;
3141 }
3142
3143
3144 /**
3145  * Generate sin(a)
3146  */
3147 LLVMValueRef
3148 lp_build_sin(struct lp_build_context *bld,
3149              LLVMValueRef a)
3150 {
3151    return lp_build_sin_or_cos(bld, a, FALSE);
3152 }
3153
3154
3155 /**
3156  * Generate cos(a)
3157  */
3158 LLVMValueRef
3159 lp_build_cos(struct lp_build_context *bld,
3160              LLVMValueRef a)
3161 {
3162    return lp_build_sin_or_cos(bld, a, TRUE);
3163 }
3164
3165
3166 /**
3167  * Generate pow(x, y)
3168  */
3169 LLVMValueRef
3170 lp_build_pow(struct lp_build_context *bld,
3171              LLVMValueRef x,
3172              LLVMValueRef y)
3173 {
3174    /* TODO: optimize the constant case */
3175    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3176        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3177       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3178                    __FUNCTION__);
3179    }
3180
3181    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3182 }
3183
3184
3185 /**
3186  * Generate exp(x)
3187  */
3188 LLVMValueRef
3189 lp_build_exp(struct lp_build_context *bld,
3190              LLVMValueRef x)
3191 {
3192    /* log2(e) = 1/log(2) */
3193    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3194                                            1.4426950408889634);
3195
3196    assert(lp_check_value(bld->type, x));
3197
3198    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3199 }
3200
3201
3202 /**
3203  * Generate log(x)
3204  * Behavior is undefined with infs, 0s and nans
3205  */
3206 LLVMValueRef
3207 lp_build_log(struct lp_build_context *bld,
3208              LLVMValueRef x)
3209 {
3210    /* log(2) */
3211    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3212                                           0.69314718055994529);
3213
3214    assert(lp_check_value(bld->type, x));
3215
3216    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3217 }
3218
3219 /**
3220  * Generate log(x) that handles edge cases (infs, 0s and nans)
3221  */
3222 LLVMValueRef
3223 lp_build_log_safe(struct lp_build_context *bld,
3224                   LLVMValueRef x)
3225 {
3226    /* log(2) */
3227    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3228                                           0.69314718055994529);
3229
3230    assert(lp_check_value(bld->type, x));
3231
3232    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3233 }
3234
3235
3236 /**
3237  * Generate polynomial.
3238  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3239  */
3240 LLVMValueRef
3241 lp_build_polynomial(struct lp_build_context *bld,
3242                     LLVMValueRef x,
3243                     const double *coeffs,
3244                     unsigned num_coeffs)
3245 {
3246    const struct lp_type type = bld->type;
3247    LLVMValueRef even = NULL, odd = NULL;
3248    LLVMValueRef x2;
3249    unsigned i;
3250
3251    assert(lp_check_value(bld->type, x));
3252
3253    /* TODO: optimize the constant case */
3254    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3255        LLVMIsConstant(x)) {
3256       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3257                    __FUNCTION__);
3258    }
3259
3260    /*
3261     * Calculate odd and even terms seperately to decrease data dependency
3262     * Ex:
3263     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3264     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3265     */
3266    x2 = lp_build_mul(bld, x, x);
3267
3268    for (i = num_coeffs; i--; ) {
3269       LLVMValueRef coeff;
3270
3271       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3272
3273       if (i % 2 == 0) {
3274          if (even)
3275             even = lp_build_mad(bld, x2, even, coeff);
3276          else
3277             even = coeff;
3278       } else {
3279          if (odd)
3280             odd = lp_build_mad(bld, x2, odd, coeff);
3281          else
3282             odd = coeff;
3283       }
3284    }
3285
3286    if (odd)
3287       return lp_build_mad(bld, odd, x, even);
3288    else if (even)
3289       return even;
3290    else
3291       return bld->undef;
3292 }
3293
3294
3295 /**
3296  * Minimax polynomial fit of 2**x, in range [0, 1[
3297  */
3298 const double lp_build_exp2_polynomial[] = {
3299 #if EXP_POLY_DEGREE == 5
3300    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3301    0.693153073200168932794,
3302    0.240153617044375388211,
3303    0.0558263180532956664775,
3304    0.00898934009049466391101,
3305    0.00187757667519147912699
3306 #elif EXP_POLY_DEGREE == 4
3307    1.00000259337069434683,
3308    0.693003834469974940458,
3309    0.24144275689150793076,
3310    0.0520114606103070150235,
3311    0.0135341679161270268764
3312 #elif EXP_POLY_DEGREE == 3
3313    0.999925218562710312959,
3314    0.695833540494823811697,
3315    0.226067155427249155588,
3316    0.0780245226406372992967
3317 #elif EXP_POLY_DEGREE == 2
3318    1.00172476321474503578,
3319    0.657636275736077639316,
3320    0.33718943461968720704
3321 #else
3322 #error
3323 #endif
3324 };
3325
3326
3327 LLVMValueRef
3328 lp_build_exp2(struct lp_build_context *bld,
3329               LLVMValueRef x)
3330 {
3331    LLVMBuilderRef builder = bld->gallivm->builder;
3332    const struct lp_type type = bld->type;
3333    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3334    LLVMValueRef ipart = NULL;
3335    LLVMValueRef fpart = NULL;
3336    LLVMValueRef expipart = NULL;
3337    LLVMValueRef expfpart = NULL;
3338    LLVMValueRef res = NULL;
3339
3340    assert(lp_check_value(bld->type, x));
3341
3342    /* TODO: optimize the constant case */
3343    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3344        LLVMIsConstant(x)) {
3345       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3346                    __FUNCTION__);
3347    }
3348
3349    assert(type.floating && type.width == 32);
3350
3351    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3352     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3353    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3354                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3355    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3356                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3357
3358    /* ipart = floor(x) */
3359    /* fpart = x - ipart */
3360    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3361
3362    /* expipart = (float) (1 << ipart) */
3363    expipart = LLVMBuildAdd(builder, ipart,
3364                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3365    expipart = LLVMBuildShl(builder, expipart,
3366                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3367    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3368
3369    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3370                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3371
3372    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3373
3374    return res;
3375 }
3376
3377
3378
3379 /**
3380  * Extract the exponent of a IEEE-754 floating point value.
3381  *
3382  * Optionally apply an integer bias.
3383  *
3384  * Result is an integer value with
3385  *
3386  *   ifloor(log2(x)) + bias
3387  */
3388 LLVMValueRef
3389 lp_build_extract_exponent(struct lp_build_context *bld,
3390                           LLVMValueRef x,
3391                           int bias)
3392 {
3393    LLVMBuilderRef builder = bld->gallivm->builder;
3394    const struct lp_type type = bld->type;
3395    unsigned mantissa = lp_mantissa(type);
3396    LLVMValueRef res;
3397
3398    assert(type.floating);
3399
3400    assert(lp_check_value(bld->type, x));
3401
3402    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3403
3404    res = LLVMBuildLShr(builder, x,
3405                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3406    res = LLVMBuildAnd(builder, res,
3407                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3408    res = LLVMBuildSub(builder, res,
3409                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3410
3411    return res;
3412 }
3413
3414
3415 /**
3416  * Extract the mantissa of the a floating.
3417  *
3418  * Result is a floating point value with
3419  *
3420  *   x / floor(log2(x))
3421  */
3422 LLVMValueRef
3423 lp_build_extract_mantissa(struct lp_build_context *bld,
3424                           LLVMValueRef x)
3425 {
3426    LLVMBuilderRef builder = bld->gallivm->builder;
3427    const struct lp_type type = bld->type;
3428    unsigned mantissa = lp_mantissa(type);
3429    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3430                                                   (1ULL << mantissa) - 1);
3431    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3432    LLVMValueRef res;
3433
3434    assert(lp_check_value(bld->type, x));
3435
3436    assert(type.floating);
3437
3438    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3439
3440    /* res = x / 2**ipart */
3441    res = LLVMBuildAnd(builder, x, mantmask, "");
3442    res = LLVMBuildOr(builder, res, one, "");
3443    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3444
3445    return res;
3446 }
3447
3448
3449
3450 /**
3451  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3452  * These coefficients can be generate with
3453  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3454  */
3455 const double lp_build_log2_polynomial[] = {
3456 #if LOG_POLY_DEGREE == 5
3457    2.88539008148777786488L,
3458    0.961796878841293367824L,
3459    0.577058946784739859012L,
3460    0.412914355135828735411L,
3461    0.308591899232910175289L,
3462    0.352376952300281371868L,
3463 #elif LOG_POLY_DEGREE == 4
3464    2.88539009343309178325L,
3465    0.961791550404184197881L,
3466    0.577440339438736392009L,
3467    0.403343858251329912514L,
3468    0.406718052498846252698L,
3469 #elif LOG_POLY_DEGREE == 3
3470    2.88538959748872753838L,
3471    0.961932915889597772928L,
3472    0.571118517972136195241L,
3473    0.493997535084709500285L,
3474 #else
3475 #error
3476 #endif
3477 };
3478
3479 /**
3480  * See http://www.devmaster.net/forums/showthread.php?p=43580
3481  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3482  * http://www.nezumi.demon.co.uk/consult/logx.htm
3483  *
3484  * If handle_edge_cases is true the function will perform computations
3485  * to match the required D3D10+ behavior for each of the edge cases.
3486  * That means that if input is:
3487  * - less than zero (to and including -inf) then NaN will be returned
3488  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3489  * - +infinity, then +infinity will be returned
3490  * - NaN, then NaN will be returned
3491  *
3492  * Those checks are fairly expensive so if you don't need them make sure
3493  * handle_edge_cases is false.
3494  */
3495 void
3496 lp_build_log2_approx(struct lp_build_context *bld,
3497                      LLVMValueRef x,
3498                      LLVMValueRef *p_exp,
3499                      LLVMValueRef *p_floor_log2,
3500                      LLVMValueRef *p_log2,
3501                      boolean handle_edge_cases)
3502 {
3503    LLVMBuilderRef builder = bld->gallivm->builder;
3504    const struct lp_type type = bld->type;
3505    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3506    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3507
3508    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3509    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3510    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3511
3512    LLVMValueRef i = NULL;
3513    LLVMValueRef y = NULL;
3514    LLVMValueRef z = NULL;
3515    LLVMValueRef exp = NULL;
3516    LLVMValueRef mant = NULL;
3517    LLVMValueRef logexp = NULL;
3518    LLVMValueRef p_z = NULL;
3519    LLVMValueRef res = NULL;
3520
3521    assert(lp_check_value(bld->type, x));
3522
3523    if(p_exp || p_floor_log2 || p_log2) {
3524       /* TODO: optimize the constant case */
3525       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3526           LLVMIsConstant(x)) {
3527          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3528                       __FUNCTION__);
3529       }
3530
3531       assert(type.floating && type.width == 32);
3532
3533       /*
3534        * We don't explicitly handle denormalized numbers. They will yield a
3535        * result in the neighbourhood of -127, which appears to be adequate
3536        * enough.
3537        */
3538
3539       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3540
3541       /* exp = (float) exponent(x) */
3542       exp = LLVMBuildAnd(builder, i, expmask, "");
3543    }
3544
3545    if(p_floor_log2 || p_log2) {
3546       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3547       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3548       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3549    }
3550
3551    if (p_log2) {
3552       /* mant = 1 + (float) mantissa(x) */
3553       mant = LLVMBuildAnd(builder, i, mantmask, "");
3554       mant = LLVMBuildOr(builder, mant, one, "");
3555       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3556
3557       /* y = (mant - 1) / (mant + 1) */
3558       y = lp_build_div(bld,
3559          lp_build_sub(bld, mant, bld->one),
3560          lp_build_add(bld, mant, bld->one)
3561       );
3562
3563       /* z = y^2 */
3564       z = lp_build_mul(bld, y, y);
3565
3566       /* compute P(z) */
3567       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3568                                 ARRAY_SIZE(lp_build_log2_polynomial));
3569
3570       /* y * P(z) + logexp */
3571       res = lp_build_mad(bld, y, p_z, logexp);
3572
3573       if (type.floating && handle_edge_cases) {
3574          LLVMValueRef negmask, infmask,  zmask;
3575          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3576                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3577          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3578                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3579          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3580                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3581
3582          /* If x is qual to inf make sure we return inf */
3583          res = lp_build_select(bld, infmask,
3584                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3585                                res);
3586          /* If x is qual to 0, return -inf */
3587          res = lp_build_select(bld, zmask,
3588                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3589                                res);
3590          /* If x is nan or less than 0, return nan */
3591          res = lp_build_select(bld, negmask,
3592                                lp_build_const_vec(bld->gallivm, type,  NAN),
3593                                res);
3594       }
3595    }
3596
3597    if (p_exp) {
3598       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3599       *p_exp = exp;
3600    }
3601
3602    if (p_floor_log2)
3603       *p_floor_log2 = logexp;
3604
3605    if (p_log2)
3606       *p_log2 = res;
3607 }
3608
3609
3610 /*
3611  * log2 implementation which doesn't have special code to
3612  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3613  * the results for those cases are undefined.
3614  */
3615 LLVMValueRef
3616 lp_build_log2(struct lp_build_context *bld,
3617               LLVMValueRef x)
3618 {
3619    LLVMValueRef res;
3620    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3621    return res;
3622 }
3623
3624 /*
3625  * Version of log2 which handles all edge cases.
3626  * Look at documentation of lp_build_log2_approx for
3627  * description of the behavior for each of the edge cases.
3628  */
3629 LLVMValueRef
3630 lp_build_log2_safe(struct lp_build_context *bld,
3631                    LLVMValueRef x)
3632 {
3633    LLVMValueRef res;
3634    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3635    return res;
3636 }
3637
3638
3639 /**
3640  * Faster (and less accurate) log2.
3641  *
3642  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3643  *
3644  * Piece-wise linear approximation, with exact results when x is a
3645  * power of two.
3646  *
3647  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3648  */
3649 LLVMValueRef
3650 lp_build_fast_log2(struct lp_build_context *bld,
3651                    LLVMValueRef x)
3652 {
3653    LLVMBuilderRef builder = bld->gallivm->builder;
3654    LLVMValueRef ipart;
3655    LLVMValueRef fpart;
3656
3657    assert(lp_check_value(bld->type, x));
3658
3659    assert(bld->type.floating);
3660
3661    /* ipart = floor(log2(x)) - 1 */
3662    ipart = lp_build_extract_exponent(bld, x, -1);
3663    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3664
3665    /* fpart = x / 2**ipart */
3666    fpart = lp_build_extract_mantissa(bld, x);
3667
3668    /* ipart + fpart */
3669    return LLVMBuildFAdd(builder, ipart, fpart, "");
3670 }
3671
3672
3673 /**
3674  * Fast implementation of iround(log2(x)).
3675  *
3676  * Not an approximation -- it should give accurate results all the time.
3677  */
3678 LLVMValueRef
3679 lp_build_ilog2(struct lp_build_context *bld,
3680                LLVMValueRef x)
3681 {
3682    LLVMBuilderRef builder = bld->gallivm->builder;
3683    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3684    LLVMValueRef ipart;
3685
3686    assert(bld->type.floating);
3687
3688    assert(lp_check_value(bld->type, x));
3689
3690    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3691    x = LLVMBuildFMul(builder, x, sqrt2, "");
3692
3693    /* ipart = floor(log2(x) + 0.5)  */
3694    ipart = lp_build_extract_exponent(bld, x, 0);
3695
3696    return ipart;
3697 }
3698
3699 LLVMValueRef
3700 lp_build_mod(struct lp_build_context *bld,
3701              LLVMValueRef x,
3702              LLVMValueRef y)
3703 {
3704    LLVMBuilderRef builder = bld->gallivm->builder;
3705    LLVMValueRef res;
3706    const struct lp_type type = bld->type;
3707
3708    assert(lp_check_value(type, x));
3709    assert(lp_check_value(type, y));
3710
3711    if (type.floating)
3712       res = LLVMBuildFRem(builder, x, y, "");
3713    else if (type.sign)
3714       res = LLVMBuildSRem(builder, x, y, "");
3715    else
3716       res = LLVMBuildURem(builder, x, y, "");
3717    return res;
3718 }
3719
3720
3721 /*
3722  * For floating inputs it creates and returns a mask
3723  * which is all 1's for channels which are NaN.
3724  * Channels inside x which are not NaN will be 0.
3725  */
3726 LLVMValueRef
3727 lp_build_isnan(struct lp_build_context *bld,
3728                LLVMValueRef x)
3729 {
3730    LLVMValueRef mask;
3731    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3732
3733    assert(bld->type.floating);
3734    assert(lp_check_value(bld->type, x));
3735
3736    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3737                         "isnotnan");
3738    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3739    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3740    return mask;
3741 }
3742
3743 /* Returns all 1's for floating point numbers that are
3744  * finite numbers and returns all zeros for -inf,
3745  * inf and nan's */
3746 LLVMValueRef
3747 lp_build_isfinite(struct lp_build_context *bld,
3748                   LLVMValueRef x)
3749 {
3750    LLVMBuilderRef builder = bld->gallivm->builder;
3751    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3752    struct lp_type int_type = lp_int_type(bld->type);
3753    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3754    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3755                                                     0x7f800000);
3756
3757    if (!bld->type.floating) {
3758       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3759    }
3760    assert(bld->type.floating);
3761    assert(lp_check_value(bld->type, x));
3762    assert(bld->type.width == 32);
3763
3764    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3765    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3766                            intx, infornan32);
3767 }
3768
3769 /*
3770  * Returns true if the number is nan or inf and false otherwise.
3771  * The input has to be a floating point vector.
3772  */
3773 LLVMValueRef
3774 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3775                        const struct lp_type type,
3776                        LLVMValueRef x)
3777 {
3778    LLVMBuilderRef builder = gallivm->builder;
3779    struct lp_type int_type = lp_int_type(type);
3780    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3781                                                 0x7f800000);
3782    LLVMValueRef ret;
3783
3784    assert(type.floating);
3785
3786    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3787    ret = LLVMBuildAnd(builder, ret, const0, "");
3788    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3789                           ret, const0);
3790
3791    return ret;
3792 }
3793
3794
3795 LLVMValueRef
3796 lp_build_fpstate_get(struct gallivm_state *gallivm)
3797 {
3798    if (util_cpu_caps.has_sse) {
3799       LLVMBuilderRef builder = gallivm->builder;
3800       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3801          gallivm,
3802          LLVMInt32TypeInContext(gallivm->context),
3803          "mxcsr_ptr");
3804       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3805           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3806       lp_build_intrinsic(builder,
3807                          "llvm.x86.sse.stmxcsr",
3808                          LLVMVoidTypeInContext(gallivm->context),
3809                          &mxcsr_ptr8, 1, 0);
3810       return mxcsr_ptr;
3811    }
3812    return 0;
3813 }
3814
3815 void
3816 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3817                                   boolean zero)
3818 {
3819    if (util_cpu_caps.has_sse) {
3820       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3821       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3822
3823       LLVMBuilderRef builder = gallivm->builder;
3824       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3825       LLVMValueRef mxcsr =
3826          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3827
3828       if (util_cpu_caps.has_daz) {
3829          /* Enable denormals are zero mode */
3830          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3831       }
3832       if (zero) {
3833          mxcsr = LLVMBuildOr(builder, mxcsr,
3834                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3835       } else {
3836          mxcsr = LLVMBuildAnd(builder, mxcsr,
3837                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3838       }
3839
3840       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3841       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3842    }
3843 }
3844
3845 void
3846 lp_build_fpstate_set(struct gallivm_state *gallivm,
3847                      LLVMValueRef mxcsr_ptr)
3848 {
3849    if (util_cpu_caps.has_sse) {
3850       LLVMBuilderRef builder = gallivm->builder;
3851       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3852                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3853       lp_build_intrinsic(builder,
3854                          "llvm.x86.sse.ldmxcsr",
3855                          LLVMVoidTypeInContext(gallivm->context),
3856                          &mxcsr_ptr, 1, 0);
3857    }
3858 }