src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_cpu_detect.h"
  54
  55 #include "lp_bld_type.h"
  56 #include "lp_bld_const.h"
  57 #include "lp_bld_init.h"
  58 #include "lp_bld_intr.h"
  59 #include "lp_bld_logic.h"
  60 #include "lp_bld_pack.h"
  61 #include "lp_bld_debug.h"
  62 #include "lp_bld_bitarit.h"
  63 #include "lp_bld_arit.h"
  64 #include "lp_bld_flow.h"
  65
  66 #if defined(PIPE_ARCH_SSE)
  67 #include <xmmintrin.h>
  68 #endif
  69
  70 #ifndef _MM_DENORMALS_ZERO_MASK
  71 #define _MM_DENORMALS_ZERO_MASK 0x0040
  72 #endif
  73
  74 #ifndef _MM_FLUSH_ZERO_MASK
  75 #define _MM_FLUSH_ZERO_MASK 0x8000
  76 #endif
  77
  78 #define EXP_POLY_DEGREE 5
  79
  80 #define LOG_POLY_DEGREE 4
  81
  82
  83 /**
  84  * Generate min(a, b)
  85  * No checks for special case values of a or b = 1 or 0 are done.
  86  * NaN's are handled according to the behavior specified by the
  87  * nan_behavior argument.
  88  */
  89 static LLVMValueRef
  90 lp_build_min_simple(struct lp_build_context *bld,
  91                     LLVMValueRef a,
  92                     LLVMValueRef b,
  93                     enum gallivm_nan_behavior nan_behavior)
  94 {
  95    const struct lp_type type = bld->type;
  96    const char *intrinsic = NULL;
  97    unsigned intr_size = 0;
  98    LLVMValueRef cond;
  99
 100    assert(lp_check_value(type, a));
 101    assert(lp_check_value(type, b));
 102
 103    /* TODO: optimize the constant case */
 104
 105    if (type.floating && util_cpu_caps.has_sse) {
 106       if (type.width == 32) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse.min.ss";
 109             intr_size = 128;
 110          }
 111          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse.min.ps";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.ps.256";
 117             intr_size = 256;
 118          }
 119       }
 120       if (type.width == 64 && util_cpu_caps.has_sse2) {
 121          if (type.length == 1) {
 122             intrinsic = "llvm.x86.sse2.min.sd";
 123             intr_size = 128;
 124          }
 125          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 126             intrinsic = "llvm.x86.sse2.min.pd";
 127             intr_size = 128;
 128          }
 129          else {
 130             intrinsic = "llvm.x86.avx.min.pd.256";
 131             intr_size = 256;
 132          }
 133       }
 134    }
 135    else if (type.floating && util_cpu_caps.has_altivec) {
 136       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 137           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (HAVE_LLVM < 0x0309 &&
 146               util_cpu_caps.has_avx2 && type.length > 4) {
 147       intr_size = 256;
 148       switch (type.width) {
 149       case 8:
 150          intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
 151          break;
 152       case 16:
 153          intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
 154          break;
 155       case 32:
 156          intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
 157          break;
 158       }
 159    } else if (HAVE_LLVM < 0x0309 &&
 160               util_cpu_caps.has_sse2 && type.length >= 2) {
 161       intr_size = 128;
 162       if ((type.width == 8 || type.width == 16) &&
 163           (type.width * type.length <= 64) &&
 164           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 165          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 166                       __FUNCTION__);
 167       }
 168       if (type.width == 8 && !type.sign) {
 169          intrinsic = "llvm.x86.sse2.pminu.b";
 170       }
 171       else if (type.width == 16 && type.sign) {
 172          intrinsic = "llvm.x86.sse2.pmins.w";
 173       }
 174       if (util_cpu_caps.has_sse4_1) {
 175          if (type.width == 8 && type.sign) {
 176             intrinsic = "llvm.x86.sse41.pminsb";
 177          }
 178          if (type.width == 16 && !type.sign) {
 179             intrinsic = "llvm.x86.sse41.pminuw";
 180          }
 181          if (type.width == 32 && !type.sign) {
 182             intrinsic = "llvm.x86.sse41.pminud";
 183          }
 184          if (type.width == 32 && type.sign) {
 185             intrinsic = "llvm.x86.sse41.pminsd";
 186          }
 187       }
 188    } else if (util_cpu_caps.has_altivec) {
 189       intr_size = 128;
 190       if (type.width == 8) {
 191          if (!type.sign) {
 192             intrinsic = "llvm.ppc.altivec.vminub";
 193          } else {
 194             intrinsic = "llvm.ppc.altivec.vminsb";
 195          }
 196       } else if (type.width == 16) {
 197          if (!type.sign) {
 198             intrinsic = "llvm.ppc.altivec.vminuh";
 199          } else {
 200             intrinsic = "llvm.ppc.altivec.vminsh";
 201          }
 202       } else if (type.width == 32) {
 203          if (!type.sign) {
 204             intrinsic = "llvm.ppc.altivec.vminuw";
 205          } else {
 206             intrinsic = "llvm.ppc.altivec.vminsw";
 207          }
 208       }
 209    }
 210
 211    if (intrinsic) {
 212       /* We need to handle nan's for floating point numbers. If one of the
 213        * inputs is nan the other should be returned (required by both D3D10+
 214        * and OpenCL).
 215        * The sse intrinsics return the second operator in case of nan by
 216        * default so we need to special code to handle those.
 217        */
 218       if (util_cpu_caps.has_sse && type.floating &&
 219           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 220           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 221           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 222          LLVMValueRef isnan, min;
 223          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 224                                                    type,
 225                                                    intr_size, a, b);
 226          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 227             isnan = lp_build_isnan(bld, b);
 228             return lp_build_select(bld, isnan, a, min);
 229          } else {
 230             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 231             isnan = lp_build_isnan(bld, a);
 232             return lp_build_select(bld, isnan, a, min);
 233          }
 234       } else {
 235          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 236                                                     type,
 237                                                     intr_size, a, b);
 238       }
 239    }
 240
 241    if (type.floating) {
 242       switch (nan_behavior) {
 243       case GALLIVM_NAN_RETURN_NAN: {
 244          LLVMValueRef isnan = lp_build_isnan(bld, b);
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 247          return lp_build_select(bld, cond, a, b);
 248       }
 249          break;
 250       case GALLIVM_NAN_RETURN_OTHER: {
 251          LLVMValueRef isnan = lp_build_isnan(bld, a);
 252          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 253          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 254          return lp_build_select(bld, cond, a, b);
 255       }
 256          break;
 257       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 258          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 259          return lp_build_select(bld, cond, a, b);
 260       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 261          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 262          return lp_build_select(bld, cond, b, a);
 263       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 264          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 265          return lp_build_select(bld, cond, a, b);
 266          break;
 267       default:
 268          assert(0);
 269          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 270          return lp_build_select(bld, cond, a, b);
 271       }
 272    } else {
 273       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 274       return lp_build_select(bld, cond, a, b);
 275    }
 276 }
 277
 278
 279 LLVMValueRef
 280 lp_build_fmuladd(LLVMBuilderRef builder,
 281                  LLVMValueRef a,
 282                  LLVMValueRef b,
 283                  LLVMValueRef c)
 284 {
 285    LLVMTypeRef type = LLVMTypeOf(a);
 286    assert(type == LLVMTypeOf(b));
 287    assert(type == LLVMTypeOf(c));
 288    if (HAVE_LLVM < 0x0304) {
 289       /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
 290        * not supported, and instead it falls-back to a C function.
 291        */
 292       return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
 293    }
 294    char intrinsic[32];
 295    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
 296    LLVMValueRef args[] = { a, b, c };
 297    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
 298 }
 299
 300
 301 /**
 302  * Generate max(a, b)
 303  * No checks for special case values of a or b = 1 or 0 are done.
 304  * NaN's are handled according to the behavior specified by the
 305  * nan_behavior argument.
 306  */
 307 static LLVMValueRef
 308 lp_build_max_simple(struct lp_build_context *bld,
 309                     LLVMValueRef a,
 310                     LLVMValueRef b,
 311                     enum gallivm_nan_behavior nan_behavior)
 312 {
 313    const struct lp_type type = bld->type;
 314    const char *intrinsic = NULL;
 315    unsigned intr_size = 0;
 316    LLVMValueRef cond;
 317
 318    assert(lp_check_value(type, a));
 319    assert(lp_check_value(type, b));
 320
 321    /* TODO: optimize the constant case */
 322
 323    if (type.floating && util_cpu_caps.has_sse) {
 324       if (type.width == 32) {
 325          if (type.length == 1) {
 326             intrinsic = "llvm.x86.sse.max.ss";
 327             intr_size = 128;
 328          }
 329          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 330             intrinsic = "llvm.x86.sse.max.ps";
 331             intr_size = 128;
 332          }
 333          else {
 334             intrinsic = "llvm.x86.avx.max.ps.256";
 335             intr_size = 256;
 336          }
 337       }
 338       if (type.width == 64 && util_cpu_caps.has_sse2) {
 339          if (type.length == 1) {
 340             intrinsic = "llvm.x86.sse2.max.sd";
 341             intr_size = 128;
 342          }
 343          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 344             intrinsic = "llvm.x86.sse2.max.pd";
 345             intr_size = 128;
 346          }
 347          else {
 348             intrinsic = "llvm.x86.avx.max.pd.256";
 349             intr_size = 256;
 350          }
 351       }
 352    }
 353    else if (type.floating && util_cpu_caps.has_altivec) {
 354       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 355           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 356          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 357                       __FUNCTION__);
 358       }
 359       if (type.width == 32 || type.length == 4) {
 360          intrinsic = "llvm.ppc.altivec.vmaxfp";
 361          intr_size = 128;
 362       }
 363    } else if (HAVE_LLVM < 0x0309 &&
 364               util_cpu_caps.has_avx2 && type.length > 4) {
 365       intr_size = 256;
 366       switch (type.width) {
 367       case 8:
 368          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
 369          break;
 370       case 16:
 371          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
 372          break;
 373       case 32:
 374          intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
 375          break;
 376       }
 377    } else if (HAVE_LLVM < 0x0309 &&
 378               util_cpu_caps.has_sse2 && type.length >= 2) {
 379       intr_size = 128;
 380       if ((type.width == 8 || type.width == 16) &&
 381           (type.width * type.length <= 64) &&
 382           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 383          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 384                       __FUNCTION__);
 385          }
 386       if (type.width == 8 && !type.sign) {
 387          intrinsic = "llvm.x86.sse2.pmaxu.b";
 388          intr_size = 128;
 389       }
 390       else if (type.width == 16 && type.sign) {
 391          intrinsic = "llvm.x86.sse2.pmaxs.w";
 392       }
 393       if (util_cpu_caps.has_sse4_1) {
 394          if (type.width == 8 && type.sign) {
 395             intrinsic = "llvm.x86.sse41.pmaxsb";
 396          }
 397          if (type.width == 16 && !type.sign) {
 398             intrinsic = "llvm.x86.sse41.pmaxuw";
 399          }
 400          if (type.width == 32 && !type.sign) {
 401             intrinsic = "llvm.x86.sse41.pmaxud";
 402         }
 403          if (type.width == 32 && type.sign) {
 404             intrinsic = "llvm.x86.sse41.pmaxsd";
 405          }
 406       }
 407    } else if (util_cpu_caps.has_altivec) {
 408      intr_size = 128;
 409      if (type.width == 8) {
 410        if (!type.sign) {
 411          intrinsic = "llvm.ppc.altivec.vmaxub";
 412        } else {
 413          intrinsic = "llvm.ppc.altivec.vmaxsb";
 414        }
 415      } else if (type.width == 16) {
 416        if (!type.sign) {
 417          intrinsic = "llvm.ppc.altivec.vmaxuh";
 418        } else {
 419          intrinsic = "llvm.ppc.altivec.vmaxsh";
 420        }
 421      } else if (type.width == 32) {
 422        if (!type.sign) {
 423          intrinsic = "llvm.ppc.altivec.vmaxuw";
 424        } else {
 425          intrinsic = "llvm.ppc.altivec.vmaxsw";
 426        }
 427      }
 428    }
 429
 430    if (intrinsic) {
 431       if (util_cpu_caps.has_sse && type.floating &&
 432           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 433           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 434           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 435          LLVMValueRef isnan, max;
 436          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 437                                                    type,
 438                                                    intr_size, a, b);
 439          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 440             isnan = lp_build_isnan(bld, b);
 441             return lp_build_select(bld, isnan, a, max);
 442          } else {
 443             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 444             isnan = lp_build_isnan(bld, a);
 445             return lp_build_select(bld, isnan, a, max);
 446          }
 447       } else {
 448          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 449                                                     type,
 450                                                     intr_size, a, b);
 451       }
 452    }
 453
 454    if (type.floating) {
 455       switch (nan_behavior) {
 456       case GALLIVM_NAN_RETURN_NAN: {
 457          LLVMValueRef isnan = lp_build_isnan(bld, b);
 458          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 459          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 460          return lp_build_select(bld, cond, a, b);
 461       }
 462          break;
 463       case GALLIVM_NAN_RETURN_OTHER: {
 464          LLVMValueRef isnan = lp_build_isnan(bld, a);
 465          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 466          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 467          return lp_build_select(bld, cond, a, b);
 468       }
 469          break;
 470       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 471          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 472          return lp_build_select(bld, cond, a, b);
 473       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 474          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 475          return lp_build_select(bld, cond, b, a);
 476       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 477          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 478          return lp_build_select(bld, cond, a, b);
 479          break;
 480       default:
 481          assert(0);
 482          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 483          return lp_build_select(bld, cond, a, b);
 484       }
 485    } else {
 486       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 487       return lp_build_select(bld, cond, a, b);
 488    }
 489 }
 490
 491
 492 /**
 493  * Generate 1 - a, or ~a depending on bld->type.
 494  */
 495 LLVMValueRef
 496 lp_build_comp(struct lp_build_context *bld,
 497               LLVMValueRef a)
 498 {
 499    LLVMBuilderRef builder = bld->gallivm->builder;
 500    const struct lp_type type = bld->type;
 501
 502    assert(lp_check_value(type, a));
 503
 504    if(a == bld->one)
 505       return bld->zero;
 506    if(a == bld->zero)
 507       return bld->one;
 508
 509    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 510       if(LLVMIsConstant(a))
 511          return LLVMConstNot(a);
 512       else
 513          return LLVMBuildNot(builder, a, "");
 514    }
 515
 516    if(LLVMIsConstant(a))
 517       if (type.floating)
 518           return LLVMConstFSub(bld->one, a);
 519       else
 520           return LLVMConstSub(bld->one, a);
 521    else
 522       if (type.floating)
 523          return LLVMBuildFSub(builder, bld->one, a, "");
 524       else
 525          return LLVMBuildSub(builder, bld->one, a, "");
 526 }
 527
 528
 529 /**
 530  * Generate a + b
 531  */
 532 LLVMValueRef
 533 lp_build_add(struct lp_build_context *bld,
 534              LLVMValueRef a,
 535              LLVMValueRef b)
 536 {
 537    LLVMBuilderRef builder = bld->gallivm->builder;
 538    const struct lp_type type = bld->type;
 539    LLVMValueRef res;
 540
 541    assert(lp_check_value(type, a));
 542    assert(lp_check_value(type, b));
 543
 544    if (a == bld->zero)
 545       return b;
 546    if (b == bld->zero)
 547       return a;
 548    if (a == bld->undef || b == bld->undef)
 549       return bld->undef;
 550
 551    if (type.norm) {
 552       const char *intrinsic = NULL;
 553
 554       if (!type.sign && (a == bld->one || b == bld->one))
 555         return bld->one;
 556
 557       if (!type.floating && !type.fixed) {
 558          if (HAVE_LLVM >= 0x0900) {
 559             char intrin[32];
 560             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 561             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 562             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 563          }
 564          if (type.width * type.length == 128) {
 565             if (util_cpu_caps.has_sse2) {
 566                if (type.width == 8)
 567                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
 568                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
 569                if (type.width == 16)
 570                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
 571                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
 572             } else if (util_cpu_caps.has_altivec) {
 573                if (type.width == 8)
 574                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 575                if (type.width == 16)
 576                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 577             }
 578          }
 579          if (type.width * type.length == 256) {
 580             if (util_cpu_caps.has_avx2) {
 581                if (type.width == 8)
 582                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
 583                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
 584                if (type.width == 16)
 585                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
 586                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
 587             }
 588          }
 589       }
 590
 591       if (intrinsic)
 592          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 593    }
 594
 595    if(type.norm && !type.floating && !type.fixed) {
 596       if (type.sign) {
 597          uint64_t sign = (uint64_t)1 << (type.width - 1);
 598          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 599          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 600          /* a_clamp_max is the maximum a for positive b,
 601             a_clamp_min is the minimum a for negative b. */
 602          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 603          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 604          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 605       }
 606    }
 607
 608    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 609       if (type.floating)
 610          res = LLVMConstFAdd(a, b);
 611       else
 612          res = LLVMConstAdd(a, b);
 613    else
 614       if (type.floating)
 615          res = LLVMBuildFAdd(builder, a, b, "");
 616       else
 617          res = LLVMBuildAdd(builder, a, b, "");
 618
 619    /* clamp to ceiling of 1.0 */
 620    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 621       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 622
 623    if (type.norm && !type.floating && !type.fixed) {
 624       if (!type.sign) {
 625          /*
 626           * newer llvm versions no longer support the intrinsics, but recognize
 627           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
 628           * code, it is important we match the pattern llvm uses (and pray llvm
 629           * doesn't change it - and hope they decide on the same pattern for
 630           * all backends supporting it...).
 631           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 632           * interfere with llvm's ability to recognize the pattern but seems
 633           * a bit brittle.
 634           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 635           */
 636          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
 637          res = lp_build_select(bld, overflowed,
 638                                LLVMConstAllOnes(bld->int_vec_type), res);
 639       }
 640    }
 641
 642    /* XXX clamp to floor of -1 or 0??? */
 643
 644    return res;
 645 }
 646
 647
 648 /** Return the scalar sum of the elements of a.
 649  * Should avoid this operation whenever possible.
 650  */
 651 LLVMValueRef
 652 lp_build_horizontal_add(struct lp_build_context *bld,
 653                         LLVMValueRef a)
 654 {
 655    LLVMBuilderRef builder = bld->gallivm->builder;
 656    const struct lp_type type = bld->type;
 657    LLVMValueRef index, res;
 658    unsigned i, length;
 659    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 660    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 661    LLVMValueRef vecres, elem2;
 662
 663    assert(lp_check_value(type, a));
 664
 665    if (type.length == 1) {
 666       return a;
 667    }
 668
 669    assert(!bld->type.norm);
 670
 671    /*
 672     * for byte vectors can do much better with psadbw.
 673     * Using repeated shuffle/adds here. Note with multiple vectors
 674     * this can be done more efficiently as outlined in the intel
 675     * optimization manual.
 676     * Note: could cause data rearrangement if used with smaller element
 677     * sizes.
 678     */
 679
 680    vecres = a;
 681    length = type.length / 2;
 682    while (length > 1) {
 683       LLVMValueRef vec1, vec2;
 684       for (i = 0; i < length; i++) {
 685          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 686          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 687       }
 688       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 689                                     LLVMConstVector(shuffles1, length), "");
 690       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 691                                     LLVMConstVector(shuffles2, length), "");
 692       if (type.floating) {
 693          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 694       }
 695       else {
 696          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 697       }
 698       length = length >> 1;
 699    }
 700
 701    /* always have vector of size 2 here */
 702    assert(length == 1);
 703
 704    index = lp_build_const_int32(bld->gallivm, 0);
 705    res = LLVMBuildExtractElement(builder, vecres, index, "");
 706    index = lp_build_const_int32(bld->gallivm, 1);
 707    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 708
 709    if (type.floating)
 710       res = LLVMBuildFAdd(builder, res, elem2, "");
 711     else
 712       res = LLVMBuildAdd(builder, res, elem2, "");
 713
 714    return res;
 715 }
 716
 717 /**
 718  * Return the horizontal sums of 4 float vectors as a float4 vector.
 719  * This uses the technique as outlined in Intel Optimization Manual.
 720  */
 721 static LLVMValueRef
 722 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 723                             LLVMValueRef src[4])
 724 {
 725    struct gallivm_state *gallivm = bld->gallivm;
 726    LLVMBuilderRef builder = gallivm->builder;
 727    LLVMValueRef shuffles[4];
 728    LLVMValueRef tmp[4];
 729    LLVMValueRef sumtmp[2], shuftmp[2];
 730
 731    /* lower half of regs */
 732    shuffles[0] = lp_build_const_int32(gallivm, 0);
 733    shuffles[1] = lp_build_const_int32(gallivm, 1);
 734    shuffles[2] = lp_build_const_int32(gallivm, 4);
 735    shuffles[3] = lp_build_const_int32(gallivm, 5);
 736    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 737                                    LLVMConstVector(shuffles, 4), "");
 738    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 739                                    LLVMConstVector(shuffles, 4), "");
 740
 741    /* upper half of regs */
 742    shuffles[0] = lp_build_const_int32(gallivm, 2);
 743    shuffles[1] = lp_build_const_int32(gallivm, 3);
 744    shuffles[2] = lp_build_const_int32(gallivm, 6);
 745    shuffles[3] = lp_build_const_int32(gallivm, 7);
 746    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 747                                    LLVMConstVector(shuffles, 4), "");
 748    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 749                                    LLVMConstVector(shuffles, 4), "");
 750
 751    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 752    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 753
 754    shuffles[0] = lp_build_const_int32(gallivm, 0);
 755    shuffles[1] = lp_build_const_int32(gallivm, 2);
 756    shuffles[2] = lp_build_const_int32(gallivm, 4);
 757    shuffles[3] = lp_build_const_int32(gallivm, 6);
 758    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 759                                        LLVMConstVector(shuffles, 4), "");
 760
 761    shuffles[0] = lp_build_const_int32(gallivm, 1);
 762    shuffles[1] = lp_build_const_int32(gallivm, 3);
 763    shuffles[2] = lp_build_const_int32(gallivm, 5);
 764    shuffles[3] = lp_build_const_int32(gallivm, 7);
 765    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 766                                        LLVMConstVector(shuffles, 4), "");
 767
 768    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 769 }
 770
 771
 772 /*
 773  * partially horizontally add 2-4 float vectors with length nx4,
 774  * i.e. only four adjacent values in each vector will be added,
 775  * assuming values are really grouped in 4 which also determines
 776  * output order.
 777  *
 778  * Return a vector of the same length as the initial vectors,
 779  * with the excess elements (if any) being undefined.
 780  * The element order is independent of number of input vectors.
 781  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 782  * the output order thus will be
 783  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 784  */
 785 LLVMValueRef
 786 lp_build_hadd_partial4(struct lp_build_context *bld,
 787                        LLVMValueRef vectors[],
 788                        unsigned num_vecs)
 789 {
 790    struct gallivm_state *gallivm = bld->gallivm;
 791    LLVMBuilderRef builder = gallivm->builder;
 792    LLVMValueRef ret_vec;
 793    LLVMValueRef tmp[4];
 794    const char *intrinsic = NULL;
 795
 796    assert(num_vecs >= 2 && num_vecs <= 4);
 797    assert(bld->type.floating);
 798
 799    /* only use this with at least 2 vectors, as it is sort of expensive
 800     * (depending on cpu) and we always need two horizontal adds anyway,
 801     * so a shuffle/add approach might be better.
 802     */
 803
 804    tmp[0] = vectors[0];
 805    tmp[1] = vectors[1];
 806
 807    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 808    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 809
 810    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 811        bld->type.length == 4) {
 812       intrinsic = "llvm.x86.sse3.hadd.ps";
 813    }
 814    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 815             bld->type.length == 8) {
 816       intrinsic = "llvm.x86.avx.hadd.ps.256";
 817    }
 818    if (intrinsic) {
 819       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 820                                        lp_build_vec_type(gallivm, bld->type),
 821                                        tmp[0], tmp[1]);
 822       if (num_vecs > 2) {
 823          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 824                                           lp_build_vec_type(gallivm, bld->type),
 825                                           tmp[2], tmp[3]);
 826       }
 827       else {
 828          tmp[1] = tmp[0];
 829       }
 830       return lp_build_intrinsic_binary(builder, intrinsic,
 831                                        lp_build_vec_type(gallivm, bld->type),
 832                                        tmp[0], tmp[1]);
 833    }
 834
 835    if (bld->type.length == 4) {
 836       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 837    }
 838    else {
 839       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 840       unsigned j;
 841       unsigned num_iter = bld->type.length / 4;
 842       struct lp_type parttype = bld->type;
 843       parttype.length = 4;
 844       for (j = 0; j < num_iter; j++) {
 845          LLVMValueRef partsrc[4];
 846          unsigned i;
 847          for (i = 0; i < 4; i++) {
 848             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 849          }
 850          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 851       }
 852       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 853    }
 854    return ret_vec;
 855 }
 856
 857 /**
 858  * Generate a - b
 859  */
 860 LLVMValueRef
 861 lp_build_sub(struct lp_build_context *bld,
 862              LLVMValueRef a,
 863              LLVMValueRef b)
 864 {
 865    LLVMBuilderRef builder = bld->gallivm->builder;
 866    const struct lp_type type = bld->type;
 867    LLVMValueRef res;
 868
 869    assert(lp_check_value(type, a));
 870    assert(lp_check_value(type, b));
 871
 872    if (b == bld->zero)
 873       return a;
 874    if (a == bld->undef || b == bld->undef)
 875       return bld->undef;
 876    if (a == b)
 877       return bld->zero;
 878
 879    if (type.norm) {
 880       const char *intrinsic = NULL;
 881
 882       if (!type.sign && b == bld->one)
 883         return bld->zero;
 884
 885       if (!type.floating && !type.fixed) {
 886          if (HAVE_LLVM >= 0x0900) {
 887             char intrin[32];
 888             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 889             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
 890             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
 891          }
 892          if (type.width * type.length == 128) {
 893             if (util_cpu_caps.has_sse2) {
 894                if (type.width == 8)
 895                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
 896                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
 897                if (type.width == 16)
 898                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
 899                                           HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
 900             } else if (util_cpu_caps.has_altivec) {
 901                if (type.width == 8)
 902                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 903                if (type.width == 16)
 904                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 905             }
 906          }
 907          if (type.width * type.length == 256) {
 908             if (util_cpu_caps.has_avx2) {
 909                if (type.width == 8)
 910                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
 911                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
 912                if (type.width == 16)
 913                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
 914                                           HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
 915             }
 916          }
 917       }
 918
 919       if (intrinsic)
 920          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 921    }
 922
 923    if(type.norm && !type.floating && !type.fixed) {
 924       if (type.sign) {
 925          uint64_t sign = (uint64_t)1 << (type.width - 1);
 926          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 927          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 928          /* a_clamp_max is the maximum a for negative b,
 929             a_clamp_min is the minimum a for positive b. */
 930          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 931          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 932          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 933       } else {
 934          /*
 935           * This must match llvm pattern for saturated unsigned sub.
 936           * (lp_build_max_simple actually does the job with its current
 937           * definition but do it explicitly here.)
 938           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
 939           * interfere with llvm's ability to recognize the pattern but seems
 940           * a bit brittle.
 941           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
 942           */
 943          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 944          a = lp_build_select(bld, no_ov, a, b);
 945       }
 946    }
 947
 948    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 949       if (type.floating)
 950          res = LLVMConstFSub(a, b);
 951       else
 952          res = LLVMConstSub(a, b);
 953    else
 954       if (type.floating)
 955          res = LLVMBuildFSub(builder, a, b, "");
 956       else
 957          res = LLVMBuildSub(builder, a, b, "");
 958
 959    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 960       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 961
 962    return res;
 963 }
 964
 965
 966
 967 /**
 968  * Normalized multiplication.
 969  *
 970  * There are several approaches for (using 8-bit normalized multiplication as
 971  * an example):
 972  *
 973  * - alpha plus one
 974  *
 975  *     makes the following approximation to the division (Sree)
 976  *
 977  *       a*b/255 ~= (a*(b + 1)) >> 256
 978  *
 979  *     which is the fastest method that satisfies the following OpenGL criteria of
 980  *
 981  *       0*0 = 0 and 255*255 = 255
 982  *
 983  * - geometric series
 984  *
 985  *     takes the geometric series approximation to the division
 986  *
 987  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 988  *
 989  *     in this case just the first two terms to fit in 16bit arithmetic
 990  *
 991  *       t/255 ~= (t + (t >> 8)) >> 8
 992  *
 993  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 994  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 995  *     must be used.
 996  *
 997  * - geometric series plus rounding
 998  *
 999  *     when using a geometric series division instead of truncating the result
1000  *     use roundoff in the approximation (Jim Blinn)
1001  *
1002  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
1003  *
1004  *     achieving the exact results.
1005  *
1006  *
1007  *
1008  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1009  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1010  * @sa Michael Herf, The "double blend trick", May 2000,
1011  *     http://www.stereopsis.com/doubleblend.html
1012  */
1013 LLVMValueRef
1014 lp_build_mul_norm(struct gallivm_state *gallivm,
1015                   struct lp_type wide_type,
1016                   LLVMValueRef a, LLVMValueRef b)
1017 {
1018    LLVMBuilderRef builder = gallivm->builder;
1019    struct lp_build_context bld;
1020    unsigned n;
1021    LLVMValueRef half;
1022    LLVMValueRef ab;
1023
1024    assert(!wide_type.floating);
1025    assert(lp_check_value(wide_type, a));
1026    assert(lp_check_value(wide_type, b));
1027
1028    lp_build_context_init(&bld, gallivm, wide_type);
1029
1030    n = wide_type.width / 2;
1031    if (wide_type.sign) {
1032       --n;
1033    }
1034
1035    /*
1036     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1037     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1038     */
1039
1040    /*
1041     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1042     */
1043
1044    ab = LLVMBuildMul(builder, a, b, "");
1045    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1046
1047    /*
1048     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1049     */
1050
1051    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1052    if (wide_type.sign) {
1053       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1054       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1055       half = lp_build_select(&bld, sign, minus_half, half);
1056    }
1057    ab = LLVMBuildAdd(builder, ab, half, "");
1058
1059    /* Final division */
1060    ab = lp_build_shr_imm(&bld, ab, n);
1061
1062    return ab;
1063 }
1064
1065 /**
1066  * Generate a * b
1067  */
1068 LLVMValueRef
1069 lp_build_mul(struct lp_build_context *bld,
1070              LLVMValueRef a,
1071              LLVMValueRef b)
1072 {
1073    LLVMBuilderRef builder = bld->gallivm->builder;
1074    const struct lp_type type = bld->type;
1075    LLVMValueRef shift;
1076    LLVMValueRef res;
1077
1078    assert(lp_check_value(type, a));
1079    assert(lp_check_value(type, b));
1080
1081    if(a == bld->zero)
1082       return bld->zero;
1083    if(a == bld->one)
1084       return b;
1085    if(b == bld->zero)
1086       return bld->zero;
1087    if(b == bld->one)
1088       return a;
1089    if(a == bld->undef || b == bld->undef)
1090       return bld->undef;
1091
1092    if (!type.floating && !type.fixed && type.norm) {
1093       struct lp_type wide_type = lp_wider_type(type);
1094       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1095
1096       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1097       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1098
1099       /* PMULLW, PSRLW, PADDW */
1100       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1101       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1102
1103       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1104
1105       return ab;
1106    }
1107
1108    if(type.fixed)
1109       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1110    else
1111       shift = NULL;
1112
1113    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1114       if (type.floating)
1115          res = LLVMConstFMul(a, b);
1116       else
1117          res = LLVMConstMul(a, b);
1118       if(shift) {
1119          if(type.sign)
1120             res = LLVMConstAShr(res, shift);
1121          else
1122             res = LLVMConstLShr(res, shift);
1123       }
1124    }
1125    else {
1126       if (type.floating)
1127          res = LLVMBuildFMul(builder, a, b, "");
1128       else
1129          res = LLVMBuildMul(builder, a, b, "");
1130       if(shift) {
1131          if(type.sign)
1132             res = LLVMBuildAShr(builder, res, shift, "");
1133          else
1134             res = LLVMBuildLShr(builder, res, shift, "");
1135       }
1136    }
1137
1138    return res;
1139 }
1140
1141 /*
1142  * Widening mul, valid for 32x32 bit -> 64bit only.
1143  * Result is low 32bits, high bits returned in res_hi.
1144  *
1145  * Emits code that is meant to be compiled for the host CPU.
1146  */
1147 LLVMValueRef
1148 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1149                          LLVMValueRef a,
1150                          LLVMValueRef b,
1151                          LLVMValueRef *res_hi)
1152 {
1153    struct gallivm_state *gallivm = bld->gallivm;
1154    LLVMBuilderRef builder = gallivm->builder;
1155
1156    assert(bld->type.width == 32);
1157    assert(bld->type.floating == 0);
1158    assert(bld->type.fixed == 0);
1159    assert(bld->type.norm == 0);
1160
1161    /*
1162     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1163     * for x86 simd is atrocious (even if the high bits weren't required),
1164     * trying to handle real 64bit inputs (which of course can't happen due
1165     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1166     * apparently llvm does not recognize this widening mul). This includes 6
1167     * (instead of 2) pmuludq plus extra adds and shifts
1168     * The same story applies to signed mul, albeit fixing this requires sse41.
1169     * https://llvm.org/bugs/show_bug.cgi?id=30845
1170     * So, whip up our own code, albeit only for length 4 and 8 (which
1171     * should be good enough)...
1172     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1173     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1174     * for signed), which the fallback code does not, without this llvm
1175     * will likely still produce atrocious code.
1176     */
1177    if (HAVE_LLVM < 0x0700 &&
1178        (bld->type.length == 4 || bld->type.length == 8) &&
1179        ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1180         util_cpu_caps.has_sse4_1)) {
1181       const char *intrinsic = NULL;
1182       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1183       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1184       struct lp_type type_wide = lp_wider_type(bld->type);
1185       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1186       unsigned i;
1187       for (i = 0; i < bld->type.length; i += 2) {
1188          shuf[i] = lp_build_const_int32(gallivm, i+1);
1189          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1190       }
1191       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1192       aeven = a;
1193       beven = b;
1194       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1195       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1196
1197       if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1198          if (bld->type.sign) {
1199             intrinsic = "llvm.x86.avx2.pmul.dq";
1200          } else {
1201             intrinsic = "llvm.x86.avx2.pmulu.dq";
1202          }
1203          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1204                                              wider_type, aeven, beven);
1205          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1206                                             wider_type, aodd, bodd);
1207       }
1208       else {
1209          /* for consistent naming look elsewhere... */
1210          if (bld->type.sign) {
1211             intrinsic = "llvm.x86.sse41.pmuldq";
1212          } else {
1213             intrinsic = "llvm.x86.sse2.pmulu.dq";
1214          }
1215          /*
1216           * XXX If we only have AVX but not AVX2 this is a pain.
1217           * lp_build_intrinsic_binary_anylength() can't handle it
1218           * (due to src and dst type not being identical).
1219           */
1220          if (bld->type.length == 8) {
1221             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1222             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1223             LLVMValueRef muleven2[2], mulodd2[2];
1224             struct lp_type type_wide_half = type_wide;
1225             LLVMTypeRef wtype_half;
1226             type_wide_half.length = 2;
1227             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1228             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1229             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1230             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1231             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1232             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1233             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1234             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1235             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1236             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1237                                                     wtype_half, aevenlo, bevenlo);
1238             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1239                                                    wtype_half, aoddlo, boddlo);
1240             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1241                                                     wtype_half, aevenhi, bevenhi);
1242             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1243                                                    wtype_half, aoddhi, boddhi);
1244             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1245             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1246
1247          }
1248          else {
1249             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1250                                                 wider_type, aeven, beven);
1251             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1252                                                wider_type, aodd, bodd);
1253          }
1254       }
1255       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1256       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1257
1258       for (i = 0; i < bld->type.length; i += 2) {
1259          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1260          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1261       }
1262       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1263       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1264
1265       for (i = 0; i < bld->type.length; i += 2) {
1266          shuf[i] = lp_build_const_int32(gallivm, i);
1267          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1268       }
1269       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1270       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1271    }
1272    else {
1273       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1274    }
1275 }
1276
1277
1278 /*
1279  * Widening mul, valid for 32x32 bit -> 64bit only.
1280  * Result is low 32bits, high bits returned in res_hi.
1281  *
1282  * Emits generic code.
1283  */
1284 LLVMValueRef
1285 lp_build_mul_32_lohi(struct lp_build_context *bld,
1286                      LLVMValueRef a,
1287                      LLVMValueRef b,
1288                      LLVMValueRef *res_hi)
1289 {
1290    struct gallivm_state *gallivm = bld->gallivm;
1291    LLVMBuilderRef builder = gallivm->builder;
1292    LLVMValueRef tmp, shift, res_lo;
1293    struct lp_type type_tmp;
1294    LLVMTypeRef wide_type, narrow_type;
1295
1296    type_tmp = bld->type;
1297    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1298    type_tmp.width *= 2;
1299    wide_type = lp_build_vec_type(gallivm, type_tmp);
1300    shift = lp_build_const_vec(gallivm, type_tmp, 32);
1301
1302    if (bld->type.sign) {
1303       a = LLVMBuildSExt(builder, a, wide_type, "");
1304       b = LLVMBuildSExt(builder, b, wide_type, "");
1305    } else {
1306       a = LLVMBuildZExt(builder, a, wide_type, "");
1307       b = LLVMBuildZExt(builder, b, wide_type, "");
1308    }
1309    tmp = LLVMBuildMul(builder, a, b, "");
1310
1311    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1312
1313    /* Since we truncate anyway, LShr and AShr are equivalent. */
1314    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1315    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1316
1317    return res_lo;
1318 }
1319
1320
1321 /* a * b + c */
1322 LLVMValueRef
1323 lp_build_mad(struct lp_build_context *bld,
1324              LLVMValueRef a,
1325              LLVMValueRef b,
1326              LLVMValueRef c)
1327 {
1328    const struct lp_type type = bld->type;
1329    if (type.floating) {
1330       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1331    } else {
1332       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1333    }
1334 }
1335
1336
1337 /**
1338  * Small vector x scale multiplication optimization.
1339  */
1340 LLVMValueRef
1341 lp_build_mul_imm(struct lp_build_context *bld,
1342                  LLVMValueRef a,
1343                  int b)
1344 {
1345    LLVMBuilderRef builder = bld->gallivm->builder;
1346    LLVMValueRef factor;
1347
1348    assert(lp_check_value(bld->type, a));
1349
1350    if(b == 0)
1351       return bld->zero;
1352
1353    if(b == 1)
1354       return a;
1355
1356    if(b == -1)
1357       return lp_build_negate(bld, a);
1358
1359    if(b == 2 && bld->type.floating)
1360       return lp_build_add(bld, a, a);
1361
1362    if(util_is_power_of_two_or_zero(b)) {
1363       unsigned shift = ffs(b) - 1;
1364
1365       if(bld->type.floating) {
1366 #if 0
1367          /*
1368           * Power of two multiplication by directly manipulating the exponent.
1369           *
1370           * XXX: This might not be always faster, it will introduce a small error
1371           * for multiplication by zero, and it will produce wrong results
1372           * for Inf and NaN.
1373           */
1374          unsigned mantissa = lp_mantissa(bld->type);
1375          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1376          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1377          a = LLVMBuildAdd(builder, a, factor, "");
1378          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1379          return a;
1380 #endif
1381       }
1382       else {
1383          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1384          return LLVMBuildShl(builder, a, factor, "");
1385       }
1386    }
1387
1388    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1389    return lp_build_mul(bld, a, factor);
1390 }
1391
1392
1393 /**
1394  * Generate a / b
1395  */
1396 LLVMValueRef
1397 lp_build_div(struct lp_build_context *bld,
1398              LLVMValueRef a,
1399              LLVMValueRef b)
1400 {
1401    LLVMBuilderRef builder = bld->gallivm->builder;
1402    const struct lp_type type = bld->type;
1403
1404    assert(lp_check_value(type, a));
1405    assert(lp_check_value(type, b));
1406
1407    if(a == bld->zero)
1408       return bld->zero;
1409    if(a == bld->one && type.floating)
1410       return lp_build_rcp(bld, b);
1411    if(b == bld->zero)
1412       return bld->undef;
1413    if(b == bld->one)
1414       return a;
1415    if(a == bld->undef || b == bld->undef)
1416       return bld->undef;
1417
1418    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1419       if (type.floating)
1420          return LLVMConstFDiv(a, b);
1421       else if (type.sign)
1422          return LLVMConstSDiv(a, b);
1423       else
1424          return LLVMConstUDiv(a, b);
1425    }
1426
1427    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1428    if(FALSE &&
1429       ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1430        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1431       type.floating)
1432       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1433
1434    if (type.floating)
1435       return LLVMBuildFDiv(builder, a, b, "");
1436    else if (type.sign)
1437       return LLVMBuildSDiv(builder, a, b, "");
1438    else
1439       return LLVMBuildUDiv(builder, a, b, "");
1440 }
1441
1442
1443 /**
1444  * Linear interpolation helper.
1445  *
1446  * @param normalized whether we are interpolating normalized values,
1447  *        encoded in normalized integers, twice as wide.
1448  *
1449  * @sa http://www.stereopsis.com/doubleblend.html
1450  */
1451 static inline LLVMValueRef
1452 lp_build_lerp_simple(struct lp_build_context *bld,
1453                      LLVMValueRef x,
1454                      LLVMValueRef v0,
1455                      LLVMValueRef v1,
1456                      unsigned flags)
1457 {
1458    unsigned half_width = bld->type.width/2;
1459    LLVMBuilderRef builder = bld->gallivm->builder;
1460    LLVMValueRef delta;
1461    LLVMValueRef res;
1462
1463    assert(lp_check_value(bld->type, x));
1464    assert(lp_check_value(bld->type, v0));
1465    assert(lp_check_value(bld->type, v1));
1466
1467    delta = lp_build_sub(bld, v1, v0);
1468
1469    if (bld->type.floating) {
1470       assert(flags == 0);
1471       return lp_build_mad(bld, x, delta, v0);
1472    }
1473
1474    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1475       if (!bld->type.sign) {
1476          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1477             /*
1478              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1479              * most-significant-bit to the lowest-significant-bit, so that
1480              * later we can just divide by 2**n instead of 2**n - 1.
1481              */
1482
1483             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1484          }
1485
1486          /* (x * delta) >> n */
1487          res = lp_build_mul(bld, x, delta);
1488          res = lp_build_shr_imm(bld, res, half_width);
1489       } else {
1490          /*
1491           * The rescaling trick above doesn't work for signed numbers, so
1492           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1493           * instead.
1494           */
1495          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1496          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1497       }
1498    } else {
1499       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1500       res = lp_build_mul(bld, x, delta);
1501    }
1502
1503    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1504       /*
1505        * At this point both res and v0 only use the lower half of the bits,
1506        * the rest is zero. Instead of add / mask, do add with half wide type.
1507        */
1508       struct lp_type narrow_type;
1509       struct lp_build_context narrow_bld;
1510
1511       memset(&narrow_type, 0, sizeof narrow_type);
1512       narrow_type.sign   = bld->type.sign;
1513       narrow_type.width  = bld->type.width/2;
1514       narrow_type.length = bld->type.length*2;
1515
1516       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1517       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1518       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1519       res = lp_build_add(&narrow_bld, v0, res);
1520       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1521    } else {
1522       res = lp_build_add(bld, v0, res);
1523
1524       if (bld->type.fixed) {
1525          /*
1526           * We need to mask out the high order bits when lerping 8bit
1527           * normalized colors stored on 16bits
1528           */
1529          /* XXX: This step is necessary for lerping 8bit colors stored on
1530           * 16bits, but it will be wrong for true fixed point use cases.
1531           * Basically we need a more powerful lp_type, capable of further
1532           * distinguishing the values interpretation from the value storage.
1533           */
1534          LLVMValueRef low_bits;
1535          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1536          res = LLVMBuildAnd(builder, res, low_bits, "");
1537       }
1538    }
1539
1540    return res;
1541 }
1542
1543
1544 /**
1545  * Linear interpolation.
1546  */
1547 LLVMValueRef
1548 lp_build_lerp(struct lp_build_context *bld,
1549               LLVMValueRef x,
1550               LLVMValueRef v0,
1551               LLVMValueRef v1,
1552               unsigned flags)
1553 {
1554    const struct lp_type type = bld->type;
1555    LLVMValueRef res;
1556
1557    assert(lp_check_value(type, x));
1558    assert(lp_check_value(type, v0));
1559    assert(lp_check_value(type, v1));
1560
1561    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1562
1563    if (type.norm) {
1564       struct lp_type wide_type;
1565       struct lp_build_context wide_bld;
1566       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1567
1568       assert(type.length >= 2);
1569
1570       /*
1571        * Create a wider integer type, enough to hold the
1572        * intermediate result of the multiplication.
1573        */
1574       memset(&wide_type, 0, sizeof wide_type);
1575       wide_type.sign   = type.sign;
1576       wide_type.width  = type.width*2;
1577       wide_type.length = type.length/2;
1578
1579       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1580
1581       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1582       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1583       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1584
1585       /*
1586        * Lerp both halves.
1587        */
1588
1589       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1590
1591       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1592       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1593
1594       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1595    } else {
1596       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1597    }
1598
1599    return res;
1600 }
1601
1602
1603 /**
1604  * Bilinear interpolation.
1605  *
1606  * Values indices are in v_{yx}.
1607  */
1608 LLVMValueRef
1609 lp_build_lerp_2d(struct lp_build_context *bld,
1610                  LLVMValueRef x,
1611                  LLVMValueRef y,
1612                  LLVMValueRef v00,
1613                  LLVMValueRef v01,
1614                  LLVMValueRef v10,
1615                  LLVMValueRef v11,
1616                  unsigned flags)
1617 {
1618    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1619    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1620    return lp_build_lerp(bld, y, v0, v1, flags);
1621 }
1622
1623
1624 LLVMValueRef
1625 lp_build_lerp_3d(struct lp_build_context *bld,
1626                  LLVMValueRef x,
1627                  LLVMValueRef y,
1628                  LLVMValueRef z,
1629                  LLVMValueRef v000,
1630                  LLVMValueRef v001,
1631                  LLVMValueRef v010,
1632                  LLVMValueRef v011,
1633                  LLVMValueRef v100,
1634                  LLVMValueRef v101,
1635                  LLVMValueRef v110,
1636                  LLVMValueRef v111,
1637                  unsigned flags)
1638 {
1639    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1640    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1641    return lp_build_lerp(bld, z, v0, v1, flags);
1642 }
1643
1644
1645 /**
1646  * Generate min(a, b)
1647  * Do checks for special cases but not for nans.
1648  */
1649 LLVMValueRef
1650 lp_build_min(struct lp_build_context *bld,
1651              LLVMValueRef a,
1652              LLVMValueRef b)
1653 {
1654    assert(lp_check_value(bld->type, a));
1655    assert(lp_check_value(bld->type, b));
1656
1657    if(a == bld->undef || b == bld->undef)
1658       return bld->undef;
1659
1660    if(a == b)
1661       return a;
1662
1663    if (bld->type.norm) {
1664       if (!bld->type.sign) {
1665          if (a == bld->zero || b == bld->zero) {
1666             return bld->zero;
1667          }
1668       }
1669       if(a == bld->one)
1670          return b;
1671       if(b == bld->one)
1672          return a;
1673    }
1674
1675    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1676 }
1677
1678
1679 /**
1680  * Generate min(a, b)
1681  * NaN's are handled according to the behavior specified by the
1682  * nan_behavior argument.
1683  */
1684 LLVMValueRef
1685 lp_build_min_ext(struct lp_build_context *bld,
1686                  LLVMValueRef a,
1687                  LLVMValueRef b,
1688                  enum gallivm_nan_behavior nan_behavior)
1689 {
1690    assert(lp_check_value(bld->type, a));
1691    assert(lp_check_value(bld->type, b));
1692
1693    if(a == bld->undef || b == bld->undef)
1694       return bld->undef;
1695
1696    if(a == b)
1697       return a;
1698
1699    if (bld->type.norm) {
1700       if (!bld->type.sign) {
1701          if (a == bld->zero || b == bld->zero) {
1702             return bld->zero;
1703          }
1704       }
1705       if(a == bld->one)
1706          return b;
1707       if(b == bld->one)
1708          return a;
1709    }
1710
1711    return lp_build_min_simple(bld, a, b, nan_behavior);
1712 }
1713
1714 /**
1715  * Generate max(a, b)
1716  * Do checks for special cases, but NaN behavior is undefined.
1717  */
1718 LLVMValueRef
1719 lp_build_max(struct lp_build_context *bld,
1720              LLVMValueRef a,
1721              LLVMValueRef b)
1722 {
1723    assert(lp_check_value(bld->type, a));
1724    assert(lp_check_value(bld->type, b));
1725
1726    if(a == bld->undef || b == bld->undef)
1727       return bld->undef;
1728
1729    if(a == b)
1730       return a;
1731
1732    if(bld->type.norm) {
1733       if(a == bld->one || b == bld->one)
1734          return bld->one;
1735       if (!bld->type.sign) {
1736          if (a == bld->zero) {
1737             return b;
1738          }
1739          if (b == bld->zero) {
1740             return a;
1741          }
1742       }
1743    }
1744
1745    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1746 }
1747
1748
1749 /**
1750  * Generate max(a, b)
1751  * Checks for special cases.
1752  * NaN's are handled according to the behavior specified by the
1753  * nan_behavior argument.
1754  */
1755 LLVMValueRef
1756 lp_build_max_ext(struct lp_build_context *bld,
1757                   LLVMValueRef a,
1758                   LLVMValueRef b,
1759                   enum gallivm_nan_behavior nan_behavior)
1760 {
1761    assert(lp_check_value(bld->type, a));
1762    assert(lp_check_value(bld->type, b));
1763
1764    if(a == bld->undef || b == bld->undef)
1765       return bld->undef;
1766
1767    if(a == b)
1768       return a;
1769
1770    if(bld->type.norm) {
1771       if(a == bld->one || b == bld->one)
1772          return bld->one;
1773       if (!bld->type.sign) {
1774          if (a == bld->zero) {
1775             return b;
1776          }
1777          if (b == bld->zero) {
1778             return a;
1779          }
1780       }
1781    }
1782
1783    return lp_build_max_simple(bld, a, b, nan_behavior);
1784 }
1785
1786 /**
1787  * Generate clamp(a, min, max)
1788  * NaN behavior (for any of a, min, max) is undefined.
1789  * Do checks for special cases.
1790  */
1791 LLVMValueRef
1792 lp_build_clamp(struct lp_build_context *bld,
1793                LLVMValueRef a,
1794                LLVMValueRef min,
1795                LLVMValueRef max)
1796 {
1797    assert(lp_check_value(bld->type, a));
1798    assert(lp_check_value(bld->type, min));
1799    assert(lp_check_value(bld->type, max));
1800
1801    a = lp_build_min(bld, a, max);
1802    a = lp_build_max(bld, a, min);
1803    return a;
1804 }
1805
1806
1807 /**
1808  * Generate clamp(a, 0, 1)
1809  * A NaN will get converted to zero.
1810  */
1811 LLVMValueRef
1812 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1813                                 LLVMValueRef a)
1814 {
1815    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1816    a = lp_build_min(bld, a, bld->one);
1817    return a;
1818 }
1819
1820
1821 /**
1822  * Generate abs(a)
1823  */
1824 LLVMValueRef
1825 lp_build_abs(struct lp_build_context *bld,
1826              LLVMValueRef a)
1827 {
1828    LLVMBuilderRef builder = bld->gallivm->builder;
1829    const struct lp_type type = bld->type;
1830    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1831
1832    assert(lp_check_value(type, a));
1833
1834    if(!type.sign)
1835       return a;
1836
1837    if(type.floating) {
1838       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1839          /* Workaround llvm.org/PR27332 */
1840          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1841          unsigned long long absMask = ~(1ULL << (type.width - 1));
1842          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1843          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1844          a = LLVMBuildAnd(builder, a, mask, "");
1845          a = LLVMBuildBitCast(builder, a, vec_type, "");
1846          return a;
1847       } else {
1848          char intrinsic[32];
1849          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1850          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1851       }
1852    }
1853
1854    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1855       switch(type.width) {
1856       case 8:
1857          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1858       case 16:
1859          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1860       case 32:
1861          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1862       }
1863    }
1864    else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1865       switch(type.width) {
1866       case 8:
1867          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1868       case 16:
1869          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1870       case 32:
1871          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1872       }
1873    }
1874
1875    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1876                           a, LLVMBuildNeg(builder, a, ""));
1877 }
1878
1879
1880 LLVMValueRef
1881 lp_build_negate(struct lp_build_context *bld,
1882                 LLVMValueRef a)
1883 {
1884    LLVMBuilderRef builder = bld->gallivm->builder;
1885
1886    assert(lp_check_value(bld->type, a));
1887
1888    if (bld->type.floating)
1889       a = LLVMBuildFNeg(builder, a, "");
1890    else
1891       a = LLVMBuildNeg(builder, a, "");
1892
1893    return a;
1894 }
1895
1896
1897 /** Return -1, 0 or +1 depending on the sign of a */
1898 LLVMValueRef
1899 lp_build_sgn(struct lp_build_context *bld,
1900              LLVMValueRef a)
1901 {
1902    LLVMBuilderRef builder = bld->gallivm->builder;
1903    const struct lp_type type = bld->type;
1904    LLVMValueRef cond;
1905    LLVMValueRef res;
1906
1907    assert(lp_check_value(type, a));
1908
1909    /* Handle non-zero case */
1910    if(!type.sign) {
1911       /* if not zero then sign must be positive */
1912       res = bld->one;
1913    }
1914    else if(type.floating) {
1915       LLVMTypeRef vec_type;
1916       LLVMTypeRef int_type;
1917       LLVMValueRef mask;
1918       LLVMValueRef sign;
1919       LLVMValueRef one;
1920       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1921
1922       int_type = lp_build_int_vec_type(bld->gallivm, type);
1923       vec_type = lp_build_vec_type(bld->gallivm, type);
1924       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1925
1926       /* Take the sign bit and add it to 1 constant */
1927       sign = LLVMBuildBitCast(builder, a, int_type, "");
1928       sign = LLVMBuildAnd(builder, sign, mask, "");
1929       one = LLVMConstBitCast(bld->one, int_type);
1930       res = LLVMBuildOr(builder, sign, one, "");
1931       res = LLVMBuildBitCast(builder, res, vec_type, "");
1932    }
1933    else
1934    {
1935       /* signed int/norm/fixed point */
1936       /* could use psign with sse3 and appropriate vectors here */
1937       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1938       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1939       res = lp_build_select(bld, cond, bld->one, minus_one);
1940    }
1941
1942    /* Handle zero */
1943    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1944    res = lp_build_select(bld, cond, bld->zero, res);
1945
1946    return res;
1947 }
1948
1949
1950 /**
1951  * Set the sign of float vector 'a' according to 'sign'.
1952  * If sign==0, return abs(a).
1953  * If sign==1, return -abs(a);
1954  * Other values for sign produce undefined results.
1955  */
1956 LLVMValueRef
1957 lp_build_set_sign(struct lp_build_context *bld,
1958                   LLVMValueRef a, LLVMValueRef sign)
1959 {
1960    LLVMBuilderRef builder = bld->gallivm->builder;
1961    const struct lp_type type = bld->type;
1962    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1963    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1964    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1965    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1966                              ~((unsigned long long) 1 << (type.width - 1)));
1967    LLVMValueRef val, res;
1968
1969    assert(type.floating);
1970    assert(lp_check_value(type, a));
1971
1972    /* val = reinterpret_cast<int>(a) */
1973    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1974    /* val = val & mask */
1975    val = LLVMBuildAnd(builder, val, mask, "");
1976    /* sign = sign << shift */
1977    sign = LLVMBuildShl(builder, sign, shift, "");
1978    /* res = val | sign */
1979    res = LLVMBuildOr(builder, val, sign, "");
1980    /* res = reinterpret_cast<float>(res) */
1981    res = LLVMBuildBitCast(builder, res, vec_type, "");
1982
1983    return res;
1984 }
1985
1986
1987 /**
1988  * Convert vector of (or scalar) int to vector of (or scalar) float.
1989  */
1990 LLVMValueRef
1991 lp_build_int_to_float(struct lp_build_context *bld,
1992                       LLVMValueRef a)
1993 {
1994    LLVMBuilderRef builder = bld->gallivm->builder;
1995    const struct lp_type type = bld->type;
1996    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1997
1998    assert(type.floating);
1999
2000    return LLVMBuildSIToFP(builder, a, vec_type, "");
2001 }
2002
2003 static boolean
2004 arch_rounding_available(const struct lp_type type)
2005 {
2006    if ((util_cpu_caps.has_sse4_1 &&
2007        (type.length == 1 || type.width*type.length == 128)) ||
2008        (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2009        (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2010       return TRUE;
2011    else if ((util_cpu_caps.has_altivec &&
2012             (type.width == 32 && type.length == 4)))
2013       return TRUE;
2014    else if (util_cpu_caps.has_neon)
2015       return TRUE;
2016
2017    return FALSE;
2018 }
2019
2020 enum lp_build_round_mode
2021 {
2022    LP_BUILD_ROUND_NEAREST = 0,
2023    LP_BUILD_ROUND_FLOOR = 1,
2024    LP_BUILD_ROUND_CEIL = 2,
2025    LP_BUILD_ROUND_TRUNCATE = 3
2026 };
2027
2028 static inline LLVMValueRef
2029 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2030                              LLVMValueRef a)
2031 {
2032    LLVMBuilderRef builder = bld->gallivm->builder;
2033    const struct lp_type type = bld->type;
2034    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2035    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2036    const char *intrinsic;
2037    LLVMValueRef res;
2038
2039    assert(type.floating);
2040    /* using the double precision conversions is a bit more complicated */
2041    assert(type.width == 32);
2042
2043    assert(lp_check_value(type, a));
2044    assert(util_cpu_caps.has_sse2);
2045
2046    /* This is relying on MXCSR rounding mode, which should always be nearest. */
2047    if (type.length == 1) {
2048       LLVMTypeRef vec_type;
2049       LLVMValueRef undef;
2050       LLVMValueRef arg;
2051       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2052
2053       vec_type = LLVMVectorType(bld->elem_type, 4);
2054
2055       intrinsic = "llvm.x86.sse.cvtss2si";
2056
2057       undef = LLVMGetUndef(vec_type);
2058
2059       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2060
2061       res = lp_build_intrinsic_unary(builder, intrinsic,
2062                                      ret_type, arg);
2063    }
2064    else {
2065       if (type.width* type.length == 128) {
2066          intrinsic = "llvm.x86.sse2.cvtps2dq";
2067       }
2068       else {
2069          assert(type.width*type.length == 256);
2070          assert(util_cpu_caps.has_avx);
2071
2072          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2073       }
2074       res = lp_build_intrinsic_unary(builder, intrinsic,
2075                                      ret_type, a);
2076    }
2077
2078    return res;
2079 }
2080
2081
2082 /*
2083  */
2084 static inline LLVMValueRef
2085 lp_build_round_altivec(struct lp_build_context *bld,
2086                        LLVMValueRef a,
2087                        enum lp_build_round_mode mode)
2088 {
2089    LLVMBuilderRef builder = bld->gallivm->builder;
2090    const struct lp_type type = bld->type;
2091    const char *intrinsic = NULL;
2092
2093    assert(type.floating);
2094
2095    assert(lp_check_value(type, a));
2096    assert(util_cpu_caps.has_altivec);
2097
2098    (void)type;
2099
2100    switch (mode) {
2101    case LP_BUILD_ROUND_NEAREST:
2102       intrinsic = "llvm.ppc.altivec.vrfin";
2103       break;
2104    case LP_BUILD_ROUND_FLOOR:
2105       intrinsic = "llvm.ppc.altivec.vrfim";
2106       break;
2107    case LP_BUILD_ROUND_CEIL:
2108       intrinsic = "llvm.ppc.altivec.vrfip";
2109       break;
2110    case LP_BUILD_ROUND_TRUNCATE:
2111       intrinsic = "llvm.ppc.altivec.vrfiz";
2112       break;
2113    }
2114
2115    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2116 }
2117
2118 static inline LLVMValueRef
2119 lp_build_round_arch(struct lp_build_context *bld,
2120                     LLVMValueRef a,
2121                     enum lp_build_round_mode mode)
2122 {
2123    if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2124       LLVMBuilderRef builder = bld->gallivm->builder;
2125       const struct lp_type type = bld->type;
2126       const char *intrinsic_root;
2127       char intrinsic[32];
2128
2129       assert(type.floating);
2130       assert(lp_check_value(type, a));
2131       (void)type;
2132
2133       switch (mode) {
2134       case LP_BUILD_ROUND_NEAREST:
2135          intrinsic_root = "llvm.nearbyint";
2136          break;
2137       case LP_BUILD_ROUND_FLOOR:
2138          intrinsic_root = "llvm.floor";
2139          break;
2140       case LP_BUILD_ROUND_CEIL:
2141          intrinsic_root = "llvm.ceil";
2142          break;
2143       case LP_BUILD_ROUND_TRUNCATE:
2144          intrinsic_root = "llvm.trunc";
2145          break;
2146       }
2147
2148       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2149       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2150    }
2151    else /* (util_cpu_caps.has_altivec) */
2152      return lp_build_round_altivec(bld, a, mode);
2153 }
2154
2155 /**
2156  * Return the integer part of a float (vector) value (== round toward zero).
2157  * The returned value is a float (vector).
2158  * Ex: trunc(-1.5) = -1.0
2159  */
2160 LLVMValueRef
2161 lp_build_trunc(struct lp_build_context *bld,
2162                LLVMValueRef a)
2163 {
2164    LLVMBuilderRef builder = bld->gallivm->builder;
2165    const struct lp_type type = bld->type;
2166
2167    assert(type.floating);
2168    assert(lp_check_value(type, a));
2169
2170    if (arch_rounding_available(type)) {
2171       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2172    }
2173    else {
2174       const struct lp_type type = bld->type;
2175       struct lp_type inttype;
2176       struct lp_build_context intbld;
2177       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2178       LLVMValueRef trunc, res, anosign, mask;
2179       LLVMTypeRef int_vec_type = bld->int_vec_type;
2180       LLVMTypeRef vec_type = bld->vec_type;
2181
2182       assert(type.width == 32); /* might want to handle doubles at some point */
2183
2184       inttype = type;
2185       inttype.floating = 0;
2186       lp_build_context_init(&intbld, bld->gallivm, inttype);
2187
2188       /* round by truncation */
2189       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2190       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2191
2192       /* mask out sign bit */
2193       anosign = lp_build_abs(bld, a);
2194       /*
2195        * mask out all values if anosign > 2^24
2196        * This should work both for large ints (all rounding is no-op for them
2197        * because such floats are always exact) as well as special cases like
2198        * NaNs, Infs (taking advantage of the fact they use max exponent).
2199        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2200        */
2201       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2202       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2203       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2204       return lp_build_select(bld, mask, a, res);
2205    }
2206 }
2207
2208
2209 /**
2210  * Return float (vector) rounded to nearest integer (vector).  The returned
2211  * value is a float (vector).
2212  * Ex: round(0.9) = 1.0
2213  * Ex: round(-1.5) = -2.0
2214  */
2215 LLVMValueRef
2216 lp_build_round(struct lp_build_context *bld,
2217                LLVMValueRef a)
2218 {
2219    LLVMBuilderRef builder = bld->gallivm->builder;
2220    const struct lp_type type = bld->type;
2221
2222    assert(type.floating);
2223    assert(lp_check_value(type, a));
2224
2225    if (arch_rounding_available(type)) {
2226       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2227    }
2228    else {
2229       const struct lp_type type = bld->type;
2230       struct lp_type inttype;
2231       struct lp_build_context intbld;
2232       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2233       LLVMValueRef res, anosign, mask;
2234       LLVMTypeRef int_vec_type = bld->int_vec_type;
2235       LLVMTypeRef vec_type = bld->vec_type;
2236
2237       assert(type.width == 32); /* might want to handle doubles at some point */
2238
2239       inttype = type;
2240       inttype.floating = 0;
2241       lp_build_context_init(&intbld, bld->gallivm, inttype);
2242
2243       res = lp_build_iround(bld, a);
2244       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2245
2246       /* mask out sign bit */
2247       anosign = lp_build_abs(bld, a);
2248       /*
2249        * mask out all values if anosign > 2^24
2250        * This should work both for large ints (all rounding is no-op for them
2251        * because such floats are always exact) as well as special cases like
2252        * NaNs, Infs (taking advantage of the fact they use max exponent).
2253        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2254        */
2255       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2256       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2257       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2258       return lp_build_select(bld, mask, a, res);
2259    }
2260 }
2261
2262
2263 /**
2264  * Return floor of float (vector), result is a float (vector)
2265  * Ex: floor(1.1) = 1.0
2266  * Ex: floor(-1.1) = -2.0
2267  */
2268 LLVMValueRef
2269 lp_build_floor(struct lp_build_context *bld,
2270                LLVMValueRef a)
2271 {
2272    LLVMBuilderRef builder = bld->gallivm->builder;
2273    const struct lp_type type = bld->type;
2274
2275    assert(type.floating);
2276    assert(lp_check_value(type, a));
2277
2278    if (arch_rounding_available(type)) {
2279       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2280    }
2281    else {
2282       const struct lp_type type = bld->type;
2283       struct lp_type inttype;
2284       struct lp_build_context intbld;
2285       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2286       LLVMValueRef trunc, res, anosign, mask;
2287       LLVMTypeRef int_vec_type = bld->int_vec_type;
2288       LLVMTypeRef vec_type = bld->vec_type;
2289
2290       if (type.width != 32) {
2291          char intrinsic[32];
2292          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2293          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2294       }
2295
2296       assert(type.width == 32); /* might want to handle doubles at some point */
2297
2298       inttype = type;
2299       inttype.floating = 0;
2300       lp_build_context_init(&intbld, bld->gallivm, inttype);
2301
2302       /* round by truncation */
2303       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2304       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2305
2306       if (type.sign) {
2307          LLVMValueRef tmp;
2308
2309          /*
2310           * fix values if rounding is wrong (for non-special cases)
2311           * - this is the case if trunc > a
2312           */
2313          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2314          /* tmp = trunc > a ? 1.0 : 0.0 */
2315          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2316          tmp = lp_build_and(&intbld, mask, tmp);
2317          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2318          res = lp_build_sub(bld, res, tmp);
2319       }
2320
2321       /* mask out sign bit */
2322       anosign = lp_build_abs(bld, a);
2323       /*
2324        * mask out all values if anosign > 2^24
2325        * This should work both for large ints (all rounding is no-op for them
2326        * because such floats are always exact) as well as special cases like
2327        * NaNs, Infs (taking advantage of the fact they use max exponent).
2328        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2329        */
2330       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2331       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2332       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2333       return lp_build_select(bld, mask, a, res);
2334    }
2335 }
2336
2337
2338 /**
2339  * Return ceiling of float (vector), returning float (vector).
2340  * Ex: ceil( 1.1) = 2.0
2341  * Ex: ceil(-1.1) = -1.0
2342  */
2343 LLVMValueRef
2344 lp_build_ceil(struct lp_build_context *bld,
2345               LLVMValueRef a)
2346 {
2347    LLVMBuilderRef builder = bld->gallivm->builder;
2348    const struct lp_type type = bld->type;
2349
2350    assert(type.floating);
2351    assert(lp_check_value(type, a));
2352
2353    if (arch_rounding_available(type)) {
2354       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2355    }
2356    else {
2357       const struct lp_type type = bld->type;
2358       struct lp_type inttype;
2359       struct lp_build_context intbld;
2360       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2361       LLVMValueRef trunc, res, anosign, mask, tmp;
2362       LLVMTypeRef int_vec_type = bld->int_vec_type;
2363       LLVMTypeRef vec_type = bld->vec_type;
2364
2365       if (type.width != 32) {
2366          char intrinsic[32];
2367          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2368          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2369       }
2370
2371       assert(type.width == 32); /* might want to handle doubles at some point */
2372
2373       inttype = type;
2374       inttype.floating = 0;
2375       lp_build_context_init(&intbld, bld->gallivm, inttype);
2376
2377       /* round by truncation */
2378       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2379       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2380
2381       /*
2382        * fix values if rounding is wrong (for non-special cases)
2383        * - this is the case if trunc < a
2384        */
2385       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2386       /* tmp = trunc < a ? 1.0 : 0.0 */
2387       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2388       tmp = lp_build_and(&intbld, mask, tmp);
2389       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2390       res = lp_build_add(bld, trunc, tmp);
2391
2392       /* mask out sign bit */
2393       anosign = lp_build_abs(bld, a);
2394       /*
2395        * mask out all values if anosign > 2^24
2396        * This should work both for large ints (all rounding is no-op for them
2397        * because such floats are always exact) as well as special cases like
2398        * NaNs, Infs (taking advantage of the fact they use max exponent).
2399        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2400        */
2401       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2402       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2403       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2404       return lp_build_select(bld, mask, a, res);
2405    }
2406 }
2407
2408
2409 /**
2410  * Return fractional part of 'a' computed as a - floor(a)
2411  * Typically used in texture coord arithmetic.
2412  */
2413 LLVMValueRef
2414 lp_build_fract(struct lp_build_context *bld,
2415                LLVMValueRef a)
2416 {
2417    assert(bld->type.floating);
2418    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2419 }
2420
2421
2422 /**
2423  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2424  * against 0.99999(9). (Will also return that value for NaNs.)
2425  */
2426 static inline LLVMValueRef
2427 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2428 {
2429    LLVMValueRef max;
2430
2431    /* this is the largest number smaller than 1.0 representable as float */
2432    max = lp_build_const_vec(bld->gallivm, bld->type,
2433                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2434    return lp_build_min_ext(bld, fract, max,
2435                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2436 }
2437
2438
2439 /**
2440  * Same as lp_build_fract, but guarantees that the result is always smaller
2441  * than one. Will also return the smaller-than-one value for infs, NaNs.
2442  */
2443 LLVMValueRef
2444 lp_build_fract_safe(struct lp_build_context *bld,
2445                     LLVMValueRef a)
2446 {
2447    return clamp_fract(bld, lp_build_fract(bld, a));
2448 }
2449
2450
2451 /**
2452  * Return the integer part of a float (vector) value (== round toward zero).
2453  * The returned value is an integer (vector).
2454  * Ex: itrunc(-1.5) = -1
2455  */
2456 LLVMValueRef
2457 lp_build_itrunc(struct lp_build_context *bld,
2458                 LLVMValueRef a)
2459 {
2460    LLVMBuilderRef builder = bld->gallivm->builder;
2461    const struct lp_type type = bld->type;
2462    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2463
2464    assert(type.floating);
2465    assert(lp_check_value(type, a));
2466
2467    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2468 }
2469
2470
2471 /**
2472  * Return float (vector) rounded to nearest integer (vector).  The returned
2473  * value is an integer (vector).
2474  * Ex: iround(0.9) = 1
2475  * Ex: iround(-1.5) = -2
2476  */
2477 LLVMValueRef
2478 lp_build_iround(struct lp_build_context *bld,
2479                 LLVMValueRef a)
2480 {
2481    LLVMBuilderRef builder = bld->gallivm->builder;
2482    const struct lp_type type = bld->type;
2483    LLVMTypeRef int_vec_type = bld->int_vec_type;
2484    LLVMValueRef res;
2485
2486    assert(type.floating);
2487
2488    assert(lp_check_value(type, a));
2489
2490    if ((util_cpu_caps.has_sse2 &&
2491        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2492        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2493       return lp_build_iround_nearest_sse2(bld, a);
2494    }
2495    if (arch_rounding_available(type)) {
2496       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2497    }
2498    else {
2499       LLVMValueRef half;
2500
2501       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2502
2503       if (type.sign) {
2504          LLVMTypeRef vec_type = bld->vec_type;
2505          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2506                                     (unsigned long long)1 << (type.width - 1));
2507          LLVMValueRef sign;
2508
2509          /* get sign bit */
2510          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2511          sign = LLVMBuildAnd(builder, sign, mask, "");
2512
2513          /* sign * 0.5 */
2514          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2515          half = LLVMBuildOr(builder, sign, half, "");
2516          half = LLVMBuildBitCast(builder, half, vec_type, "");
2517       }
2518
2519       res = LLVMBuildFAdd(builder, a, half, "");
2520    }
2521
2522    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2523
2524    return res;
2525 }
2526
2527
2528 /**
2529  * Return floor of float (vector), result is an int (vector)
2530  * Ex: ifloor(1.1) = 1.0
2531  * Ex: ifloor(-1.1) = -2.0
2532  */
2533 LLVMValueRef
2534 lp_build_ifloor(struct lp_build_context *bld,
2535                 LLVMValueRef a)
2536 {
2537    LLVMBuilderRef builder = bld->gallivm->builder;
2538    const struct lp_type type = bld->type;
2539    LLVMTypeRef int_vec_type = bld->int_vec_type;
2540    LLVMValueRef res;
2541
2542    assert(type.floating);
2543    assert(lp_check_value(type, a));
2544
2545    res = a;
2546    if (type.sign) {
2547       if (arch_rounding_available(type)) {
2548          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2549       }
2550       else {
2551          struct lp_type inttype;
2552          struct lp_build_context intbld;
2553          LLVMValueRef trunc, itrunc, mask;
2554
2555          assert(type.floating);
2556          assert(lp_check_value(type, a));
2557
2558          inttype = type;
2559          inttype.floating = 0;
2560          lp_build_context_init(&intbld, bld->gallivm, inttype);
2561
2562          /* round by truncation */
2563          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2564          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2565
2566          /*
2567           * fix values if rounding is wrong (for non-special cases)
2568           * - this is the case if trunc > a
2569           * The results of doing this with NaNs, very large values etc.
2570           * are undefined but this seems to be the case anyway.
2571           */
2572          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2573          /* cheapie minus one with mask since the mask is minus one / zero */
2574          return lp_build_add(&intbld, itrunc, mask);
2575       }
2576    }
2577
2578    /* round to nearest (toward zero) */
2579    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2580
2581    return res;
2582 }
2583
2584
2585 /**
2586  * Return ceiling of float (vector), returning int (vector).
2587  * Ex: iceil( 1.1) = 2
2588  * Ex: iceil(-1.1) = -1
2589  */
2590 LLVMValueRef
2591 lp_build_iceil(struct lp_build_context *bld,
2592                LLVMValueRef a)
2593 {
2594    LLVMBuilderRef builder = bld->gallivm->builder;
2595    const struct lp_type type = bld->type;
2596    LLVMTypeRef int_vec_type = bld->int_vec_type;
2597    LLVMValueRef res;
2598
2599    assert(type.floating);
2600    assert(lp_check_value(type, a));
2601
2602    if (arch_rounding_available(type)) {
2603       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2604    }
2605    else {
2606       struct lp_type inttype;
2607       struct lp_build_context intbld;
2608       LLVMValueRef trunc, itrunc, mask;
2609
2610       assert(type.floating);
2611       assert(lp_check_value(type, a));
2612
2613       inttype = type;
2614       inttype.floating = 0;
2615       lp_build_context_init(&intbld, bld->gallivm, inttype);
2616
2617       /* round by truncation */
2618       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2619       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2620
2621       /*
2622        * fix values if rounding is wrong (for non-special cases)
2623        * - this is the case if trunc < a
2624        * The results of doing this with NaNs, very large values etc.
2625        * are undefined but this seems to be the case anyway.
2626        */
2627       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2628       /* cheapie plus one with mask since the mask is minus one / zero */
2629       return lp_build_sub(&intbld, itrunc, mask);
2630    }
2631
2632    /* round to nearest (toward zero) */
2633    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2634
2635    return res;
2636 }
2637
2638
2639 /**
2640  * Combined ifloor() & fract().
2641  *
2642  * Preferred to calling the functions separately, as it will ensure that the
2643  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2644  */
2645 void
2646 lp_build_ifloor_fract(struct lp_build_context *bld,
2647                       LLVMValueRef a,
2648                       LLVMValueRef *out_ipart,
2649                       LLVMValueRef *out_fpart)
2650 {
2651    LLVMBuilderRef builder = bld->gallivm->builder;
2652    const struct lp_type type = bld->type;
2653    LLVMValueRef ipart;
2654
2655    assert(type.floating);
2656    assert(lp_check_value(type, a));
2657
2658    if (arch_rounding_available(type)) {
2659       /*
2660        * floor() is easier.
2661        */
2662
2663       ipart = lp_build_floor(bld, a);
2664       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2665       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2666    }
2667    else {
2668       /*
2669        * ifloor() is easier.
2670        */
2671
2672       *out_ipart = lp_build_ifloor(bld, a);
2673       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2674       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2675    }
2676 }
2677
2678
2679 /**
2680  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2681  * always smaller than one.
2682  */
2683 void
2684 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2685                            LLVMValueRef a,
2686                            LLVMValueRef *out_ipart,
2687                            LLVMValueRef *out_fpart)
2688 {
2689    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2690    *out_fpart = clamp_fract(bld, *out_fpart);
2691 }
2692
2693
2694 LLVMValueRef
2695 lp_build_sqrt(struct lp_build_context *bld,
2696               LLVMValueRef a)
2697 {
2698    LLVMBuilderRef builder = bld->gallivm->builder;
2699    const struct lp_type type = bld->type;
2700    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2701    char intrinsic[32];
2702
2703    assert(lp_check_value(type, a));
2704
2705    assert(type.floating);
2706    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2707
2708    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2709 }
2710
2711
2712 /**
2713  * Do one Newton-Raphson step to improve reciprocate precision:
2714  *
2715  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2716  *
2717  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2718  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2719  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2720  * halo. It would be necessary to clamp the argument to prevent this.
2721  *
2722  * See also:
2723  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2724  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2725  */
2726 static inline LLVMValueRef
2727 lp_build_rcp_refine(struct lp_build_context *bld,
2728                     LLVMValueRef a,
2729                     LLVMValueRef rcp_a)
2730 {
2731    LLVMBuilderRef builder = bld->gallivm->builder;
2732    LLVMValueRef neg_a;
2733    LLVMValueRef res;
2734
2735    neg_a = LLVMBuildFNeg(builder, a, "");
2736    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2737    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2738
2739    return res;
2740 }
2741
2742
2743 LLVMValueRef
2744 lp_build_rcp(struct lp_build_context *bld,
2745              LLVMValueRef a)
2746 {
2747    LLVMBuilderRef builder = bld->gallivm->builder;
2748    const struct lp_type type = bld->type;
2749
2750    assert(lp_check_value(type, a));
2751
2752    if(a == bld->zero)
2753       return bld->undef;
2754    if(a == bld->one)
2755       return bld->one;
2756    if(a == bld->undef)
2757       return bld->undef;
2758
2759    assert(type.floating);
2760
2761    if(LLVMIsConstant(a))
2762       return LLVMConstFDiv(bld->one, a);
2763
2764    /*
2765     * We don't use RCPPS because:
2766     * - it only has 10bits of precision
2767     * - it doesn't even get the reciprocate of 1.0 exactly
2768     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2769     * - for recent processors the benefit over DIVPS is marginal, a case
2770     *   dependent
2771     *
2772     * We could still use it on certain processors if benchmarks show that the
2773     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2774     * particular uses that require less workarounds.
2775     */
2776
2777    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2778          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2779       const unsigned num_iterations = 0;
2780       LLVMValueRef res;
2781       unsigned i;
2782       const char *intrinsic = NULL;
2783
2784       if (type.length == 4) {
2785          intrinsic = "llvm.x86.sse.rcp.ps";
2786       }
2787       else {
2788          intrinsic = "llvm.x86.avx.rcp.ps.256";
2789       }
2790
2791       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2792
2793       for (i = 0; i < num_iterations; ++i) {
2794          res = lp_build_rcp_refine(bld, a, res);
2795       }
2796
2797       return res;
2798    }
2799
2800    return LLVMBuildFDiv(builder, bld->one, a, "");
2801 }
2802
2803
2804 /**
2805  * Do one Newton-Raphson step to improve rsqrt precision:
2806  *
2807  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2808  *
2809  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2810  */
2811 static inline LLVMValueRef
2812 lp_build_rsqrt_refine(struct lp_build_context *bld,
2813                       LLVMValueRef a,
2814                       LLVMValueRef rsqrt_a)
2815 {
2816    LLVMBuilderRef builder = bld->gallivm->builder;
2817    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2818    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2819    LLVMValueRef res;
2820
2821    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2822    res = LLVMBuildFMul(builder, a, res, "");
2823    res = LLVMBuildFSub(builder, three, res, "");
2824    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2825    res = LLVMBuildFMul(builder, half, res, "");
2826
2827    return res;
2828 }
2829
2830
2831 /**
2832  * Generate 1/sqrt(a).
2833  * Result is undefined for values < 0, infinity for +0.
2834  */
2835 LLVMValueRef
2836 lp_build_rsqrt(struct lp_build_context *bld,
2837                LLVMValueRef a)
2838 {
2839    const struct lp_type type = bld->type;
2840
2841    assert(lp_check_value(type, a));
2842
2843    assert(type.floating);
2844
2845    /*
2846     * This should be faster but all denormals will end up as infinity.
2847     */
2848    if (0 && lp_build_fast_rsqrt_available(type)) {
2849       const unsigned num_iterations = 1;
2850       LLVMValueRef res;
2851       unsigned i;
2852
2853       /* rsqrt(1.0) != 1.0 here */
2854       res = lp_build_fast_rsqrt(bld, a);
2855
2856       if (num_iterations) {
2857          /*
2858           * Newton-Raphson will result in NaN instead of infinity for zero,
2859           * and NaN instead of zero for infinity.
2860           * Also, need to ensure rsqrt(1.0) == 1.0.
2861           * All numbers smaller than FLT_MIN will result in +infinity
2862           * (rsqrtps treats all denormals as zero).
2863           */
2864          LLVMValueRef cmp;
2865          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2866          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2867
2868          for (i = 0; i < num_iterations; ++i) {
2869             res = lp_build_rsqrt_refine(bld, a, res);
2870          }
2871          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2872          res = lp_build_select(bld, cmp, inf, res);
2873          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2874          res = lp_build_select(bld, cmp, bld->zero, res);
2875          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2876          res = lp_build_select(bld, cmp, bld->one, res);
2877       }
2878
2879       return res;
2880    }
2881
2882    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2883 }
2884
2885 /**
2886  * If there's a fast (inaccurate) rsqrt instruction available
2887  * (caller may want to avoid to call rsqrt_fast if it's not available,
2888  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2889  * unavailable it would result in sqrt/div/mul so obviously
2890  * much better to just call sqrt, skipping both div and mul).
2891  */
2892 boolean
2893 lp_build_fast_rsqrt_available(struct lp_type type)
2894 {
2895    assert(type.floating);
2896
2897    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2898        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2899       return true;
2900    }
2901    return false;
2902 }
2903
2904
2905 /**
2906  * Generate 1/sqrt(a).
2907  * Result is undefined for values < 0, infinity for +0.
2908  * Precision is limited, only ~10 bits guaranteed
2909  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2910  */
2911 LLVMValueRef
2912 lp_build_fast_rsqrt(struct lp_build_context *bld,
2913                     LLVMValueRef a)
2914 {
2915    LLVMBuilderRef builder = bld->gallivm->builder;
2916    const struct lp_type type = bld->type;
2917
2918    assert(lp_check_value(type, a));
2919
2920    if (lp_build_fast_rsqrt_available(type)) {
2921       const char *intrinsic = NULL;
2922
2923       if (type.length == 4) {
2924          intrinsic = "llvm.x86.sse.rsqrt.ps";
2925       }
2926       else {
2927          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2928       }
2929       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2930    }
2931    else {
2932       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2933    }
2934    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2935 }
2936
2937
2938 /**
2939  * Generate sin(a) or cos(a) using polynomial approximation.
2940  * TODO: it might be worth recognizing sin and cos using same source
2941  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2942  * would be way cheaper than calculating (nearly) everything twice...
2943  * Not sure it's common enough to be worth bothering however, scs
2944  * opcode could also benefit from calculating both though.
2945  */
2946 static LLVMValueRef
2947 lp_build_sin_or_cos(struct lp_build_context *bld,
2948                     LLVMValueRef a,
2949                     boolean cos)
2950 {
2951    struct gallivm_state *gallivm = bld->gallivm;
2952    LLVMBuilderRef b = gallivm->builder;
2953    struct lp_type int_type = lp_int_type(bld->type);
2954
2955    /*
2956     *  take the absolute value,
2957     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2958     */
2959
2960    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2961    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2962
2963    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2964    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2965
2966    /*
2967     * scale by 4/Pi
2968     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2969     */
2970
2971    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2972    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2973
2974    /*
2975     * store the integer part of y in mm0
2976     * emm2 = _mm_cvttps_epi32(y);
2977     */
2978
2979    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2980
2981    /*
2982     * j=(j+1) & (~1) (see the cephes sources)
2983     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2984     */
2985
2986    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2987    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2988    /*
2989     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2990     */
2991    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2992    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2993
2994    /*
2995     * y = _mm_cvtepi32_ps(emm2);
2996     */
2997    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2998
2999    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
3000    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
3001    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
3002    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
3003
3004    /*
3005     * Argument used for poly selection and sign bit determination
3006     * is different for sin vs. cos.
3007     */
3008    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3009                                emm2_and;
3010
3011    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3012                                                               LLVMBuildNot(b, emm2_2, ""), ""),
3013                                               const_29, "sign_bit") :
3014                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3015                                                               LLVMBuildShl(b, emm2_add,
3016                                                                            const_29, ""), ""),
3017                                               sign_mask, "sign_bit");
3018
3019    /*
3020     * get the polynom selection mask
3021     * there is one polynom for 0 <= x <= Pi/4
3022     * and another one for Pi/4<x<=Pi/2
3023     * Both branches will be computed.
3024     *
3025     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3026     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3027     */
3028
3029    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3030    LLVMValueRef poly_mask = lp_build_compare(gallivm,
3031                                              int_type, PIPE_FUNC_EQUAL,
3032                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3033
3034    /*
3035     * _PS_CONST(minus_cephes_DP1, -0.78515625);
3036     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3037     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3038     */
3039    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3040    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3041    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3042
3043    /*
3044     * The magic pass: "Extended precision modular arithmetic"
3045     * x = ((x - y * DP1) - y * DP2) - y * DP3;
3046     */
3047    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3048    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3049    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3050
3051    /*
3052     * Evaluate the first polynom  (0 <= x <= Pi/4)
3053     *
3054     * z = _mm_mul_ps(x,x);
3055     */
3056    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3057
3058    /*
3059     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3060     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3061     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3062     */
3063    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3064    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3065    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3066
3067    /*
3068     * y = *(v4sf*)_ps_coscof_p0;
3069     * y = _mm_mul_ps(y, z);
3070     */
3071    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3072    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3073    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3074    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3075
3076
3077    /*
3078     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3079     * y = _mm_sub_ps(y, tmp);
3080     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3081     */
3082    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3083    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3084    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3085    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3086    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3087
3088    /*
3089     * _PS_CONST(sincof_p0, -1.9515295891E-4);
3090     * _PS_CONST(sincof_p1,  8.3321608736E-3);
3091     * _PS_CONST(sincof_p2, -1.6666654611E-1);
3092     */
3093    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3094    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3095    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3096
3097    /*
3098     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3099     *
3100     * y2 = *(v4sf*)_ps_sincof_p0;
3101     * y2 = _mm_mul_ps(y2, z);
3102     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3103     * y2 = _mm_mul_ps(y2, z);
3104     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3105     * y2 = _mm_mul_ps(y2, z);
3106     * y2 = _mm_mul_ps(y2, x);
3107     * y2 = _mm_add_ps(y2, x);
3108     */
3109
3110    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3111    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3112    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3113    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3114
3115    /*
3116     * select the correct result from the two polynoms
3117     * xmm3 = poly_mask;
3118     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3119     * y = _mm_andnot_ps(xmm3, y);
3120     * y = _mm_or_ps(y,y2);
3121     */
3122    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3123    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3124    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3125    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3126    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3127    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3128
3129    /*
3130     * update the sign
3131     * y = _mm_xor_ps(y, sign_bit);
3132     */
3133    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3134    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3135
3136    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3137
3138    /* clamp output to be within [-1, 1] */
3139    y_result = lp_build_clamp(bld, y_result,
3140                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3141                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3142    /* If a is -inf, inf or NaN then return NaN */
3143    y_result = lp_build_select(bld, isfinite, y_result,
3144                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3145    return y_result;
3146 }
3147
3148
3149 /**
3150  * Generate sin(a)
3151  */
3152 LLVMValueRef
3153 lp_build_sin(struct lp_build_context *bld,
3154              LLVMValueRef a)
3155 {
3156    return lp_build_sin_or_cos(bld, a, FALSE);
3157 }
3158
3159
3160 /**
3161  * Generate cos(a)
3162  */
3163 LLVMValueRef
3164 lp_build_cos(struct lp_build_context *bld,
3165              LLVMValueRef a)
3166 {
3167    return lp_build_sin_or_cos(bld, a, TRUE);
3168 }
3169
3170
3171 /**
3172  * Generate pow(x, y)
3173  */
3174 LLVMValueRef
3175 lp_build_pow(struct lp_build_context *bld,
3176              LLVMValueRef x,
3177              LLVMValueRef y)
3178 {
3179    /* TODO: optimize the constant case */
3180    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3181        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3182       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3183                    __FUNCTION__);
3184    }
3185
3186    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3187 }
3188
3189
3190 /**
3191  * Generate exp(x)
3192  */
3193 LLVMValueRef
3194 lp_build_exp(struct lp_build_context *bld,
3195              LLVMValueRef x)
3196 {
3197    /* log2(e) = 1/log(2) */
3198    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3199                                            1.4426950408889634);
3200
3201    assert(lp_check_value(bld->type, x));
3202
3203    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3204 }
3205
3206
3207 /**
3208  * Generate log(x)
3209  * Behavior is undefined with infs, 0s and nans
3210  */
3211 LLVMValueRef
3212 lp_build_log(struct lp_build_context *bld,
3213              LLVMValueRef x)
3214 {
3215    /* log(2) */
3216    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3217                                           0.69314718055994529);
3218
3219    assert(lp_check_value(bld->type, x));
3220
3221    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3222 }
3223
3224 /**
3225  * Generate log(x) that handles edge cases (infs, 0s and nans)
3226  */
3227 LLVMValueRef
3228 lp_build_log_safe(struct lp_build_context *bld,
3229                   LLVMValueRef x)
3230 {
3231    /* log(2) */
3232    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3233                                           0.69314718055994529);
3234
3235    assert(lp_check_value(bld->type, x));
3236
3237    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3238 }
3239
3240
3241 /**
3242  * Generate polynomial.
3243  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3244  */
3245 LLVMValueRef
3246 lp_build_polynomial(struct lp_build_context *bld,
3247                     LLVMValueRef x,
3248                     const double *coeffs,
3249                     unsigned num_coeffs)
3250 {
3251    const struct lp_type type = bld->type;
3252    LLVMValueRef even = NULL, odd = NULL;
3253    LLVMValueRef x2;
3254    unsigned i;
3255
3256    assert(lp_check_value(bld->type, x));
3257
3258    /* TODO: optimize the constant case */
3259    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3260        LLVMIsConstant(x)) {
3261       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3262                    __FUNCTION__);
3263    }
3264
3265    /*
3266     * Calculate odd and even terms seperately to decrease data dependency
3267     * Ex:
3268     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3269     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3270     */
3271    x2 = lp_build_mul(bld, x, x);
3272
3273    for (i = num_coeffs; i--; ) {
3274       LLVMValueRef coeff;
3275
3276       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3277
3278       if (i % 2 == 0) {
3279          if (even)
3280             even = lp_build_mad(bld, x2, even, coeff);
3281          else
3282             even = coeff;
3283       } else {
3284          if (odd)
3285             odd = lp_build_mad(bld, x2, odd, coeff);
3286          else
3287             odd = coeff;
3288       }
3289    }
3290
3291    if (odd)
3292       return lp_build_mad(bld, odd, x, even);
3293    else if (even)
3294       return even;
3295    else
3296       return bld->undef;
3297 }
3298
3299
3300 /**
3301  * Minimax polynomial fit of 2**x, in range [0, 1[
3302  */
3303 const double lp_build_exp2_polynomial[] = {
3304 #if EXP_POLY_DEGREE == 5
3305    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3306    0.693153073200168932794,
3307    0.240153617044375388211,
3308    0.0558263180532956664775,
3309    0.00898934009049466391101,
3310    0.00187757667519147912699
3311 #elif EXP_POLY_DEGREE == 4
3312    1.00000259337069434683,
3313    0.693003834469974940458,
3314    0.24144275689150793076,
3315    0.0520114606103070150235,
3316    0.0135341679161270268764
3317 #elif EXP_POLY_DEGREE == 3
3318    0.999925218562710312959,
3319    0.695833540494823811697,
3320    0.226067155427249155588,
3321    0.0780245226406372992967
3322 #elif EXP_POLY_DEGREE == 2
3323    1.00172476321474503578,
3324    0.657636275736077639316,
3325    0.33718943461968720704
3326 #else
3327 #error
3328 #endif
3329 };
3330
3331
3332 LLVMValueRef
3333 lp_build_exp2(struct lp_build_context *bld,
3334               LLVMValueRef x)
3335 {
3336    LLVMBuilderRef builder = bld->gallivm->builder;
3337    const struct lp_type type = bld->type;
3338    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3339    LLVMValueRef ipart = NULL;
3340    LLVMValueRef fpart = NULL;
3341    LLVMValueRef expipart = NULL;
3342    LLVMValueRef expfpart = NULL;
3343    LLVMValueRef res = NULL;
3344
3345    assert(lp_check_value(bld->type, x));
3346
3347    /* TODO: optimize the constant case */
3348    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3349        LLVMIsConstant(x)) {
3350       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3351                    __FUNCTION__);
3352    }
3353
3354    assert(type.floating && type.width == 32);
3355
3356    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3357     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3358    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3359                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3360    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3361                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3362
3363    /* ipart = floor(x) */
3364    /* fpart = x - ipart */
3365    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3366
3367    /* expipart = (float) (1 << ipart) */
3368    expipart = LLVMBuildAdd(builder, ipart,
3369                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3370    expipart = LLVMBuildShl(builder, expipart,
3371                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3372    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3373
3374    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3375                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3376
3377    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3378
3379    return res;
3380 }
3381
3382
3383
3384 /**
3385  * Extract the exponent of a IEEE-754 floating point value.
3386  *
3387  * Optionally apply an integer bias.
3388  *
3389  * Result is an integer value with
3390  *
3391  *   ifloor(log2(x)) + bias
3392  */
3393 LLVMValueRef
3394 lp_build_extract_exponent(struct lp_build_context *bld,
3395                           LLVMValueRef x,
3396                           int bias)
3397 {
3398    LLVMBuilderRef builder = bld->gallivm->builder;
3399    const struct lp_type type = bld->type;
3400    unsigned mantissa = lp_mantissa(type);
3401    LLVMValueRef res;
3402
3403    assert(type.floating);
3404
3405    assert(lp_check_value(bld->type, x));
3406
3407    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3408
3409    res = LLVMBuildLShr(builder, x,
3410                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3411    res = LLVMBuildAnd(builder, res,
3412                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3413    res = LLVMBuildSub(builder, res,
3414                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3415
3416    return res;
3417 }
3418
3419
3420 /**
3421  * Extract the mantissa of the a floating.
3422  *
3423  * Result is a floating point value with
3424  *
3425  *   x / floor(log2(x))
3426  */
3427 LLVMValueRef
3428 lp_build_extract_mantissa(struct lp_build_context *bld,
3429                           LLVMValueRef x)
3430 {
3431    LLVMBuilderRef builder = bld->gallivm->builder;
3432    const struct lp_type type = bld->type;
3433    unsigned mantissa = lp_mantissa(type);
3434    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3435                                                   (1ULL << mantissa) - 1);
3436    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3437    LLVMValueRef res;
3438
3439    assert(lp_check_value(bld->type, x));
3440
3441    assert(type.floating);
3442
3443    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3444
3445    /* res = x / 2**ipart */
3446    res = LLVMBuildAnd(builder, x, mantmask, "");
3447    res = LLVMBuildOr(builder, res, one, "");
3448    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3449
3450    return res;
3451 }
3452
3453
3454
3455 /**
3456  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3457  * These coefficients can be generate with
3458  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3459  */
3460 const double lp_build_log2_polynomial[] = {
3461 #if LOG_POLY_DEGREE == 5
3462    2.88539008148777786488L,
3463    0.961796878841293367824L,
3464    0.577058946784739859012L,
3465    0.412914355135828735411L,
3466    0.308591899232910175289L,
3467    0.352376952300281371868L,
3468 #elif LOG_POLY_DEGREE == 4
3469    2.88539009343309178325L,
3470    0.961791550404184197881L,
3471    0.577440339438736392009L,
3472    0.403343858251329912514L,
3473    0.406718052498846252698L,
3474 #elif LOG_POLY_DEGREE == 3
3475    2.88538959748872753838L,
3476    0.961932915889597772928L,
3477    0.571118517972136195241L,
3478    0.493997535084709500285L,
3479 #else
3480 #error
3481 #endif
3482 };
3483
3484 /**
3485  * See http://www.devmaster.net/forums/showthread.php?p=43580
3486  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3487  * http://www.nezumi.demon.co.uk/consult/logx.htm
3488  *
3489  * If handle_edge_cases is true the function will perform computations
3490  * to match the required D3D10+ behavior for each of the edge cases.
3491  * That means that if input is:
3492  * - less than zero (to and including -inf) then NaN will be returned
3493  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3494  * - +infinity, then +infinity will be returned
3495  * - NaN, then NaN will be returned
3496  *
3497  * Those checks are fairly expensive so if you don't need them make sure
3498  * handle_edge_cases is false.
3499  */
3500 void
3501 lp_build_log2_approx(struct lp_build_context *bld,
3502                      LLVMValueRef x,
3503                      LLVMValueRef *p_exp,
3504                      LLVMValueRef *p_floor_log2,
3505                      LLVMValueRef *p_log2,
3506                      boolean handle_edge_cases)
3507 {
3508    LLVMBuilderRef builder = bld->gallivm->builder;
3509    const struct lp_type type = bld->type;
3510    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3511    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3512
3513    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3514    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3515    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3516
3517    LLVMValueRef i = NULL;
3518    LLVMValueRef y = NULL;
3519    LLVMValueRef z = NULL;
3520    LLVMValueRef exp = NULL;
3521    LLVMValueRef mant = NULL;
3522    LLVMValueRef logexp = NULL;
3523    LLVMValueRef p_z = NULL;
3524    LLVMValueRef res = NULL;
3525
3526    assert(lp_check_value(bld->type, x));
3527
3528    if(p_exp || p_floor_log2 || p_log2) {
3529       /* TODO: optimize the constant case */
3530       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3531           LLVMIsConstant(x)) {
3532          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3533                       __FUNCTION__);
3534       }
3535
3536       assert(type.floating && type.width == 32);
3537
3538       /*
3539        * We don't explicitly handle denormalized numbers. They will yield a
3540        * result in the neighbourhood of -127, which appears to be adequate
3541        * enough.
3542        */
3543
3544       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3545
3546       /* exp = (float) exponent(x) */
3547       exp = LLVMBuildAnd(builder, i, expmask, "");
3548    }
3549
3550    if(p_floor_log2 || p_log2) {
3551       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3552       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3553       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3554    }
3555
3556    if (p_log2) {
3557       /* mant = 1 + (float) mantissa(x) */
3558       mant = LLVMBuildAnd(builder, i, mantmask, "");
3559       mant = LLVMBuildOr(builder, mant, one, "");
3560       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3561
3562       /* y = (mant - 1) / (mant + 1) */
3563       y = lp_build_div(bld,
3564          lp_build_sub(bld, mant, bld->one),
3565          lp_build_add(bld, mant, bld->one)
3566       );
3567
3568       /* z = y^2 */
3569       z = lp_build_mul(bld, y, y);
3570
3571       /* compute P(z) */
3572       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3573                                 ARRAY_SIZE(lp_build_log2_polynomial));
3574
3575       /* y * P(z) + logexp */
3576       res = lp_build_mad(bld, y, p_z, logexp);
3577
3578       if (type.floating && handle_edge_cases) {
3579          LLVMValueRef negmask, infmask,  zmask;
3580          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3581                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3582          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3583                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3584          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3585                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3586
3587          /* If x is qual to inf make sure we return inf */
3588          res = lp_build_select(bld, infmask,
3589                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3590                                res);
3591          /* If x is qual to 0, return -inf */
3592          res = lp_build_select(bld, zmask,
3593                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3594                                res);
3595          /* If x is nan or less than 0, return nan */
3596          res = lp_build_select(bld, negmask,
3597                                lp_build_const_vec(bld->gallivm, type,  NAN),
3598                                res);
3599       }
3600    }
3601
3602    if (p_exp) {
3603       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3604       *p_exp = exp;
3605    }
3606
3607    if (p_floor_log2)
3608       *p_floor_log2 = logexp;
3609
3610    if (p_log2)
3611       *p_log2 = res;
3612 }
3613
3614
3615 /*
3616  * log2 implementation which doesn't have special code to
3617  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3618  * the results for those cases are undefined.
3619  */
3620 LLVMValueRef
3621 lp_build_log2(struct lp_build_context *bld,
3622               LLVMValueRef x)
3623 {
3624    LLVMValueRef res;
3625    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3626    return res;
3627 }
3628
3629 /*
3630  * Version of log2 which handles all edge cases.
3631  * Look at documentation of lp_build_log2_approx for
3632  * description of the behavior for each of the edge cases.
3633  */
3634 LLVMValueRef
3635 lp_build_log2_safe(struct lp_build_context *bld,
3636                    LLVMValueRef x)
3637 {
3638    LLVMValueRef res;
3639    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3640    return res;
3641 }
3642
3643
3644 /**
3645  * Faster (and less accurate) log2.
3646  *
3647  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3648  *
3649  * Piece-wise linear approximation, with exact results when x is a
3650  * power of two.
3651  *
3652  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3653  */
3654 LLVMValueRef
3655 lp_build_fast_log2(struct lp_build_context *bld,
3656                    LLVMValueRef x)
3657 {
3658    LLVMBuilderRef builder = bld->gallivm->builder;
3659    LLVMValueRef ipart;
3660    LLVMValueRef fpart;
3661
3662    assert(lp_check_value(bld->type, x));
3663
3664    assert(bld->type.floating);
3665
3666    /* ipart = floor(log2(x)) - 1 */
3667    ipart = lp_build_extract_exponent(bld, x, -1);
3668    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3669
3670    /* fpart = x / 2**ipart */
3671    fpart = lp_build_extract_mantissa(bld, x);
3672
3673    /* ipart + fpart */
3674    return LLVMBuildFAdd(builder, ipart, fpart, "");
3675 }
3676
3677
3678 /**
3679  * Fast implementation of iround(log2(x)).
3680  *
3681  * Not an approximation -- it should give accurate results all the time.
3682  */
3683 LLVMValueRef
3684 lp_build_ilog2(struct lp_build_context *bld,
3685                LLVMValueRef x)
3686 {
3687    LLVMBuilderRef builder = bld->gallivm->builder;
3688    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3689    LLVMValueRef ipart;
3690
3691    assert(bld->type.floating);
3692
3693    assert(lp_check_value(bld->type, x));
3694
3695    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3696    x = LLVMBuildFMul(builder, x, sqrt2, "");
3697
3698    /* ipart = floor(log2(x) + 0.5)  */
3699    ipart = lp_build_extract_exponent(bld, x, 0);
3700
3701    return ipart;
3702 }
3703
3704 LLVMValueRef
3705 lp_build_mod(struct lp_build_context *bld,
3706              LLVMValueRef x,
3707              LLVMValueRef y)
3708 {
3709    LLVMBuilderRef builder = bld->gallivm->builder;
3710    LLVMValueRef res;
3711    const struct lp_type type = bld->type;
3712
3713    assert(lp_check_value(type, x));
3714    assert(lp_check_value(type, y));
3715
3716    if (type.floating)
3717       res = LLVMBuildFRem(builder, x, y, "");
3718    else if (type.sign)
3719       res = LLVMBuildSRem(builder, x, y, "");
3720    else
3721       res = LLVMBuildURem(builder, x, y, "");
3722    return res;
3723 }
3724
3725
3726 /*
3727  * For floating inputs it creates and returns a mask
3728  * which is all 1's for channels which are NaN.
3729  * Channels inside x which are not NaN will be 0.
3730  */
3731 LLVMValueRef
3732 lp_build_isnan(struct lp_build_context *bld,
3733                LLVMValueRef x)
3734 {
3735    LLVMValueRef mask;
3736    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3737
3738    assert(bld->type.floating);
3739    assert(lp_check_value(bld->type, x));
3740
3741    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3742                         "isnotnan");
3743    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3744    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3745    return mask;
3746 }
3747
3748 /* Returns all 1's for floating point numbers that are
3749  * finite numbers and returns all zeros for -inf,
3750  * inf and nan's */
3751 LLVMValueRef
3752 lp_build_isfinite(struct lp_build_context *bld,
3753                   LLVMValueRef x)
3754 {
3755    LLVMBuilderRef builder = bld->gallivm->builder;
3756    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3757    struct lp_type int_type = lp_int_type(bld->type);
3758    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3759    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3760                                                     0x7f800000);
3761
3762    if (!bld->type.floating) {
3763       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3764    }
3765    assert(bld->type.floating);
3766    assert(lp_check_value(bld->type, x));
3767    assert(bld->type.width == 32);
3768
3769    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3770    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3771                            intx, infornan32);
3772 }
3773
3774 /*
3775  * Returns true if the number is nan or inf and false otherwise.
3776  * The input has to be a floating point vector.
3777  */
3778 LLVMValueRef
3779 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3780                        const struct lp_type type,
3781                        LLVMValueRef x)
3782 {
3783    LLVMBuilderRef builder = gallivm->builder;
3784    struct lp_type int_type = lp_int_type(type);
3785    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3786                                                 0x7f800000);
3787    LLVMValueRef ret;
3788
3789    assert(type.floating);
3790
3791    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3792    ret = LLVMBuildAnd(builder, ret, const0, "");
3793    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3794                           ret, const0);
3795
3796    return ret;
3797 }
3798
3799
3800 LLVMValueRef
3801 lp_build_fpstate_get(struct gallivm_state *gallivm)
3802 {
3803    if (util_cpu_caps.has_sse) {
3804       LLVMBuilderRef builder = gallivm->builder;
3805       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3806          gallivm,
3807          LLVMInt32TypeInContext(gallivm->context),
3808          "mxcsr_ptr");
3809       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3810           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3811       lp_build_intrinsic(builder,
3812                          "llvm.x86.sse.stmxcsr",
3813                          LLVMVoidTypeInContext(gallivm->context),
3814                          &mxcsr_ptr8, 1, 0);
3815       return mxcsr_ptr;
3816    }
3817    return 0;
3818 }
3819
3820 void
3821 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3822                                   boolean zero)
3823 {
3824    if (util_cpu_caps.has_sse) {
3825       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3826       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3827
3828       LLVMBuilderRef builder = gallivm->builder;
3829       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3830       LLVMValueRef mxcsr =
3831          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3832
3833       if (util_cpu_caps.has_daz) {
3834          /* Enable denormals are zero mode */
3835          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3836       }
3837       if (zero) {
3838          mxcsr = LLVMBuildOr(builder, mxcsr,
3839                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3840       } else {
3841          mxcsr = LLVMBuildAnd(builder, mxcsr,
3842                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3843       }
3844
3845       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3846       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3847    }
3848 }
3849
3850 void
3851 lp_build_fpstate_set(struct gallivm_state *gallivm,
3852                      LLVMValueRef mxcsr_ptr)
3853 {
3854    if (util_cpu_caps.has_sse) {
3855       LLVMBuilderRef builder = gallivm->builder;
3856       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3857                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3858       lp_build_intrinsic(builder,
3859                          "llvm.x86.sse.ldmxcsr",
3860                          LLVMVoidTypeInContext(gallivm->context),
3861                          &mxcsr_ptr, 1, 0);
3862    }
3863 }