src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
 138          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 139                       __FUNCTION__);
 140       }
 141       if (type.width == 32 && type.length == 4) {
 142          intrinsic = "llvm.ppc.altivec.vminfp";
 143          intr_size = 128;
 144       }
 145    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 146       intr_size = 128;
 147       if ((type.width == 8 || type.width == 16) &&
 148           (type.width * type.length <= 64) &&
 149           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 150          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 151                       __FUNCTION__);
 152       }
 153       if (type.width == 8 && !type.sign) {
 154          intrinsic = "llvm.x86.sse2.pminu.b";
 155       }
 156       else if (type.width == 16 && type.sign) {
 157          intrinsic = "llvm.x86.sse2.pmins.w";
 158       }
 159       if (util_cpu_caps.has_sse4_1) {
 160          if (type.width == 8 && type.sign) {
 161             intrinsic = "llvm.x86.sse41.pminsb";
 162          }
 163          if (type.width == 16 && !type.sign) {
 164             intrinsic = "llvm.x86.sse41.pminuw";
 165          }
 166          if (type.width == 32 && !type.sign) {
 167             intrinsic = "llvm.x86.sse41.pminud";
 168          }
 169          if (type.width == 32 && type.sign) {
 170             intrinsic = "llvm.x86.sse41.pminsd";
 171          }
 172       }
 173    } else if (util_cpu_caps.has_altivec) {
 174       intr_size = 128;
 175       if (type.width == 8) {
 176          if (!type.sign) {
 177             intrinsic = "llvm.ppc.altivec.vminub";
 178          } else {
 179             intrinsic = "llvm.ppc.altivec.vminsb";
 180          }
 181       } else if (type.width == 16) {
 182          if (!type.sign) {
 183             intrinsic = "llvm.ppc.altivec.vminuh";
 184          } else {
 185             intrinsic = "llvm.ppc.altivec.vminsh";
 186          }
 187       } else if (type.width == 32) {
 188          if (!type.sign) {
 189             intrinsic = "llvm.ppc.altivec.vminuw";
 190          } else {
 191             intrinsic = "llvm.ppc.altivec.vminsw";
 192          }
 193       }
 194    }
 195
 196    if(intrinsic) {
 197       /* We need to handle nan's for floating point numbers. If one of the
 198        * inputs is nan the other should be returned (required by both D3D10+
 199        * and OpenCL).
 200        * The sse intrinsics return the second operator in case of nan by
 201        * default so we need to special code to handle those.
 202        */
 203       if (util_cpu_caps.has_sse && type.floating &&
 204           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 205           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
 206          LLVMValueRef isnan, max;
 207          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 208                                                    type,
 209                                                    intr_size, a, b);
 210          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 211             isnan = lp_build_isnan(bld, b);
 212             return lp_build_select(bld, isnan, a, max);
 213          } else {
 214             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 215             isnan = lp_build_isnan(bld, a);
 216             return lp_build_select(bld, isnan, a, max);
 217          }
 218       } else {
 219          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 220                                                     type,
 221                                                     intr_size, a, b);
 222       }
 223    }
 224
 225    if (type.floating) {
 226       switch (nan_behavior) {
 227       case GALLIVM_NAN_RETURN_NAN: {
 228          LLVMValueRef isnan = lp_build_isnan(bld, b);
 229          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 230          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 231          return lp_build_select(bld, cond, a, b);
 232       }
 233          break;
 234       case GALLIVM_NAN_RETURN_OTHER: {
 235          LLVMValueRef isnan = lp_build_isnan(bld, a);
 236          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 237          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 238          return lp_build_select(bld, cond, a, b);
 239       }
 240          break;
 241       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 242          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 243          return lp_build_select(bld, cond, a, b);
 244       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 245          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 246          return lp_build_select(bld, cond, a, b);
 247          break;
 248       default:
 249          assert(0);
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252       }
 253    } else {
 254       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 255       return lp_build_select(bld, cond, a, b);
 256    }
 257 }
 258
 259
 260 /**
 261  * Generate max(a, b)
 262  * No checks for special case values of a or b = 1 or 0 are done.
 263  * NaN's are handled according to the behavior specified by the
 264  * nan_behavior argument.
 265  */
 266 static LLVMValueRef
 267 lp_build_max_simple(struct lp_build_context *bld,
 268                     LLVMValueRef a,
 269                     LLVMValueRef b,
 270                     enum gallivm_nan_behavior nan_behavior)
 271 {
 272    const struct lp_type type = bld->type;
 273    const char *intrinsic = NULL;
 274    unsigned intr_size = 0;
 275    LLVMValueRef cond;
 276
 277    assert(lp_check_value(type, a));
 278    assert(lp_check_value(type, b));
 279
 280    /* TODO: optimize the constant case */
 281
 282    if (type.floating && util_cpu_caps.has_sse) {
 283       if (type.width == 32) {
 284          if (type.length == 1) {
 285             intrinsic = "llvm.x86.sse.max.ss";
 286             intr_size = 128;
 287          }
 288          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 289             intrinsic = "llvm.x86.sse.max.ps";
 290             intr_size = 128;
 291          }
 292          else {
 293             intrinsic = "llvm.x86.avx.max.ps.256";
 294             intr_size = 256;
 295          }
 296       }
 297       if (type.width == 64 && util_cpu_caps.has_sse2) {
 298          if (type.length == 1) {
 299             intrinsic = "llvm.x86.sse2.max.sd";
 300             intr_size = 128;
 301          }
 302          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 303             intrinsic = "llvm.x86.sse2.max.pd";
 304             intr_size = 128;
 305          }
 306          else {
 307             intrinsic = "llvm.x86.avx.max.pd.256";
 308             intr_size = 256;
 309          }
 310       }
 311    }
 312    else if (type.floating && util_cpu_caps.has_altivec) {
 313       if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
 314          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 315                       __FUNCTION__);
 316       }
 317       if (type.width == 32 || type.length == 4) {
 318          intrinsic = "llvm.ppc.altivec.vmaxfp";
 319          intr_size = 128;
 320       }
 321    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 322       intr_size = 128;
 323       if ((type.width == 8 || type.width == 16) &&
 324           (type.width * type.length <= 64) &&
 325           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 326          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 327                       __FUNCTION__);
 328          }
 329       if (type.width == 8 && !type.sign) {
 330          intrinsic = "llvm.x86.sse2.pmaxu.b";
 331          intr_size = 128;
 332       }
 333       else if (type.width == 16 && type.sign) {
 334          intrinsic = "llvm.x86.sse2.pmaxs.w";
 335       }
 336       if (util_cpu_caps.has_sse4_1) {
 337          if (type.width == 8 && type.sign) {
 338             intrinsic = "llvm.x86.sse41.pmaxsb";
 339          }
 340          if (type.width == 16 && !type.sign) {
 341             intrinsic = "llvm.x86.sse41.pmaxuw";
 342          }
 343          if (type.width == 32 && !type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxud";
 345         }
 346          if (type.width == 32 && type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxsd";
 348          }
 349       }
 350    } else if (util_cpu_caps.has_altivec) {
 351      intr_size = 128;
 352      if (type.width == 8) {
 353        if (!type.sign) {
 354          intrinsic = "llvm.ppc.altivec.vmaxub";
 355        } else {
 356          intrinsic = "llvm.ppc.altivec.vmaxsb";
 357        }
 358      } else if (type.width == 16) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxuh";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsh";
 363        }
 364      } else if (type.width == 32) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuw";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsw";
 369        }
 370      }
 371    }
 372
 373    if(intrinsic) {
 374       if (util_cpu_caps.has_sse && type.floating &&
 375           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 376           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
 377          LLVMValueRef isnan, min;
 378          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 379                                                    type,
 380                                                    intr_size, a, b);
 381          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 382             isnan = lp_build_isnan(bld, b);
 383             return lp_build_select(bld, isnan, a, min);
 384          } else {
 385             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 386             isnan = lp_build_isnan(bld, a);
 387             return lp_build_select(bld, isnan, a, min);
 388          }
 389       } else {
 390          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 391                                                     type,
 392                                                     intr_size, a, b);
 393       }
 394    }
 395
 396    if (type.floating) {
 397       switch (nan_behavior) {
 398       case GALLIVM_NAN_RETURN_NAN: {
 399          LLVMValueRef isnan = lp_build_isnan(bld, b);
 400          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 401          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 402          return lp_build_select(bld, cond, a, b);
 403       }
 404          break;
 405       case GALLIVM_NAN_RETURN_OTHER: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, a);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 413          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 414          return lp_build_select(bld, cond, a, b);
 415       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 416          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 417          return lp_build_select(bld, cond, a, b);
 418          break;
 419       default:
 420          assert(0);
 421          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 422          return lp_build_select(bld, cond, a, b);
 423       }
 424    } else {
 425       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 426       return lp_build_select(bld, cond, a, b);
 427    }
 428 }
 429
 430
 431 /**
 432  * Generate 1 - a, or ~a depending on bld->type.
 433  */
 434 LLVMValueRef
 435 lp_build_comp(struct lp_build_context *bld,
 436               LLVMValueRef a)
 437 {
 438    LLVMBuilderRef builder = bld->gallivm->builder;
 439    const struct lp_type type = bld->type;
 440
 441    assert(lp_check_value(type, a));
 442
 443    if(a == bld->one)
 444       return bld->zero;
 445    if(a == bld->zero)
 446       return bld->one;
 447
 448    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 449       if(LLVMIsConstant(a))
 450          return LLVMConstNot(a);
 451       else
 452          return LLVMBuildNot(builder, a, "");
 453    }
 454
 455    if(LLVMIsConstant(a))
 456       if (type.floating)
 457           return LLVMConstFSub(bld->one, a);
 458       else
 459           return LLVMConstSub(bld->one, a);
 460    else
 461       if (type.floating)
 462          return LLVMBuildFSub(builder, bld->one, a, "");
 463       else
 464          return LLVMBuildSub(builder, bld->one, a, "");
 465 }
 466
 467
 468 /**
 469  * Generate a + b
 470  */
 471 LLVMValueRef
 472 lp_build_add(struct lp_build_context *bld,
 473              LLVMValueRef a,
 474              LLVMValueRef b)
 475 {
 476    LLVMBuilderRef builder = bld->gallivm->builder;
 477    const struct lp_type type = bld->type;
 478    LLVMValueRef res;
 479
 480    assert(lp_check_value(type, a));
 481    assert(lp_check_value(type, b));
 482
 483    if(a == bld->zero)
 484       return b;
 485    if(b == bld->zero)
 486       return a;
 487    if(a == bld->undef || b == bld->undef)
 488       return bld->undef;
 489
 490    if(bld->type.norm) {
 491       const char *intrinsic = NULL;
 492
 493       if(a == bld->one || b == bld->one)
 494         return bld->one;
 495
 496       if (type.width * type.length == 128 &&
 497           !type.floating && !type.fixed) {
 498          if(util_cpu_caps.has_sse2) {
 499            if(type.width == 8)
 500              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 501            if(type.width == 16)
 502              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 503          } else if (util_cpu_caps.has_altivec) {
 504            if(type.width == 8)
 505               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 506            if(type.width == 16)
 507               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 508          }
 509       }
 510
 511       if(intrinsic)
 512          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 513    }
 514
 515    /* TODO: handle signed case */
 516    if(type.norm && !type.floating && !type.fixed && !type.sign)
 517       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 518
 519    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 520       if (type.floating)
 521          res = LLVMConstFAdd(a, b);
 522       else
 523          res = LLVMConstAdd(a, b);
 524    else
 525       if (type.floating)
 526          res = LLVMBuildFAdd(builder, a, b, "");
 527       else
 528          res = LLVMBuildAdd(builder, a, b, "");
 529
 530    /* clamp to ceiling of 1.0 */
 531    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 532       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 533
 534    /* XXX clamp to floor of -1 or 0??? */
 535
 536    return res;
 537 }
 538
 539
 540 /** Return the scalar sum of the elements of a.
 541  * Should avoid this operation whenever possible.
 542  */
 543 LLVMValueRef
 544 lp_build_horizontal_add(struct lp_build_context *bld,
 545                         LLVMValueRef a)
 546 {
 547    LLVMBuilderRef builder = bld->gallivm->builder;
 548    const struct lp_type type = bld->type;
 549    LLVMValueRef index, res;
 550    unsigned i, length;
 551    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 552    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 553    LLVMValueRef vecres, elem2;
 554
 555    assert(lp_check_value(type, a));
 556
 557    if (type.length == 1) {
 558       return a;
 559    }
 560
 561    assert(!bld->type.norm);
 562
 563    /*
 564     * for byte vectors can do much better with psadbw.
 565     * Using repeated shuffle/adds here. Note with multiple vectors
 566     * this can be done more efficiently as outlined in the intel
 567     * optimization manual.
 568     * Note: could cause data rearrangement if used with smaller element
 569     * sizes.
 570     */
 571
 572    vecres = a;
 573    length = type.length / 2;
 574    while (length > 1) {
 575       LLVMValueRef vec1, vec2;
 576       for (i = 0; i < length; i++) {
 577          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 578          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 579       }
 580       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 581                                     LLVMConstVector(shuffles1, length), "");
 582       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 583                                     LLVMConstVector(shuffles2, length), "");
 584       if (type.floating) {
 585          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 586       }
 587       else {
 588          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 589       }
 590       length = length >> 1;
 591    }
 592
 593    /* always have vector of size 2 here */
 594    assert(length == 1);
 595
 596    index = lp_build_const_int32(bld->gallivm, 0);
 597    res = LLVMBuildExtractElement(builder, vecres, index, "");
 598    index = lp_build_const_int32(bld->gallivm, 1);
 599    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 600
 601    if (type.floating)
 602       res = LLVMBuildFAdd(builder, res, elem2, "");
 603     else
 604       res = LLVMBuildAdd(builder, res, elem2, "");
 605
 606    return res;
 607 }
 608
 609 /**
 610  * Return the horizontal sums of 4 float vectors as a float4 vector.
 611  * This uses the technique as outlined in Intel Optimization Manual.
 612  */
 613 static LLVMValueRef
 614 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 615                             LLVMValueRef src[4])
 616 {
 617    struct gallivm_state *gallivm = bld->gallivm;
 618    LLVMBuilderRef builder = gallivm->builder;
 619    LLVMValueRef shuffles[4];
 620    LLVMValueRef tmp[4];
 621    LLVMValueRef sumtmp[2], shuftmp[2];
 622
 623    /* lower half of regs */
 624    shuffles[0] = lp_build_const_int32(gallivm, 0);
 625    shuffles[1] = lp_build_const_int32(gallivm, 1);
 626    shuffles[2] = lp_build_const_int32(gallivm, 4);
 627    shuffles[3] = lp_build_const_int32(gallivm, 5);
 628    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 629                                    LLVMConstVector(shuffles, 4), "");
 630    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 631                                    LLVMConstVector(shuffles, 4), "");
 632
 633    /* upper half of regs */
 634    shuffles[0] = lp_build_const_int32(gallivm, 2);
 635    shuffles[1] = lp_build_const_int32(gallivm, 3);
 636    shuffles[2] = lp_build_const_int32(gallivm, 6);
 637    shuffles[3] = lp_build_const_int32(gallivm, 7);
 638    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 639                                    LLVMConstVector(shuffles, 4), "");
 640    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 641                                    LLVMConstVector(shuffles, 4), "");
 642
 643    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 644    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 645
 646    shuffles[0] = lp_build_const_int32(gallivm, 0);
 647    shuffles[1] = lp_build_const_int32(gallivm, 2);
 648    shuffles[2] = lp_build_const_int32(gallivm, 4);
 649    shuffles[3] = lp_build_const_int32(gallivm, 6);
 650    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 651                                        LLVMConstVector(shuffles, 4), "");
 652
 653    shuffles[0] = lp_build_const_int32(gallivm, 1);
 654    shuffles[1] = lp_build_const_int32(gallivm, 3);
 655    shuffles[2] = lp_build_const_int32(gallivm, 5);
 656    shuffles[3] = lp_build_const_int32(gallivm, 7);
 657    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 658                                        LLVMConstVector(shuffles, 4), "");
 659
 660    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 661 }
 662
 663
 664 /*
 665  * partially horizontally add 2-4 float vectors with length nx4,
 666  * i.e. only four adjacent values in each vector will be added,
 667  * assuming values are really grouped in 4 which also determines
 668  * output order.
 669  *
 670  * Return a vector of the same length as the initial vectors,
 671  * with the excess elements (if any) being undefined.
 672  * The element order is independent of number of input vectors.
 673  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 674  * the output order thus will be
 675  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 676  */
 677 LLVMValueRef
 678 lp_build_hadd_partial4(struct lp_build_context *bld,
 679                        LLVMValueRef vectors[],
 680                        unsigned num_vecs)
 681 {
 682    struct gallivm_state *gallivm = bld->gallivm;
 683    LLVMBuilderRef builder = gallivm->builder;
 684    LLVMValueRef ret_vec;
 685    LLVMValueRef tmp[4];
 686    const char *intrinsic = NULL;
 687
 688    assert(num_vecs >= 2 && num_vecs <= 4);
 689    assert(bld->type.floating);
 690
 691    /* only use this with at least 2 vectors, as it is sort of expensive
 692     * (depending on cpu) and we always need two horizontal adds anyway,
 693     * so a shuffle/add approach might be better.
 694     */
 695
 696    tmp[0] = vectors[0];
 697    tmp[1] = vectors[1];
 698
 699    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 700    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 701
 702    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 703        bld->type.length == 4) {
 704       intrinsic = "llvm.x86.sse3.hadd.ps";
 705    }
 706    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 707             bld->type.length == 8) {
 708       intrinsic = "llvm.x86.avx.hadd.ps.256";
 709    }
 710    if (intrinsic) {
 711       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 712                                        lp_build_vec_type(gallivm, bld->type),
 713                                        tmp[0], tmp[1]);
 714       if (num_vecs > 2) {
 715          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 716                                           lp_build_vec_type(gallivm, bld->type),
 717                                           tmp[2], tmp[3]);
 718       }
 719       else {
 720          tmp[1] = tmp[0];
 721       }
 722       return lp_build_intrinsic_binary(builder, intrinsic,
 723                                        lp_build_vec_type(gallivm, bld->type),
 724                                        tmp[0], tmp[1]);
 725    }
 726
 727    if (bld->type.length == 4) {
 728       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 729    }
 730    else {
 731       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 732       unsigned j;
 733       unsigned num_iter = bld->type.length / 4;
 734       struct lp_type parttype = bld->type;
 735       parttype.length = 4;
 736       for (j = 0; j < num_iter; j++) {
 737          LLVMValueRef partsrc[4];
 738          unsigned i;
 739          for (i = 0; i < 4; i++) {
 740             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 741          }
 742          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 743       }
 744       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 745    }
 746    return ret_vec;
 747 }
 748
 749 /**
 750  * Generate a - b
 751  */
 752 LLVMValueRef
 753 lp_build_sub(struct lp_build_context *bld,
 754              LLVMValueRef a,
 755              LLVMValueRef b)
 756 {
 757    LLVMBuilderRef builder = bld->gallivm->builder;
 758    const struct lp_type type = bld->type;
 759    LLVMValueRef res;
 760
 761    assert(lp_check_value(type, a));
 762    assert(lp_check_value(type, b));
 763
 764    if(b == bld->zero)
 765       return a;
 766    if(a == bld->undef || b == bld->undef)
 767       return bld->undef;
 768    if(a == b)
 769       return bld->zero;
 770
 771    if(bld->type.norm) {
 772       const char *intrinsic = NULL;
 773
 774       if(b == bld->one)
 775         return bld->zero;
 776
 777       if (type.width * type.length == 128 &&
 778           !type.floating && !type.fixed) {
 779          if (util_cpu_caps.has_sse2) {
 780            if(type.width == 8)
 781               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 782            if(type.width == 16)
 783               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 784          } else if (util_cpu_caps.has_altivec) {
 785            if(type.width == 8)
 786               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 787            if(type.width == 16)
 788               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 789          }
 790       }
 791
 792       if(intrinsic)
 793          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 794    }
 795
 796    /* TODO: handle signed case */
 797    if(type.norm && !type.floating && !type.fixed && !type.sign)
 798       a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 799
 800    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 801       if (type.floating)
 802          res = LLVMConstFSub(a, b);
 803       else
 804          res = LLVMConstSub(a, b);
 805    else
 806       if (type.floating)
 807          res = LLVMBuildFSub(builder, a, b, "");
 808       else
 809          res = LLVMBuildSub(builder, a, b, "");
 810
 811    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 812       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 813
 814    return res;
 815 }
 816
 817
 818
 819 /**
 820  * Normalized multiplication.
 821  *
 822  * There are several approaches for (using 8-bit normalized multiplication as
 823  * an example):
 824  *
 825  * - alpha plus one
 826  *
 827  *     makes the following approximation to the division (Sree)
 828  *
 829  *       a*b/255 ~= (a*(b + 1)) >> 256
 830  *
 831  *     which is the fastest method that satisfies the following OpenGL criteria of
 832  *
 833  *       0*0 = 0 and 255*255 = 255
 834  *
 835  * - geometric series
 836  *
 837  *     takes the geometric series approximation to the division
 838  *
 839  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 840  *
 841  *     in this case just the first two terms to fit in 16bit arithmetic
 842  *
 843  *       t/255 ~= (t + (t >> 8)) >> 8
 844  *
 845  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 846  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 847  *     must be used.
 848  *
 849  * - geometric series plus rounding
 850  *
 851  *     when using a geometric series division instead of truncating the result
 852  *     use roundoff in the approximation (Jim Blinn)
 853  *
 854  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 855  *
 856  *     achieving the exact results.
 857  *
 858  *
 859  *
 860  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 861  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 862  * @sa Michael Herf, The "double blend trick", May 2000,
 863  *     http://www.stereopsis.com/doubleblend.html
 864  */
 865 static LLVMValueRef
 866 lp_build_mul_norm(struct gallivm_state *gallivm,
 867                   struct lp_type wide_type,
 868                   LLVMValueRef a, LLVMValueRef b)
 869 {
 870    LLVMBuilderRef builder = gallivm->builder;
 871    struct lp_build_context bld;
 872    unsigned n;
 873    LLVMValueRef half;
 874    LLVMValueRef ab;
 875
 876    assert(!wide_type.floating);
 877    assert(lp_check_value(wide_type, a));
 878    assert(lp_check_value(wide_type, b));
 879
 880    lp_build_context_init(&bld, gallivm, wide_type);
 881
 882    n = wide_type.width / 2;
 883    if (wide_type.sign) {
 884       --n;
 885    }
 886
 887    /*
 888     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 889     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 890     */
 891
 892    /*
 893     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 894     */
 895
 896    ab = LLVMBuildMul(builder, a, b, "");
 897    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 898
 899    /*
 900     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 901     */
 902
 903    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 904    if (wide_type.sign) {
 905       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 906       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 907       half = lp_build_select(&bld, sign, minus_half, half);
 908    }
 909    ab = LLVMBuildAdd(builder, ab, half, "");
 910
 911    /* Final division */
 912    ab = lp_build_shr_imm(&bld, ab, n);
 913
 914    return ab;
 915 }
 916
 917 /**
 918  * Generate a * b
 919  */
 920 LLVMValueRef
 921 lp_build_mul(struct lp_build_context *bld,
 922              LLVMValueRef a,
 923              LLVMValueRef b)
 924 {
 925    LLVMBuilderRef builder = bld->gallivm->builder;
 926    const struct lp_type type = bld->type;
 927    LLVMValueRef shift;
 928    LLVMValueRef res;
 929
 930    assert(lp_check_value(type, a));
 931    assert(lp_check_value(type, b));
 932
 933    if(a == bld->zero)
 934       return bld->zero;
 935    if(a == bld->one)
 936       return b;
 937    if(b == bld->zero)
 938       return bld->zero;
 939    if(b == bld->one)
 940       return a;
 941    if(a == bld->undef || b == bld->undef)
 942       return bld->undef;
 943
 944    if (!type.floating && !type.fixed && type.norm) {
 945       struct lp_type wide_type = lp_wider_type(type);
 946       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 947
 948       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 949       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 950
 951       /* PMULLW, PSRLW, PADDW */
 952       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 953       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 954
 955       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 956
 957       return ab;
 958    }
 959
 960    if(type.fixed)
 961       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 962    else
 963       shift = NULL;
 964
 965    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 966       if (type.floating)
 967          res = LLVMConstFMul(a, b);
 968       else
 969          res = LLVMConstMul(a, b);
 970       if(shift) {
 971          if(type.sign)
 972             res = LLVMConstAShr(res, shift);
 973          else
 974             res = LLVMConstLShr(res, shift);
 975       }
 976    }
 977    else {
 978       if (type.floating)
 979          res = LLVMBuildFMul(builder, a, b, "");
 980       else
 981          res = LLVMBuildMul(builder, a, b, "");
 982       if(shift) {
 983          if(type.sign)
 984             res = LLVMBuildAShr(builder, res, shift, "");
 985          else
 986             res = LLVMBuildLShr(builder, res, shift, "");
 987       }
 988    }
 989
 990    return res;
 991 }
 992
 993
 994 /**
 995  * Small vector x scale multiplication optimization.
 996  */
 997 LLVMValueRef
 998 lp_build_mul_imm(struct lp_build_context *bld,
 999                  LLVMValueRef a,
1000                  int b)
1001 {
1002    LLVMBuilderRef builder = bld->gallivm->builder;
1003    LLVMValueRef factor;
1004
1005    assert(lp_check_value(bld->type, a));
1006
1007    if(b == 0)
1008       return bld->zero;
1009
1010    if(b == 1)
1011       return a;
1012
1013    if(b == -1)
1014       return lp_build_negate(bld, a);
1015
1016    if(b == 2 && bld->type.floating)
1017       return lp_build_add(bld, a, a);
1018
1019    if(util_is_power_of_two(b)) {
1020       unsigned shift = ffs(b) - 1;
1021
1022       if(bld->type.floating) {
1023 #if 0
1024          /*
1025           * Power of two multiplication by directly manipulating the exponent.
1026           *
1027           * XXX: This might not be always faster, it will introduce a small error
1028           * for multiplication by zero, and it will produce wrong results
1029           * for Inf and NaN.
1030           */
1031          unsigned mantissa = lp_mantissa(bld->type);
1032          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1033          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1034          a = LLVMBuildAdd(builder, a, factor, "");
1035          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1036          return a;
1037 #endif
1038       }
1039       else {
1040          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1041          return LLVMBuildShl(builder, a, factor, "");
1042       }
1043    }
1044
1045    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1046    return lp_build_mul(bld, a, factor);
1047 }
1048
1049
1050 /**
1051  * Generate a / b
1052  */
1053 LLVMValueRef
1054 lp_build_div(struct lp_build_context *bld,
1055              LLVMValueRef a,
1056              LLVMValueRef b)
1057 {
1058    LLVMBuilderRef builder = bld->gallivm->builder;
1059    const struct lp_type type = bld->type;
1060
1061    assert(lp_check_value(type, a));
1062    assert(lp_check_value(type, b));
1063
1064    if(a == bld->zero)
1065       return bld->zero;
1066    if(a == bld->one)
1067       return lp_build_rcp(bld, b);
1068    if(b == bld->zero)
1069       return bld->undef;
1070    if(b == bld->one)
1071       return a;
1072    if(a == bld->undef || b == bld->undef)
1073       return bld->undef;
1074
1075    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1076       if (type.floating)
1077          return LLVMConstFDiv(a, b);
1078       else if (type.sign)
1079          return LLVMConstSDiv(a, b);
1080       else
1081          return LLVMConstUDiv(a, b);
1082    }
1083
1084    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1085        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1086       type.floating)
1087       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1088
1089    if (type.floating)
1090       return LLVMBuildFDiv(builder, a, b, "");
1091    else if (type.sign)
1092       return LLVMBuildSDiv(builder, a, b, "");
1093    else
1094       return LLVMBuildUDiv(builder, a, b, "");
1095 }
1096
1097
1098 /**
1099  * Linear interpolation helper.
1100  *
1101  * @param normalized whether we are interpolating normalized values,
1102  *        encoded in normalized integers, twice as wide.
1103  *
1104  * @sa http://www.stereopsis.com/doubleblend.html
1105  */
1106 static INLINE LLVMValueRef
1107 lp_build_lerp_simple(struct lp_build_context *bld,
1108                      LLVMValueRef x,
1109                      LLVMValueRef v0,
1110                      LLVMValueRef v1,
1111                      unsigned flags)
1112 {
1113    unsigned half_width = bld->type.width/2;
1114    LLVMBuilderRef builder = bld->gallivm->builder;
1115    LLVMValueRef delta;
1116    LLVMValueRef res;
1117
1118    assert(lp_check_value(bld->type, x));
1119    assert(lp_check_value(bld->type, v0));
1120    assert(lp_check_value(bld->type, v1));
1121
1122    delta = lp_build_sub(bld, v1, v0);
1123
1124    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1125       if (!bld->type.sign) {
1126          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1127             /*
1128              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1129              * most-significant-bit to the lowest-significant-bit, so that
1130              * later we can just divide by 2**n instead of 2**n - 1.
1131              */
1132
1133             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1134          }
1135
1136          /* (x * delta) >> n */
1137          res = lp_build_mul(bld, x, delta);
1138          res = lp_build_shr_imm(bld, res, half_width);
1139       } else {
1140          /*
1141           * The rescaling trick above doesn't work for signed numbers, so
1142           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1143           * instead.
1144           */
1145          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1146          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1147       }
1148    } else {
1149       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1150       res = lp_build_mul(bld, x, delta);
1151    }
1152
1153    res = lp_build_add(bld, v0, res);
1154
1155    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1156        bld->type.fixed) {
1157       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1158       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1159        * but it will be wrong for true fixed point use cases. Basically we need
1160        * a more powerful lp_type, capable of further distinguishing the values
1161        * interpretation from the value storage. */
1162       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1163    }
1164
1165    return res;
1166 }
1167
1168
1169 /**
1170  * Linear interpolation.
1171  */
1172 LLVMValueRef
1173 lp_build_lerp(struct lp_build_context *bld,
1174               LLVMValueRef x,
1175               LLVMValueRef v0,
1176               LLVMValueRef v1,
1177               unsigned flags)
1178 {
1179    const struct lp_type type = bld->type;
1180    LLVMValueRef res;
1181
1182    assert(lp_check_value(type, x));
1183    assert(lp_check_value(type, v0));
1184    assert(lp_check_value(type, v1));
1185
1186    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1187
1188    if (type.norm) {
1189       struct lp_type wide_type;
1190       struct lp_build_context wide_bld;
1191       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1192
1193       assert(type.length >= 2);
1194
1195       /*
1196        * Create a wider integer type, enough to hold the
1197        * intermediate result of the multiplication.
1198        */
1199       memset(&wide_type, 0, sizeof wide_type);
1200       wide_type.sign   = type.sign;
1201       wide_type.width  = type.width*2;
1202       wide_type.length = type.length/2;
1203
1204       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1205
1206       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1207       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1208       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1209
1210       /*
1211        * Lerp both halves.
1212        */
1213
1214       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1215
1216       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1217       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1218
1219       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1220    } else {
1221       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1222    }
1223
1224    return res;
1225 }
1226
1227
1228 /**
1229  * Bilinear interpolation.
1230  *
1231  * Values indices are in v_{yx}.
1232  */
1233 LLVMValueRef
1234 lp_build_lerp_2d(struct lp_build_context *bld,
1235                  LLVMValueRef x,
1236                  LLVMValueRef y,
1237                  LLVMValueRef v00,
1238                  LLVMValueRef v01,
1239                  LLVMValueRef v10,
1240                  LLVMValueRef v11,
1241                  unsigned flags)
1242 {
1243    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1244    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1245    return lp_build_lerp(bld, y, v0, v1, flags);
1246 }
1247
1248
1249 LLVMValueRef
1250 lp_build_lerp_3d(struct lp_build_context *bld,
1251                  LLVMValueRef x,
1252                  LLVMValueRef y,
1253                  LLVMValueRef z,
1254                  LLVMValueRef v000,
1255                  LLVMValueRef v001,
1256                  LLVMValueRef v010,
1257                  LLVMValueRef v011,
1258                  LLVMValueRef v100,
1259                  LLVMValueRef v101,
1260                  LLVMValueRef v110,
1261                  LLVMValueRef v111,
1262                  unsigned flags)
1263 {
1264    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1265    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1266    return lp_build_lerp(bld, z, v0, v1, flags);
1267 }
1268
1269
1270 /**
1271  * Generate min(a, b)
1272  * Do checks for special cases but not for nans.
1273  */
1274 LLVMValueRef
1275 lp_build_min(struct lp_build_context *bld,
1276              LLVMValueRef a,
1277              LLVMValueRef b)
1278 {
1279    assert(lp_check_value(bld->type, a));
1280    assert(lp_check_value(bld->type, b));
1281
1282    if(a == bld->undef || b == bld->undef)
1283       return bld->undef;
1284
1285    if(a == b)
1286       return a;
1287
1288    if (bld->type.norm) {
1289       if (!bld->type.sign) {
1290          if (a == bld->zero || b == bld->zero) {
1291             return bld->zero;
1292          }
1293       }
1294       if(a == bld->one)
1295          return b;
1296       if(b == bld->one)
1297          return a;
1298    }
1299
1300    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1301 }
1302
1303
1304 /**
1305  * Generate min(a, b)
1306  * NaN's are handled according to the behavior specified by the
1307  * nan_behavior argument.
1308  */
1309 LLVMValueRef
1310 lp_build_min_ext(struct lp_build_context *bld,
1311                  LLVMValueRef a,
1312                  LLVMValueRef b,
1313                  enum gallivm_nan_behavior nan_behavior)
1314 {
1315    assert(lp_check_value(bld->type, a));
1316    assert(lp_check_value(bld->type, b));
1317
1318    if(a == bld->undef || b == bld->undef)
1319       return bld->undef;
1320
1321    if(a == b)
1322       return a;
1323
1324    if (bld->type.norm) {
1325       if (!bld->type.sign) {
1326          if (a == bld->zero || b == bld->zero) {
1327             return bld->zero;
1328          }
1329       }
1330       if(a == bld->one)
1331          return b;
1332       if(b == bld->one)
1333          return a;
1334    }
1335
1336    return lp_build_min_simple(bld, a, b, nan_behavior);
1337 }
1338
1339 /**
1340  * Generate max(a, b)
1341  * Do checks for special cases, but NaN behavior is undefined.
1342  */
1343 LLVMValueRef
1344 lp_build_max(struct lp_build_context *bld,
1345              LLVMValueRef a,
1346              LLVMValueRef b)
1347 {
1348    assert(lp_check_value(bld->type, a));
1349    assert(lp_check_value(bld->type, b));
1350
1351    if(a == bld->undef || b == bld->undef)
1352       return bld->undef;
1353
1354    if(a == b)
1355       return a;
1356
1357    if(bld->type.norm) {
1358       if(a == bld->one || b == bld->one)
1359          return bld->one;
1360       if (!bld->type.sign) {
1361          if (a == bld->zero) {
1362             return b;
1363          }
1364          if (b == bld->zero) {
1365             return a;
1366          }
1367       }
1368    }
1369
1370    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1371 }
1372
1373
1374 /**
1375  * Generate max(a, b)
1376  * Checks for special cases.
1377  * NaN's are handled according to the behavior specified by the
1378  * nan_behavior argument.
1379  */
1380 LLVMValueRef
1381 lp_build_max_ext(struct lp_build_context *bld,
1382                   LLVMValueRef a,
1383                   LLVMValueRef b,
1384                   enum gallivm_nan_behavior nan_behavior)
1385 {
1386    assert(lp_check_value(bld->type, a));
1387    assert(lp_check_value(bld->type, b));
1388
1389    if(a == bld->undef || b == bld->undef)
1390       return bld->undef;
1391
1392    if(a == b)
1393       return a;
1394
1395    if(bld->type.norm) {
1396       if(a == bld->one || b == bld->one)
1397          return bld->one;
1398       if (!bld->type.sign) {
1399          if (a == bld->zero) {
1400             return b;
1401          }
1402          if (b == bld->zero) {
1403             return a;
1404          }
1405       }
1406    }
1407
1408    return lp_build_max_simple(bld, a, b, nan_behavior);
1409 }
1410
1411 /**
1412  * Generate clamp(a, min, max)
1413  * NaN behavior (for any of a, min, max) is undefined.
1414  * Do checks for special cases.
1415  */
1416 LLVMValueRef
1417 lp_build_clamp(struct lp_build_context *bld,
1418                LLVMValueRef a,
1419                LLVMValueRef min,
1420                LLVMValueRef max)
1421 {
1422    assert(lp_check_value(bld->type, a));
1423    assert(lp_check_value(bld->type, min));
1424    assert(lp_check_value(bld->type, max));
1425
1426    a = lp_build_min(bld, a, max);
1427    a = lp_build_max(bld, a, min);
1428    return a;
1429 }
1430
1431
1432 /**
1433  * Generate clamp(a, 0, 1)
1434  * A NaN will get converted to zero.
1435  */
1436 LLVMValueRef
1437 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1438                                 LLVMValueRef a)
1439 {
1440    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1441    a = lp_build_min(bld, a, bld->one);
1442    return a;
1443 }
1444
1445
1446 /**
1447  * Generate abs(a)
1448  */
1449 LLVMValueRef
1450 lp_build_abs(struct lp_build_context *bld,
1451              LLVMValueRef a)
1452 {
1453    LLVMBuilderRef builder = bld->gallivm->builder;
1454    const struct lp_type type = bld->type;
1455    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1456
1457    assert(lp_check_value(type, a));
1458
1459    if(!type.sign)
1460       return a;
1461
1462    if(type.floating) {
1463       /* Mask out the sign bit */
1464       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1465       unsigned long long absMask = ~(1ULL << (type.width - 1));
1466       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1467       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1468       a = LLVMBuildAnd(builder, a, mask, "");
1469       a = LLVMBuildBitCast(builder, a, vec_type, "");
1470       return a;
1471    }
1472
1473    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1474       switch(type.width) {
1475       case 8:
1476          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1477       case 16:
1478          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1479       case 32:
1480          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1481       }
1482    }
1483    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1484             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1485             (type.width == 8 || type.width == 16 || type.width == 32)) {
1486       debug_printf("%s: inefficient code, should split vectors manually\n",
1487                    __FUNCTION__);
1488    }
1489
1490    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1491 }
1492
1493
1494 LLVMValueRef
1495 lp_build_negate(struct lp_build_context *bld,
1496                 LLVMValueRef a)
1497 {
1498    LLVMBuilderRef builder = bld->gallivm->builder;
1499
1500    assert(lp_check_value(bld->type, a));
1501
1502 #if HAVE_LLVM >= 0x0207
1503    if (bld->type.floating)
1504       a = LLVMBuildFNeg(builder, a, "");
1505    else
1506 #endif
1507       a = LLVMBuildNeg(builder, a, "");
1508
1509    return a;
1510 }
1511
1512
1513 /** Return -1, 0 or +1 depending on the sign of a */
1514 LLVMValueRef
1515 lp_build_sgn(struct lp_build_context *bld,
1516              LLVMValueRef a)
1517 {
1518    LLVMBuilderRef builder = bld->gallivm->builder;
1519    const struct lp_type type = bld->type;
1520    LLVMValueRef cond;
1521    LLVMValueRef res;
1522
1523    assert(lp_check_value(type, a));
1524
1525    /* Handle non-zero case */
1526    if(!type.sign) {
1527       /* if not zero then sign must be positive */
1528       res = bld->one;
1529    }
1530    else if(type.floating) {
1531       LLVMTypeRef vec_type;
1532       LLVMTypeRef int_type;
1533       LLVMValueRef mask;
1534       LLVMValueRef sign;
1535       LLVMValueRef one;
1536       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1537
1538       int_type = lp_build_int_vec_type(bld->gallivm, type);
1539       vec_type = lp_build_vec_type(bld->gallivm, type);
1540       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1541
1542       /* Take the sign bit and add it to 1 constant */
1543       sign = LLVMBuildBitCast(builder, a, int_type, "");
1544       sign = LLVMBuildAnd(builder, sign, mask, "");
1545       one = LLVMConstBitCast(bld->one, int_type);
1546       res = LLVMBuildOr(builder, sign, one, "");
1547       res = LLVMBuildBitCast(builder, res, vec_type, "");
1548    }
1549    else
1550    {
1551       /* signed int/norm/fixed point */
1552       /* could use psign with sse3 and appropriate vectors here */
1553       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1554       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1555       res = lp_build_select(bld, cond, bld->one, minus_one);
1556    }
1557
1558    /* Handle zero */
1559    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1560    res = lp_build_select(bld, cond, bld->zero, res);
1561
1562    return res;
1563 }
1564
1565
1566 /**
1567  * Set the sign of float vector 'a' according to 'sign'.
1568  * If sign==0, return abs(a).
1569  * If sign==1, return -abs(a);
1570  * Other values for sign produce undefined results.
1571  */
1572 LLVMValueRef
1573 lp_build_set_sign(struct lp_build_context *bld,
1574                   LLVMValueRef a, LLVMValueRef sign)
1575 {
1576    LLVMBuilderRef builder = bld->gallivm->builder;
1577    const struct lp_type type = bld->type;
1578    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1579    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1580    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1581    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1582                              ~((unsigned long long) 1 << (type.width - 1)));
1583    LLVMValueRef val, res;
1584
1585    assert(type.floating);
1586    assert(lp_check_value(type, a));
1587
1588    /* val = reinterpret_cast<int>(a) */
1589    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1590    /* val = val & mask */
1591    val = LLVMBuildAnd(builder, val, mask, "");
1592    /* sign = sign << shift */
1593    sign = LLVMBuildShl(builder, sign, shift, "");
1594    /* res = val | sign */
1595    res = LLVMBuildOr(builder, val, sign, "");
1596    /* res = reinterpret_cast<float>(res) */
1597    res = LLVMBuildBitCast(builder, res, vec_type, "");
1598
1599    return res;
1600 }
1601
1602
1603 /**
1604  * Convert vector of (or scalar) int to vector of (or scalar) float.
1605  */
1606 LLVMValueRef
1607 lp_build_int_to_float(struct lp_build_context *bld,
1608                       LLVMValueRef a)
1609 {
1610    LLVMBuilderRef builder = bld->gallivm->builder;
1611    const struct lp_type type = bld->type;
1612    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1613
1614    assert(type.floating);
1615
1616    return LLVMBuildSIToFP(builder, a, vec_type, "");
1617 }
1618
1619 static boolean
1620 arch_rounding_available(const struct lp_type type)
1621 {
1622    if ((util_cpu_caps.has_sse4_1 &&
1623        (type.length == 1 || type.width*type.length == 128)) ||
1624        (util_cpu_caps.has_avx && type.width*type.length == 256))
1625       return TRUE;
1626    else if ((util_cpu_caps.has_altivec &&
1627             (type.width == 32 && type.length == 4)))
1628       return TRUE;
1629
1630    return FALSE;
1631 }
1632
1633 enum lp_build_round_mode
1634 {
1635    LP_BUILD_ROUND_NEAREST = 0,
1636    LP_BUILD_ROUND_FLOOR = 1,
1637    LP_BUILD_ROUND_CEIL = 2,
1638    LP_BUILD_ROUND_TRUNCATE = 3
1639 };
1640
1641 /**
1642  * Helper for SSE4.1's ROUNDxx instructions.
1643  *
1644  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1645  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1646  */
1647 static INLINE LLVMValueRef
1648 lp_build_round_sse41(struct lp_build_context *bld,
1649                      LLVMValueRef a,
1650                      enum lp_build_round_mode mode)
1651 {
1652    LLVMBuilderRef builder = bld->gallivm->builder;
1653    const struct lp_type type = bld->type;
1654    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1655    const char *intrinsic;
1656    LLVMValueRef res;
1657
1658    assert(type.floating);
1659
1660    assert(lp_check_value(type, a));
1661    assert(util_cpu_caps.has_sse4_1);
1662
1663    if (type.length == 1) {
1664       LLVMTypeRef vec_type;
1665       LLVMValueRef undef;
1666       LLVMValueRef args[3];
1667       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1668
1669       switch(type.width) {
1670       case 32:
1671          intrinsic = "llvm.x86.sse41.round.ss";
1672          break;
1673       case 64:
1674          intrinsic = "llvm.x86.sse41.round.sd";
1675          break;
1676       default:
1677          assert(0);
1678          return bld->undef;
1679       }
1680
1681       vec_type = LLVMVectorType(bld->elem_type, 4);
1682
1683       undef = LLVMGetUndef(vec_type);
1684
1685       args[0] = undef;
1686       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1687       args[2] = LLVMConstInt(i32t, mode, 0);
1688
1689       res = lp_build_intrinsic(builder, intrinsic,
1690                                vec_type, args, Elements(args));
1691
1692       res = LLVMBuildExtractElement(builder, res, index0, "");
1693    }
1694    else {
1695       if (type.width * type.length == 128) {
1696          switch(type.width) {
1697          case 32:
1698             intrinsic = "llvm.x86.sse41.round.ps";
1699             break;
1700          case 64:
1701             intrinsic = "llvm.x86.sse41.round.pd";
1702             break;
1703          default:
1704             assert(0);
1705             return bld->undef;
1706          }
1707       }
1708       else {
1709          assert(type.width * type.length == 256);
1710          assert(util_cpu_caps.has_avx);
1711
1712          switch(type.width) {
1713          case 32:
1714             intrinsic = "llvm.x86.avx.round.ps.256";
1715             break;
1716          case 64:
1717             intrinsic = "llvm.x86.avx.round.pd.256";
1718             break;
1719          default:
1720             assert(0);
1721             return bld->undef;
1722          }
1723       }
1724
1725       res = lp_build_intrinsic_binary(builder, intrinsic,
1726                                       bld->vec_type, a,
1727                                       LLVMConstInt(i32t, mode, 0));
1728    }
1729
1730    return res;
1731 }
1732
1733
1734 static INLINE LLVMValueRef
1735 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1736                              LLVMValueRef a)
1737 {
1738    LLVMBuilderRef builder = bld->gallivm->builder;
1739    const struct lp_type type = bld->type;
1740    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1741    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1742    const char *intrinsic;
1743    LLVMValueRef res;
1744
1745    assert(type.floating);
1746    /* using the double precision conversions is a bit more complicated */
1747    assert(type.width == 32);
1748
1749    assert(lp_check_value(type, a));
1750    assert(util_cpu_caps.has_sse2);
1751
1752    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1753    if (type.length == 1) {
1754       LLVMTypeRef vec_type;
1755       LLVMValueRef undef;
1756       LLVMValueRef arg;
1757       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1758
1759       vec_type = LLVMVectorType(bld->elem_type, 4);
1760
1761       intrinsic = "llvm.x86.sse.cvtss2si";
1762
1763       undef = LLVMGetUndef(vec_type);
1764
1765       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1766
1767       res = lp_build_intrinsic_unary(builder, intrinsic,
1768                                      ret_type, arg);
1769    }
1770    else {
1771       if (type.width* type.length == 128) {
1772          intrinsic = "llvm.x86.sse2.cvtps2dq";
1773       }
1774       else {
1775          assert(type.width*type.length == 256);
1776          assert(util_cpu_caps.has_avx);
1777
1778          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1779       }
1780       res = lp_build_intrinsic_unary(builder, intrinsic,
1781                                      ret_type, a);
1782    }
1783
1784    return res;
1785 }
1786
1787
1788 /*
1789  */
1790 static INLINE LLVMValueRef
1791 lp_build_round_altivec(struct lp_build_context *bld,
1792                        LLVMValueRef a,
1793                        enum lp_build_round_mode mode)
1794 {
1795    LLVMBuilderRef builder = bld->gallivm->builder;
1796    const struct lp_type type = bld->type;
1797    const char *intrinsic = NULL;
1798
1799    assert(type.floating);
1800
1801    assert(lp_check_value(type, a));
1802    assert(util_cpu_caps.has_altivec);
1803
1804    switch (mode) {
1805    case LP_BUILD_ROUND_NEAREST:
1806       intrinsic = "llvm.ppc.altivec.vrfin";
1807       break;
1808    case LP_BUILD_ROUND_FLOOR:
1809       intrinsic = "llvm.ppc.altivec.vrfim";
1810       break;
1811    case LP_BUILD_ROUND_CEIL:
1812       intrinsic = "llvm.ppc.altivec.vrfip";
1813       break;
1814    case LP_BUILD_ROUND_TRUNCATE:
1815       intrinsic = "llvm.ppc.altivec.vrfiz";
1816       break;
1817    }
1818
1819    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1820 }
1821
1822 static INLINE LLVMValueRef
1823 lp_build_round_arch(struct lp_build_context *bld,
1824                     LLVMValueRef a,
1825                     enum lp_build_round_mode mode)
1826 {
1827    if (util_cpu_caps.has_sse4_1)
1828      return lp_build_round_sse41(bld, a, mode);
1829    else /* (util_cpu_caps.has_altivec) */
1830      return lp_build_round_altivec(bld, a, mode);
1831 }
1832
1833 /**
1834  * Return the integer part of a float (vector) value (== round toward zero).
1835  * The returned value is a float (vector).
1836  * Ex: trunc(-1.5) = -1.0
1837  */
1838 LLVMValueRef
1839 lp_build_trunc(struct lp_build_context *bld,
1840                LLVMValueRef a)
1841 {
1842    LLVMBuilderRef builder = bld->gallivm->builder;
1843    const struct lp_type type = bld->type;
1844
1845    assert(type.floating);
1846    assert(lp_check_value(type, a));
1847
1848    if (arch_rounding_available(type)) {
1849       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1850    }
1851    else {
1852       const struct lp_type type = bld->type;
1853       struct lp_type inttype;
1854       struct lp_build_context intbld;
1855       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1856       LLVMValueRef trunc, res, anosign, mask;
1857       LLVMTypeRef int_vec_type = bld->int_vec_type;
1858       LLVMTypeRef vec_type = bld->vec_type;
1859
1860       assert(type.width == 32); /* might want to handle doubles at some point */
1861
1862       inttype = type;
1863       inttype.floating = 0;
1864       lp_build_context_init(&intbld, bld->gallivm, inttype);
1865
1866       /* round by truncation */
1867       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1868       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1869
1870       /* mask out sign bit */
1871       anosign = lp_build_abs(bld, a);
1872       /*
1873        * mask out all values if anosign > 2^24
1874        * This should work both for large ints (all rounding is no-op for them
1875        * because such floats are always exact) as well as special cases like
1876        * NaNs, Infs (taking advantage of the fact they use max exponent).
1877        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1878        */
1879       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1880       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1881       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1882       return lp_build_select(bld, mask, a, res);
1883    }
1884 }
1885
1886
1887 /**
1888  * Return float (vector) rounded to nearest integer (vector).  The returned
1889  * value is a float (vector).
1890  * Ex: round(0.9) = 1.0
1891  * Ex: round(-1.5) = -2.0
1892  */
1893 LLVMValueRef
1894 lp_build_round(struct lp_build_context *bld,
1895                LLVMValueRef a)
1896 {
1897    LLVMBuilderRef builder = bld->gallivm->builder;
1898    const struct lp_type type = bld->type;
1899
1900    assert(type.floating);
1901    assert(lp_check_value(type, a));
1902
1903    if (arch_rounding_available(type)) {
1904       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1905    }
1906    else {
1907       const struct lp_type type = bld->type;
1908       struct lp_type inttype;
1909       struct lp_build_context intbld;
1910       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1911       LLVMValueRef res, anosign, mask;
1912       LLVMTypeRef int_vec_type = bld->int_vec_type;
1913       LLVMTypeRef vec_type = bld->vec_type;
1914
1915       assert(type.width == 32); /* might want to handle doubles at some point */
1916
1917       inttype = type;
1918       inttype.floating = 0;
1919       lp_build_context_init(&intbld, bld->gallivm, inttype);
1920
1921       res = lp_build_iround(bld, a);
1922       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1923
1924       /* mask out sign bit */
1925       anosign = lp_build_abs(bld, a);
1926       /*
1927        * mask out all values if anosign > 2^24
1928        * This should work both for large ints (all rounding is no-op for them
1929        * because such floats are always exact) as well as special cases like
1930        * NaNs, Infs (taking advantage of the fact they use max exponent).
1931        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1932        */
1933       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1934       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1935       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1936       return lp_build_select(bld, mask, a, res);
1937    }
1938 }
1939
1940
1941 /**
1942  * Return floor of float (vector), result is a float (vector)
1943  * Ex: floor(1.1) = 1.0
1944  * Ex: floor(-1.1) = -2.0
1945  */
1946 LLVMValueRef
1947 lp_build_floor(struct lp_build_context *bld,
1948                LLVMValueRef a)
1949 {
1950    LLVMBuilderRef builder = bld->gallivm->builder;
1951    const struct lp_type type = bld->type;
1952
1953    assert(type.floating);
1954    assert(lp_check_value(type, a));
1955
1956    if (arch_rounding_available(type)) {
1957       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1958    }
1959    else {
1960       const struct lp_type type = bld->type;
1961       struct lp_type inttype;
1962       struct lp_build_context intbld;
1963       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1964       LLVMValueRef trunc, res, anosign, mask;
1965       LLVMTypeRef int_vec_type = bld->int_vec_type;
1966       LLVMTypeRef vec_type = bld->vec_type;
1967
1968       assert(type.width == 32); /* might want to handle doubles at some point */
1969
1970       inttype = type;
1971       inttype.floating = 0;
1972       lp_build_context_init(&intbld, bld->gallivm, inttype);
1973
1974       /* round by truncation */
1975       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1976       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1977
1978       if (type.sign) {
1979          LLVMValueRef tmp;
1980
1981          /*
1982           * fix values if rounding is wrong (for non-special cases)
1983           * - this is the case if trunc > a
1984           */
1985          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1986          /* tmp = trunc > a ? 1.0 : 0.0 */
1987          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1988          tmp = lp_build_and(&intbld, mask, tmp);
1989          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1990          res = lp_build_sub(bld, res, tmp);
1991       }
1992
1993       /* mask out sign bit */
1994       anosign = lp_build_abs(bld, a);
1995       /*
1996        * mask out all values if anosign > 2^24
1997        * This should work both for large ints (all rounding is no-op for them
1998        * because such floats are always exact) as well as special cases like
1999        * NaNs, Infs (taking advantage of the fact they use max exponent).
2000        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2001        */
2002       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2003       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2004       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2005       return lp_build_select(bld, mask, a, res);
2006    }
2007 }
2008
2009
2010 /**
2011  * Return ceiling of float (vector), returning float (vector).
2012  * Ex: ceil( 1.1) = 2.0
2013  * Ex: ceil(-1.1) = -1.0
2014  */
2015 LLVMValueRef
2016 lp_build_ceil(struct lp_build_context *bld,
2017               LLVMValueRef a)
2018 {
2019    LLVMBuilderRef builder = bld->gallivm->builder;
2020    const struct lp_type type = bld->type;
2021
2022    assert(type.floating);
2023    assert(lp_check_value(type, a));
2024
2025    if (arch_rounding_available(type)) {
2026       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2027    }
2028    else {
2029       const struct lp_type type = bld->type;
2030       struct lp_type inttype;
2031       struct lp_build_context intbld;
2032       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2033       LLVMValueRef trunc, res, anosign, mask, tmp;
2034       LLVMTypeRef int_vec_type = bld->int_vec_type;
2035       LLVMTypeRef vec_type = bld->vec_type;
2036
2037       assert(type.width == 32); /* might want to handle doubles at some point */
2038
2039       inttype = type;
2040       inttype.floating = 0;
2041       lp_build_context_init(&intbld, bld->gallivm, inttype);
2042
2043       /* round by truncation */
2044       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2045       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2046
2047       /*
2048        * fix values if rounding is wrong (for non-special cases)
2049        * - this is the case if trunc < a
2050        */
2051       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2052       /* tmp = trunc < a ? 1.0 : 0.0 */
2053       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2054       tmp = lp_build_and(&intbld, mask, tmp);
2055       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2056       res = lp_build_add(bld, trunc, tmp);
2057
2058       /* mask out sign bit */
2059       anosign = lp_build_abs(bld, a);
2060       /*
2061        * mask out all values if anosign > 2^24
2062        * This should work both for large ints (all rounding is no-op for them
2063        * because such floats are always exact) as well as special cases like
2064        * NaNs, Infs (taking advantage of the fact they use max exponent).
2065        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2066        */
2067       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2068       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2069       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2070       return lp_build_select(bld, mask, a, res);
2071    }
2072 }
2073
2074
2075 /**
2076  * Return fractional part of 'a' computed as a - floor(a)
2077  * Typically used in texture coord arithmetic.
2078  */
2079 LLVMValueRef
2080 lp_build_fract(struct lp_build_context *bld,
2081                LLVMValueRef a)
2082 {
2083    assert(bld->type.floating);
2084    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2085 }
2086
2087
2088 /**
2089  * Prevent returning a fractional part of 1.0 for very small negative values of
2090  * 'a' by clamping against 0.99999(9).
2091  */
2092 static inline LLVMValueRef
2093 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2094 {
2095    LLVMValueRef max;
2096
2097    /* this is the largest number smaller than 1.0 representable as float */
2098    max = lp_build_const_vec(bld->gallivm, bld->type,
2099                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2100    return lp_build_min(bld, fract, max);
2101 }
2102
2103
2104 /**
2105  * Same as lp_build_fract, but guarantees that the result is always smaller
2106  * than one.
2107  */
2108 LLVMValueRef
2109 lp_build_fract_safe(struct lp_build_context *bld,
2110                     LLVMValueRef a)
2111 {
2112    return clamp_fract(bld, lp_build_fract(bld, a));
2113 }
2114
2115
2116 /**
2117  * Return the integer part of a float (vector) value (== round toward zero).
2118  * The returned value is an integer (vector).
2119  * Ex: itrunc(-1.5) = -1
2120  */
2121 LLVMValueRef
2122 lp_build_itrunc(struct lp_build_context *bld,
2123                 LLVMValueRef a)
2124 {
2125    LLVMBuilderRef builder = bld->gallivm->builder;
2126    const struct lp_type type = bld->type;
2127    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2128
2129    assert(type.floating);
2130    assert(lp_check_value(type, a));
2131
2132    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2133 }
2134
2135
2136 /**
2137  * Return float (vector) rounded to nearest integer (vector).  The returned
2138  * value is an integer (vector).
2139  * Ex: iround(0.9) = 1
2140  * Ex: iround(-1.5) = -2
2141  */
2142 LLVMValueRef
2143 lp_build_iround(struct lp_build_context *bld,
2144                 LLVMValueRef a)
2145 {
2146    LLVMBuilderRef builder = bld->gallivm->builder;
2147    const struct lp_type type = bld->type;
2148    LLVMTypeRef int_vec_type = bld->int_vec_type;
2149    LLVMValueRef res;
2150
2151    assert(type.floating);
2152
2153    assert(lp_check_value(type, a));
2154
2155    if ((util_cpu_caps.has_sse2 &&
2156        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2157        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2158       return lp_build_iround_nearest_sse2(bld, a);
2159    }
2160    if (arch_rounding_available(type)) {
2161       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2162    }
2163    else {
2164       LLVMValueRef half;
2165
2166       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2167
2168       if (type.sign) {
2169          LLVMTypeRef vec_type = bld->vec_type;
2170          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2171                                     (unsigned long long)1 << (type.width - 1));
2172          LLVMValueRef sign;
2173
2174          /* get sign bit */
2175          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2176          sign = LLVMBuildAnd(builder, sign, mask, "");
2177
2178          /* sign * 0.5 */
2179          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2180          half = LLVMBuildOr(builder, sign, half, "");
2181          half = LLVMBuildBitCast(builder, half, vec_type, "");
2182       }
2183
2184       res = LLVMBuildFAdd(builder, a, half, "");
2185    }
2186
2187    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2188
2189    return res;
2190 }
2191
2192
2193 /**
2194  * Return floor of float (vector), result is an int (vector)
2195  * Ex: ifloor(1.1) = 1.0
2196  * Ex: ifloor(-1.1) = -2.0
2197  */
2198 LLVMValueRef
2199 lp_build_ifloor(struct lp_build_context *bld,
2200                 LLVMValueRef a)
2201 {
2202    LLVMBuilderRef builder = bld->gallivm->builder;
2203    const struct lp_type type = bld->type;
2204    LLVMTypeRef int_vec_type = bld->int_vec_type;
2205    LLVMValueRef res;
2206
2207    assert(type.floating);
2208    assert(lp_check_value(type, a));
2209
2210    res = a;
2211    if (type.sign) {
2212       if (arch_rounding_available(type)) {
2213          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2214       }
2215       else {
2216          struct lp_type inttype;
2217          struct lp_build_context intbld;
2218          LLVMValueRef trunc, itrunc, mask;
2219
2220          assert(type.floating);
2221          assert(lp_check_value(type, a));
2222
2223          inttype = type;
2224          inttype.floating = 0;
2225          lp_build_context_init(&intbld, bld->gallivm, inttype);
2226
2227          /* round by truncation */
2228          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2229          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2230
2231          /*
2232           * fix values if rounding is wrong (for non-special cases)
2233           * - this is the case if trunc > a
2234           * The results of doing this with NaNs, very large values etc.
2235           * are undefined but this seems to be the case anyway.
2236           */
2237          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2238          /* cheapie minus one with mask since the mask is minus one / zero */
2239          return lp_build_add(&intbld, itrunc, mask);
2240       }
2241    }
2242
2243    /* round to nearest (toward zero) */
2244    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2245
2246    return res;
2247 }
2248
2249
2250 /**
2251  * Return ceiling of float (vector), returning int (vector).
2252  * Ex: iceil( 1.1) = 2
2253  * Ex: iceil(-1.1) = -1
2254  */
2255 LLVMValueRef
2256 lp_build_iceil(struct lp_build_context *bld,
2257                LLVMValueRef a)
2258 {
2259    LLVMBuilderRef builder = bld->gallivm->builder;
2260    const struct lp_type type = bld->type;
2261    LLVMTypeRef int_vec_type = bld->int_vec_type;
2262    LLVMValueRef res;
2263
2264    assert(type.floating);
2265    assert(lp_check_value(type, a));
2266
2267    if (arch_rounding_available(type)) {
2268       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2269    }
2270    else {
2271       struct lp_type inttype;
2272       struct lp_build_context intbld;
2273       LLVMValueRef trunc, itrunc, mask;
2274
2275       assert(type.floating);
2276       assert(lp_check_value(type, a));
2277
2278       inttype = type;
2279       inttype.floating = 0;
2280       lp_build_context_init(&intbld, bld->gallivm, inttype);
2281
2282       /* round by truncation */
2283       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2284       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2285
2286       /*
2287        * fix values if rounding is wrong (for non-special cases)
2288        * - this is the case if trunc < a
2289        * The results of doing this with NaNs, very large values etc.
2290        * are undefined but this seems to be the case anyway.
2291        */
2292       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2293       /* cheapie plus one with mask since the mask is minus one / zero */
2294       return lp_build_sub(&intbld, itrunc, mask);
2295    }
2296
2297    /* round to nearest (toward zero) */
2298    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2299
2300    return res;
2301 }
2302
2303
2304 /**
2305  * Combined ifloor() & fract().
2306  *
2307  * Preferred to calling the functions separately, as it will ensure that the
2308  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2309  */
2310 void
2311 lp_build_ifloor_fract(struct lp_build_context *bld,
2312                       LLVMValueRef a,
2313                       LLVMValueRef *out_ipart,
2314                       LLVMValueRef *out_fpart)
2315 {
2316    LLVMBuilderRef builder = bld->gallivm->builder;
2317    const struct lp_type type = bld->type;
2318    LLVMValueRef ipart;
2319
2320    assert(type.floating);
2321    assert(lp_check_value(type, a));
2322
2323    if (arch_rounding_available(type)) {
2324       /*
2325        * floor() is easier.
2326        */
2327
2328       ipart = lp_build_floor(bld, a);
2329       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2330       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2331    }
2332    else {
2333       /*
2334        * ifloor() is easier.
2335        */
2336
2337       *out_ipart = lp_build_ifloor(bld, a);
2338       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2339       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2340    }
2341 }
2342
2343
2344 /**
2345  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2346  * always smaller than one.
2347  */
2348 void
2349 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2350                            LLVMValueRef a,
2351                            LLVMValueRef *out_ipart,
2352                            LLVMValueRef *out_fpart)
2353 {
2354    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2355    *out_fpart = clamp_fract(bld, *out_fpart);
2356 }
2357
2358
2359 LLVMValueRef
2360 lp_build_sqrt(struct lp_build_context *bld,
2361               LLVMValueRef a)
2362 {
2363    LLVMBuilderRef builder = bld->gallivm->builder;
2364    const struct lp_type type = bld->type;
2365    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2366    char intrinsic[32];
2367
2368    assert(lp_check_value(type, a));
2369
2370    /* TODO: optimize the constant case */
2371
2372    assert(type.floating);
2373    if (type.length == 1) {
2374       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2375    }
2376    else {
2377       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2378    }
2379
2380    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2381 }
2382
2383
2384 /**
2385  * Do one Newton-Raphson step to improve reciprocate precision:
2386  *
2387  *   x_{i+1} = x_i * (2 - a * x_i)
2388  *
2389  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2390  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2391  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2392  * halo. It would be necessary to clamp the argument to prevent this.
2393  *
2394  * See also:
2395  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2396  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2397  */
2398 static INLINE LLVMValueRef
2399 lp_build_rcp_refine(struct lp_build_context *bld,
2400                     LLVMValueRef a,
2401                     LLVMValueRef rcp_a)
2402 {
2403    LLVMBuilderRef builder = bld->gallivm->builder;
2404    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2405    LLVMValueRef res;
2406
2407    res = LLVMBuildFMul(builder, a, rcp_a, "");
2408    res = LLVMBuildFSub(builder, two, res, "");
2409    res = LLVMBuildFMul(builder, rcp_a, res, "");
2410
2411    return res;
2412 }
2413
2414
2415 LLVMValueRef
2416 lp_build_rcp(struct lp_build_context *bld,
2417              LLVMValueRef a)
2418 {
2419    LLVMBuilderRef builder = bld->gallivm->builder;
2420    const struct lp_type type = bld->type;
2421
2422    assert(lp_check_value(type, a));
2423
2424    if(a == bld->zero)
2425       return bld->undef;
2426    if(a == bld->one)
2427       return bld->one;
2428    if(a == bld->undef)
2429       return bld->undef;
2430
2431    assert(type.floating);
2432
2433    if(LLVMIsConstant(a))
2434       return LLVMConstFDiv(bld->one, a);
2435
2436    /*
2437     * We don't use RCPPS because:
2438     * - it only has 10bits of precision
2439     * - it doesn't even get the reciprocate of 1.0 exactly
2440     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2441     * - for recent processors the benefit over DIVPS is marginal, a case
2442     *   dependent
2443     *
2444     * We could still use it on certain processors if benchmarks show that the
2445     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2446     * particular uses that require less workarounds.
2447     */
2448
2449    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2450          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2451       const unsigned num_iterations = 0;
2452       LLVMValueRef res;
2453       unsigned i;
2454       const char *intrinsic = NULL;
2455
2456       if (type.length == 4) {
2457          intrinsic = "llvm.x86.sse.rcp.ps";
2458       }
2459       else {
2460          intrinsic = "llvm.x86.avx.rcp.ps.256";
2461       }
2462
2463       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2464
2465       for (i = 0; i < num_iterations; ++i) {
2466          res = lp_build_rcp_refine(bld, a, res);
2467       }
2468
2469       return res;
2470    }
2471
2472    return LLVMBuildFDiv(builder, bld->one, a, "");
2473 }
2474
2475
2476 /**
2477  * Do one Newton-Raphson step to improve rsqrt precision:
2478  *
2479  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2480  *
2481  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2482  */
2483 static INLINE LLVMValueRef
2484 lp_build_rsqrt_refine(struct lp_build_context *bld,
2485                       LLVMValueRef a,
2486                       LLVMValueRef rsqrt_a)
2487 {
2488    LLVMBuilderRef builder = bld->gallivm->builder;
2489    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2490    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2491    LLVMValueRef res;
2492
2493    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2494    res = LLVMBuildFMul(builder, a, res, "");
2495    res = LLVMBuildFSub(builder, three, res, "");
2496    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2497    res = LLVMBuildFMul(builder, half, res, "");
2498
2499    return res;
2500 }
2501
2502
2503 /**
2504  * Generate 1/sqrt(a).
2505  * Result is undefined for values < 0, infinity for +0.
2506  */
2507 LLVMValueRef
2508 lp_build_rsqrt(struct lp_build_context *bld,
2509                LLVMValueRef a)
2510 {
2511    LLVMBuilderRef builder = bld->gallivm->builder;
2512    const struct lp_type type = bld->type;
2513
2514    assert(lp_check_value(type, a));
2515
2516    assert(type.floating);
2517
2518    /*
2519     * This should be faster but all denormals will end up as infinity.
2520     */
2521    if (0 && lp_build_fast_rsqrt_available(type)) {
2522       const unsigned num_iterations = 1;
2523       LLVMValueRef res;
2524       unsigned i;
2525
2526       /* rsqrt(1.0) != 1.0 here */
2527       res = lp_build_fast_rsqrt(bld, a);
2528
2529       if (num_iterations) {
2530          /*
2531           * Newton-Raphson will result in NaN instead of infinity for zero,
2532           * and NaN instead of zero for infinity.
2533           * Also, need to ensure rsqrt(1.0) == 1.0.
2534           * All numbers smaller than FLT_MIN will result in +infinity
2535           * (rsqrtps treats all denormals as zero).
2536           */
2537          /*
2538           * Certain non-c99 compilers don't know INFINITY and might not support
2539           * hacks to evaluate it at compile time neither.
2540           */
2541          const unsigned posinf_int = 0x7F800000;
2542          LLVMValueRef cmp;
2543          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2544          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2545
2546          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2547
2548          for (i = 0; i < num_iterations; ++i) {
2549             res = lp_build_rsqrt_refine(bld, a, res);
2550          }
2551          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2552          res = lp_build_select(bld, cmp, inf, res);
2553          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2554          res = lp_build_select(bld, cmp, bld->zero, res);
2555          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2556          res = lp_build_select(bld, cmp, bld->one, res);
2557       }
2558
2559       return res;
2560    }
2561
2562    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2563 }
2564
2565 /**
2566  * If there's a fast (inaccurate) rsqrt instruction available
2567  * (caller may want to avoid to call rsqrt_fast if it's not available,
2568  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2569  * unavailable it would result in sqrt/div/mul so obviously
2570  * much better to just call sqrt, skipping both div and mul).
2571  */
2572 boolean
2573 lp_build_fast_rsqrt_available(struct lp_type type)
2574 {
2575    assert(type.floating);
2576
2577    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2578        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2579       return true;
2580    }
2581    return false;
2582 }
2583
2584
2585 /**
2586  * Generate 1/sqrt(a).
2587  * Result is undefined for values < 0, infinity for +0.
2588  * Precision is limited, only ~10 bits guaranteed
2589  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2590  */
2591 LLVMValueRef
2592 lp_build_fast_rsqrt(struct lp_build_context *bld,
2593                     LLVMValueRef a)
2594 {
2595    LLVMBuilderRef builder = bld->gallivm->builder;
2596    const struct lp_type type = bld->type;
2597
2598    assert(lp_check_value(type, a));
2599
2600    if (lp_build_fast_rsqrt_available(type)) {
2601       const char *intrinsic = NULL;
2602
2603       if (type.length == 4) {
2604          intrinsic = "llvm.x86.sse.rsqrt.ps";
2605       }
2606       else {
2607          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2608       }
2609       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2610    }
2611    else {
2612       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2613    }
2614    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2615 }
2616
2617
2618 /**
2619  * Generate sin(a) or cos(a) using polynomial approximation.
2620  * TODO: it might be worth recognizing sin and cos using same source
2621  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2622  * would be way cheaper than calculating (nearly) everything twice...
2623  * Not sure it's common enough to be worth bothering however, scs
2624  * opcode could also benefit from calculating both though.
2625  */
2626 static LLVMValueRef
2627 lp_build_sin_or_cos(struct lp_build_context *bld,
2628                     LLVMValueRef a,
2629                     boolean cos)
2630 {
2631    struct gallivm_state *gallivm = bld->gallivm;
2632    LLVMBuilderRef b = gallivm->builder;
2633    struct lp_type int_type = lp_int_type(bld->type);
2634
2635    /*
2636     *  take the absolute value,
2637     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2638     */
2639
2640    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2641    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2642
2643    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2644    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2645
2646    /*
2647     * scale by 4/Pi
2648     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2649     */
2650
2651    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2652    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2653
2654    /*
2655     * store the integer part of y in mm0
2656     * emm2 = _mm_cvttps_epi32(y);
2657     */
2658
2659    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2660
2661    /*
2662     * j=(j+1) & (~1) (see the cephes sources)
2663     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2664     */
2665
2666    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2667    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2668    /*
2669     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2670     */
2671    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2672    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2673
2674    /*
2675     * y = _mm_cvtepi32_ps(emm2);
2676     */
2677    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2678
2679    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2680    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2681    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2682    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2683
2684    /*
2685     * Argument used for poly selection and sign bit determination
2686     * is different for sin vs. cos.
2687     */
2688    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2689                                emm2_and;
2690
2691    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2692                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2693                                               const_29, "sign_bit") :
2694                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2695                                                               LLVMBuildShl(b, emm2_add,
2696                                                                            const_29, ""), ""),
2697                                               sign_mask, "sign_bit");
2698
2699    /*
2700     * get the polynom selection mask
2701     * there is one polynom for 0 <= x <= Pi/4
2702     * and another one for Pi/4<x<=Pi/2
2703     * Both branches will be computed.
2704     *
2705     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2706     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2707     */
2708
2709    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2710    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2711                                              int_type, PIPE_FUNC_EQUAL,
2712                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2713
2714    /*
2715     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2716     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2717     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2718     */
2719    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2720    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2721    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2722
2723    /*
2724     * The magic pass: "Extended precision modular arithmetic"
2725     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2726     * xmm1 = _mm_mul_ps(y, xmm1);
2727     * xmm2 = _mm_mul_ps(y, xmm2);
2728     * xmm3 = _mm_mul_ps(y, xmm3);
2729     */
2730    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2731    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2732    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2733
2734    /*
2735     * x = _mm_add_ps(x, xmm1);
2736     * x = _mm_add_ps(x, xmm2);
2737     * x = _mm_add_ps(x, xmm3);
2738     */
2739
2740    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2741    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2742    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2743
2744    /*
2745     * Evaluate the first polynom  (0 <= x <= Pi/4)
2746     *
2747     * z = _mm_mul_ps(x,x);
2748     */
2749    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2750
2751    /*
2752     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2753     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2754     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2755     */
2756    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2757    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2758    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2759
2760    /*
2761     * y = *(v4sf*)_ps_coscof_p0;
2762     * y = _mm_mul_ps(y, z);
2763     */
2764    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2765    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2766    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2767    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2768    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2769    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2770
2771
2772    /*
2773     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2774     * y = _mm_sub_ps(y, tmp);
2775     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2776     */
2777    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2778    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2779    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2780    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2781    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2782
2783    /*
2784     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2785     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2786     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2787     */
2788    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2789    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2790    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2791
2792    /*
2793     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2794     *
2795     * y2 = *(v4sf*)_ps_sincof_p0;
2796     * y2 = _mm_mul_ps(y2, z);
2797     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2798     * y2 = _mm_mul_ps(y2, z);
2799     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2800     * y2 = _mm_mul_ps(y2, z);
2801     * y2 = _mm_mul_ps(y2, x);
2802     * y2 = _mm_add_ps(y2, x);
2803     */
2804
2805    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2806    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2807    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2808    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2809    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2810    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2811    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2812
2813    /*
2814     * select the correct result from the two polynoms
2815     * xmm3 = poly_mask;
2816     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2817     * y = _mm_andnot_ps(xmm3, y);
2818     * y = _mm_or_ps(y,y2);
2819     */
2820    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2821    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2822    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2823    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2824    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2825    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2826
2827    /*
2828     * update the sign
2829     * y = _mm_xor_ps(y, sign_bit);
2830     */
2831    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2832    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2833
2834    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2835
2836    /* clamp output to be within [-1, 1] */
2837    y_result = lp_build_clamp(bld, y_result,
2838                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2839                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2840    /* If a is -inf, inf or NaN then return NaN */
2841    y_result = lp_build_select(bld, isfinite, y_result,
2842                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2843    return y_result;
2844 }
2845
2846
2847 /**
2848  * Generate sin(a)
2849  */
2850 LLVMValueRef
2851 lp_build_sin(struct lp_build_context *bld,
2852              LLVMValueRef a)
2853 {
2854    return lp_build_sin_or_cos(bld, a, FALSE);
2855 }
2856
2857
2858 /**
2859  * Generate cos(a)
2860  */
2861 LLVMValueRef
2862 lp_build_cos(struct lp_build_context *bld,
2863              LLVMValueRef a)
2864 {
2865    return lp_build_sin_or_cos(bld, a, TRUE);
2866 }
2867
2868
2869 /**
2870  * Generate pow(x, y)
2871  */
2872 LLVMValueRef
2873 lp_build_pow(struct lp_build_context *bld,
2874              LLVMValueRef x,
2875              LLVMValueRef y)
2876 {
2877    /* TODO: optimize the constant case */
2878    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2879        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2880       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2881                    __FUNCTION__);
2882    }
2883
2884    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2885 }
2886
2887
2888 /**
2889  * Generate exp(x)
2890  */
2891 LLVMValueRef
2892 lp_build_exp(struct lp_build_context *bld,
2893              LLVMValueRef x)
2894 {
2895    /* log2(e) = 1/log(2) */
2896    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2897                                            1.4426950408889634);
2898
2899    assert(lp_check_value(bld->type, x));
2900
2901    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2902 }
2903
2904
2905 /**
2906  * Generate log(x)
2907  * Behavior is undefined with infs, 0s and nans
2908  */
2909 LLVMValueRef
2910 lp_build_log(struct lp_build_context *bld,
2911              LLVMValueRef x)
2912 {
2913    /* log(2) */
2914    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2915                                           0.69314718055994529);
2916
2917    assert(lp_check_value(bld->type, x));
2918
2919    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2920 }
2921
2922 /**
2923  * Generate log(x) that handles edge cases (infs, 0s and nans)
2924  */
2925 LLVMValueRef
2926 lp_build_log_safe(struct lp_build_context *bld,
2927                   LLVMValueRef x)
2928 {
2929    /* log(2) */
2930    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2931                                           0.69314718055994529);
2932
2933    assert(lp_check_value(bld->type, x));
2934
2935    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2936 }
2937
2938
2939 /**
2940  * Generate polynomial.
2941  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2942  */
2943 LLVMValueRef
2944 lp_build_polynomial(struct lp_build_context *bld,
2945                     LLVMValueRef x,
2946                     const double *coeffs,
2947                     unsigned num_coeffs)
2948 {
2949    const struct lp_type type = bld->type;
2950    LLVMValueRef even = NULL, odd = NULL;
2951    LLVMValueRef x2;
2952    unsigned i;
2953
2954    assert(lp_check_value(bld->type, x));
2955
2956    /* TODO: optimize the constant case */
2957    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2958        LLVMIsConstant(x)) {
2959       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2960                    __FUNCTION__);
2961    }
2962
2963    /*
2964     * Calculate odd and even terms seperately to decrease data dependency
2965     * Ex:
2966     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2967     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2968     */
2969    x2 = lp_build_mul(bld, x, x);
2970
2971    for (i = num_coeffs; i--; ) {
2972       LLVMValueRef coeff;
2973
2974       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2975
2976       if (i % 2 == 0) {
2977          if (even)
2978             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2979          else
2980             even = coeff;
2981       } else {
2982          if (odd)
2983             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2984          else
2985             odd = coeff;
2986       }
2987    }
2988
2989    if (odd)
2990       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2991    else if (even)
2992       return even;
2993    else
2994       return bld->undef;
2995 }
2996
2997
2998 /**
2999  * Minimax polynomial fit of 2**x, in range [0, 1[
3000  */
3001 const double lp_build_exp2_polynomial[] = {
3002 #if EXP_POLY_DEGREE == 5
3003    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3004    0.693153073200168932794,
3005    0.240153617044375388211,
3006    0.0558263180532956664775,
3007    0.00898934009049466391101,
3008    0.00187757667519147912699
3009 #elif EXP_POLY_DEGREE == 4
3010    1.00000259337069434683,
3011    0.693003834469974940458,
3012    0.24144275689150793076,
3013    0.0520114606103070150235,
3014    0.0135341679161270268764
3015 #elif EXP_POLY_DEGREE == 3
3016    0.999925218562710312959,
3017    0.695833540494823811697,
3018    0.226067155427249155588,
3019    0.0780245226406372992967
3020 #elif EXP_POLY_DEGREE == 2
3021    1.00172476321474503578,
3022    0.657636275736077639316,
3023    0.33718943461968720704
3024 #else
3025 #error
3026 #endif
3027 };
3028
3029
3030 LLVMValueRef
3031 lp_build_exp2(struct lp_build_context *bld,
3032               LLVMValueRef x)
3033 {
3034    LLVMBuilderRef builder = bld->gallivm->builder;
3035    const struct lp_type type = bld->type;
3036    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3037    LLVMValueRef ipart = NULL;
3038    LLVMValueRef fpart = NULL;
3039    LLVMValueRef expipart = NULL;
3040    LLVMValueRef expfpart = NULL;
3041    LLVMValueRef res = NULL;
3042
3043    assert(lp_check_value(bld->type, x));
3044
3045
3046    /* TODO: optimize the constant case */
3047    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3048        LLVMIsConstant(x)) {
3049       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3050                    __FUNCTION__);
3051    }
3052
3053    assert(type.floating && type.width == 32);
3054
3055    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3056     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3057    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3058                         GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
3059    x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x);
3060
3061    /* ipart = floor(x) */
3062    /* fpart = x - ipart */
3063    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3064
3065
3066
3067    /* expipart = (float) (1 << ipart) */
3068    expipart = LLVMBuildAdd(builder, ipart,
3069                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3070    expipart = LLVMBuildShl(builder, expipart,
3071                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3072    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3073
3074
3075    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3076                                   Elements(lp_build_exp2_polynomial));
3077
3078    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3079
3080
3081    return res;
3082 }
3083
3084
3085
3086 /**
3087  * Extract the exponent of a IEEE-754 floating point value.
3088  *
3089  * Optionally apply an integer bias.
3090  *
3091  * Result is an integer value with
3092  *
3093  *   ifloor(log2(x)) + bias
3094  */
3095 LLVMValueRef
3096 lp_build_extract_exponent(struct lp_build_context *bld,
3097                           LLVMValueRef x,
3098                           int bias)
3099 {
3100    LLVMBuilderRef builder = bld->gallivm->builder;
3101    const struct lp_type type = bld->type;
3102    unsigned mantissa = lp_mantissa(type);
3103    LLVMValueRef res;
3104
3105    assert(type.floating);
3106
3107    assert(lp_check_value(bld->type, x));
3108
3109    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3110
3111    res = LLVMBuildLShr(builder, x,
3112                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3113    res = LLVMBuildAnd(builder, res,
3114                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3115    res = LLVMBuildSub(builder, res,
3116                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3117
3118    return res;
3119 }
3120
3121
3122 /**
3123  * Extract the mantissa of the a floating.
3124  *
3125  * Result is a floating point value with
3126  *
3127  *   x / floor(log2(x))
3128  */
3129 LLVMValueRef
3130 lp_build_extract_mantissa(struct lp_build_context *bld,
3131                           LLVMValueRef x)
3132 {
3133    LLVMBuilderRef builder = bld->gallivm->builder;
3134    const struct lp_type type = bld->type;
3135    unsigned mantissa = lp_mantissa(type);
3136    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3137                                                   (1ULL << mantissa) - 1);
3138    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3139    LLVMValueRef res;
3140
3141    assert(lp_check_value(bld->type, x));
3142
3143    assert(type.floating);
3144
3145    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3146
3147    /* res = x / 2**ipart */
3148    res = LLVMBuildAnd(builder, x, mantmask, "");
3149    res = LLVMBuildOr(builder, res, one, "");
3150    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3151
3152    return res;
3153 }
3154
3155
3156
3157 /**
3158  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3159  * These coefficients can be generate with
3160  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3161  */
3162 const double lp_build_log2_polynomial[] = {
3163 #if LOG_POLY_DEGREE == 5
3164    2.88539008148777786488L,
3165    0.961796878841293367824L,
3166    0.577058946784739859012L,
3167    0.412914355135828735411L,
3168    0.308591899232910175289L,
3169    0.352376952300281371868L,
3170 #elif LOG_POLY_DEGREE == 4
3171    2.88539009343309178325L,
3172    0.961791550404184197881L,
3173    0.577440339438736392009L,
3174    0.403343858251329912514L,
3175    0.406718052498846252698L,
3176 #elif LOG_POLY_DEGREE == 3
3177    2.88538959748872753838L,
3178    0.961932915889597772928L,
3179    0.571118517972136195241L,
3180    0.493997535084709500285L,
3181 #else
3182 #error
3183 #endif
3184 };
3185
3186 /**
3187  * See http://www.devmaster.net/forums/showthread.php?p=43580
3188  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3189  * http://www.nezumi.demon.co.uk/consult/logx.htm
3190  *
3191  * If handle_edge_cases is true the function will perform computations
3192  * to match the required D3D10+ behavior for each of the edge cases.
3193  * That means that if input is:
3194  * - less than zero (to and including -inf) then NaN will be returned
3195  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3196  * - +infinity, then +infinity will be returned
3197  * - NaN, then NaN will be returned
3198  *
3199  * Those checks are fairly expensive so if you don't need them make sure
3200  * handle_edge_cases is false.
3201  */
3202 void
3203 lp_build_log2_approx(struct lp_build_context *bld,
3204                      LLVMValueRef x,
3205                      LLVMValueRef *p_exp,
3206                      LLVMValueRef *p_floor_log2,
3207                      LLVMValueRef *p_log2,
3208                      boolean handle_edge_cases)
3209 {
3210    LLVMBuilderRef builder = bld->gallivm->builder;
3211    const struct lp_type type = bld->type;
3212    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3213    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3214
3215    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3216    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3217    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3218
3219    LLVMValueRef i = NULL;
3220    LLVMValueRef y = NULL;
3221    LLVMValueRef z = NULL;
3222    LLVMValueRef exp = NULL;
3223    LLVMValueRef mant = NULL;
3224    LLVMValueRef logexp = NULL;
3225    LLVMValueRef logmant = NULL;
3226    LLVMValueRef res = NULL;
3227
3228    assert(lp_check_value(bld->type, x));
3229
3230    if(p_exp || p_floor_log2 || p_log2) {
3231       /* TODO: optimize the constant case */
3232       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3233           LLVMIsConstant(x)) {
3234          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3235                       __FUNCTION__);
3236       }
3237
3238       assert(type.floating && type.width == 32);
3239
3240       /*
3241        * We don't explicitly handle denormalized numbers. They will yield a
3242        * result in the neighbourhood of -127, which appears to be adequate
3243        * enough.
3244        */
3245
3246       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3247
3248       /* exp = (float) exponent(x) */
3249       exp = LLVMBuildAnd(builder, i, expmask, "");
3250    }
3251
3252    if(p_floor_log2 || p_log2) {
3253       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3254       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3255       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3256    }
3257
3258    if(p_log2) {
3259       /* mant = 1 + (float) mantissa(x) */
3260       mant = LLVMBuildAnd(builder, i, mantmask, "");
3261       mant = LLVMBuildOr(builder, mant, one, "");
3262       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3263
3264       /* y = (mant - 1) / (mant + 1) */
3265       y = lp_build_div(bld,
3266          lp_build_sub(bld, mant, bld->one),
3267          lp_build_add(bld, mant, bld->one)
3268       );
3269
3270       /* z = y^2 */
3271       z = lp_build_mul(bld, y, y);
3272
3273       /* compute P(z) */
3274       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3275                                     Elements(lp_build_log2_polynomial));
3276
3277       /* logmant = y * P(z) */
3278       logmant = lp_build_mul(bld, y, logmant);
3279
3280       res = lp_build_add(bld, logmant, logexp);
3281
3282       if (type.floating && handle_edge_cases) {
3283          LLVMValueRef negmask, infmask,  zmask;
3284          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3285                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3286          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3287                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3288          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3289                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3290
3291          /* If x is qual to inf make sure we return inf */
3292          res = lp_build_select(bld, infmask,
3293                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3294                                res);
3295          /* If x is qual to 0, return -inf */
3296          res = lp_build_select(bld, zmask,
3297                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3298                                res);
3299          /* If x is nan or less than 0, return nan */
3300          res = lp_build_select(bld, negmask,
3301                                lp_build_const_vec(bld->gallivm, type,  NAN),
3302                                res);
3303       }
3304    }
3305
3306    if(p_exp) {
3307       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3308       *p_exp = exp;
3309    }
3310
3311    if(p_floor_log2)
3312       *p_floor_log2 = logexp;
3313
3314    if(p_log2)
3315       *p_log2 = res;
3316 }
3317
3318
3319 /*
3320  * log2 implementation which doesn't have special code to
3321  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3322  * the results for those cases are undefined.
3323  */
3324 LLVMValueRef
3325 lp_build_log2(struct lp_build_context *bld,
3326               LLVMValueRef x)
3327 {
3328    LLVMValueRef res;
3329    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3330    return res;
3331 }
3332
3333 /*
3334  * Version of log2 which handles all edge cases.
3335  * Look at documentation of lp_build_log2_approx for
3336  * description of the behavior for each of the edge cases.
3337  */
3338 LLVMValueRef
3339 lp_build_log2_safe(struct lp_build_context *bld,
3340                    LLVMValueRef x)
3341 {
3342    LLVMValueRef res;
3343    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3344    return res;
3345 }
3346
3347
3348 /**
3349  * Faster (and less accurate) log2.
3350  *
3351  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3352  *
3353  * Piece-wise linear approximation, with exact results when x is a
3354  * power of two.
3355  *
3356  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3357  */
3358 LLVMValueRef
3359 lp_build_fast_log2(struct lp_build_context *bld,
3360                    LLVMValueRef x)
3361 {
3362    LLVMBuilderRef builder = bld->gallivm->builder;
3363    LLVMValueRef ipart;
3364    LLVMValueRef fpart;
3365
3366    assert(lp_check_value(bld->type, x));
3367
3368    assert(bld->type.floating);
3369
3370    /* ipart = floor(log2(x)) - 1 */
3371    ipart = lp_build_extract_exponent(bld, x, -1);
3372    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3373
3374    /* fpart = x / 2**ipart */
3375    fpart = lp_build_extract_mantissa(bld, x);
3376
3377    /* ipart + fpart */
3378    return LLVMBuildFAdd(builder, ipart, fpart, "");
3379 }
3380
3381
3382 /**
3383  * Fast implementation of iround(log2(x)).
3384  *
3385  * Not an approximation -- it should give accurate results all the time.
3386  */
3387 LLVMValueRef
3388 lp_build_ilog2(struct lp_build_context *bld,
3389                LLVMValueRef x)
3390 {
3391    LLVMBuilderRef builder = bld->gallivm->builder;
3392    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3393    LLVMValueRef ipart;
3394
3395    assert(bld->type.floating);
3396
3397    assert(lp_check_value(bld->type, x));
3398
3399    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3400    x = LLVMBuildFMul(builder, x, sqrt2, "");
3401
3402    /* ipart = floor(log2(x) + 0.5)  */
3403    ipart = lp_build_extract_exponent(bld, x, 0);
3404
3405    return ipart;
3406 }
3407
3408 LLVMValueRef
3409 lp_build_mod(struct lp_build_context *bld,
3410              LLVMValueRef x,
3411              LLVMValueRef y)
3412 {
3413    LLVMBuilderRef builder = bld->gallivm->builder;
3414    LLVMValueRef res;
3415    const struct lp_type type = bld->type;
3416
3417    assert(lp_check_value(type, x));
3418    assert(lp_check_value(type, y));
3419
3420    if (type.floating)
3421       res = LLVMBuildFRem(builder, x, y, "");
3422    else if (type.sign)
3423       res = LLVMBuildSRem(builder, x, y, "");
3424    else
3425       res = LLVMBuildURem(builder, x, y, "");
3426    return res;
3427 }
3428
3429
3430 /*
3431  * For floating inputs it creates and returns a mask
3432  * which is all 1's for channels which are NaN.
3433  * Channels inside x which are not NaN will be 0.
3434  */
3435 LLVMValueRef
3436 lp_build_isnan(struct lp_build_context *bld,
3437                LLVMValueRef x)
3438 {
3439    LLVMValueRef mask;
3440    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3441
3442    assert(bld->type.floating);
3443    assert(lp_check_value(bld->type, x));
3444
3445    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3446                         "isnotnan");
3447    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3448    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3449    return mask;
3450 }
3451
3452 /* Returns all 1's for floating point numbers that are
3453  * finite numbers and returns all zeros for -inf,
3454  * inf and nan's */
3455 LLVMValueRef
3456 lp_build_isfinite(struct lp_build_context *bld,
3457                   LLVMValueRef x)
3458 {
3459    LLVMBuilderRef builder = bld->gallivm->builder;
3460    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3461    struct lp_type int_type = lp_int_type(bld->type);
3462    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3463    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3464                                                     0x7f800000);
3465
3466    if (!bld->type.floating) {
3467       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3468    }
3469    assert(bld->type.floating);
3470    assert(lp_check_value(bld->type, x));
3471    assert(bld->type.width == 32);
3472
3473    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3474    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3475                            intx, infornan32);
3476 }
3477
3478 /*
3479  * Returns true if the number is nan or inf and false otherwise.
3480  * The input has to be a floating point vector.
3481  */
3482 LLVMValueRef
3483 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3484                        const struct lp_type type,
3485                        LLVMValueRef x)
3486 {
3487    LLVMBuilderRef builder = gallivm->builder;
3488    struct lp_type int_type = lp_int_type(type);
3489    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3490                                                 0x7f800000);
3491    LLVMValueRef ret;
3492
3493    assert(type.floating);
3494
3495    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3496    ret = LLVMBuildAnd(builder, ret, const0, "");
3497    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3498                           ret, const0);
3499
3500    return ret;
3501 }
3502
3503
3504 LLVMValueRef
3505 lp_build_fpstate_get(struct gallivm_state *gallivm)
3506 {
3507    if (util_cpu_caps.has_sse) {
3508       LLVMBuilderRef builder = gallivm->builder;
3509       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3510          gallivm,
3511          LLVMInt32TypeInContext(gallivm->context),
3512          "mxcsr_ptr");
3513       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3514           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3515       lp_build_intrinsic(builder,
3516                          "llvm.x86.sse.stmxcsr",
3517                          LLVMVoidTypeInContext(gallivm->context),
3518                          &mxcsr_ptr8, 1);
3519       return mxcsr_ptr;
3520    }
3521    return 0;
3522 }
3523
3524 void
3525 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3526                                   boolean zero)
3527 {
3528    if (util_cpu_caps.has_sse) {
3529       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3530       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3531
3532       LLVMBuilderRef builder = gallivm->builder;
3533       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3534       LLVMValueRef mxcsr =
3535          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3536
3537       if (util_cpu_caps.has_daz) {
3538          /* Enable denormals are zero mode */
3539          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3540       }
3541       if (zero) {
3542          mxcsr = LLVMBuildOr(builder, mxcsr,
3543                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3544       } else {
3545          mxcsr = LLVMBuildAnd(builder, mxcsr,
3546                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3547       }
3548
3549       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3550       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3551    }
3552 }
3553
3554 void
3555 lp_build_fpstate_set(struct gallivm_state *gallivm,
3556                      LLVMValueRef mxcsr_ptr)
3557 {
3558    if (util_cpu_caps.has_sse) {
3559       LLVMBuilderRef builder = gallivm->builder;
3560       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3561                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3562       lp_build_intrinsic(builder,
3563                          "llvm.x86.sse.ldmxcsr",
3564                          LLVMVoidTypeInContext(gallivm->context),
3565                          &mxcsr_ptr, 1);
3566    }
3567 }