src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67
  68 #define EXP_POLY_DEGREE 5
  69
  70 #define LOG_POLY_DEGREE 4
  71
  72
  73 /**
  74  * Generate min(a, b)
  75  * No checks for special case values of a or b = 1 or 0 are done.
  76  * NaN's are handled according to the behavior specified by the
  77  * nan_behavior argument.
  78  */
  79 static LLVMValueRef
  80 lp_build_min_simple(struct lp_build_context *bld,
  81                     LLVMValueRef a,
  82                     LLVMValueRef b,
  83                     enum gallivm_nan_behavior nan_behavior)
  84 {
  85    const struct lp_type type = bld->type;
  86    const char *intrinsic = NULL;
  87    unsigned intr_size = 0;
  88    LLVMValueRef cond;
  89
  90    assert(lp_check_value(type, a));
  91    assert(lp_check_value(type, b));
  92
  93    /* TODO: optimize the constant case */
  94
  95    if (type.floating && util_cpu_caps.has_sse) {
  96       if (type.width == 32) {
  97          if (type.length == 1) {
  98             intrinsic = "llvm.x86.sse.min.ss";
  99             intr_size = 128;
 100          }
 101          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 102             intrinsic = "llvm.x86.sse.min.ps";
 103             intr_size = 128;
 104          }
 105          else {
 106             intrinsic = "llvm.x86.avx.min.ps.256";
 107             intr_size = 256;
 108          }
 109       }
 110       if (type.width == 64 && util_cpu_caps.has_sse2) {
 111          if (type.length == 1) {
 112             intrinsic = "llvm.x86.sse2.min.sd";
 113             intr_size = 128;
 114          }
 115          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 116             intrinsic = "llvm.x86.sse2.min.pd";
 117             intr_size = 128;
 118          }
 119          else {
 120             intrinsic = "llvm.x86.avx.min.pd.256";
 121             intr_size = 256;
 122          }
 123       }
 124    }
 125    else if (type.floating && util_cpu_caps.has_altivec) {
 126       debug_printf("%s: altivec doesn't support nan behavior modes\n",
 127                    __FUNCTION__);
 128       if (type.width == 32 && type.length == 4) {
 129          intrinsic = "llvm.ppc.altivec.vminfp";
 130          intr_size = 128;
 131       }
 132    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 133       intr_size = 128;
 134       if ((type.width == 8 || type.width == 16) &&
 135           (type.width * type.length <= 64) &&
 136           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 137          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 138                       __FUNCTION__);
 139       }
 140       if (type.width == 8 && !type.sign) {
 141          intrinsic = "llvm.x86.sse2.pminu.b";
 142       }
 143       else if (type.width == 16 && type.sign) {
 144          intrinsic = "llvm.x86.sse2.pmins.w";
 145       }
 146       if (util_cpu_caps.has_sse4_1) {
 147          if (type.width == 8 && type.sign) {
 148             intrinsic = "llvm.x86.sse41.pminsb";
 149          }
 150          if (type.width == 16 && !type.sign) {
 151             intrinsic = "llvm.x86.sse41.pminuw";
 152          }
 153          if (type.width == 32 && !type.sign) {
 154             intrinsic = "llvm.x86.sse41.pminud";
 155          }
 156          if (type.width == 32 && type.sign) {
 157             intrinsic = "llvm.x86.sse41.pminsd";
 158          }
 159       }
 160    } else if (util_cpu_caps.has_altivec) {
 161       intr_size = 128;
 162       debug_printf("%s: altivec doesn't support nan behavior modes\n",
 163                    __FUNCTION__);
 164       if (type.width == 8) {
 165          if (!type.sign) {
 166             intrinsic = "llvm.ppc.altivec.vminub";
 167          } else {
 168             intrinsic = "llvm.ppc.altivec.vminsb";
 169          }
 170       } else if (type.width == 16) {
 171          if (!type.sign) {
 172             intrinsic = "llvm.ppc.altivec.vminuh";
 173          } else {
 174             intrinsic = "llvm.ppc.altivec.vminsh";
 175          }
 176       } else if (type.width == 32) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminuw";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsw";
 181          }
 182       }
 183    }
 184
 185    if(intrinsic) {
 186       /* We need to handle nan's for floating point numbers. If one of the
 187        * inputs is nan the other should be returned (required by both D3D10+
 188        * and OpenCL).
 189        * The sse intrinsics return the second operator in case of nan by
 190        * default so we need to special code to handle those.
 191        */
 192       if (util_cpu_caps.has_sse && type.floating &&
 193           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 194           nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
 195          LLVMValueRef isnan, max;
 196          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 197                                                    type,
 198                                                    intr_size, a, b);
 199          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 200             isnan = lp_build_isnan(bld, b);
 201             return lp_build_select(bld, isnan, a, max);
 202          } else {
 203             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 204             isnan = lp_build_isnan(bld, a);
 205             return lp_build_select(bld, isnan, a, max);
 206          }
 207       } else {
 208          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 209                                                     type,
 210                                                     intr_size, a, b);
 211       }
 212    }
 213
 214    if (type.floating) {
 215       switch (nan_behavior) {
 216       case GALLIVM_NAN_RETURN_NAN: {
 217          LLVMValueRef isnan = lp_build_isnan(bld, b);
 218          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 219          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 220          return lp_build_select(bld, cond, a, b);
 221       }
 222          break;
 223       case GALLIVM_NAN_RETURN_OTHER: {
 224          LLVMValueRef isnan = lp_build_isnan(bld, a);
 225          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 226          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 227          return lp_build_select(bld, cond, a, b);
 228       }
 229          break;
 230       case GALLIVM_NAN_RETURN_SECOND:
 231          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 232          return lp_build_select(bld, cond, a, b);
 233       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 234          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 235          return lp_build_select(bld, cond, a, b);
 236          break;
 237       default:
 238          assert(0);
 239          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242    } else {
 243       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 244       return lp_build_select(bld, cond, a, b);
 245    }
 246 }
 247
 248
 249 /**
 250  * Generate max(a, b)
 251  * No checks for special case values of a or b = 1 or 0 are done.
 252  * NaN's are handled according to the behavior specified by the
 253  * nan_behavior argument.
 254  */
 255 static LLVMValueRef
 256 lp_build_max_simple(struct lp_build_context *bld,
 257                     LLVMValueRef a,
 258                     LLVMValueRef b,
 259                     enum gallivm_nan_behavior nan_behavior)
 260 {
 261    const struct lp_type type = bld->type;
 262    const char *intrinsic = NULL;
 263    unsigned intr_size = 0;
 264    LLVMValueRef cond;
 265
 266    assert(lp_check_value(type, a));
 267    assert(lp_check_value(type, b));
 268
 269    /* TODO: optimize the constant case */
 270
 271    if (type.floating && util_cpu_caps.has_sse) {
 272       if (type.width == 32) {
 273          if (type.length == 1) {
 274             intrinsic = "llvm.x86.sse.max.ss";
 275             intr_size = 128;
 276          }
 277          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 278             intrinsic = "llvm.x86.sse.max.ps";
 279             intr_size = 128;
 280          }
 281          else {
 282             intrinsic = "llvm.x86.avx.max.ps.256";
 283             intr_size = 256;
 284          }
 285       }
 286       if (type.width == 64 && util_cpu_caps.has_sse2) {
 287          if (type.length == 1) {
 288             intrinsic = "llvm.x86.sse2.max.sd";
 289             intr_size = 128;
 290          }
 291          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 292             intrinsic = "llvm.x86.sse2.max.pd";
 293             intr_size = 128;
 294          }
 295          else {
 296             intrinsic = "llvm.x86.avx.max.pd.256";
 297             intr_size = 256;
 298          }
 299       }
 300    }
 301    else if (type.floating && util_cpu_caps.has_altivec) {
 302       debug_printf("%s: altivec doesn't support nan behavior modes\n",
 303                    __FUNCTION__);
 304       if (type.width == 32 || type.length == 4) {
 305          intrinsic = "llvm.ppc.altivec.vmaxfp";
 306          intr_size = 128;
 307       }
 308    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 309       intr_size = 128;
 310       if ((type.width == 8 || type.width == 16) &&
 311           (type.width * type.length <= 64) &&
 312           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 313          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 314                       __FUNCTION__);
 315          }
 316       if (type.width == 8 && !type.sign) {
 317          intrinsic = "llvm.x86.sse2.pmaxu.b";
 318          intr_size = 128;
 319       }
 320       else if (type.width == 16 && type.sign) {
 321          intrinsic = "llvm.x86.sse2.pmaxs.w";
 322       }
 323       if (util_cpu_caps.has_sse4_1) {
 324          if (type.width == 8 && type.sign) {
 325             intrinsic = "llvm.x86.sse41.pmaxsb";
 326          }
 327          if (type.width == 16 && !type.sign) {
 328             intrinsic = "llvm.x86.sse41.pmaxuw";
 329          }
 330          if (type.width == 32 && !type.sign) {
 331             intrinsic = "llvm.x86.sse41.pmaxud";
 332         }
 333          if (type.width == 32 && type.sign) {
 334             intrinsic = "llvm.x86.sse41.pmaxsd";
 335          }
 336       }
 337    } else if (util_cpu_caps.has_altivec) {
 338      intr_size = 128;
 339      debug_printf("%s: altivec doesn't support nan behavior modes\n",
 340                   __FUNCTION__);
 341      if (type.width == 8) {
 342        if (!type.sign) {
 343          intrinsic = "llvm.ppc.altivec.vmaxub";
 344        } else {
 345          intrinsic = "llvm.ppc.altivec.vmaxsb";
 346        }
 347      } else if (type.width == 16) {
 348        if (!type.sign) {
 349          intrinsic = "llvm.ppc.altivec.vmaxuh";
 350        } else {
 351          intrinsic = "llvm.ppc.altivec.vmaxsh";
 352        }
 353      } else if (type.width == 32) {
 354        if (!type.sign) {
 355          intrinsic = "llvm.ppc.altivec.vmaxuw";
 356        } else {
 357          intrinsic = "llvm.ppc.altivec.vmaxsw";
 358        }
 359      }
 360    }
 361
 362    if(intrinsic) {
 363       if (util_cpu_caps.has_sse && type.floating &&
 364           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 365           nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
 366          LLVMValueRef isnan, min;
 367          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 368                                                    type,
 369                                                    intr_size, a, b);
 370          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 371             isnan = lp_build_isnan(bld, b);
 372             return lp_build_select(bld, isnan, a, min);
 373          } else {
 374             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 375             isnan = lp_build_isnan(bld, a);
 376             return lp_build_select(bld, isnan, a, min);
 377          }
 378       } else {
 379          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 380                                                     type,
 381                                                     intr_size, a, b);
 382       }
 383    }
 384
 385    if (type.floating) {
 386       switch (nan_behavior) {
 387       case GALLIVM_NAN_RETURN_NAN: {
 388          LLVMValueRef isnan = lp_build_isnan(bld, b);
 389          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 390          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 391          return lp_build_select(bld, cond, a, b);
 392       }
 393          break;
 394       case GALLIVM_NAN_RETURN_OTHER: {
 395          LLVMValueRef isnan = lp_build_isnan(bld, a);
 396          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 397          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 398          return lp_build_select(bld, cond, a, b);
 399       }
 400          break;
 401       case GALLIVM_NAN_RETURN_SECOND:
 402          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 403          return lp_build_select(bld, cond, a, b);
 404       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 405          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 406          return lp_build_select(bld, cond, a, b);
 407          break;
 408       default:
 409          assert(0);
 410          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 411          return lp_build_select(bld, cond, a, b);
 412       }
 413    } else {
 414       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415       return lp_build_select(bld, cond, a, b);
 416    }
 417 }
 418
 419
 420 /**
 421  * Generate 1 - a, or ~a depending on bld->type.
 422  */
 423 LLVMValueRef
 424 lp_build_comp(struct lp_build_context *bld,
 425               LLVMValueRef a)
 426 {
 427    LLVMBuilderRef builder = bld->gallivm->builder;
 428    const struct lp_type type = bld->type;
 429
 430    assert(lp_check_value(type, a));
 431
 432    if(a == bld->one)
 433       return bld->zero;
 434    if(a == bld->zero)
 435       return bld->one;
 436
 437    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 438       if(LLVMIsConstant(a))
 439          return LLVMConstNot(a);
 440       else
 441          return LLVMBuildNot(builder, a, "");
 442    }
 443
 444    if(LLVMIsConstant(a))
 445       if (type.floating)
 446           return LLVMConstFSub(bld->one, a);
 447       else
 448           return LLVMConstSub(bld->one, a);
 449    else
 450       if (type.floating)
 451          return LLVMBuildFSub(builder, bld->one, a, "");
 452       else
 453          return LLVMBuildSub(builder, bld->one, a, "");
 454 }
 455
 456
 457 /**
 458  * Generate a + b
 459  */
 460 LLVMValueRef
 461 lp_build_add(struct lp_build_context *bld,
 462              LLVMValueRef a,
 463              LLVMValueRef b)
 464 {
 465    LLVMBuilderRef builder = bld->gallivm->builder;
 466    const struct lp_type type = bld->type;
 467    LLVMValueRef res;
 468
 469    assert(lp_check_value(type, a));
 470    assert(lp_check_value(type, b));
 471
 472    if(a == bld->zero)
 473       return b;
 474    if(b == bld->zero)
 475       return a;
 476    if(a == bld->undef || b == bld->undef)
 477       return bld->undef;
 478
 479    if(bld->type.norm) {
 480       const char *intrinsic = NULL;
 481
 482       if(a == bld->one || b == bld->one)
 483         return bld->one;
 484
 485       if (type.width * type.length == 128 &&
 486           !type.floating && !type.fixed) {
 487          if(util_cpu_caps.has_sse2) {
 488            if(type.width == 8)
 489              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 490            if(type.width == 16)
 491              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 492          } else if (util_cpu_caps.has_altivec) {
 493            if(type.width == 8)
 494               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 495            if(type.width == 16)
 496               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 497          }
 498       }
 499
 500       if(intrinsic)
 501          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 502    }
 503
 504    /* TODO: handle signed case */
 505    if(type.norm && !type.floating && !type.fixed && !type.sign)
 506       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 507
 508    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 509       if (type.floating)
 510          res = LLVMConstFAdd(a, b);
 511       else
 512          res = LLVMConstAdd(a, b);
 513    else
 514       if (type.floating)
 515          res = LLVMBuildFAdd(builder, a, b, "");
 516       else
 517          res = LLVMBuildAdd(builder, a, b, "");
 518
 519    /* clamp to ceiling of 1.0 */
 520    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 521       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 522
 523    /* XXX clamp to floor of -1 or 0??? */
 524
 525    return res;
 526 }
 527
 528
 529 /** Return the scalar sum of the elements of a.
 530  * Should avoid this operation whenever possible.
 531  */
 532 LLVMValueRef
 533 lp_build_horizontal_add(struct lp_build_context *bld,
 534                         LLVMValueRef a)
 535 {
 536    LLVMBuilderRef builder = bld->gallivm->builder;
 537    const struct lp_type type = bld->type;
 538    LLVMValueRef index, res;
 539    unsigned i, length;
 540    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 541    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 542    LLVMValueRef vecres, elem2;
 543
 544    assert(lp_check_value(type, a));
 545
 546    if (type.length == 1) {
 547       return a;
 548    }
 549
 550    assert(!bld->type.norm);
 551
 552    /*
 553     * for byte vectors can do much better with psadbw.
 554     * Using repeated shuffle/adds here. Note with multiple vectors
 555     * this can be done more efficiently as outlined in the intel
 556     * optimization manual.
 557     * Note: could cause data rearrangement if used with smaller element
 558     * sizes.
 559     */
 560
 561    vecres = a;
 562    length = type.length / 2;
 563    while (length > 1) {
 564       LLVMValueRef vec1, vec2;
 565       for (i = 0; i < length; i++) {
 566          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 567          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 568       }
 569       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 570                                     LLVMConstVector(shuffles1, length), "");
 571       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 572                                     LLVMConstVector(shuffles2, length), "");
 573       if (type.floating) {
 574          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 575       }
 576       else {
 577          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 578       }
 579       length = length >> 1;
 580    }
 581
 582    /* always have vector of size 2 here */
 583    assert(length == 1);
 584
 585    index = lp_build_const_int32(bld->gallivm, 0);
 586    res = LLVMBuildExtractElement(builder, vecres, index, "");
 587    index = lp_build_const_int32(bld->gallivm, 1);
 588    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 589
 590    if (type.floating)
 591       res = LLVMBuildFAdd(builder, res, elem2, "");
 592     else
 593       res = LLVMBuildAdd(builder, res, elem2, "");
 594
 595    return res;
 596 }
 597
 598 /**
 599  * Return the horizontal sums of 4 float vectors as a float4 vector.
 600  * This uses the technique as outlined in Intel Optimization Manual.
 601  */
 602 static LLVMValueRef
 603 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 604                             LLVMValueRef src[4])
 605 {
 606    struct gallivm_state *gallivm = bld->gallivm;
 607    LLVMBuilderRef builder = gallivm->builder;
 608    LLVMValueRef shuffles[4];
 609    LLVMValueRef tmp[4];
 610    LLVMValueRef sumtmp[2], shuftmp[2];
 611
 612    /* lower half of regs */
 613    shuffles[0] = lp_build_const_int32(gallivm, 0);
 614    shuffles[1] = lp_build_const_int32(gallivm, 1);
 615    shuffles[2] = lp_build_const_int32(gallivm, 4);
 616    shuffles[3] = lp_build_const_int32(gallivm, 5);
 617    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 618                                    LLVMConstVector(shuffles, 4), "");
 619    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 620                                    LLVMConstVector(shuffles, 4), "");
 621
 622    /* upper half of regs */
 623    shuffles[0] = lp_build_const_int32(gallivm, 2);
 624    shuffles[1] = lp_build_const_int32(gallivm, 3);
 625    shuffles[2] = lp_build_const_int32(gallivm, 6);
 626    shuffles[3] = lp_build_const_int32(gallivm, 7);
 627    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 628                                    LLVMConstVector(shuffles, 4), "");
 629    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 630                                    LLVMConstVector(shuffles, 4), "");
 631
 632    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 633    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 634
 635    shuffles[0] = lp_build_const_int32(gallivm, 0);
 636    shuffles[1] = lp_build_const_int32(gallivm, 2);
 637    shuffles[2] = lp_build_const_int32(gallivm, 4);
 638    shuffles[3] = lp_build_const_int32(gallivm, 6);
 639    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 640                                        LLVMConstVector(shuffles, 4), "");
 641
 642    shuffles[0] = lp_build_const_int32(gallivm, 1);
 643    shuffles[1] = lp_build_const_int32(gallivm, 3);
 644    shuffles[2] = lp_build_const_int32(gallivm, 5);
 645    shuffles[3] = lp_build_const_int32(gallivm, 7);
 646    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 647                                        LLVMConstVector(shuffles, 4), "");
 648
 649    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 650 }
 651
 652
 653 /*
 654  * partially horizontally add 2-4 float vectors with length nx4,
 655  * i.e. only four adjacent values in each vector will be added,
 656  * assuming values are really grouped in 4 which also determines
 657  * output order.
 658  *
 659  * Return a vector of the same length as the initial vectors,
 660  * with the excess elements (if any) being undefined.
 661  * The element order is independent of number of input vectors.
 662  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 663  * the output order thus will be
 664  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 665  */
 666 LLVMValueRef
 667 lp_build_hadd_partial4(struct lp_build_context *bld,
 668                        LLVMValueRef vectors[],
 669                        unsigned num_vecs)
 670 {
 671    struct gallivm_state *gallivm = bld->gallivm;
 672    LLVMBuilderRef builder = gallivm->builder;
 673    LLVMValueRef ret_vec;
 674    LLVMValueRef tmp[4];
 675    const char *intrinsic = NULL;
 676
 677    assert(num_vecs >= 2 && num_vecs <= 4);
 678    assert(bld->type.floating);
 679
 680    /* only use this with at least 2 vectors, as it is sort of expensive
 681     * (depending on cpu) and we always need two horizontal adds anyway,
 682     * so a shuffle/add approach might be better.
 683     */
 684
 685    tmp[0] = vectors[0];
 686    tmp[1] = vectors[1];
 687
 688    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 689    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 690
 691    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 692        bld->type.length == 4) {
 693       intrinsic = "llvm.x86.sse3.hadd.ps";
 694    }
 695    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 696             bld->type.length == 8) {
 697       intrinsic = "llvm.x86.avx.hadd.ps.256";
 698    }
 699    if (intrinsic) {
 700       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 701                                        lp_build_vec_type(gallivm, bld->type),
 702                                        tmp[0], tmp[1]);
 703       if (num_vecs > 2) {
 704          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 705                                           lp_build_vec_type(gallivm, bld->type),
 706                                           tmp[2], tmp[3]);
 707       }
 708       else {
 709          tmp[1] = tmp[0];
 710       }
 711       return lp_build_intrinsic_binary(builder, intrinsic,
 712                                        lp_build_vec_type(gallivm, bld->type),
 713                                        tmp[0], tmp[1]);
 714    }
 715
 716    if (bld->type.length == 4) {
 717       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 718    }
 719    else {
 720       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 721       unsigned j;
 722       unsigned num_iter = bld->type.length / 4;
 723       struct lp_type parttype = bld->type;
 724       parttype.length = 4;
 725       for (j = 0; j < num_iter; j++) {
 726          LLVMValueRef partsrc[4];
 727          unsigned i;
 728          for (i = 0; i < 4; i++) {
 729             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 730          }
 731          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 732       }
 733       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 734    }
 735    return ret_vec;
 736 }
 737
 738 /**
 739  * Generate a - b
 740  */
 741 LLVMValueRef
 742 lp_build_sub(struct lp_build_context *bld,
 743              LLVMValueRef a,
 744              LLVMValueRef b)
 745 {
 746    LLVMBuilderRef builder = bld->gallivm->builder;
 747    const struct lp_type type = bld->type;
 748    LLVMValueRef res;
 749
 750    assert(lp_check_value(type, a));
 751    assert(lp_check_value(type, b));
 752
 753    if(b == bld->zero)
 754       return a;
 755    if(a == bld->undef || b == bld->undef)
 756       return bld->undef;
 757    if(a == b)
 758       return bld->zero;
 759
 760    if(bld->type.norm) {
 761       const char *intrinsic = NULL;
 762
 763       if(b == bld->one)
 764         return bld->zero;
 765
 766       if (type.width * type.length == 128 &&
 767           !type.floating && !type.fixed) {
 768          if (util_cpu_caps.has_sse2) {
 769            if(type.width == 8)
 770               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 771            if(type.width == 16)
 772               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 773          } else if (util_cpu_caps.has_altivec) {
 774            if(type.width == 8)
 775               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 776            if(type.width == 16)
 777               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 778          }
 779       }
 780
 781       if(intrinsic)
 782          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 783    }
 784
 785    /* TODO: handle signed case */
 786    if(type.norm && !type.floating && !type.fixed && !type.sign)
 787       a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 788
 789    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 790       if (type.floating)
 791          res = LLVMConstFSub(a, b);
 792       else
 793          res = LLVMConstSub(a, b);
 794    else
 795       if (type.floating)
 796          res = LLVMBuildFSub(builder, a, b, "");
 797       else
 798          res = LLVMBuildSub(builder, a, b, "");
 799
 800    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 801       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 802
 803    return res;
 804 }
 805
 806
 807
 808 /**
 809  * Normalized multiplication.
 810  *
 811  * There are several approaches for (using 8-bit normalized multiplication as
 812  * an example):
 813  *
 814  * - alpha plus one
 815  *
 816  *     makes the following approximation to the division (Sree)
 817  *
 818  *       a*b/255 ~= (a*(b + 1)) >> 256
 819  *
 820  *     which is the fastest method that satisfies the following OpenGL criteria of
 821  *
 822  *       0*0 = 0 and 255*255 = 255
 823  *
 824  * - geometric series
 825  *
 826  *     takes the geometric series approximation to the division
 827  *
 828  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 829  *
 830  *     in this case just the first two terms to fit in 16bit arithmetic
 831  *
 832  *       t/255 ~= (t + (t >> 8)) >> 8
 833  *
 834  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 835  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 836  *     must be used.
 837  *
 838  * - geometric series plus rounding
 839  *
 840  *     when using a geometric series division instead of truncating the result
 841  *     use roundoff in the approximation (Jim Blinn)
 842  *
 843  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 844  *
 845  *     achieving the exact results.
 846  *
 847  *
 848  *
 849  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 850  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 851  * @sa Michael Herf, The "double blend trick", May 2000,
 852  *     http://www.stereopsis.com/doubleblend.html
 853  */
 854 static LLVMValueRef
 855 lp_build_mul_norm(struct gallivm_state *gallivm,
 856                   struct lp_type wide_type,
 857                   LLVMValueRef a, LLVMValueRef b)
 858 {
 859    LLVMBuilderRef builder = gallivm->builder;
 860    struct lp_build_context bld;
 861    unsigned n;
 862    LLVMValueRef half;
 863    LLVMValueRef ab;
 864
 865    assert(!wide_type.floating);
 866    assert(lp_check_value(wide_type, a));
 867    assert(lp_check_value(wide_type, b));
 868
 869    lp_build_context_init(&bld, gallivm, wide_type);
 870
 871    n = wide_type.width / 2;
 872    if (wide_type.sign) {
 873       --n;
 874    }
 875
 876    /*
 877     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 878     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 879     */
 880
 881    /*
 882     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 883     */
 884
 885    ab = LLVMBuildMul(builder, a, b, "");
 886    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 887
 888    /*
 889     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 890     */
 891
 892    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 893    if (wide_type.sign) {
 894       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 895       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 896       half = lp_build_select(&bld, sign, minus_half, half);
 897    }
 898    ab = LLVMBuildAdd(builder, ab, half, "");
 899
 900    /* Final division */
 901    ab = lp_build_shr_imm(&bld, ab, n);
 902
 903    return ab;
 904 }
 905
 906 /**
 907  * Generate a * b
 908  */
 909 LLVMValueRef
 910 lp_build_mul(struct lp_build_context *bld,
 911              LLVMValueRef a,
 912              LLVMValueRef b)
 913 {
 914    LLVMBuilderRef builder = bld->gallivm->builder;
 915    const struct lp_type type = bld->type;
 916    LLVMValueRef shift;
 917    LLVMValueRef res;
 918
 919    assert(lp_check_value(type, a));
 920    assert(lp_check_value(type, b));
 921
 922    if(a == bld->zero)
 923       return bld->zero;
 924    if(a == bld->one)
 925       return b;
 926    if(b == bld->zero)
 927       return bld->zero;
 928    if(b == bld->one)
 929       return a;
 930    if(a == bld->undef || b == bld->undef)
 931       return bld->undef;
 932
 933    if (!type.floating && !type.fixed && type.norm) {
 934       struct lp_type wide_type = lp_wider_type(type);
 935       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 936
 937       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 938       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 939
 940       /* PMULLW, PSRLW, PADDW */
 941       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 942       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 943
 944       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 945
 946       return ab;
 947    }
 948
 949    if(type.fixed)
 950       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 951    else
 952       shift = NULL;
 953
 954    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 955       if (type.floating)
 956          res = LLVMConstFMul(a, b);
 957       else
 958          res = LLVMConstMul(a, b);
 959       if(shift) {
 960          if(type.sign)
 961             res = LLVMConstAShr(res, shift);
 962          else
 963             res = LLVMConstLShr(res, shift);
 964       }
 965    }
 966    else {
 967       if (type.floating)
 968          res = LLVMBuildFMul(builder, a, b, "");
 969       else
 970          res = LLVMBuildMul(builder, a, b, "");
 971       if(shift) {
 972          if(type.sign)
 973             res = LLVMBuildAShr(builder, res, shift, "");
 974          else
 975             res = LLVMBuildLShr(builder, res, shift, "");
 976       }
 977    }
 978
 979    return res;
 980 }
 981
 982
 983 /**
 984  * Small vector x scale multiplication optimization.
 985  */
 986 LLVMValueRef
 987 lp_build_mul_imm(struct lp_build_context *bld,
 988                  LLVMValueRef a,
 989                  int b)
 990 {
 991    LLVMBuilderRef builder = bld->gallivm->builder;
 992    LLVMValueRef factor;
 993
 994    assert(lp_check_value(bld->type, a));
 995
 996    if(b == 0)
 997       return bld->zero;
 998
 999    if(b == 1)
1000       return a;
1001
1002    if(b == -1)
1003       return lp_build_negate(bld, a);
1004
1005    if(b == 2 && bld->type.floating)
1006       return lp_build_add(bld, a, a);
1007
1008    if(util_is_power_of_two(b)) {
1009       unsigned shift = ffs(b) - 1;
1010
1011       if(bld->type.floating) {
1012 #if 0
1013          /*
1014           * Power of two multiplication by directly manipulating the exponent.
1015           *
1016           * XXX: This might not be always faster, it will introduce a small error
1017           * for multiplication by zero, and it will produce wrong results
1018           * for Inf and NaN.
1019           */
1020          unsigned mantissa = lp_mantissa(bld->type);
1021          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1022          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1023          a = LLVMBuildAdd(builder, a, factor, "");
1024          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1025          return a;
1026 #endif
1027       }
1028       else {
1029          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1030          return LLVMBuildShl(builder, a, factor, "");
1031       }
1032    }
1033
1034    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1035    return lp_build_mul(bld, a, factor);
1036 }
1037
1038
1039 /**
1040  * Generate a / b
1041  */
1042 LLVMValueRef
1043 lp_build_div(struct lp_build_context *bld,
1044              LLVMValueRef a,
1045              LLVMValueRef b)
1046 {
1047    LLVMBuilderRef builder = bld->gallivm->builder;
1048    const struct lp_type type = bld->type;
1049
1050    assert(lp_check_value(type, a));
1051    assert(lp_check_value(type, b));
1052
1053    if(a == bld->zero)
1054       return bld->zero;
1055    if(a == bld->one)
1056       return lp_build_rcp(bld, b);
1057    if(b == bld->zero)
1058       return bld->undef;
1059    if(b == bld->one)
1060       return a;
1061    if(a == bld->undef || b == bld->undef)
1062       return bld->undef;
1063
1064    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1065       if (type.floating)
1066          return LLVMConstFDiv(a, b);
1067       else if (type.sign)
1068          return LLVMConstSDiv(a, b);
1069       else
1070          return LLVMConstUDiv(a, b);
1071    }
1072
1073    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1074        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1075       type.floating)
1076       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1077
1078    if (type.floating)
1079       return LLVMBuildFDiv(builder, a, b, "");
1080    else if (type.sign)
1081       return LLVMBuildSDiv(builder, a, b, "");
1082    else
1083       return LLVMBuildUDiv(builder, a, b, "");
1084 }
1085
1086
1087 /**
1088  * Linear interpolation helper.
1089  *
1090  * @param normalized whether we are interpolating normalized values,
1091  *        encoded in normalized integers, twice as wide.
1092  *
1093  * @sa http://www.stereopsis.com/doubleblend.html
1094  */
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context *bld,
1097                      LLVMValueRef x,
1098                      LLVMValueRef v0,
1099                      LLVMValueRef v1,
1100                      unsigned flags)
1101 {
1102    unsigned half_width = bld->type.width/2;
1103    LLVMBuilderRef builder = bld->gallivm->builder;
1104    LLVMValueRef delta;
1105    LLVMValueRef res;
1106
1107    assert(lp_check_value(bld->type, x));
1108    assert(lp_check_value(bld->type, v0));
1109    assert(lp_check_value(bld->type, v1));
1110
1111    delta = lp_build_sub(bld, v1, v0);
1112
1113    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1114       if (!bld->type.sign) {
1115          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1116             /*
1117              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118              * most-significant-bit to the lowest-significant-bit, so that
1119              * later we can just divide by 2**n instead of 2**n - 1.
1120              */
1121
1122             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1123          }
1124
1125          /* (x * delta) >> n */
1126          res = lp_build_mul(bld, x, delta);
1127          res = lp_build_shr_imm(bld, res, half_width);
1128       } else {
1129          /*
1130           * The rescaling trick above doesn't work for signed numbers, so
1131           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1132           * instead.
1133           */
1134          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1135          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1136       }
1137    } else {
1138       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1139       res = lp_build_mul(bld, x, delta);
1140    }
1141
1142    res = lp_build_add(bld, v0, res);
1143
1144    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1145        bld->type.fixed) {
1146       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148        * but it will be wrong for true fixed point use cases. Basically we need
1149        * a more powerful lp_type, capable of further distinguishing the values
1150        * interpretation from the value storage. */
1151       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1152    }
1153
1154    return res;
1155 }
1156
1157
1158 /**
1159  * Linear interpolation.
1160  */
1161 LLVMValueRef
1162 lp_build_lerp(struct lp_build_context *bld,
1163               LLVMValueRef x,
1164               LLVMValueRef v0,
1165               LLVMValueRef v1,
1166               unsigned flags)
1167 {
1168    const struct lp_type type = bld->type;
1169    LLVMValueRef res;
1170
1171    assert(lp_check_value(type, x));
1172    assert(lp_check_value(type, v0));
1173    assert(lp_check_value(type, v1));
1174
1175    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1176
1177    if (type.norm) {
1178       struct lp_type wide_type;
1179       struct lp_build_context wide_bld;
1180       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1181
1182       assert(type.length >= 2);
1183
1184       /*
1185        * Create a wider integer type, enough to hold the
1186        * intermediate result of the multiplication.
1187        */
1188       memset(&wide_type, 0, sizeof wide_type);
1189       wide_type.sign   = type.sign;
1190       wide_type.width  = type.width*2;
1191       wide_type.length = type.length/2;
1192
1193       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1194
1195       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1196       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1197       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1198
1199       /*
1200        * Lerp both halves.
1201        */
1202
1203       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1204
1205       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1206       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1207
1208       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1209    } else {
1210       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1211    }
1212
1213    return res;
1214 }
1215
1216
1217 /**
1218  * Bilinear interpolation.
1219  *
1220  * Values indices are in v_{yx}.
1221  */
1222 LLVMValueRef
1223 lp_build_lerp_2d(struct lp_build_context *bld,
1224                  LLVMValueRef x,
1225                  LLVMValueRef y,
1226                  LLVMValueRef v00,
1227                  LLVMValueRef v01,
1228                  LLVMValueRef v10,
1229                  LLVMValueRef v11,
1230                  unsigned flags)
1231 {
1232    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1233    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1234    return lp_build_lerp(bld, y, v0, v1, flags);
1235 }
1236
1237
1238 LLVMValueRef
1239 lp_build_lerp_3d(struct lp_build_context *bld,
1240                  LLVMValueRef x,
1241                  LLVMValueRef y,
1242                  LLVMValueRef z,
1243                  LLVMValueRef v000,
1244                  LLVMValueRef v001,
1245                  LLVMValueRef v010,
1246                  LLVMValueRef v011,
1247                  LLVMValueRef v100,
1248                  LLVMValueRef v101,
1249                  LLVMValueRef v110,
1250                  LLVMValueRef v111,
1251                  unsigned flags)
1252 {
1253    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1254    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1255    return lp_build_lerp(bld, z, v0, v1, flags);
1256 }
1257
1258
1259 /**
1260  * Generate min(a, b)
1261  * Do checks for special cases but not for nans.
1262  */
1263 LLVMValueRef
1264 lp_build_min(struct lp_build_context *bld,
1265              LLVMValueRef a,
1266              LLVMValueRef b)
1267 {
1268    assert(lp_check_value(bld->type, a));
1269    assert(lp_check_value(bld->type, b));
1270
1271    if(a == bld->undef || b == bld->undef)
1272       return bld->undef;
1273
1274    if(a == b)
1275       return a;
1276
1277    if (bld->type.norm) {
1278       if (!bld->type.sign) {
1279          if (a == bld->zero || b == bld->zero) {
1280             return bld->zero;
1281          }
1282       }
1283       if(a == bld->one)
1284          return b;
1285       if(b == bld->one)
1286          return a;
1287    }
1288
1289    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1290 }
1291
1292
1293 /**
1294  * Generate min(a, b)
1295  * NaN's are handled according to the behavior specified by the
1296  * nan_behavior argument.
1297  */
1298 LLVMValueRef
1299 lp_build_min_ext(struct lp_build_context *bld,
1300                  LLVMValueRef a,
1301                  LLVMValueRef b,
1302                  enum gallivm_nan_behavior nan_behavior)
1303 {
1304    assert(lp_check_value(bld->type, a));
1305    assert(lp_check_value(bld->type, b));
1306
1307    if(a == bld->undef || b == bld->undef)
1308       return bld->undef;
1309
1310    if(a == b)
1311       return a;
1312
1313    if (bld->type.norm) {
1314       if (!bld->type.sign) {
1315          if (a == bld->zero || b == bld->zero) {
1316             return bld->zero;
1317          }
1318       }
1319       if(a == bld->one)
1320          return b;
1321       if(b == bld->one)
1322          return a;
1323    }
1324
1325    return lp_build_min_simple(bld, a, b, nan_behavior);
1326 }
1327
1328 /**
1329  * Generate max(a, b)
1330  * Do checks for special cases, but NaN behavior is undefined.
1331  */
1332 LLVMValueRef
1333 lp_build_max(struct lp_build_context *bld,
1334              LLVMValueRef a,
1335              LLVMValueRef b)
1336 {
1337    assert(lp_check_value(bld->type, a));
1338    assert(lp_check_value(bld->type, b));
1339
1340    if(a == bld->undef || b == bld->undef)
1341       return bld->undef;
1342
1343    if(a == b)
1344       return a;
1345
1346    if(bld->type.norm) {
1347       if(a == bld->one || b == bld->one)
1348          return bld->one;
1349       if (!bld->type.sign) {
1350          if (a == bld->zero) {
1351             return b;
1352          }
1353          if (b == bld->zero) {
1354             return a;
1355          }
1356       }
1357    }
1358
1359    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1360 }
1361
1362
1363 /**
1364  * Generate max(a, b)
1365  * Checks for special cases.
1366  * NaN's are handled according to the behavior specified by the
1367  * nan_behavior argument.
1368  */
1369 LLVMValueRef
1370 lp_build_max_ext(struct lp_build_context *bld,
1371                   LLVMValueRef a,
1372                   LLVMValueRef b,
1373                   enum gallivm_nan_behavior nan_behavior)
1374 {
1375    assert(lp_check_value(bld->type, a));
1376    assert(lp_check_value(bld->type, b));
1377
1378    if(a == bld->undef || b == bld->undef)
1379       return bld->undef;
1380
1381    if(a == b)
1382       return a;
1383
1384    if(bld->type.norm) {
1385       if(a == bld->one || b == bld->one)
1386          return bld->one;
1387       if (!bld->type.sign) {
1388          if (a == bld->zero) {
1389             return b;
1390          }
1391          if (b == bld->zero) {
1392             return a;
1393          }
1394       }
1395    }
1396
1397    return lp_build_max_simple(bld, a, b, nan_behavior);
1398 }
1399
1400 /**
1401  * Generate clamp(a, min, max)
1402  * Do checks for special cases.
1403  */
1404 LLVMValueRef
1405 lp_build_clamp(struct lp_build_context *bld,
1406                LLVMValueRef a,
1407                LLVMValueRef min,
1408                LLVMValueRef max)
1409 {
1410    assert(lp_check_value(bld->type, a));
1411    assert(lp_check_value(bld->type, min));
1412    assert(lp_check_value(bld->type, max));
1413
1414    /*
1415     * XXX dark magic warning: The order of min/max here matters (!).
1416     * The reason is a typical use case is clamp(a, 0.0, 1.0)
1417     * (for example for float->unorm conversion) and on x86 sse2
1418     * this will give 0.0 for NaNs, whereas doing min first will
1419     * give 1.0 for NaN which makes d3d10 angry...
1420     * This is very much not guaranteed behavior though which just
1421     * happens to work x86 sse2 (and up), and obviously won't do anything
1422     * for other non-zero clamps (say -1.0/1.0 in a SNORM conversion) neither,
1423     * so need to fix this for real...
1424     */
1425    a = lp_build_max(bld, a, min);
1426    a = lp_build_min(bld, a, max);
1427    return a;
1428 }
1429
1430
1431 /**
1432  * Generate abs(a)
1433  */
1434 LLVMValueRef
1435 lp_build_abs(struct lp_build_context *bld,
1436              LLVMValueRef a)
1437 {
1438    LLVMBuilderRef builder = bld->gallivm->builder;
1439    const struct lp_type type = bld->type;
1440    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1441
1442    assert(lp_check_value(type, a));
1443
1444    if(!type.sign)
1445       return a;
1446
1447    if(type.floating) {
1448       /* Mask out the sign bit */
1449       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1450       unsigned long long absMask = ~(1ULL << (type.width - 1));
1451       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1452       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1453       a = LLVMBuildAnd(builder, a, mask, "");
1454       a = LLVMBuildBitCast(builder, a, vec_type, "");
1455       return a;
1456    }
1457
1458    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1459       switch(type.width) {
1460       case 8:
1461          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1462       case 16:
1463          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1464       case 32:
1465          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1466       }
1467    }
1468    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1469             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1470             (type.width == 8 || type.width == 16 || type.width == 32)) {
1471       debug_printf("%s: inefficient code, should split vectors manually\n",
1472                    __FUNCTION__);
1473    }
1474
1475    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1476 }
1477
1478
1479 LLVMValueRef
1480 lp_build_negate(struct lp_build_context *bld,
1481                 LLVMValueRef a)
1482 {
1483    LLVMBuilderRef builder = bld->gallivm->builder;
1484
1485    assert(lp_check_value(bld->type, a));
1486
1487 #if HAVE_LLVM >= 0x0207
1488    if (bld->type.floating)
1489       a = LLVMBuildFNeg(builder, a, "");
1490    else
1491 #endif
1492       a = LLVMBuildNeg(builder, a, "");
1493
1494    return a;
1495 }
1496
1497
1498 /** Return -1, 0 or +1 depending on the sign of a */
1499 LLVMValueRef
1500 lp_build_sgn(struct lp_build_context *bld,
1501              LLVMValueRef a)
1502 {
1503    LLVMBuilderRef builder = bld->gallivm->builder;
1504    const struct lp_type type = bld->type;
1505    LLVMValueRef cond;
1506    LLVMValueRef res;
1507
1508    assert(lp_check_value(type, a));
1509
1510    /* Handle non-zero case */
1511    if(!type.sign) {
1512       /* if not zero then sign must be positive */
1513       res = bld->one;
1514    }
1515    else if(type.floating) {
1516       LLVMTypeRef vec_type;
1517       LLVMTypeRef int_type;
1518       LLVMValueRef mask;
1519       LLVMValueRef sign;
1520       LLVMValueRef one;
1521       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1522
1523       int_type = lp_build_int_vec_type(bld->gallivm, type);
1524       vec_type = lp_build_vec_type(bld->gallivm, type);
1525       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1526
1527       /* Take the sign bit and add it to 1 constant */
1528       sign = LLVMBuildBitCast(builder, a, int_type, "");
1529       sign = LLVMBuildAnd(builder, sign, mask, "");
1530       one = LLVMConstBitCast(bld->one, int_type);
1531       res = LLVMBuildOr(builder, sign, one, "");
1532       res = LLVMBuildBitCast(builder, res, vec_type, "");
1533    }
1534    else
1535    {
1536       /* signed int/norm/fixed point */
1537       /* could use psign with sse3 and appropriate vectors here */
1538       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1539       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1540       res = lp_build_select(bld, cond, bld->one, minus_one);
1541    }
1542
1543    /* Handle zero */
1544    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1545    res = lp_build_select(bld, cond, bld->zero, res);
1546
1547    return res;
1548 }
1549
1550
1551 /**
1552  * Set the sign of float vector 'a' according to 'sign'.
1553  * If sign==0, return abs(a).
1554  * If sign==1, return -abs(a);
1555  * Other values for sign produce undefined results.
1556  */
1557 LLVMValueRef
1558 lp_build_set_sign(struct lp_build_context *bld,
1559                   LLVMValueRef a, LLVMValueRef sign)
1560 {
1561    LLVMBuilderRef builder = bld->gallivm->builder;
1562    const struct lp_type type = bld->type;
1563    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1564    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1565    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1566    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1567                              ~((unsigned long long) 1 << (type.width - 1)));
1568    LLVMValueRef val, res;
1569
1570    assert(type.floating);
1571    assert(lp_check_value(type, a));
1572
1573    /* val = reinterpret_cast<int>(a) */
1574    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1575    /* val = val & mask */
1576    val = LLVMBuildAnd(builder, val, mask, "");
1577    /* sign = sign << shift */
1578    sign = LLVMBuildShl(builder, sign, shift, "");
1579    /* res = val | sign */
1580    res = LLVMBuildOr(builder, val, sign, "");
1581    /* res = reinterpret_cast<float>(res) */
1582    res = LLVMBuildBitCast(builder, res, vec_type, "");
1583
1584    return res;
1585 }
1586
1587
1588 /**
1589  * Convert vector of (or scalar) int to vector of (or scalar) float.
1590  */
1591 LLVMValueRef
1592 lp_build_int_to_float(struct lp_build_context *bld,
1593                       LLVMValueRef a)
1594 {
1595    LLVMBuilderRef builder = bld->gallivm->builder;
1596    const struct lp_type type = bld->type;
1597    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1598
1599    assert(type.floating);
1600
1601    return LLVMBuildSIToFP(builder, a, vec_type, "");
1602 }
1603
1604 static boolean
1605 arch_rounding_available(const struct lp_type type)
1606 {
1607    if ((util_cpu_caps.has_sse4_1 &&
1608        (type.length == 1 || type.width*type.length == 128)) ||
1609        (util_cpu_caps.has_avx && type.width*type.length == 256))
1610       return TRUE;
1611    else if ((util_cpu_caps.has_altivec &&
1612             (type.width == 32 && type.length == 4)))
1613       return TRUE;
1614
1615    return FALSE;
1616 }
1617
1618 enum lp_build_round_mode
1619 {
1620    LP_BUILD_ROUND_NEAREST = 0,
1621    LP_BUILD_ROUND_FLOOR = 1,
1622    LP_BUILD_ROUND_CEIL = 2,
1623    LP_BUILD_ROUND_TRUNCATE = 3
1624 };
1625
1626 /**
1627  * Helper for SSE4.1's ROUNDxx instructions.
1628  *
1629  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1630  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1631  */
1632 static INLINE LLVMValueRef
1633 lp_build_round_sse41(struct lp_build_context *bld,
1634                      LLVMValueRef a,
1635                      enum lp_build_round_mode mode)
1636 {
1637    LLVMBuilderRef builder = bld->gallivm->builder;
1638    const struct lp_type type = bld->type;
1639    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1640    const char *intrinsic;
1641    LLVMValueRef res;
1642
1643    assert(type.floating);
1644
1645    assert(lp_check_value(type, a));
1646    assert(util_cpu_caps.has_sse4_1);
1647
1648    if (type.length == 1) {
1649       LLVMTypeRef vec_type;
1650       LLVMValueRef undef;
1651       LLVMValueRef args[3];
1652       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1653
1654       switch(type.width) {
1655       case 32:
1656          intrinsic = "llvm.x86.sse41.round.ss";
1657          break;
1658       case 64:
1659          intrinsic = "llvm.x86.sse41.round.sd";
1660          break;
1661       default:
1662          assert(0);
1663          return bld->undef;
1664       }
1665
1666       vec_type = LLVMVectorType(bld->elem_type, 4);
1667
1668       undef = LLVMGetUndef(vec_type);
1669
1670       args[0] = undef;
1671       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1672       args[2] = LLVMConstInt(i32t, mode, 0);
1673
1674       res = lp_build_intrinsic(builder, intrinsic,
1675                                vec_type, args, Elements(args));
1676
1677       res = LLVMBuildExtractElement(builder, res, index0, "");
1678    }
1679    else {
1680       if (type.width * type.length == 128) {
1681          switch(type.width) {
1682          case 32:
1683             intrinsic = "llvm.x86.sse41.round.ps";
1684             break;
1685          case 64:
1686             intrinsic = "llvm.x86.sse41.round.pd";
1687             break;
1688          default:
1689             assert(0);
1690             return bld->undef;
1691          }
1692       }
1693       else {
1694          assert(type.width * type.length == 256);
1695          assert(util_cpu_caps.has_avx);
1696
1697          switch(type.width) {
1698          case 32:
1699             intrinsic = "llvm.x86.avx.round.ps.256";
1700             break;
1701          case 64:
1702             intrinsic = "llvm.x86.avx.round.pd.256";
1703             break;
1704          default:
1705             assert(0);
1706             return bld->undef;
1707          }
1708       }
1709
1710       res = lp_build_intrinsic_binary(builder, intrinsic,
1711                                       bld->vec_type, a,
1712                                       LLVMConstInt(i32t, mode, 0));
1713    }
1714
1715    return res;
1716 }
1717
1718
1719 static INLINE LLVMValueRef
1720 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1721                              LLVMValueRef a)
1722 {
1723    LLVMBuilderRef builder = bld->gallivm->builder;
1724    const struct lp_type type = bld->type;
1725    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1726    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1727    const char *intrinsic;
1728    LLVMValueRef res;
1729
1730    assert(type.floating);
1731    /* using the double precision conversions is a bit more complicated */
1732    assert(type.width == 32);
1733
1734    assert(lp_check_value(type, a));
1735    assert(util_cpu_caps.has_sse2);
1736
1737    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1738    if (type.length == 1) {
1739       LLVMTypeRef vec_type;
1740       LLVMValueRef undef;
1741       LLVMValueRef arg;
1742       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1743
1744       vec_type = LLVMVectorType(bld->elem_type, 4);
1745
1746       intrinsic = "llvm.x86.sse.cvtss2si";
1747
1748       undef = LLVMGetUndef(vec_type);
1749
1750       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1751
1752       res = lp_build_intrinsic_unary(builder, intrinsic,
1753                                      ret_type, arg);
1754    }
1755    else {
1756       if (type.width* type.length == 128) {
1757          intrinsic = "llvm.x86.sse2.cvtps2dq";
1758       }
1759       else {
1760          assert(type.width*type.length == 256);
1761          assert(util_cpu_caps.has_avx);
1762
1763          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1764       }
1765       res = lp_build_intrinsic_unary(builder, intrinsic,
1766                                      ret_type, a);
1767    }
1768
1769    return res;
1770 }
1771
1772
1773 /*
1774  */
1775 static INLINE LLVMValueRef
1776 lp_build_round_altivec(struct lp_build_context *bld,
1777                        LLVMValueRef a,
1778                        enum lp_build_round_mode mode)
1779 {
1780    LLVMBuilderRef builder = bld->gallivm->builder;
1781    const struct lp_type type = bld->type;
1782    const char *intrinsic = NULL;
1783
1784    assert(type.floating);
1785
1786    assert(lp_check_value(type, a));
1787    assert(util_cpu_caps.has_altivec);
1788
1789    switch (mode) {
1790    case LP_BUILD_ROUND_NEAREST:
1791       intrinsic = "llvm.ppc.altivec.vrfin";
1792       break;
1793    case LP_BUILD_ROUND_FLOOR:
1794       intrinsic = "llvm.ppc.altivec.vrfim";
1795       break;
1796    case LP_BUILD_ROUND_CEIL:
1797       intrinsic = "llvm.ppc.altivec.vrfip";
1798       break;
1799    case LP_BUILD_ROUND_TRUNCATE:
1800       intrinsic = "llvm.ppc.altivec.vrfiz";
1801       break;
1802    }
1803
1804    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1805 }
1806
1807 static INLINE LLVMValueRef
1808 lp_build_round_arch(struct lp_build_context *bld,
1809                     LLVMValueRef a,
1810                     enum lp_build_round_mode mode)
1811 {
1812    if (util_cpu_caps.has_sse4_1)
1813      return lp_build_round_sse41(bld, a, mode);
1814    else /* (util_cpu_caps.has_altivec) */
1815      return lp_build_round_altivec(bld, a, mode);
1816 }
1817
1818 /**
1819  * Return the integer part of a float (vector) value (== round toward zero).
1820  * The returned value is a float (vector).
1821  * Ex: trunc(-1.5) = -1.0
1822  */
1823 LLVMValueRef
1824 lp_build_trunc(struct lp_build_context *bld,
1825                LLVMValueRef a)
1826 {
1827    LLVMBuilderRef builder = bld->gallivm->builder;
1828    const struct lp_type type = bld->type;
1829
1830    assert(type.floating);
1831    assert(lp_check_value(type, a));
1832
1833    if (arch_rounding_available(type)) {
1834       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1835    }
1836    else {
1837       const struct lp_type type = bld->type;
1838       struct lp_type inttype;
1839       struct lp_build_context intbld;
1840       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1841       LLVMValueRef trunc, res, anosign, mask;
1842       LLVMTypeRef int_vec_type = bld->int_vec_type;
1843       LLVMTypeRef vec_type = bld->vec_type;
1844
1845       assert(type.width == 32); /* might want to handle doubles at some point */
1846
1847       inttype = type;
1848       inttype.floating = 0;
1849       lp_build_context_init(&intbld, bld->gallivm, inttype);
1850
1851       /* round by truncation */
1852       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1853       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1854
1855       /* mask out sign bit */
1856       anosign = lp_build_abs(bld, a);
1857       /*
1858        * mask out all values if anosign > 2^24
1859        * This should work both for large ints (all rounding is no-op for them
1860        * because such floats are always exact) as well as special cases like
1861        * NaNs, Infs (taking advantage of the fact they use max exponent).
1862        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1863        */
1864       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1865       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1866       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1867       return lp_build_select(bld, mask, a, res);
1868    }
1869 }
1870
1871
1872 /**
1873  * Return float (vector) rounded to nearest integer (vector).  The returned
1874  * value is a float (vector).
1875  * Ex: round(0.9) = 1.0
1876  * Ex: round(-1.5) = -2.0
1877  */
1878 LLVMValueRef
1879 lp_build_round(struct lp_build_context *bld,
1880                LLVMValueRef a)
1881 {
1882    LLVMBuilderRef builder = bld->gallivm->builder;
1883    const struct lp_type type = bld->type;
1884
1885    assert(type.floating);
1886    assert(lp_check_value(type, a));
1887
1888    if (arch_rounding_available(type)) {
1889       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1890    }
1891    else {
1892       const struct lp_type type = bld->type;
1893       struct lp_type inttype;
1894       struct lp_build_context intbld;
1895       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1896       LLVMValueRef res, anosign, mask;
1897       LLVMTypeRef int_vec_type = bld->int_vec_type;
1898       LLVMTypeRef vec_type = bld->vec_type;
1899
1900       assert(type.width == 32); /* might want to handle doubles at some point */
1901
1902       inttype = type;
1903       inttype.floating = 0;
1904       lp_build_context_init(&intbld, bld->gallivm, inttype);
1905
1906       res = lp_build_iround(bld, a);
1907       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1908
1909       /* mask out sign bit */
1910       anosign = lp_build_abs(bld, a);
1911       /*
1912        * mask out all values if anosign > 2^24
1913        * This should work both for large ints (all rounding is no-op for them
1914        * because such floats are always exact) as well as special cases like
1915        * NaNs, Infs (taking advantage of the fact they use max exponent).
1916        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1917        */
1918       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1919       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1920       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1921       return lp_build_select(bld, mask, a, res);
1922    }
1923 }
1924
1925
1926 /**
1927  * Return floor of float (vector), result is a float (vector)
1928  * Ex: floor(1.1) = 1.0
1929  * Ex: floor(-1.1) = -2.0
1930  */
1931 LLVMValueRef
1932 lp_build_floor(struct lp_build_context *bld,
1933                LLVMValueRef a)
1934 {
1935    LLVMBuilderRef builder = bld->gallivm->builder;
1936    const struct lp_type type = bld->type;
1937
1938    assert(type.floating);
1939    assert(lp_check_value(type, a));
1940
1941    if (arch_rounding_available(type)) {
1942       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1943    }
1944    else {
1945       const struct lp_type type = bld->type;
1946       struct lp_type inttype;
1947       struct lp_build_context intbld;
1948       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1949       LLVMValueRef trunc, res, anosign, mask;
1950       LLVMTypeRef int_vec_type = bld->int_vec_type;
1951       LLVMTypeRef vec_type = bld->vec_type;
1952
1953       assert(type.width == 32); /* might want to handle doubles at some point */
1954
1955       inttype = type;
1956       inttype.floating = 0;
1957       lp_build_context_init(&intbld, bld->gallivm, inttype);
1958
1959       /* round by truncation */
1960       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1961       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1962
1963       if (type.sign) {
1964          LLVMValueRef tmp;
1965
1966          /*
1967           * fix values if rounding is wrong (for non-special cases)
1968           * - this is the case if trunc > a
1969           */
1970          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1971          /* tmp = trunc > a ? 1.0 : 0.0 */
1972          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1973          tmp = lp_build_and(&intbld, mask, tmp);
1974          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1975          res = lp_build_sub(bld, res, tmp);
1976       }
1977
1978       /* mask out sign bit */
1979       anosign = lp_build_abs(bld, a);
1980       /*
1981        * mask out all values if anosign > 2^24
1982        * This should work both for large ints (all rounding is no-op for them
1983        * because such floats are always exact) as well as special cases like
1984        * NaNs, Infs (taking advantage of the fact they use max exponent).
1985        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1986        */
1987       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1988       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1989       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1990       return lp_build_select(bld, mask, a, res);
1991    }
1992 }
1993
1994
1995 /**
1996  * Return ceiling of float (vector), returning float (vector).
1997  * Ex: ceil( 1.1) = 2.0
1998  * Ex: ceil(-1.1) = -1.0
1999  */
2000 LLVMValueRef
2001 lp_build_ceil(struct lp_build_context *bld,
2002               LLVMValueRef a)
2003 {
2004    LLVMBuilderRef builder = bld->gallivm->builder;
2005    const struct lp_type type = bld->type;
2006
2007    assert(type.floating);
2008    assert(lp_check_value(type, a));
2009
2010    if (arch_rounding_available(type)) {
2011       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2012    }
2013    else {
2014       const struct lp_type type = bld->type;
2015       struct lp_type inttype;
2016       struct lp_build_context intbld;
2017       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2018       LLVMValueRef trunc, res, anosign, mask, tmp;
2019       LLVMTypeRef int_vec_type = bld->int_vec_type;
2020       LLVMTypeRef vec_type = bld->vec_type;
2021
2022       assert(type.width == 32); /* might want to handle doubles at some point */
2023
2024       inttype = type;
2025       inttype.floating = 0;
2026       lp_build_context_init(&intbld, bld->gallivm, inttype);
2027
2028       /* round by truncation */
2029       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2030       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2031
2032       /*
2033        * fix values if rounding is wrong (for non-special cases)
2034        * - this is the case if trunc < a
2035        */
2036       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2037       /* tmp = trunc < a ? 1.0 : 0.0 */
2038       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2039       tmp = lp_build_and(&intbld, mask, tmp);
2040       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2041       res = lp_build_add(bld, trunc, tmp);
2042
2043       /* mask out sign bit */
2044       anosign = lp_build_abs(bld, a);
2045       /*
2046        * mask out all values if anosign > 2^24
2047        * This should work both for large ints (all rounding is no-op for them
2048        * because such floats are always exact) as well as special cases like
2049        * NaNs, Infs (taking advantage of the fact they use max exponent).
2050        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2051        */
2052       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2053       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2054       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2055       return lp_build_select(bld, mask, a, res);
2056    }
2057 }
2058
2059
2060 /**
2061  * Return fractional part of 'a' computed as a - floor(a)
2062  * Typically used in texture coord arithmetic.
2063  */
2064 LLVMValueRef
2065 lp_build_fract(struct lp_build_context *bld,
2066                LLVMValueRef a)
2067 {
2068    assert(bld->type.floating);
2069    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2070 }
2071
2072
2073 /**
2074  * Prevent returning a fractional part of 1.0 for very small negative values of
2075  * 'a' by clamping against 0.99999(9).
2076  */
2077 static inline LLVMValueRef
2078 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2079 {
2080    LLVMValueRef max;
2081
2082    /* this is the largest number smaller than 1.0 representable as float */
2083    max = lp_build_const_vec(bld->gallivm, bld->type,
2084                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2085    return lp_build_min(bld, fract, max);
2086 }
2087
2088
2089 /**
2090  * Same as lp_build_fract, but guarantees that the result is always smaller
2091  * than one.
2092  */
2093 LLVMValueRef
2094 lp_build_fract_safe(struct lp_build_context *bld,
2095                     LLVMValueRef a)
2096 {
2097    return clamp_fract(bld, lp_build_fract(bld, a));
2098 }
2099
2100
2101 /**
2102  * Return the integer part of a float (vector) value (== round toward zero).
2103  * The returned value is an integer (vector).
2104  * Ex: itrunc(-1.5) = -1
2105  */
2106 LLVMValueRef
2107 lp_build_itrunc(struct lp_build_context *bld,
2108                 LLVMValueRef a)
2109 {
2110    LLVMBuilderRef builder = bld->gallivm->builder;
2111    const struct lp_type type = bld->type;
2112    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2113
2114    assert(type.floating);
2115    assert(lp_check_value(type, a));
2116
2117    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2118 }
2119
2120
2121 /**
2122  * Return float (vector) rounded to nearest integer (vector).  The returned
2123  * value is an integer (vector).
2124  * Ex: iround(0.9) = 1
2125  * Ex: iround(-1.5) = -2
2126  */
2127 LLVMValueRef
2128 lp_build_iround(struct lp_build_context *bld,
2129                 LLVMValueRef a)
2130 {
2131    LLVMBuilderRef builder = bld->gallivm->builder;
2132    const struct lp_type type = bld->type;
2133    LLVMTypeRef int_vec_type = bld->int_vec_type;
2134    LLVMValueRef res;
2135
2136    assert(type.floating);
2137
2138    assert(lp_check_value(type, a));
2139
2140    if ((util_cpu_caps.has_sse2 &&
2141        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2142        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2143       return lp_build_iround_nearest_sse2(bld, a);
2144    }
2145    if (arch_rounding_available(type)) {
2146       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2147    }
2148    else {
2149       LLVMValueRef half;
2150
2151       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2152
2153       if (type.sign) {
2154          LLVMTypeRef vec_type = bld->vec_type;
2155          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2156                                     (unsigned long long)1 << (type.width - 1));
2157          LLVMValueRef sign;
2158
2159          /* get sign bit */
2160          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2161          sign = LLVMBuildAnd(builder, sign, mask, "");
2162
2163          /* sign * 0.5 */
2164          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2165          half = LLVMBuildOr(builder, sign, half, "");
2166          half = LLVMBuildBitCast(builder, half, vec_type, "");
2167       }
2168
2169       res = LLVMBuildFAdd(builder, a, half, "");
2170    }
2171
2172    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2173
2174    return res;
2175 }
2176
2177
2178 /**
2179  * Return floor of float (vector), result is an int (vector)
2180  * Ex: ifloor(1.1) = 1.0
2181  * Ex: ifloor(-1.1) = -2.0
2182  */
2183 LLVMValueRef
2184 lp_build_ifloor(struct lp_build_context *bld,
2185                 LLVMValueRef a)
2186 {
2187    LLVMBuilderRef builder = bld->gallivm->builder;
2188    const struct lp_type type = bld->type;
2189    LLVMTypeRef int_vec_type = bld->int_vec_type;
2190    LLVMValueRef res;
2191
2192    assert(type.floating);
2193    assert(lp_check_value(type, a));
2194
2195    res = a;
2196    if (type.sign) {
2197       if (arch_rounding_available(type)) {
2198          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2199       }
2200       else {
2201          struct lp_type inttype;
2202          struct lp_build_context intbld;
2203          LLVMValueRef trunc, itrunc, mask;
2204
2205          assert(type.floating);
2206          assert(lp_check_value(type, a));
2207
2208          inttype = type;
2209          inttype.floating = 0;
2210          lp_build_context_init(&intbld, bld->gallivm, inttype);
2211
2212          /* round by truncation */
2213          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2214          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2215
2216          /*
2217           * fix values if rounding is wrong (for non-special cases)
2218           * - this is the case if trunc > a
2219           * The results of doing this with NaNs, very large values etc.
2220           * are undefined but this seems to be the case anyway.
2221           */
2222          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2223          /* cheapie minus one with mask since the mask is minus one / zero */
2224          return lp_build_add(&intbld, itrunc, mask);
2225       }
2226    }
2227
2228    /* round to nearest (toward zero) */
2229    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2230
2231    return res;
2232 }
2233
2234
2235 /**
2236  * Return ceiling of float (vector), returning int (vector).
2237  * Ex: iceil( 1.1) = 2
2238  * Ex: iceil(-1.1) = -1
2239  */
2240 LLVMValueRef
2241 lp_build_iceil(struct lp_build_context *bld,
2242                LLVMValueRef a)
2243 {
2244    LLVMBuilderRef builder = bld->gallivm->builder;
2245    const struct lp_type type = bld->type;
2246    LLVMTypeRef int_vec_type = bld->int_vec_type;
2247    LLVMValueRef res;
2248
2249    assert(type.floating);
2250    assert(lp_check_value(type, a));
2251
2252    if (arch_rounding_available(type)) {
2253       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2254    }
2255    else {
2256       struct lp_type inttype;
2257       struct lp_build_context intbld;
2258       LLVMValueRef trunc, itrunc, mask;
2259
2260       assert(type.floating);
2261       assert(lp_check_value(type, a));
2262
2263       inttype = type;
2264       inttype.floating = 0;
2265       lp_build_context_init(&intbld, bld->gallivm, inttype);
2266
2267       /* round by truncation */
2268       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2269       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2270
2271       /*
2272        * fix values if rounding is wrong (for non-special cases)
2273        * - this is the case if trunc < a
2274        * The results of doing this with NaNs, very large values etc.
2275        * are undefined but this seems to be the case anyway.
2276        */
2277       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2278       /* cheapie plus one with mask since the mask is minus one / zero */
2279       return lp_build_sub(&intbld, itrunc, mask);
2280    }
2281
2282    /* round to nearest (toward zero) */
2283    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2284
2285    return res;
2286 }
2287
2288
2289 /**
2290  * Combined ifloor() & fract().
2291  *
2292  * Preferred to calling the functions separately, as it will ensure that the
2293  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2294  */
2295 void
2296 lp_build_ifloor_fract(struct lp_build_context *bld,
2297                       LLVMValueRef a,
2298                       LLVMValueRef *out_ipart,
2299                       LLVMValueRef *out_fpart)
2300 {
2301    LLVMBuilderRef builder = bld->gallivm->builder;
2302    const struct lp_type type = bld->type;
2303    LLVMValueRef ipart;
2304
2305    assert(type.floating);
2306    assert(lp_check_value(type, a));
2307
2308    if (arch_rounding_available(type)) {
2309       /*
2310        * floor() is easier.
2311        */
2312
2313       ipart = lp_build_floor(bld, a);
2314       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2315       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2316    }
2317    else {
2318       /*
2319        * ifloor() is easier.
2320        */
2321
2322       *out_ipart = lp_build_ifloor(bld, a);
2323       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2324       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2325    }
2326 }
2327
2328
2329 /**
2330  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2331  * always smaller than one.
2332  */
2333 void
2334 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2335                            LLVMValueRef a,
2336                            LLVMValueRef *out_ipart,
2337                            LLVMValueRef *out_fpart)
2338 {
2339    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2340    *out_fpart = clamp_fract(bld, *out_fpart);
2341 }
2342
2343
2344 LLVMValueRef
2345 lp_build_sqrt(struct lp_build_context *bld,
2346               LLVMValueRef a)
2347 {
2348    LLVMBuilderRef builder = bld->gallivm->builder;
2349    const struct lp_type type = bld->type;
2350    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2351    char intrinsic[32];
2352
2353    assert(lp_check_value(type, a));
2354
2355    /* TODO: optimize the constant case */
2356
2357    assert(type.floating);
2358    if (type.length == 1) {
2359       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2360    }
2361    else {
2362       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2363    }
2364
2365    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2366 }
2367
2368
2369 /**
2370  * Do one Newton-Raphson step to improve reciprocate precision:
2371  *
2372  *   x_{i+1} = x_i * (2 - a * x_i)
2373  *
2374  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2375  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2376  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2377  * halo. It would be necessary to clamp the argument to prevent this.
2378  *
2379  * See also:
2380  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2381  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2382  */
2383 static INLINE LLVMValueRef
2384 lp_build_rcp_refine(struct lp_build_context *bld,
2385                     LLVMValueRef a,
2386                     LLVMValueRef rcp_a)
2387 {
2388    LLVMBuilderRef builder = bld->gallivm->builder;
2389    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2390    LLVMValueRef res;
2391
2392    res = LLVMBuildFMul(builder, a, rcp_a, "");
2393    res = LLVMBuildFSub(builder, two, res, "");
2394    res = LLVMBuildFMul(builder, rcp_a, res, "");
2395
2396    return res;
2397 }
2398
2399
2400 LLVMValueRef
2401 lp_build_rcp(struct lp_build_context *bld,
2402              LLVMValueRef a)
2403 {
2404    LLVMBuilderRef builder = bld->gallivm->builder;
2405    const struct lp_type type = bld->type;
2406
2407    assert(lp_check_value(type, a));
2408
2409    if(a == bld->zero)
2410       return bld->undef;
2411    if(a == bld->one)
2412       return bld->one;
2413    if(a == bld->undef)
2414       return bld->undef;
2415
2416    assert(type.floating);
2417
2418    if(LLVMIsConstant(a))
2419       return LLVMConstFDiv(bld->one, a);
2420
2421    /*
2422     * We don't use RCPPS because:
2423     * - it only has 10bits of precision
2424     * - it doesn't even get the reciprocate of 1.0 exactly
2425     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2426     * - for recent processors the benefit over DIVPS is marginal, a case
2427     *   dependent
2428     *
2429     * We could still use it on certain processors if benchmarks show that the
2430     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2431     * particular uses that require less workarounds.
2432     */
2433
2434    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2435          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2436       const unsigned num_iterations = 0;
2437       LLVMValueRef res;
2438       unsigned i;
2439       const char *intrinsic = NULL;
2440
2441       if (type.length == 4) {
2442          intrinsic = "llvm.x86.sse.rcp.ps";
2443       }
2444       else {
2445          intrinsic = "llvm.x86.avx.rcp.ps.256";
2446       }
2447
2448       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2449
2450       for (i = 0; i < num_iterations; ++i) {
2451          res = lp_build_rcp_refine(bld, a, res);
2452       }
2453
2454       return res;
2455    }
2456
2457    return LLVMBuildFDiv(builder, bld->one, a, "");
2458 }
2459
2460
2461 /**
2462  * Do one Newton-Raphson step to improve rsqrt precision:
2463  *
2464  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2465  *
2466  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2467  */
2468 static INLINE LLVMValueRef
2469 lp_build_rsqrt_refine(struct lp_build_context *bld,
2470                       LLVMValueRef a,
2471                       LLVMValueRef rsqrt_a)
2472 {
2473    LLVMBuilderRef builder = bld->gallivm->builder;
2474    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2475    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2476    LLVMValueRef res;
2477
2478    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2479    res = LLVMBuildFMul(builder, a, res, "");
2480    res = LLVMBuildFSub(builder, three, res, "");
2481    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2482    res = LLVMBuildFMul(builder, half, res, "");
2483
2484    return res;
2485 }
2486
2487
2488 /**
2489  * Generate 1/sqrt(a).
2490  * Result is undefined for values < 0, infinity for +0.
2491  */
2492 LLVMValueRef
2493 lp_build_rsqrt(struct lp_build_context *bld,
2494                LLVMValueRef a)
2495 {
2496    LLVMBuilderRef builder = bld->gallivm->builder;
2497    const struct lp_type type = bld->type;
2498
2499    assert(lp_check_value(type, a));
2500
2501    assert(type.floating);
2502
2503    /*
2504     * This should be faster but all denormals will end up as infinity.
2505     */
2506    if (0 && lp_build_fast_rsqrt_available(type)) {
2507       const unsigned num_iterations = 1;
2508       LLVMValueRef res;
2509       unsigned i;
2510
2511       /* rsqrt(1.0) != 1.0 here */
2512       res = lp_build_fast_rsqrt(bld, a);
2513
2514       if (num_iterations) {
2515          /*
2516           * Newton-Raphson will result in NaN instead of infinity for zero,
2517           * and NaN instead of zero for infinity.
2518           * Also, need to ensure rsqrt(1.0) == 1.0.
2519           * All numbers smaller than FLT_MIN will result in +infinity
2520           * (rsqrtps treats all denormals as zero).
2521           */
2522          /*
2523           * Certain non-c99 compilers don't know INFINITY and might not support
2524           * hacks to evaluate it at compile time neither.
2525           */
2526          const unsigned posinf_int = 0x7F800000;
2527          LLVMValueRef cmp;
2528          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2529          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2530
2531          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2532
2533          for (i = 0; i < num_iterations; ++i) {
2534             res = lp_build_rsqrt_refine(bld, a, res);
2535          }
2536          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2537          res = lp_build_select(bld, cmp, inf, res);
2538          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2539          res = lp_build_select(bld, cmp, bld->zero, res);
2540          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2541          res = lp_build_select(bld, cmp, bld->one, res);
2542       }
2543
2544       return res;
2545    }
2546
2547    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2548 }
2549
2550 /**
2551  * If there's a fast (inaccurate) rsqrt instruction available
2552  * (caller may want to avoid to call rsqrt_fast if it's not available,
2553  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2554  * unavailable it would result in sqrt/div/mul so obviously
2555  * much better to just call sqrt, skipping both div and mul).
2556  */
2557 boolean
2558 lp_build_fast_rsqrt_available(struct lp_type type)
2559 {
2560    assert(type.floating);
2561
2562    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2563        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2564       return true;
2565    }
2566    return false;
2567 }
2568
2569
2570 /**
2571  * Generate 1/sqrt(a).
2572  * Result is undefined for values < 0, infinity for +0.
2573  * Precision is limited, only ~10 bits guaranteed
2574  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2575  */
2576 LLVMValueRef
2577 lp_build_fast_rsqrt(struct lp_build_context *bld,
2578                     LLVMValueRef a)
2579 {
2580    LLVMBuilderRef builder = bld->gallivm->builder;
2581    const struct lp_type type = bld->type;
2582
2583    assert(lp_check_value(type, a));
2584
2585    if (lp_build_fast_rsqrt_available(type)) {
2586       const char *intrinsic = NULL;
2587
2588       if (type.length == 4) {
2589          intrinsic = "llvm.x86.sse.rsqrt.ps";
2590       }
2591       else {
2592          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2593       }
2594       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2595    }
2596    else {
2597       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2598    }
2599    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2600 }
2601
2602
2603 /**
2604  * Generate sin(a) using SSE2
2605  */
2606 LLVMValueRef
2607 lp_build_sin(struct lp_build_context *bld,
2608              LLVMValueRef a)
2609 {
2610    struct gallivm_state *gallivm = bld->gallivm;
2611    LLVMBuilderRef builder = gallivm->builder;
2612    struct lp_type int_type = lp_int_type(bld->type);
2613    LLVMBuilderRef b = builder;
2614
2615    /*
2616     *  take the absolute value,
2617     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2618     */
2619
2620    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2621    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2622
2623    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2624    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2625
2626    /*
2627     * extract the sign bit (upper one)
2628     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2629     */
2630    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2631    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2632
2633    /*
2634     * scale by 4/Pi
2635     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2636     */
2637
2638    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2639    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2640
2641    /*
2642     * store the integer part of y in mm0
2643     * emm2 = _mm_cvttps_epi32(y);
2644     */
2645
2646    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2647
2648    /*
2649     * j=(j+1) & (~1) (see the cephes sources)
2650     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2651     */
2652
2653    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2654    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2655    /*
2656     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2657     */
2658    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2659    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2660
2661    /*
2662     * y = _mm_cvtepi32_ps(emm2);
2663     */
2664    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2665
2666    /* get the swap sign flag
2667     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2668     */
2669    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2670    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2671
2672    /*
2673     * emm2 = _mm_slli_epi32(emm0, 29);
2674     */
2675    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2676    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2677
2678    /*
2679     * get the polynom selection mask
2680     * there is one polynom for 0 <= x <= Pi/4
2681     * and another one for Pi/4<x<=Pi/2
2682     * Both branches will be computed.
2683     *
2684     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2685     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2686     */
2687
2688    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2689    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2690    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2691                                              int_type, PIPE_FUNC_EQUAL,
2692                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2693    /*
2694     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2695     */
2696    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2697
2698    /*
2699     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2700     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2701     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2702     */
2703    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2704    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2705    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2706
2707    /*
2708     * The magic pass: "Extended precision modular arithmetic"
2709     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2710     * xmm1 = _mm_mul_ps(y, xmm1);
2711     * xmm2 = _mm_mul_ps(y, xmm2);
2712     * xmm3 = _mm_mul_ps(y, xmm3);
2713     */
2714    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2715    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2716    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2717
2718    /*
2719     * x = _mm_add_ps(x, xmm1);
2720     * x = _mm_add_ps(x, xmm2);
2721     * x = _mm_add_ps(x, xmm3);
2722     */
2723
2724    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2725    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2726    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2727
2728    /*
2729     * Evaluate the first polynom  (0 <= x <= Pi/4)
2730     *
2731     * z = _mm_mul_ps(x,x);
2732     */
2733    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2734
2735    /*
2736     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2737     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2738     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2739     */
2740    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2741    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2742    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2743
2744    /*
2745     * y = *(v4sf*)_ps_coscof_p0;
2746     * y = _mm_mul_ps(y, z);
2747     */
2748    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2749    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2750    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2751    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2752    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2753    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2754
2755
2756    /*
2757     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2758     * y = _mm_sub_ps(y, tmp);
2759     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2760     */
2761    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2762    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2763    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2764    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2765    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2766
2767    /*
2768     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2769     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2770     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2771     */
2772    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2773    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2774    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2775
2776    /*
2777     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2778     *
2779     * y2 = *(v4sf*)_ps_sincof_p0;
2780     * y2 = _mm_mul_ps(y2, z);
2781     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2782     * y2 = _mm_mul_ps(y2, z);
2783     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2784     * y2 = _mm_mul_ps(y2, z);
2785     * y2 = _mm_mul_ps(y2, x);
2786     * y2 = _mm_add_ps(y2, x);
2787     */
2788
2789    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2790    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2791    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2792    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2793    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2794    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2795    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2796
2797    /*
2798     * select the correct result from the two polynoms
2799     * xmm3 = poly_mask;
2800     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2801     * y = _mm_andnot_ps(xmm3, y);
2802     * y = _mm_or_ps(y,y2);
2803     */
2804    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2805    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2806    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2807    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2808    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2809    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2810
2811    /*
2812     * update the sign
2813     * y = _mm_xor_ps(y, sign_bit);
2814     */
2815    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2816    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2817    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2818
2819    /* clamp output to be within [-1, 1] */
2820    y_result = lp_build_clamp(bld, y_result,
2821                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2822                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2823    /* If a is -inf, inf or NaN then return NaN */
2824    y_result = lp_build_select(bld, isfinite, y_result,
2825                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2826    return y_result;
2827 }
2828
2829
2830 /**
2831  * Generate cos(a) using SSE2
2832  */
2833 LLVMValueRef
2834 lp_build_cos(struct lp_build_context *bld,
2835              LLVMValueRef a)
2836 {
2837    struct gallivm_state *gallivm = bld->gallivm;
2838    LLVMBuilderRef builder = gallivm->builder;
2839    struct lp_type int_type = lp_int_type(bld->type);
2840    LLVMBuilderRef b = builder;
2841
2842    /*
2843     *  take the absolute value,
2844     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2845     */
2846
2847    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2848    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2849
2850    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2851    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2852
2853    /*
2854     * scale by 4/Pi
2855     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2856     */
2857
2858    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2859    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2860
2861    /*
2862     * store the integer part of y in mm0
2863     * emm2 = _mm_cvttps_epi32(y);
2864     */
2865
2866    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2867
2868    /*
2869     * j=(j+1) & (~1) (see the cephes sources)
2870     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2871     */
2872
2873    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2874    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2875    /*
2876     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2877     */
2878    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2879    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2880
2881    /*
2882     * y = _mm_cvtepi32_ps(emm2);
2883     */
2884    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2885
2886
2887    /*
2888     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2889     */
2890    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2891    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2892
2893
2894    /* get the swap sign flag
2895     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2896     */
2897    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2898    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2899    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2900    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2901
2902    /*
2903     * emm2 = _mm_slli_epi32(emm0, 29);
2904     */
2905    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2906    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2907
2908    /*
2909     * get the polynom selection mask
2910     * there is one polynom for 0 <= x <= Pi/4
2911     * and another one for Pi/4<x<=Pi/2
2912     * Both branches will be computed.
2913     *
2914     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2915     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2916     */
2917
2918    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2919    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2920    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2921                                              int_type, PIPE_FUNC_EQUAL,
2922                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2923
2924    /*
2925     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2926     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2927     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2928     */
2929    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2930    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2931    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2932
2933    /*
2934     * The magic pass: "Extended precision modular arithmetic"
2935     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2936     * xmm1 = _mm_mul_ps(y, xmm1);
2937     * xmm2 = _mm_mul_ps(y, xmm2);
2938     * xmm3 = _mm_mul_ps(y, xmm3);
2939     */
2940    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2941    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2942    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2943
2944    /*
2945     * x = _mm_add_ps(x, xmm1);
2946     * x = _mm_add_ps(x, xmm2);
2947     * x = _mm_add_ps(x, xmm3);
2948     */
2949
2950    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2951    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2952    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2953
2954    /*
2955     * Evaluate the first polynom  (0 <= x <= Pi/4)
2956     *
2957     * z = _mm_mul_ps(x,x);
2958     */
2959    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2960
2961    /*
2962     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2963     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2964     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2965     */
2966    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2967    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2968    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2969
2970    /*
2971     * y = *(v4sf*)_ps_coscof_p0;
2972     * y = _mm_mul_ps(y, z);
2973     */
2974    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2975    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2976    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2977    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2978    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2979    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2980
2981
2982    /*
2983     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2984     * y = _mm_sub_ps(y, tmp);
2985     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2986     */
2987    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2988    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2989    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2990    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2991    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2992
2993    /*
2994     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2995     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2996     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2997     */
2998    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2999    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3000    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3001
3002    /*
3003     * Evaluate the second polynom  (Pi/4 <= x <= 0)
3004     *
3005     * y2 = *(v4sf*)_ps_sincof_p0;
3006     * y2 = _mm_mul_ps(y2, z);
3007     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3008     * y2 = _mm_mul_ps(y2, z);
3009     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3010     * y2 = _mm_mul_ps(y2, z);
3011     * y2 = _mm_mul_ps(y2, x);
3012     * y2 = _mm_add_ps(y2, x);
3013     */
3014
3015    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
3016    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
3017    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
3018    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
3019    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3020    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
3021    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
3022
3023    /*
3024     * select the correct result from the two polynoms
3025     * xmm3 = poly_mask;
3026     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3027     * y = _mm_andnot_ps(xmm3, y);
3028     * y = _mm_or_ps(y,y2);
3029     */
3030    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3031    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3032    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3033    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3034    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3035    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3036
3037    /*
3038     * update the sign
3039     * y = _mm_xor_ps(y, sign_bit);
3040     */
3041    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
3042    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3043    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3044
3045    /* clamp output to be within [-1, 1] */
3046    y_result = lp_build_clamp(bld, y_result,
3047                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3048                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3049    /* If a is -inf, inf or NaN then return NaN */
3050    y_result = lp_build_select(bld, isfinite, y_result,
3051                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3052    return y_result;
3053 }
3054
3055
3056 /**
3057  * Generate pow(x, y)
3058  */
3059 LLVMValueRef
3060 lp_build_pow(struct lp_build_context *bld,
3061              LLVMValueRef x,
3062              LLVMValueRef y)
3063 {
3064    /* TODO: optimize the constant case */
3065    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3066        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3067       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3068                    __FUNCTION__);
3069    }
3070
3071    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3072 }
3073
3074
3075 /**
3076  * Generate exp(x)
3077  */
3078 LLVMValueRef
3079 lp_build_exp(struct lp_build_context *bld,
3080              LLVMValueRef x)
3081 {
3082    /* log2(e) = 1/log(2) */
3083    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3084                                            1.4426950408889634);
3085
3086    assert(lp_check_value(bld->type, x));
3087
3088    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3089 }
3090
3091
3092 /**
3093  * Generate log(x)
3094  * Behavior is undefined with infs, 0s and nans
3095  */
3096 LLVMValueRef
3097 lp_build_log(struct lp_build_context *bld,
3098              LLVMValueRef x)
3099 {
3100    /* log(2) */
3101    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3102                                           0.69314718055994529);
3103
3104    assert(lp_check_value(bld->type, x));
3105
3106    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3107 }
3108
3109 /**
3110  * Generate log(x) that handles edge cases (infs, 0s and nans)
3111  */
3112 LLVMValueRef
3113 lp_build_log_safe(struct lp_build_context *bld,
3114                   LLVMValueRef x)
3115 {
3116    /* log(2) */
3117    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3118                                           0.69314718055994529);
3119
3120    assert(lp_check_value(bld->type, x));
3121
3122    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3123 }
3124
3125
3126 /**
3127  * Generate polynomial.
3128  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3129  */
3130 LLVMValueRef
3131 lp_build_polynomial(struct lp_build_context *bld,
3132                     LLVMValueRef x,
3133                     const double *coeffs,
3134                     unsigned num_coeffs)
3135 {
3136    const struct lp_type type = bld->type;
3137    LLVMValueRef even = NULL, odd = NULL;
3138    LLVMValueRef x2;
3139    unsigned i;
3140
3141    assert(lp_check_value(bld->type, x));
3142
3143    /* TODO: optimize the constant case */
3144    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3145        LLVMIsConstant(x)) {
3146       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3147                    __FUNCTION__);
3148    }
3149
3150    /*
3151     * Calculate odd and even terms seperately to decrease data dependency
3152     * Ex:
3153     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3154     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3155     */
3156    x2 = lp_build_mul(bld, x, x);
3157
3158    for (i = num_coeffs; i--; ) {
3159       LLVMValueRef coeff;
3160
3161       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3162
3163       if (i % 2 == 0) {
3164          if (even)
3165             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3166          else
3167             even = coeff;
3168       } else {
3169          if (odd)
3170             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3171          else
3172             odd = coeff;
3173       }
3174    }
3175
3176    if (odd)
3177       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3178    else if (even)
3179       return even;
3180    else
3181       return bld->undef;
3182 }
3183
3184
3185 /**
3186  * Minimax polynomial fit of 2**x, in range [0, 1[
3187  */
3188 const double lp_build_exp2_polynomial[] = {
3189 #if EXP_POLY_DEGREE == 5
3190    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3191    0.693153073200168932794,
3192    0.240153617044375388211,
3193    0.0558263180532956664775,
3194    0.00898934009049466391101,
3195    0.00187757667519147912699
3196 #elif EXP_POLY_DEGREE == 4
3197    1.00000259337069434683,
3198    0.693003834469974940458,
3199    0.24144275689150793076,
3200    0.0520114606103070150235,
3201    0.0135341679161270268764
3202 #elif EXP_POLY_DEGREE == 3
3203    0.999925218562710312959,
3204    0.695833540494823811697,
3205    0.226067155427249155588,
3206    0.0780245226406372992967
3207 #elif EXP_POLY_DEGREE == 2
3208    1.00172476321474503578,
3209    0.657636275736077639316,
3210    0.33718943461968720704
3211 #else
3212 #error
3213 #endif
3214 };
3215
3216
3217 void
3218 lp_build_exp2_approx(struct lp_build_context *bld,
3219                      LLVMValueRef x,
3220                      LLVMValueRef *p_exp2_int_part,
3221                      LLVMValueRef *p_frac_part,
3222                      LLVMValueRef *p_exp2)
3223 {
3224    LLVMBuilderRef builder = bld->gallivm->builder;
3225    const struct lp_type type = bld->type;
3226    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3227    LLVMValueRef ipart = NULL;
3228    LLVMValueRef fpart = NULL;
3229    LLVMValueRef expipart = NULL;
3230    LLVMValueRef expfpart = NULL;
3231    LLVMValueRef res = NULL;
3232
3233    assert(lp_check_value(bld->type, x));
3234
3235    if(p_exp2_int_part || p_frac_part || p_exp2) {
3236       /* TODO: optimize the constant case */
3237       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3238           LLVMIsConstant(x)) {
3239          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3240                       __FUNCTION__);
3241       }
3242
3243       assert(type.floating && type.width == 32);
3244
3245       /* We want to preserve NaN and make sure than for exp2 if x > 128,
3246        * the result is INF  and if it's smaller than -126.9 the result is 0 */
3247       x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3248                            GALLIVM_NAN_RETURN_SECOND);
3249       x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x,
3250                            GALLIVM_NAN_RETURN_SECOND);
3251
3252       /* ipart = floor(x) */
3253       /* fpart = x - ipart */
3254       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3255    }
3256
3257    if(p_exp2_int_part || p_exp2) {
3258       /* expipart = (float) (1 << ipart) */
3259       expipart = LLVMBuildAdd(builder, ipart,
3260                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
3261       expipart = LLVMBuildShl(builder, expipart,
3262                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
3263       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3264    }
3265
3266    if(p_exp2) {
3267       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3268                                      Elements(lp_build_exp2_polynomial));
3269
3270       res = LLVMBuildFMul(builder, expipart, expfpart, "");
3271    }
3272
3273    if(p_exp2_int_part)
3274       *p_exp2_int_part = expipart;
3275
3276    if(p_frac_part)
3277       *p_frac_part = fpart;
3278
3279    if(p_exp2)
3280       *p_exp2 = res;
3281 }
3282
3283
3284 LLVMValueRef
3285 lp_build_exp2(struct lp_build_context *bld,
3286               LLVMValueRef x)
3287 {
3288    LLVMValueRef res;
3289    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3290    return res;
3291 }
3292
3293
3294 /**
3295  * Extract the exponent of a IEEE-754 floating point value.
3296  *
3297  * Optionally apply an integer bias.
3298  *
3299  * Result is an integer value with
3300  *
3301  *   ifloor(log2(x)) + bias
3302  */
3303 LLVMValueRef
3304 lp_build_extract_exponent(struct lp_build_context *bld,
3305                           LLVMValueRef x,
3306                           int bias)
3307 {
3308    LLVMBuilderRef builder = bld->gallivm->builder;
3309    const struct lp_type type = bld->type;
3310    unsigned mantissa = lp_mantissa(type);
3311    LLVMValueRef res;
3312
3313    assert(type.floating);
3314
3315    assert(lp_check_value(bld->type, x));
3316
3317    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3318
3319    res = LLVMBuildLShr(builder, x,
3320                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3321    res = LLVMBuildAnd(builder, res,
3322                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3323    res = LLVMBuildSub(builder, res,
3324                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3325
3326    return res;
3327 }
3328
3329
3330 /**
3331  * Extract the mantissa of the a floating.
3332  *
3333  * Result is a floating point value with
3334  *
3335  *   x / floor(log2(x))
3336  */
3337 LLVMValueRef
3338 lp_build_extract_mantissa(struct lp_build_context *bld,
3339                           LLVMValueRef x)
3340 {
3341    LLVMBuilderRef builder = bld->gallivm->builder;
3342    const struct lp_type type = bld->type;
3343    unsigned mantissa = lp_mantissa(type);
3344    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3345                                                   (1ULL << mantissa) - 1);
3346    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3347    LLVMValueRef res;
3348
3349    assert(lp_check_value(bld->type, x));
3350
3351    assert(type.floating);
3352
3353    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3354
3355    /* res = x / 2**ipart */
3356    res = LLVMBuildAnd(builder, x, mantmask, "");
3357    res = LLVMBuildOr(builder, res, one, "");
3358    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3359
3360    return res;
3361 }
3362
3363
3364
3365 /**
3366  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3367  * These coefficients can be generate with
3368  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3369  */
3370 const double lp_build_log2_polynomial[] = {
3371 #if LOG_POLY_DEGREE == 5
3372    2.88539008148777786488L,
3373    0.961796878841293367824L,
3374    0.577058946784739859012L,
3375    0.412914355135828735411L,
3376    0.308591899232910175289L,
3377    0.352376952300281371868L,
3378 #elif LOG_POLY_DEGREE == 4
3379    2.88539009343309178325L,
3380    0.961791550404184197881L,
3381    0.577440339438736392009L,
3382    0.403343858251329912514L,
3383    0.406718052498846252698L,
3384 #elif LOG_POLY_DEGREE == 3
3385    2.88538959748872753838L,
3386    0.961932915889597772928L,
3387    0.571118517972136195241L,
3388    0.493997535084709500285L,
3389 #else
3390 #error
3391 #endif
3392 };
3393
3394 /**
3395  * See http://www.devmaster.net/forums/showthread.php?p=43580
3396  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3397  * http://www.nezumi.demon.co.uk/consult/logx.htm
3398  *
3399  * If handle_edge_cases is true the function will perform computations
3400  * to match the required D3D10+ behavior for each of the edge cases.
3401  * That means that if input is:
3402  * - less than zero (to and including -inf) then NaN will be returned
3403  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3404  * - +infinity, then +infinity will be returned
3405  * - NaN, then NaN will be returned
3406  *
3407  * Those checks are fairly expensive so if you don't need them make sure
3408  * handle_edge_cases is false.
3409  */
3410 void
3411 lp_build_log2_approx(struct lp_build_context *bld,
3412                      LLVMValueRef x,
3413                      LLVMValueRef *p_exp,
3414                      LLVMValueRef *p_floor_log2,
3415                      LLVMValueRef *p_log2,
3416                      boolean handle_edge_cases)
3417 {
3418    LLVMBuilderRef builder = bld->gallivm->builder;
3419    const struct lp_type type = bld->type;
3420    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3421    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3422
3423    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3424    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3425    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3426
3427    LLVMValueRef i = NULL;
3428    LLVMValueRef y = NULL;
3429    LLVMValueRef z = NULL;
3430    LLVMValueRef exp = NULL;
3431    LLVMValueRef mant = NULL;
3432    LLVMValueRef logexp = NULL;
3433    LLVMValueRef logmant = NULL;
3434    LLVMValueRef res = NULL;
3435
3436    assert(lp_check_value(bld->type, x));
3437
3438    if(p_exp || p_floor_log2 || p_log2) {
3439       /* TODO: optimize the constant case */
3440       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3441           LLVMIsConstant(x)) {
3442          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3443                       __FUNCTION__);
3444       }
3445
3446       assert(type.floating && type.width == 32);
3447
3448       /*
3449        * We don't explicitly handle denormalized numbers. They will yield a
3450        * result in the neighbourhood of -127, which appears to be adequate
3451        * enough.
3452        */
3453
3454       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3455
3456       /* exp = (float) exponent(x) */
3457       exp = LLVMBuildAnd(builder, i, expmask, "");
3458    }
3459
3460    if(p_floor_log2 || p_log2) {
3461       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3462       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3463       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3464    }
3465
3466    if(p_log2) {
3467       /* mant = 1 + (float) mantissa(x) */
3468       mant = LLVMBuildAnd(builder, i, mantmask, "");
3469       mant = LLVMBuildOr(builder, mant, one, "");
3470       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3471
3472       /* y = (mant - 1) / (mant + 1) */
3473       y = lp_build_div(bld,
3474          lp_build_sub(bld, mant, bld->one),
3475          lp_build_add(bld, mant, bld->one)
3476       );
3477
3478       /* z = y^2 */
3479       z = lp_build_mul(bld, y, y);
3480
3481       /* compute P(z) */
3482       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3483                                     Elements(lp_build_log2_polynomial));
3484
3485       /* logmant = y * P(z) */
3486       logmant = lp_build_mul(bld, y, logmant);
3487
3488       res = lp_build_add(bld, logmant, logexp);
3489
3490       if (type.floating && handle_edge_cases) {
3491          LLVMValueRef negmask, infmask,  zmask;
3492          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3493                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3494          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3495                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3496          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3497                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3498
3499          /* If x is qual to inf make sure we return inf */
3500          res = lp_build_select(bld, infmask,
3501                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3502                                res);
3503          /* If x is qual to 0, return -inf */
3504          res = lp_build_select(bld, zmask,
3505                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3506                                res);
3507          /* If x is nan or less than 0, return nan */
3508          res = lp_build_select(bld, negmask,
3509                                lp_build_const_vec(bld->gallivm, type,  NAN),
3510                                res);
3511       }
3512    }
3513
3514    if(p_exp) {
3515       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3516       *p_exp = exp;
3517    }
3518
3519    if(p_floor_log2)
3520       *p_floor_log2 = logexp;
3521
3522    if(p_log2)
3523       *p_log2 = res;
3524 }
3525
3526
3527 /*
3528  * log2 implementation which doesn't have special code to
3529  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3530  * the results for those cases are undefined.
3531  */
3532 LLVMValueRef
3533 lp_build_log2(struct lp_build_context *bld,
3534               LLVMValueRef x)
3535 {
3536    LLVMValueRef res;
3537    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3538    return res;
3539 }
3540
3541 /*
3542  * Version of log2 which handles all edge cases.
3543  * Look at documentation of lp_build_log2_approx for
3544  * description of the behavior for each of the edge cases.
3545  */
3546 LLVMValueRef
3547 lp_build_log2_safe(struct lp_build_context *bld,
3548                    LLVMValueRef x)
3549 {
3550    LLVMValueRef res;
3551    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3552    return res;
3553 }
3554
3555
3556 /**
3557  * Faster (and less accurate) log2.
3558  *
3559  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3560  *
3561  * Piece-wise linear approximation, with exact results when x is a
3562  * power of two.
3563  *
3564  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3565  */
3566 LLVMValueRef
3567 lp_build_fast_log2(struct lp_build_context *bld,
3568                    LLVMValueRef x)
3569 {
3570    LLVMBuilderRef builder = bld->gallivm->builder;
3571    LLVMValueRef ipart;
3572    LLVMValueRef fpart;
3573
3574    assert(lp_check_value(bld->type, x));
3575
3576    assert(bld->type.floating);
3577
3578    /* ipart = floor(log2(x)) - 1 */
3579    ipart = lp_build_extract_exponent(bld, x, -1);
3580    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3581
3582    /* fpart = x / 2**ipart */
3583    fpart = lp_build_extract_mantissa(bld, x);
3584
3585    /* ipart + fpart */
3586    return LLVMBuildFAdd(builder, ipart, fpart, "");
3587 }
3588
3589
3590 /**
3591  * Fast implementation of iround(log2(x)).
3592  *
3593  * Not an approximation -- it should give accurate results all the time.
3594  */
3595 LLVMValueRef
3596 lp_build_ilog2(struct lp_build_context *bld,
3597                LLVMValueRef x)
3598 {
3599    LLVMBuilderRef builder = bld->gallivm->builder;
3600    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3601    LLVMValueRef ipart;
3602
3603    assert(bld->type.floating);
3604
3605    assert(lp_check_value(bld->type, x));
3606
3607    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3608    x = LLVMBuildFMul(builder, x, sqrt2, "");
3609
3610    /* ipart = floor(log2(x) + 0.5)  */
3611    ipart = lp_build_extract_exponent(bld, x, 0);
3612
3613    return ipart;
3614 }
3615
3616 LLVMValueRef
3617 lp_build_mod(struct lp_build_context *bld,
3618              LLVMValueRef x,
3619              LLVMValueRef y)
3620 {
3621    LLVMBuilderRef builder = bld->gallivm->builder;
3622    LLVMValueRef res;
3623    const struct lp_type type = bld->type;
3624
3625    assert(lp_check_value(type, x));
3626    assert(lp_check_value(type, y));
3627
3628    if (type.floating)
3629       res = LLVMBuildFRem(builder, x, y, "");
3630    else if (type.sign)
3631       res = LLVMBuildSRem(builder, x, y, "");
3632    else
3633       res = LLVMBuildURem(builder, x, y, "");
3634    return res;
3635 }
3636
3637
3638 /*
3639  * For floating inputs it creates and returns a mask
3640  * which is all 1's for channels which are NaN.
3641  * Channels inside x which are not NaN will be 0.
3642  */
3643 LLVMValueRef
3644 lp_build_isnan(struct lp_build_context *bld,
3645                LLVMValueRef x)
3646 {
3647    LLVMValueRef mask;
3648    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3649
3650    assert(bld->type.floating);
3651    assert(lp_check_value(bld->type, x));
3652
3653    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3654                         "isnotnan");
3655    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3656    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3657    return mask;
3658 }
3659
3660 /* Returns all 1's for floating point numbers that are
3661  * finite numbers and returns all zeros for -inf,
3662  * inf and nan's */
3663 LLVMValueRef
3664 lp_build_isfinite(struct lp_build_context *bld,
3665                   LLVMValueRef x)
3666 {
3667    LLVMBuilderRef builder = bld->gallivm->builder;
3668    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3669    struct lp_type int_type = lp_int_type(bld->type);
3670    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3671    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3672                                                     0x7f800000);
3673
3674    if (!bld->type.floating) {
3675       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3676    }
3677    assert(bld->type.floating);
3678    assert(lp_check_value(bld->type, x));
3679    assert(bld->type.width == 32);
3680
3681    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3682    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3683                            intx, infornan32);
3684 }