src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67
  68 #define EXP_POLY_DEGREE 5
  69
  70 #define LOG_POLY_DEGREE 4
  71
  72
  73 /**
  74  * Generate min(a, b)
  75  * No checks for special case values of a or b = 1 or 0 are done.
  76  * NaN's are handled according to the behavior specified by the
  77  * nan_behavior argument.
  78  */
  79 static LLVMValueRef
  80 lp_build_min_simple(struct lp_build_context *bld,
  81                     LLVMValueRef a,
  82                     LLVMValueRef b,
  83                     enum gallivm_nan_behavior nan_behavior)
  84 {
  85    const struct lp_type type = bld->type;
  86    const char *intrinsic = NULL;
  87    unsigned intr_size = 0;
  88    LLVMValueRef cond;
  89
  90    assert(lp_check_value(type, a));
  91    assert(lp_check_value(type, b));
  92
  93    /* TODO: optimize the constant case */
  94
  95    if (type.floating && util_cpu_caps.has_sse) {
  96       if (type.width == 32) {
  97          if (type.length == 1) {
  98             intrinsic = "llvm.x86.sse.min.ss";
  99             intr_size = 128;
 100          }
 101          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 102             intrinsic = "llvm.x86.sse.min.ps";
 103             intr_size = 128;
 104          }
 105          else {
 106             intrinsic = "llvm.x86.avx.min.ps.256";
 107             intr_size = 256;
 108          }
 109       }
 110       if (type.width == 64 && util_cpu_caps.has_sse2) {
 111          if (type.length == 1) {
 112             intrinsic = "llvm.x86.sse2.min.sd";
 113             intr_size = 128;
 114          }
 115          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 116             intrinsic = "llvm.x86.sse2.min.pd";
 117             intr_size = 128;
 118          }
 119          else {
 120             intrinsic = "llvm.x86.avx.min.pd.256";
 121             intr_size = 256;
 122          }
 123       }
 124    }
 125    else if (type.floating && util_cpu_caps.has_altivec) {
 126       debug_printf("%s: altivec doesn't support nan behavior modes\n",
 127                    __FUNCTION__);
 128       if (type.width == 32 && type.length == 4) {
 129          intrinsic = "llvm.ppc.altivec.vminfp";
 130          intr_size = 128;
 131       }
 132    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 133       intr_size = 128;
 134       if ((type.width == 8 || type.width == 16) &&
 135           (type.width * type.length <= 64) &&
 136           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 137          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 138                       __FUNCTION__);
 139       }
 140       if (type.width == 8 && !type.sign) {
 141          intrinsic = "llvm.x86.sse2.pminu.b";
 142       }
 143       else if (type.width == 16 && type.sign) {
 144          intrinsic = "llvm.x86.sse2.pmins.w";
 145       }
 146       if (util_cpu_caps.has_sse4_1) {
 147          if (type.width == 8 && type.sign) {
 148             intrinsic = "llvm.x86.sse41.pminsb";
 149          }
 150          if (type.width == 16 && !type.sign) {
 151             intrinsic = "llvm.x86.sse41.pminuw";
 152          }
 153          if (type.width == 32 && !type.sign) {
 154             intrinsic = "llvm.x86.sse41.pminud";
 155          }
 156          if (type.width == 32 && type.sign) {
 157             intrinsic = "llvm.x86.sse41.pminsd";
 158          }
 159       }
 160    } else if (util_cpu_caps.has_altivec) {
 161       intr_size = 128;
 162       debug_printf("%s: altivec doesn't support nan behavior modes\n",
 163                    __FUNCTION__);
 164       if (type.width == 8) {
 165          if (!type.sign) {
 166             intrinsic = "llvm.ppc.altivec.vminub";
 167          } else {
 168             intrinsic = "llvm.ppc.altivec.vminsb";
 169          }
 170       } else if (type.width == 16) {
 171          if (!type.sign) {
 172             intrinsic = "llvm.ppc.altivec.vminuh";
 173          } else {
 174             intrinsic = "llvm.ppc.altivec.vminsh";
 175          }
 176       } else if (type.width == 32) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminuw";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsw";
 181          }
 182       }
 183    }
 184
 185    if(intrinsic) {
 186       /* We need to handle nan's for floating point numbers. If one of the
 187        * inputs is nan the other should be returned (required by both D3D10+
 188        * and OpenCL).
 189        * The sse intrinsics return the second operator in case of nan by
 190        * default so we need to special code to handle those.
 191        */
 192       if (util_cpu_caps.has_sse && type.floating &&
 193           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 194           nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
 195          LLVMValueRef isnan, max;
 196          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 197                                                    type,
 198                                                    intr_size, a, b);
 199          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 200             isnan = lp_build_isnan(bld, b);
 201             return lp_build_select(bld, isnan, a, max);
 202          } else {
 203             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 204             isnan = lp_build_isnan(bld, a);
 205             return lp_build_select(bld, isnan, a, max);
 206          }
 207       } else {
 208          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 209                                                     type,
 210                                                     intr_size, a, b);
 211       }
 212    }
 213
 214    if (type.floating) {
 215       switch (nan_behavior) {
 216       case GALLIVM_NAN_RETURN_NAN: {
 217          LLVMValueRef isnan = lp_build_isnan(bld, b);
 218          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 219          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 220          return lp_build_select(bld, cond, a, b);
 221       }
 222          break;
 223       case GALLIVM_NAN_RETURN_OTHER: {
 224          LLVMValueRef isnan = lp_build_isnan(bld, a);
 225          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 226          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 227          return lp_build_select(bld, cond, a, b);
 228       }
 229          break;
 230       case GALLIVM_NAN_RETURN_SECOND:
 231          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 232          return lp_build_select(bld, cond, a, b);
 233       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 234          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 235          return lp_build_select(bld, cond, a, b);
 236          break;
 237       default:
 238          assert(0);
 239          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242    } else {
 243       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 244       return lp_build_select(bld, cond, a, b);
 245    }
 246 }
 247
 248
 249 /**
 250  * Generate max(a, b)
 251  * No checks for special case values of a or b = 1 or 0 are done.
 252  * NaN's are handled according to the behavior specified by the
 253  * nan_behavior argument.
 254  */
 255 static LLVMValueRef
 256 lp_build_max_simple(struct lp_build_context *bld,
 257                     LLVMValueRef a,
 258                     LLVMValueRef b,
 259                     enum gallivm_nan_behavior nan_behavior)
 260 {
 261    const struct lp_type type = bld->type;
 262    const char *intrinsic = NULL;
 263    unsigned intr_size = 0;
 264    LLVMValueRef cond;
 265
 266    assert(lp_check_value(type, a));
 267    assert(lp_check_value(type, b));
 268
 269    /* TODO: optimize the constant case */
 270
 271    if (type.floating && util_cpu_caps.has_sse) {
 272       if (type.width == 32) {
 273          if (type.length == 1) {
 274             intrinsic = "llvm.x86.sse.max.ss";
 275             intr_size = 128;
 276          }
 277          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 278             intrinsic = "llvm.x86.sse.max.ps";
 279             intr_size = 128;
 280          }
 281          else {
 282             intrinsic = "llvm.x86.avx.max.ps.256";
 283             intr_size = 256;
 284          }
 285       }
 286       if (type.width == 64 && util_cpu_caps.has_sse2) {
 287          if (type.length == 1) {
 288             intrinsic = "llvm.x86.sse2.max.sd";
 289             intr_size = 128;
 290          }
 291          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 292             intrinsic = "llvm.x86.sse2.max.pd";
 293             intr_size = 128;
 294          }
 295          else {
 296             intrinsic = "llvm.x86.avx.max.pd.256";
 297             intr_size = 256;
 298          }
 299       }
 300    }
 301    else if (type.floating && util_cpu_caps.has_altivec) {
 302       debug_printf("%s: altivec doesn't support nan behavior modes\n",
 303                    __FUNCTION__);
 304       if (type.width == 32 || type.length == 4) {
 305          intrinsic = "llvm.ppc.altivec.vmaxfp";
 306          intr_size = 128;
 307       }
 308    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 309       intr_size = 128;
 310       if ((type.width == 8 || type.width == 16) &&
 311           (type.width * type.length <= 64) &&
 312           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 313          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 314                       __FUNCTION__);
 315          }
 316       if (type.width == 8 && !type.sign) {
 317          intrinsic = "llvm.x86.sse2.pmaxu.b";
 318          intr_size = 128;
 319       }
 320       else if (type.width == 16 && type.sign) {
 321          intrinsic = "llvm.x86.sse2.pmaxs.w";
 322       }
 323       if (util_cpu_caps.has_sse4_1) {
 324          if (type.width == 8 && type.sign) {
 325             intrinsic = "llvm.x86.sse41.pmaxsb";
 326          }
 327          if (type.width == 16 && !type.sign) {
 328             intrinsic = "llvm.x86.sse41.pmaxuw";
 329          }
 330          if (type.width == 32 && !type.sign) {
 331             intrinsic = "llvm.x86.sse41.pmaxud";
 332         }
 333          if (type.width == 32 && type.sign) {
 334             intrinsic = "llvm.x86.sse41.pmaxsd";
 335          }
 336       }
 337    } else if (util_cpu_caps.has_altivec) {
 338      intr_size = 128;
 339      debug_printf("%s: altivec doesn't support nan behavior modes\n",
 340                   __FUNCTION__);
 341      if (type.width == 8) {
 342        if (!type.sign) {
 343          intrinsic = "llvm.ppc.altivec.vmaxub";
 344        } else {
 345          intrinsic = "llvm.ppc.altivec.vmaxsb";
 346        }
 347      } else if (type.width == 16) {
 348        if (!type.sign) {
 349          intrinsic = "llvm.ppc.altivec.vmaxuh";
 350        } else {
 351          intrinsic = "llvm.ppc.altivec.vmaxsh";
 352        }
 353      } else if (type.width == 32) {
 354        if (!type.sign) {
 355          intrinsic = "llvm.ppc.altivec.vmaxuw";
 356        } else {
 357          intrinsic = "llvm.ppc.altivec.vmaxsw";
 358        }
 359      }
 360    }
 361
 362    if(intrinsic) {
 363       if (util_cpu_caps.has_sse && type.floating &&
 364           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 365           nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
 366          LLVMValueRef isnan, min;
 367          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 368                                                    type,
 369                                                    intr_size, a, b);
 370          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 371             isnan = lp_build_isnan(bld, b);
 372             return lp_build_select(bld, isnan, a, min);
 373          } else {
 374             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 375             isnan = lp_build_isnan(bld, a);
 376             return lp_build_select(bld, isnan, a, min);
 377          }
 378       } else {
 379          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 380                                                     type,
 381                                                     intr_size, a, b);
 382       }
 383    }
 384
 385    if (type.floating) {
 386       switch (nan_behavior) {
 387       case GALLIVM_NAN_RETURN_NAN: {
 388          LLVMValueRef isnan = lp_build_isnan(bld, b);
 389          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 390          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 391          return lp_build_select(bld, cond, a, b);
 392       }
 393          break;
 394       case GALLIVM_NAN_RETURN_OTHER: {
 395          LLVMValueRef isnan = lp_build_isnan(bld, a);
 396          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 397          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 398          return lp_build_select(bld, cond, a, b);
 399       }
 400          break;
 401       case GALLIVM_NAN_RETURN_SECOND:
 402          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 403          return lp_build_select(bld, cond, a, b);
 404       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 405          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 406          return lp_build_select(bld, cond, a, b);
 407          break;
 408       default:
 409          assert(0);
 410          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 411          return lp_build_select(bld, cond, a, b);
 412       }
 413    } else {
 414       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415       return lp_build_select(bld, cond, a, b);
 416    }
 417 }
 418
 419
 420 /**
 421  * Generate 1 - a, or ~a depending on bld->type.
 422  */
 423 LLVMValueRef
 424 lp_build_comp(struct lp_build_context *bld,
 425               LLVMValueRef a)
 426 {
 427    LLVMBuilderRef builder = bld->gallivm->builder;
 428    const struct lp_type type = bld->type;
 429
 430    assert(lp_check_value(type, a));
 431
 432    if(a == bld->one)
 433       return bld->zero;
 434    if(a == bld->zero)
 435       return bld->one;
 436
 437    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 438       if(LLVMIsConstant(a))
 439          return LLVMConstNot(a);
 440       else
 441          return LLVMBuildNot(builder, a, "");
 442    }
 443
 444    if(LLVMIsConstant(a))
 445       if (type.floating)
 446           return LLVMConstFSub(bld->one, a);
 447       else
 448           return LLVMConstSub(bld->one, a);
 449    else
 450       if (type.floating)
 451          return LLVMBuildFSub(builder, bld->one, a, "");
 452       else
 453          return LLVMBuildSub(builder, bld->one, a, "");
 454 }
 455
 456
 457 /**
 458  * Generate a + b
 459  */
 460 LLVMValueRef
 461 lp_build_add(struct lp_build_context *bld,
 462              LLVMValueRef a,
 463              LLVMValueRef b)
 464 {
 465    LLVMBuilderRef builder = bld->gallivm->builder;
 466    const struct lp_type type = bld->type;
 467    LLVMValueRef res;
 468
 469    assert(lp_check_value(type, a));
 470    assert(lp_check_value(type, b));
 471
 472    if(a == bld->zero)
 473       return b;
 474    if(b == bld->zero)
 475       return a;
 476    if(a == bld->undef || b == bld->undef)
 477       return bld->undef;
 478
 479    if(bld->type.norm) {
 480       const char *intrinsic = NULL;
 481
 482       if(a == bld->one || b == bld->one)
 483         return bld->one;
 484
 485       if (type.width * type.length == 128 &&
 486           !type.floating && !type.fixed) {
 487          if(util_cpu_caps.has_sse2) {
 488            if(type.width == 8)
 489              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 490            if(type.width == 16)
 491              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 492          } else if (util_cpu_caps.has_altivec) {
 493            if(type.width == 8)
 494               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 495            if(type.width == 16)
 496               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 497          }
 498       }
 499
 500       if(intrinsic)
 501          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 502    }
 503
 504    /* TODO: handle signed case */
 505    if(type.norm && !type.floating && !type.fixed && !type.sign)
 506       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 507
 508    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 509       if (type.floating)
 510          res = LLVMConstFAdd(a, b);
 511       else
 512          res = LLVMConstAdd(a, b);
 513    else
 514       if (type.floating)
 515          res = LLVMBuildFAdd(builder, a, b, "");
 516       else
 517          res = LLVMBuildAdd(builder, a, b, "");
 518
 519    /* clamp to ceiling of 1.0 */
 520    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 521       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 522
 523    /* XXX clamp to floor of -1 or 0??? */
 524
 525    return res;
 526 }
 527
 528
 529 /** Return the scalar sum of the elements of a.
 530  * Should avoid this operation whenever possible.
 531  */
 532 LLVMValueRef
 533 lp_build_horizontal_add(struct lp_build_context *bld,
 534                         LLVMValueRef a)
 535 {
 536    LLVMBuilderRef builder = bld->gallivm->builder;
 537    const struct lp_type type = bld->type;
 538    LLVMValueRef index, res;
 539    unsigned i, length;
 540    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 541    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 542    LLVMValueRef vecres, elem2;
 543
 544    assert(lp_check_value(type, a));
 545
 546    if (type.length == 1) {
 547       return a;
 548    }
 549
 550    assert(!bld->type.norm);
 551
 552    /*
 553     * for byte vectors can do much better with psadbw.
 554     * Using repeated shuffle/adds here. Note with multiple vectors
 555     * this can be done more efficiently as outlined in the intel
 556     * optimization manual.
 557     * Note: could cause data rearrangement if used with smaller element
 558     * sizes.
 559     */
 560
 561    vecres = a;
 562    length = type.length / 2;
 563    while (length > 1) {
 564       LLVMValueRef vec1, vec2;
 565       for (i = 0; i < length; i++) {
 566          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 567          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 568       }
 569       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 570                                     LLVMConstVector(shuffles1, length), "");
 571       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 572                                     LLVMConstVector(shuffles2, length), "");
 573       if (type.floating) {
 574          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 575       }
 576       else {
 577          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 578       }
 579       length = length >> 1;
 580    }
 581
 582    /* always have vector of size 2 here */
 583    assert(length == 1);
 584
 585    index = lp_build_const_int32(bld->gallivm, 0);
 586    res = LLVMBuildExtractElement(builder, vecres, index, "");
 587    index = lp_build_const_int32(bld->gallivm, 1);
 588    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 589
 590    if (type.floating)
 591       res = LLVMBuildFAdd(builder, res, elem2, "");
 592     else
 593       res = LLVMBuildAdd(builder, res, elem2, "");
 594
 595    return res;
 596 }
 597
 598 /**
 599  * Return the horizontal sums of 4 float vectors as a float4 vector.
 600  * This uses the technique as outlined in Intel Optimization Manual.
 601  */
 602 static LLVMValueRef
 603 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 604                             LLVMValueRef src[4])
 605 {
 606    struct gallivm_state *gallivm = bld->gallivm;
 607    LLVMBuilderRef builder = gallivm->builder;
 608    LLVMValueRef shuffles[4];
 609    LLVMValueRef tmp[4];
 610    LLVMValueRef sumtmp[2], shuftmp[2];
 611
 612    /* lower half of regs */
 613    shuffles[0] = lp_build_const_int32(gallivm, 0);
 614    shuffles[1] = lp_build_const_int32(gallivm, 1);
 615    shuffles[2] = lp_build_const_int32(gallivm, 4);
 616    shuffles[3] = lp_build_const_int32(gallivm, 5);
 617    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 618                                    LLVMConstVector(shuffles, 4), "");
 619    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 620                                    LLVMConstVector(shuffles, 4), "");
 621
 622    /* upper half of regs */
 623    shuffles[0] = lp_build_const_int32(gallivm, 2);
 624    shuffles[1] = lp_build_const_int32(gallivm, 3);
 625    shuffles[2] = lp_build_const_int32(gallivm, 6);
 626    shuffles[3] = lp_build_const_int32(gallivm, 7);
 627    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 628                                    LLVMConstVector(shuffles, 4), "");
 629    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 630                                    LLVMConstVector(shuffles, 4), "");
 631
 632    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 633    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 634
 635    shuffles[0] = lp_build_const_int32(gallivm, 0);
 636    shuffles[1] = lp_build_const_int32(gallivm, 2);
 637    shuffles[2] = lp_build_const_int32(gallivm, 4);
 638    shuffles[3] = lp_build_const_int32(gallivm, 6);
 639    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 640                                        LLVMConstVector(shuffles, 4), "");
 641
 642    shuffles[0] = lp_build_const_int32(gallivm, 1);
 643    shuffles[1] = lp_build_const_int32(gallivm, 3);
 644    shuffles[2] = lp_build_const_int32(gallivm, 5);
 645    shuffles[3] = lp_build_const_int32(gallivm, 7);
 646    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 647                                        LLVMConstVector(shuffles, 4), "");
 648
 649    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 650 }
 651
 652
 653 /*
 654  * partially horizontally add 2-4 float vectors with length nx4,
 655  * i.e. only four adjacent values in each vector will be added,
 656  * assuming values are really grouped in 4 which also determines
 657  * output order.
 658  *
 659  * Return a vector of the same length as the initial vectors,
 660  * with the excess elements (if any) being undefined.
 661  * The element order is independent of number of input vectors.
 662  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 663  * the output order thus will be
 664  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 665  */
 666 LLVMValueRef
 667 lp_build_hadd_partial4(struct lp_build_context *bld,
 668                        LLVMValueRef vectors[],
 669                        unsigned num_vecs)
 670 {
 671    struct gallivm_state *gallivm = bld->gallivm;
 672    LLVMBuilderRef builder = gallivm->builder;
 673    LLVMValueRef ret_vec;
 674    LLVMValueRef tmp[4];
 675    const char *intrinsic = NULL;
 676
 677    assert(num_vecs >= 2 && num_vecs <= 4);
 678    assert(bld->type.floating);
 679
 680    /* only use this with at least 2 vectors, as it is sort of expensive
 681     * (depending on cpu) and we always need two horizontal adds anyway,
 682     * so a shuffle/add approach might be better.
 683     */
 684
 685    tmp[0] = vectors[0];
 686    tmp[1] = vectors[1];
 687
 688    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 689    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 690
 691    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 692        bld->type.length == 4) {
 693       intrinsic = "llvm.x86.sse3.hadd.ps";
 694    }
 695    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 696             bld->type.length == 8) {
 697       intrinsic = "llvm.x86.avx.hadd.ps.256";
 698    }
 699    if (intrinsic) {
 700       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 701                                        lp_build_vec_type(gallivm, bld->type),
 702                                        tmp[0], tmp[1]);
 703       if (num_vecs > 2) {
 704          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 705                                           lp_build_vec_type(gallivm, bld->type),
 706                                           tmp[2], tmp[3]);
 707       }
 708       else {
 709          tmp[1] = tmp[0];
 710       }
 711       return lp_build_intrinsic_binary(builder, intrinsic,
 712                                        lp_build_vec_type(gallivm, bld->type),
 713                                        tmp[0], tmp[1]);
 714    }
 715
 716    if (bld->type.length == 4) {
 717       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 718    }
 719    else {
 720       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 721       unsigned j;
 722       unsigned num_iter = bld->type.length / 4;
 723       struct lp_type parttype = bld->type;
 724       parttype.length = 4;
 725       for (j = 0; j < num_iter; j++) {
 726          LLVMValueRef partsrc[4];
 727          unsigned i;
 728          for (i = 0; i < 4; i++) {
 729             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 730          }
 731          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 732       }
 733       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 734    }
 735    return ret_vec;
 736 }
 737
 738 /**
 739  * Generate a - b
 740  */
 741 LLVMValueRef
 742 lp_build_sub(struct lp_build_context *bld,
 743              LLVMValueRef a,
 744              LLVMValueRef b)
 745 {
 746    LLVMBuilderRef builder = bld->gallivm->builder;
 747    const struct lp_type type = bld->type;
 748    LLVMValueRef res;
 749
 750    assert(lp_check_value(type, a));
 751    assert(lp_check_value(type, b));
 752
 753    if(b == bld->zero)
 754       return a;
 755    if(a == bld->undef || b == bld->undef)
 756       return bld->undef;
 757    if(a == b)
 758       return bld->zero;
 759
 760    if(bld->type.norm) {
 761       const char *intrinsic = NULL;
 762
 763       if(b == bld->one)
 764         return bld->zero;
 765
 766       if (type.width * type.length == 128 &&
 767           !type.floating && !type.fixed) {
 768          if (util_cpu_caps.has_sse2) {
 769            if(type.width == 8)
 770               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 771            if(type.width == 16)
 772               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 773          } else if (util_cpu_caps.has_altivec) {
 774            if(type.width == 8)
 775               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 776            if(type.width == 16)
 777               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 778          }
 779       }
 780
 781       if(intrinsic)
 782          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 783    }
 784
 785    /* TODO: handle signed case */
 786    if(type.norm && !type.floating && !type.fixed && !type.sign)
 787       a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 788
 789    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 790       if (type.floating)
 791          res = LLVMConstFSub(a, b);
 792       else
 793          res = LLVMConstSub(a, b);
 794    else
 795       if (type.floating)
 796          res = LLVMBuildFSub(builder, a, b, "");
 797       else
 798          res = LLVMBuildSub(builder, a, b, "");
 799
 800    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 801       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 802
 803    return res;
 804 }
 805
 806
 807
 808 /**
 809  * Normalized multiplication.
 810  *
 811  * There are several approaches for (using 8-bit normalized multiplication as
 812  * an example):
 813  *
 814  * - alpha plus one
 815  *
 816  *     makes the following approximation to the division (Sree)
 817  *
 818  *       a*b/255 ~= (a*(b + 1)) >> 256
 819  *
 820  *     which is the fastest method that satisfies the following OpenGL criteria of
 821  *
 822  *       0*0 = 0 and 255*255 = 255
 823  *
 824  * - geometric series
 825  *
 826  *     takes the geometric series approximation to the division
 827  *
 828  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 829  *
 830  *     in this case just the first two terms to fit in 16bit arithmetic
 831  *
 832  *       t/255 ~= (t + (t >> 8)) >> 8
 833  *
 834  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 835  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 836  *     must be used.
 837  *
 838  * - geometric series plus rounding
 839  *
 840  *     when using a geometric series division instead of truncating the result
 841  *     use roundoff in the approximation (Jim Blinn)
 842  *
 843  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 844  *
 845  *     achieving the exact results.
 846  *
 847  *
 848  *
 849  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 850  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 851  * @sa Michael Herf, The "double blend trick", May 2000,
 852  *     http://www.stereopsis.com/doubleblend.html
 853  */
 854 static LLVMValueRef
 855 lp_build_mul_norm(struct gallivm_state *gallivm,
 856                   struct lp_type wide_type,
 857                   LLVMValueRef a, LLVMValueRef b)
 858 {
 859    LLVMBuilderRef builder = gallivm->builder;
 860    struct lp_build_context bld;
 861    unsigned n;
 862    LLVMValueRef half;
 863    LLVMValueRef ab;
 864
 865    assert(!wide_type.floating);
 866    assert(lp_check_value(wide_type, a));
 867    assert(lp_check_value(wide_type, b));
 868
 869    lp_build_context_init(&bld, gallivm, wide_type);
 870
 871    n = wide_type.width / 2;
 872    if (wide_type.sign) {
 873       --n;
 874    }
 875
 876    /*
 877     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 878     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 879     */
 880
 881    /*
 882     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 883     */
 884
 885    ab = LLVMBuildMul(builder, a, b, "");
 886    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 887
 888    /*
 889     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 890     */
 891
 892    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 893    if (wide_type.sign) {
 894       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 895       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 896       half = lp_build_select(&bld, sign, minus_half, half);
 897    }
 898    ab = LLVMBuildAdd(builder, ab, half, "");
 899
 900    /* Final division */
 901    ab = lp_build_shr_imm(&bld, ab, n);
 902
 903    return ab;
 904 }
 905
 906 /**
 907  * Generate a * b
 908  */
 909 LLVMValueRef
 910 lp_build_mul(struct lp_build_context *bld,
 911              LLVMValueRef a,
 912              LLVMValueRef b)
 913 {
 914    LLVMBuilderRef builder = bld->gallivm->builder;
 915    const struct lp_type type = bld->type;
 916    LLVMValueRef shift;
 917    LLVMValueRef res;
 918
 919    assert(lp_check_value(type, a));
 920    assert(lp_check_value(type, b));
 921
 922    if(a == bld->zero)
 923       return bld->zero;
 924    if(a == bld->one)
 925       return b;
 926    if(b == bld->zero)
 927       return bld->zero;
 928    if(b == bld->one)
 929       return a;
 930    if(a == bld->undef || b == bld->undef)
 931       return bld->undef;
 932
 933    if (!type.floating && !type.fixed && type.norm) {
 934       struct lp_type wide_type = lp_wider_type(type);
 935       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 936
 937       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 938       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 939
 940       /* PMULLW, PSRLW, PADDW */
 941       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 942       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 943
 944       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 945
 946       return ab;
 947    }
 948
 949    if(type.fixed)
 950       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 951    else
 952       shift = NULL;
 953
 954    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 955       if (type.floating)
 956          res = LLVMConstFMul(a, b);
 957       else
 958          res = LLVMConstMul(a, b);
 959       if(shift) {
 960          if(type.sign)
 961             res = LLVMConstAShr(res, shift);
 962          else
 963             res = LLVMConstLShr(res, shift);
 964       }
 965    }
 966    else {
 967       if (type.floating)
 968          res = LLVMBuildFMul(builder, a, b, "");
 969       else
 970          res = LLVMBuildMul(builder, a, b, "");
 971       if(shift) {
 972          if(type.sign)
 973             res = LLVMBuildAShr(builder, res, shift, "");
 974          else
 975             res = LLVMBuildLShr(builder, res, shift, "");
 976       }
 977    }
 978
 979    return res;
 980 }
 981
 982
 983 /**
 984  * Small vector x scale multiplication optimization.
 985  */
 986 LLVMValueRef
 987 lp_build_mul_imm(struct lp_build_context *bld,
 988                  LLVMValueRef a,
 989                  int b)
 990 {
 991    LLVMBuilderRef builder = bld->gallivm->builder;
 992    LLVMValueRef factor;
 993
 994    assert(lp_check_value(bld->type, a));
 995
 996    if(b == 0)
 997       return bld->zero;
 998
 999    if(b == 1)
1000       return a;
1001
1002    if(b == -1)
1003       return lp_build_negate(bld, a);
1004
1005    if(b == 2 && bld->type.floating)
1006       return lp_build_add(bld, a, a);
1007
1008    if(util_is_power_of_two(b)) {
1009       unsigned shift = ffs(b) - 1;
1010
1011       if(bld->type.floating) {
1012 #if 0
1013          /*
1014           * Power of two multiplication by directly manipulating the exponent.
1015           *
1016           * XXX: This might not be always faster, it will introduce a small error
1017           * for multiplication by zero, and it will produce wrong results
1018           * for Inf and NaN.
1019           */
1020          unsigned mantissa = lp_mantissa(bld->type);
1021          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1022          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1023          a = LLVMBuildAdd(builder, a, factor, "");
1024          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1025          return a;
1026 #endif
1027       }
1028       else {
1029          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1030          return LLVMBuildShl(builder, a, factor, "");
1031       }
1032    }
1033
1034    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1035    return lp_build_mul(bld, a, factor);
1036 }
1037
1038
1039 /**
1040  * Generate a / b
1041  */
1042 LLVMValueRef
1043 lp_build_div(struct lp_build_context *bld,
1044              LLVMValueRef a,
1045              LLVMValueRef b)
1046 {
1047    LLVMBuilderRef builder = bld->gallivm->builder;
1048    const struct lp_type type = bld->type;
1049
1050    assert(lp_check_value(type, a));
1051    assert(lp_check_value(type, b));
1052
1053    if(a == bld->zero)
1054       return bld->zero;
1055    if(a == bld->one)
1056       return lp_build_rcp(bld, b);
1057    if(b == bld->zero)
1058       return bld->undef;
1059    if(b == bld->one)
1060       return a;
1061    if(a == bld->undef || b == bld->undef)
1062       return bld->undef;
1063
1064    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1065       if (type.floating)
1066          return LLVMConstFDiv(a, b);
1067       else if (type.sign)
1068          return LLVMConstSDiv(a, b);
1069       else
1070          return LLVMConstUDiv(a, b);
1071    }
1072
1073    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1074        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1075       type.floating)
1076       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1077
1078    if (type.floating)
1079       return LLVMBuildFDiv(builder, a, b, "");
1080    else if (type.sign)
1081       return LLVMBuildSDiv(builder, a, b, "");
1082    else
1083       return LLVMBuildUDiv(builder, a, b, "");
1084 }
1085
1086
1087 /**
1088  * Linear interpolation helper.
1089  *
1090  * @param normalized whether we are interpolating normalized values,
1091  *        encoded in normalized integers, twice as wide.
1092  *
1093  * @sa http://www.stereopsis.com/doubleblend.html
1094  */
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context *bld,
1097                      LLVMValueRef x,
1098                      LLVMValueRef v0,
1099                      LLVMValueRef v1,
1100                      unsigned flags)
1101 {
1102    unsigned half_width = bld->type.width/2;
1103    LLVMBuilderRef builder = bld->gallivm->builder;
1104    LLVMValueRef delta;
1105    LLVMValueRef res;
1106
1107    assert(lp_check_value(bld->type, x));
1108    assert(lp_check_value(bld->type, v0));
1109    assert(lp_check_value(bld->type, v1));
1110
1111    delta = lp_build_sub(bld, v1, v0);
1112
1113    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1114       if (!bld->type.sign) {
1115          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1116             /*
1117              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118              * most-significant-bit to the lowest-significant-bit, so that
1119              * later we can just divide by 2**n instead of 2**n - 1.
1120              */
1121
1122             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1123          }
1124
1125          /* (x * delta) >> n */
1126          res = lp_build_mul(bld, x, delta);
1127          res = lp_build_shr_imm(bld, res, half_width);
1128       } else {
1129          /*
1130           * The rescaling trick above doesn't work for signed numbers, so
1131           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1132           * instead.
1133           */
1134          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1135          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1136       }
1137    } else {
1138       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1139       res = lp_build_mul(bld, x, delta);
1140    }
1141
1142    res = lp_build_add(bld, v0, res);
1143
1144    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1145        bld->type.fixed) {
1146       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148        * but it will be wrong for true fixed point use cases. Basically we need
1149        * a more powerful lp_type, capable of further distinguishing the values
1150        * interpretation from the value storage. */
1151       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1152    }
1153
1154    return res;
1155 }
1156
1157
1158 /**
1159  * Linear interpolation.
1160  */
1161 LLVMValueRef
1162 lp_build_lerp(struct lp_build_context *bld,
1163               LLVMValueRef x,
1164               LLVMValueRef v0,
1165               LLVMValueRef v1,
1166               unsigned flags)
1167 {
1168    const struct lp_type type = bld->type;
1169    LLVMValueRef res;
1170
1171    assert(lp_check_value(type, x));
1172    assert(lp_check_value(type, v0));
1173    assert(lp_check_value(type, v1));
1174
1175    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1176
1177    if (type.norm) {
1178       struct lp_type wide_type;
1179       struct lp_build_context wide_bld;
1180       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1181
1182       assert(type.length >= 2);
1183
1184       /*
1185        * Create a wider integer type, enough to hold the
1186        * intermediate result of the multiplication.
1187        */
1188       memset(&wide_type, 0, sizeof wide_type);
1189       wide_type.sign   = type.sign;
1190       wide_type.width  = type.width*2;
1191       wide_type.length = type.length/2;
1192
1193       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1194
1195       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1196       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1197       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1198
1199       /*
1200        * Lerp both halves.
1201        */
1202
1203       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1204
1205       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1206       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1207
1208       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1209    } else {
1210       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1211    }
1212
1213    return res;
1214 }
1215
1216
1217 /**
1218  * Bilinear interpolation.
1219  *
1220  * Values indices are in v_{yx}.
1221  */
1222 LLVMValueRef
1223 lp_build_lerp_2d(struct lp_build_context *bld,
1224                  LLVMValueRef x,
1225                  LLVMValueRef y,
1226                  LLVMValueRef v00,
1227                  LLVMValueRef v01,
1228                  LLVMValueRef v10,
1229                  LLVMValueRef v11,
1230                  unsigned flags)
1231 {
1232    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1233    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1234    return lp_build_lerp(bld, y, v0, v1, flags);
1235 }
1236
1237
1238 LLVMValueRef
1239 lp_build_lerp_3d(struct lp_build_context *bld,
1240                  LLVMValueRef x,
1241                  LLVMValueRef y,
1242                  LLVMValueRef z,
1243                  LLVMValueRef v000,
1244                  LLVMValueRef v001,
1245                  LLVMValueRef v010,
1246                  LLVMValueRef v011,
1247                  LLVMValueRef v100,
1248                  LLVMValueRef v101,
1249                  LLVMValueRef v110,
1250                  LLVMValueRef v111,
1251                  unsigned flags)
1252 {
1253    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1254    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1255    return lp_build_lerp(bld, z, v0, v1, flags);
1256 }
1257
1258
1259 /**
1260  * Generate min(a, b)
1261  * Do checks for special cases but not for nans.
1262  */
1263 LLVMValueRef
1264 lp_build_min(struct lp_build_context *bld,
1265              LLVMValueRef a,
1266              LLVMValueRef b)
1267 {
1268    assert(lp_check_value(bld->type, a));
1269    assert(lp_check_value(bld->type, b));
1270
1271    if(a == bld->undef || b == bld->undef)
1272       return bld->undef;
1273
1274    if(a == b)
1275       return a;
1276
1277    if (bld->type.norm) {
1278       if (!bld->type.sign) {
1279          if (a == bld->zero || b == bld->zero) {
1280             return bld->zero;
1281          }
1282       }
1283       if(a == bld->one)
1284          return b;
1285       if(b == bld->one)
1286          return a;
1287    }
1288
1289    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1290 }
1291
1292
1293 /**
1294  * Generate min(a, b)
1295  * NaN's are handled according to the behavior specified by the
1296  * nan_behavior argument.
1297  */
1298 LLVMValueRef
1299 lp_build_min_ext(struct lp_build_context *bld,
1300                  LLVMValueRef a,
1301                  LLVMValueRef b,
1302                  enum gallivm_nan_behavior nan_behavior)
1303 {
1304    assert(lp_check_value(bld->type, a));
1305    assert(lp_check_value(bld->type, b));
1306
1307    if(a == bld->undef || b == bld->undef)
1308       return bld->undef;
1309
1310    if(a == b)
1311       return a;
1312
1313    if (bld->type.norm) {
1314       if (!bld->type.sign) {
1315          if (a == bld->zero || b == bld->zero) {
1316             return bld->zero;
1317          }
1318       }
1319       if(a == bld->one)
1320          return b;
1321       if(b == bld->one)
1322          return a;
1323    }
1324
1325    return lp_build_min_simple(bld, a, b, nan_behavior);
1326 }
1327
1328 /**
1329  * Generate max(a, b)
1330  * Do checks for special cases, but NaN behavior is undefined.
1331  */
1332 LLVMValueRef
1333 lp_build_max(struct lp_build_context *bld,
1334              LLVMValueRef a,
1335              LLVMValueRef b)
1336 {
1337    assert(lp_check_value(bld->type, a));
1338    assert(lp_check_value(bld->type, b));
1339
1340    if(a == bld->undef || b == bld->undef)
1341       return bld->undef;
1342
1343    if(a == b)
1344       return a;
1345
1346    if(bld->type.norm) {
1347       if(a == bld->one || b == bld->one)
1348          return bld->one;
1349       if (!bld->type.sign) {
1350          if (a == bld->zero) {
1351             return b;
1352          }
1353          if (b == bld->zero) {
1354             return a;
1355          }
1356       }
1357    }
1358
1359    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1360 }
1361
1362
1363 /**
1364  * Generate max(a, b)
1365  * Checks for special cases.
1366  * NaN's are handled according to the behavior specified by the
1367  * nan_behavior argument.
1368  */
1369 LLVMValueRef
1370 lp_build_max_ext(struct lp_build_context *bld,
1371                   LLVMValueRef a,
1372                   LLVMValueRef b,
1373                   enum gallivm_nan_behavior nan_behavior)
1374 {
1375    assert(lp_check_value(bld->type, a));
1376    assert(lp_check_value(bld->type, b));
1377
1378    if(a == bld->undef || b == bld->undef)
1379       return bld->undef;
1380
1381    if(a == b)
1382       return a;
1383
1384    if(bld->type.norm) {
1385       if(a == bld->one || b == bld->one)
1386          return bld->one;
1387       if (!bld->type.sign) {
1388          if (a == bld->zero) {
1389             return b;
1390          }
1391          if (b == bld->zero) {
1392             return a;
1393          }
1394       }
1395    }
1396
1397    return lp_build_max_simple(bld, a, b, nan_behavior);
1398 }
1399
1400 /**
1401  * Generate clamp(a, min, max)
1402  * Do checks for special cases.
1403  */
1404 LLVMValueRef
1405 lp_build_clamp(struct lp_build_context *bld,
1406                LLVMValueRef a,
1407                LLVMValueRef min,
1408                LLVMValueRef max)
1409 {
1410    assert(lp_check_value(bld->type, a));
1411    assert(lp_check_value(bld->type, min));
1412    assert(lp_check_value(bld->type, max));
1413
1414    a = lp_build_min(bld, a, max);
1415    a = lp_build_max(bld, a, min);
1416    return a;
1417 }
1418
1419
1420 /**
1421  * Generate abs(a)
1422  */
1423 LLVMValueRef
1424 lp_build_abs(struct lp_build_context *bld,
1425              LLVMValueRef a)
1426 {
1427    LLVMBuilderRef builder = bld->gallivm->builder;
1428    const struct lp_type type = bld->type;
1429    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1430
1431    assert(lp_check_value(type, a));
1432
1433    if(!type.sign)
1434       return a;
1435
1436    if(type.floating) {
1437       /* Mask out the sign bit */
1438       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1439       unsigned long long absMask = ~(1ULL << (type.width - 1));
1440       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1441       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1442       a = LLVMBuildAnd(builder, a, mask, "");
1443       a = LLVMBuildBitCast(builder, a, vec_type, "");
1444       return a;
1445    }
1446
1447    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1448       switch(type.width) {
1449       case 8:
1450          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1451       case 16:
1452          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1453       case 32:
1454          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1455       }
1456    }
1457    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1458             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1459             (type.width == 8 || type.width == 16 || type.width == 32)) {
1460       debug_printf("%s: inefficient code, should split vectors manually\n",
1461                    __FUNCTION__);
1462    }
1463
1464    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1465 }
1466
1467
1468 LLVMValueRef
1469 lp_build_negate(struct lp_build_context *bld,
1470                 LLVMValueRef a)
1471 {
1472    LLVMBuilderRef builder = bld->gallivm->builder;
1473
1474    assert(lp_check_value(bld->type, a));
1475
1476 #if HAVE_LLVM >= 0x0207
1477    if (bld->type.floating)
1478       a = LLVMBuildFNeg(builder, a, "");
1479    else
1480 #endif
1481       a = LLVMBuildNeg(builder, a, "");
1482
1483    return a;
1484 }
1485
1486
1487 /** Return -1, 0 or +1 depending on the sign of a */
1488 LLVMValueRef
1489 lp_build_sgn(struct lp_build_context *bld,
1490              LLVMValueRef a)
1491 {
1492    LLVMBuilderRef builder = bld->gallivm->builder;
1493    const struct lp_type type = bld->type;
1494    LLVMValueRef cond;
1495    LLVMValueRef res;
1496
1497    assert(lp_check_value(type, a));
1498
1499    /* Handle non-zero case */
1500    if(!type.sign) {
1501       /* if not zero then sign must be positive */
1502       res = bld->one;
1503    }
1504    else if(type.floating) {
1505       LLVMTypeRef vec_type;
1506       LLVMTypeRef int_type;
1507       LLVMValueRef mask;
1508       LLVMValueRef sign;
1509       LLVMValueRef one;
1510       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1511
1512       int_type = lp_build_int_vec_type(bld->gallivm, type);
1513       vec_type = lp_build_vec_type(bld->gallivm, type);
1514       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1515
1516       /* Take the sign bit and add it to 1 constant */
1517       sign = LLVMBuildBitCast(builder, a, int_type, "");
1518       sign = LLVMBuildAnd(builder, sign, mask, "");
1519       one = LLVMConstBitCast(bld->one, int_type);
1520       res = LLVMBuildOr(builder, sign, one, "");
1521       res = LLVMBuildBitCast(builder, res, vec_type, "");
1522    }
1523    else
1524    {
1525       /* signed int/norm/fixed point */
1526       /* could use psign with sse3 and appropriate vectors here */
1527       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1528       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1529       res = lp_build_select(bld, cond, bld->one, minus_one);
1530    }
1531
1532    /* Handle zero */
1533    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1534    res = lp_build_select(bld, cond, bld->zero, res);
1535
1536    return res;
1537 }
1538
1539
1540 /**
1541  * Set the sign of float vector 'a' according to 'sign'.
1542  * If sign==0, return abs(a).
1543  * If sign==1, return -abs(a);
1544  * Other values for sign produce undefined results.
1545  */
1546 LLVMValueRef
1547 lp_build_set_sign(struct lp_build_context *bld,
1548                   LLVMValueRef a, LLVMValueRef sign)
1549 {
1550    LLVMBuilderRef builder = bld->gallivm->builder;
1551    const struct lp_type type = bld->type;
1552    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1553    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1554    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1555    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1556                              ~((unsigned long long) 1 << (type.width - 1)));
1557    LLVMValueRef val, res;
1558
1559    assert(type.floating);
1560    assert(lp_check_value(type, a));
1561
1562    /* val = reinterpret_cast<int>(a) */
1563    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1564    /* val = val & mask */
1565    val = LLVMBuildAnd(builder, val, mask, "");
1566    /* sign = sign << shift */
1567    sign = LLVMBuildShl(builder, sign, shift, "");
1568    /* res = val | sign */
1569    res = LLVMBuildOr(builder, val, sign, "");
1570    /* res = reinterpret_cast<float>(res) */
1571    res = LLVMBuildBitCast(builder, res, vec_type, "");
1572
1573    return res;
1574 }
1575
1576
1577 /**
1578  * Convert vector of (or scalar) int to vector of (or scalar) float.
1579  */
1580 LLVMValueRef
1581 lp_build_int_to_float(struct lp_build_context *bld,
1582                       LLVMValueRef a)
1583 {
1584    LLVMBuilderRef builder = bld->gallivm->builder;
1585    const struct lp_type type = bld->type;
1586    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1587
1588    assert(type.floating);
1589
1590    return LLVMBuildSIToFP(builder, a, vec_type, "");
1591 }
1592
1593 static boolean
1594 arch_rounding_available(const struct lp_type type)
1595 {
1596    if ((util_cpu_caps.has_sse4_1 &&
1597        (type.length == 1 || type.width*type.length == 128)) ||
1598        (util_cpu_caps.has_avx && type.width*type.length == 256))
1599       return TRUE;
1600    else if ((util_cpu_caps.has_altivec &&
1601             (type.width == 32 && type.length == 4)))
1602       return TRUE;
1603
1604    return FALSE;
1605 }
1606
1607 enum lp_build_round_mode
1608 {
1609    LP_BUILD_ROUND_NEAREST = 0,
1610    LP_BUILD_ROUND_FLOOR = 1,
1611    LP_BUILD_ROUND_CEIL = 2,
1612    LP_BUILD_ROUND_TRUNCATE = 3
1613 };
1614
1615 /**
1616  * Helper for SSE4.1's ROUNDxx instructions.
1617  *
1618  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1619  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1620  */
1621 static INLINE LLVMValueRef
1622 lp_build_round_sse41(struct lp_build_context *bld,
1623                      LLVMValueRef a,
1624                      enum lp_build_round_mode mode)
1625 {
1626    LLVMBuilderRef builder = bld->gallivm->builder;
1627    const struct lp_type type = bld->type;
1628    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1629    const char *intrinsic;
1630    LLVMValueRef res;
1631
1632    assert(type.floating);
1633
1634    assert(lp_check_value(type, a));
1635    assert(util_cpu_caps.has_sse4_1);
1636
1637    if (type.length == 1) {
1638       LLVMTypeRef vec_type;
1639       LLVMValueRef undef;
1640       LLVMValueRef args[3];
1641       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1642
1643       switch(type.width) {
1644       case 32:
1645          intrinsic = "llvm.x86.sse41.round.ss";
1646          break;
1647       case 64:
1648          intrinsic = "llvm.x86.sse41.round.sd";
1649          break;
1650       default:
1651          assert(0);
1652          return bld->undef;
1653       }
1654
1655       vec_type = LLVMVectorType(bld->elem_type, 4);
1656
1657       undef = LLVMGetUndef(vec_type);
1658
1659       args[0] = undef;
1660       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1661       args[2] = LLVMConstInt(i32t, mode, 0);
1662
1663       res = lp_build_intrinsic(builder, intrinsic,
1664                                vec_type, args, Elements(args));
1665
1666       res = LLVMBuildExtractElement(builder, res, index0, "");
1667    }
1668    else {
1669       if (type.width * type.length == 128) {
1670          switch(type.width) {
1671          case 32:
1672             intrinsic = "llvm.x86.sse41.round.ps";
1673             break;
1674          case 64:
1675             intrinsic = "llvm.x86.sse41.round.pd";
1676             break;
1677          default:
1678             assert(0);
1679             return bld->undef;
1680          }
1681       }
1682       else {
1683          assert(type.width * type.length == 256);
1684          assert(util_cpu_caps.has_avx);
1685
1686          switch(type.width) {
1687          case 32:
1688             intrinsic = "llvm.x86.avx.round.ps.256";
1689             break;
1690          case 64:
1691             intrinsic = "llvm.x86.avx.round.pd.256";
1692             break;
1693          default:
1694             assert(0);
1695             return bld->undef;
1696          }
1697       }
1698
1699       res = lp_build_intrinsic_binary(builder, intrinsic,
1700                                       bld->vec_type, a,
1701                                       LLVMConstInt(i32t, mode, 0));
1702    }
1703
1704    return res;
1705 }
1706
1707
1708 static INLINE LLVMValueRef
1709 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1710                              LLVMValueRef a)
1711 {
1712    LLVMBuilderRef builder = bld->gallivm->builder;
1713    const struct lp_type type = bld->type;
1714    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1715    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1716    const char *intrinsic;
1717    LLVMValueRef res;
1718
1719    assert(type.floating);
1720    /* using the double precision conversions is a bit more complicated */
1721    assert(type.width == 32);
1722
1723    assert(lp_check_value(type, a));
1724    assert(util_cpu_caps.has_sse2);
1725
1726    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1727    if (type.length == 1) {
1728       LLVMTypeRef vec_type;
1729       LLVMValueRef undef;
1730       LLVMValueRef arg;
1731       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1732
1733       vec_type = LLVMVectorType(bld->elem_type, 4);
1734
1735       intrinsic = "llvm.x86.sse.cvtss2si";
1736
1737       undef = LLVMGetUndef(vec_type);
1738
1739       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1740
1741       res = lp_build_intrinsic_unary(builder, intrinsic,
1742                                      ret_type, arg);
1743    }
1744    else {
1745       if (type.width* type.length == 128) {
1746          intrinsic = "llvm.x86.sse2.cvtps2dq";
1747       }
1748       else {
1749          assert(type.width*type.length == 256);
1750          assert(util_cpu_caps.has_avx);
1751
1752          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1753       }
1754       res = lp_build_intrinsic_unary(builder, intrinsic,
1755                                      ret_type, a);
1756    }
1757
1758    return res;
1759 }
1760
1761
1762 /*
1763  */
1764 static INLINE LLVMValueRef
1765 lp_build_round_altivec(struct lp_build_context *bld,
1766                        LLVMValueRef a,
1767                        enum lp_build_round_mode mode)
1768 {
1769    LLVMBuilderRef builder = bld->gallivm->builder;
1770    const struct lp_type type = bld->type;
1771    const char *intrinsic = NULL;
1772
1773    assert(type.floating);
1774
1775    assert(lp_check_value(type, a));
1776    assert(util_cpu_caps.has_altivec);
1777
1778    switch (mode) {
1779    case LP_BUILD_ROUND_NEAREST:
1780       intrinsic = "llvm.ppc.altivec.vrfin";
1781       break;
1782    case LP_BUILD_ROUND_FLOOR:
1783       intrinsic = "llvm.ppc.altivec.vrfim";
1784       break;
1785    case LP_BUILD_ROUND_CEIL:
1786       intrinsic = "llvm.ppc.altivec.vrfip";
1787       break;
1788    case LP_BUILD_ROUND_TRUNCATE:
1789       intrinsic = "llvm.ppc.altivec.vrfiz";
1790       break;
1791    }
1792
1793    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1794 }
1795
1796 static INLINE LLVMValueRef
1797 lp_build_round_arch(struct lp_build_context *bld,
1798                     LLVMValueRef a,
1799                     enum lp_build_round_mode mode)
1800 {
1801    if (util_cpu_caps.has_sse4_1)
1802      return lp_build_round_sse41(bld, a, mode);
1803    else /* (util_cpu_caps.has_altivec) */
1804      return lp_build_round_altivec(bld, a, mode);
1805 }
1806
1807 /**
1808  * Return the integer part of a float (vector) value (== round toward zero).
1809  * The returned value is a float (vector).
1810  * Ex: trunc(-1.5) = -1.0
1811  */
1812 LLVMValueRef
1813 lp_build_trunc(struct lp_build_context *bld,
1814                LLVMValueRef a)
1815 {
1816    LLVMBuilderRef builder = bld->gallivm->builder;
1817    const struct lp_type type = bld->type;
1818
1819    assert(type.floating);
1820    assert(lp_check_value(type, a));
1821
1822    if (arch_rounding_available(type)) {
1823       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1824    }
1825    else {
1826       const struct lp_type type = bld->type;
1827       struct lp_type inttype;
1828       struct lp_build_context intbld;
1829       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1830       LLVMValueRef trunc, res, anosign, mask;
1831       LLVMTypeRef int_vec_type = bld->int_vec_type;
1832       LLVMTypeRef vec_type = bld->vec_type;
1833
1834       assert(type.width == 32); /* might want to handle doubles at some point */
1835
1836       inttype = type;
1837       inttype.floating = 0;
1838       lp_build_context_init(&intbld, bld->gallivm, inttype);
1839
1840       /* round by truncation */
1841       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1842       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1843
1844       /* mask out sign bit */
1845       anosign = lp_build_abs(bld, a);
1846       /*
1847        * mask out all values if anosign > 2^24
1848        * This should work both for large ints (all rounding is no-op for them
1849        * because such floats are always exact) as well as special cases like
1850        * NaNs, Infs (taking advantage of the fact they use max exponent).
1851        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1852        */
1853       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1854       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1855       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1856       return lp_build_select(bld, mask, a, res);
1857    }
1858 }
1859
1860
1861 /**
1862  * Return float (vector) rounded to nearest integer (vector).  The returned
1863  * value is a float (vector).
1864  * Ex: round(0.9) = 1.0
1865  * Ex: round(-1.5) = -2.0
1866  */
1867 LLVMValueRef
1868 lp_build_round(struct lp_build_context *bld,
1869                LLVMValueRef a)
1870 {
1871    LLVMBuilderRef builder = bld->gallivm->builder;
1872    const struct lp_type type = bld->type;
1873
1874    assert(type.floating);
1875    assert(lp_check_value(type, a));
1876
1877    if (arch_rounding_available(type)) {
1878       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1879    }
1880    else {
1881       const struct lp_type type = bld->type;
1882       struct lp_type inttype;
1883       struct lp_build_context intbld;
1884       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1885       LLVMValueRef res, anosign, mask;
1886       LLVMTypeRef int_vec_type = bld->int_vec_type;
1887       LLVMTypeRef vec_type = bld->vec_type;
1888
1889       assert(type.width == 32); /* might want to handle doubles at some point */
1890
1891       inttype = type;
1892       inttype.floating = 0;
1893       lp_build_context_init(&intbld, bld->gallivm, inttype);
1894
1895       res = lp_build_iround(bld, a);
1896       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1897
1898       /* mask out sign bit */
1899       anosign = lp_build_abs(bld, a);
1900       /*
1901        * mask out all values if anosign > 2^24
1902        * This should work both for large ints (all rounding is no-op for them
1903        * because such floats are always exact) as well as special cases like
1904        * NaNs, Infs (taking advantage of the fact they use max exponent).
1905        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1906        */
1907       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1908       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1909       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1910       return lp_build_select(bld, mask, a, res);
1911    }
1912 }
1913
1914
1915 /**
1916  * Return floor of float (vector), result is a float (vector)
1917  * Ex: floor(1.1) = 1.0
1918  * Ex: floor(-1.1) = -2.0
1919  */
1920 LLVMValueRef
1921 lp_build_floor(struct lp_build_context *bld,
1922                LLVMValueRef a)
1923 {
1924    LLVMBuilderRef builder = bld->gallivm->builder;
1925    const struct lp_type type = bld->type;
1926
1927    assert(type.floating);
1928    assert(lp_check_value(type, a));
1929
1930    if (arch_rounding_available(type)) {
1931       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1932    }
1933    else {
1934       const struct lp_type type = bld->type;
1935       struct lp_type inttype;
1936       struct lp_build_context intbld;
1937       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1938       LLVMValueRef trunc, res, anosign, mask;
1939       LLVMTypeRef int_vec_type = bld->int_vec_type;
1940       LLVMTypeRef vec_type = bld->vec_type;
1941
1942       assert(type.width == 32); /* might want to handle doubles at some point */
1943
1944       inttype = type;
1945       inttype.floating = 0;
1946       lp_build_context_init(&intbld, bld->gallivm, inttype);
1947
1948       /* round by truncation */
1949       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1950       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1951
1952       if (type.sign) {
1953          LLVMValueRef tmp;
1954
1955          /*
1956           * fix values if rounding is wrong (for non-special cases)
1957           * - this is the case if trunc > a
1958           */
1959          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1960          /* tmp = trunc > a ? 1.0 : 0.0 */
1961          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1962          tmp = lp_build_and(&intbld, mask, tmp);
1963          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1964          res = lp_build_sub(bld, res, tmp);
1965       }
1966
1967       /* mask out sign bit */
1968       anosign = lp_build_abs(bld, a);
1969       /*
1970        * mask out all values if anosign > 2^24
1971        * This should work both for large ints (all rounding is no-op for them
1972        * because such floats are always exact) as well as special cases like
1973        * NaNs, Infs (taking advantage of the fact they use max exponent).
1974        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1975        */
1976       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1977       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1978       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1979       return lp_build_select(bld, mask, a, res);
1980    }
1981 }
1982
1983
1984 /**
1985  * Return ceiling of float (vector), returning float (vector).
1986  * Ex: ceil( 1.1) = 2.0
1987  * Ex: ceil(-1.1) = -1.0
1988  */
1989 LLVMValueRef
1990 lp_build_ceil(struct lp_build_context *bld,
1991               LLVMValueRef a)
1992 {
1993    LLVMBuilderRef builder = bld->gallivm->builder;
1994    const struct lp_type type = bld->type;
1995
1996    assert(type.floating);
1997    assert(lp_check_value(type, a));
1998
1999    if (arch_rounding_available(type)) {
2000       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2001    }
2002    else {
2003       const struct lp_type type = bld->type;
2004       struct lp_type inttype;
2005       struct lp_build_context intbld;
2006       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2007       LLVMValueRef trunc, res, anosign, mask, tmp;
2008       LLVMTypeRef int_vec_type = bld->int_vec_type;
2009       LLVMTypeRef vec_type = bld->vec_type;
2010
2011       assert(type.width == 32); /* might want to handle doubles at some point */
2012
2013       inttype = type;
2014       inttype.floating = 0;
2015       lp_build_context_init(&intbld, bld->gallivm, inttype);
2016
2017       /* round by truncation */
2018       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2019       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2020
2021       /*
2022        * fix values if rounding is wrong (for non-special cases)
2023        * - this is the case if trunc < a
2024        */
2025       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2026       /* tmp = trunc < a ? 1.0 : 0.0 */
2027       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2028       tmp = lp_build_and(&intbld, mask, tmp);
2029       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2030       res = lp_build_add(bld, trunc, tmp);
2031
2032       /* mask out sign bit */
2033       anosign = lp_build_abs(bld, a);
2034       /*
2035        * mask out all values if anosign > 2^24
2036        * This should work both for large ints (all rounding is no-op for them
2037        * because such floats are always exact) as well as special cases like
2038        * NaNs, Infs (taking advantage of the fact they use max exponent).
2039        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2040        */
2041       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2042       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2043       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2044       return lp_build_select(bld, mask, a, res);
2045    }
2046 }
2047
2048
2049 /**
2050  * Return fractional part of 'a' computed as a - floor(a)
2051  * Typically used in texture coord arithmetic.
2052  */
2053 LLVMValueRef
2054 lp_build_fract(struct lp_build_context *bld,
2055                LLVMValueRef a)
2056 {
2057    assert(bld->type.floating);
2058    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2059 }
2060
2061
2062 /**
2063  * Prevent returning a fractional part of 1.0 for very small negative values of
2064  * 'a' by clamping against 0.99999(9).
2065  */
2066 static inline LLVMValueRef
2067 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2068 {
2069    LLVMValueRef max;
2070
2071    /* this is the largest number smaller than 1.0 representable as float */
2072    max = lp_build_const_vec(bld->gallivm, bld->type,
2073                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2074    return lp_build_min(bld, fract, max);
2075 }
2076
2077
2078 /**
2079  * Same as lp_build_fract, but guarantees that the result is always smaller
2080  * than one.
2081  */
2082 LLVMValueRef
2083 lp_build_fract_safe(struct lp_build_context *bld,
2084                     LLVMValueRef a)
2085 {
2086    return clamp_fract(bld, lp_build_fract(bld, a));
2087 }
2088
2089
2090 /**
2091  * Return the integer part of a float (vector) value (== round toward zero).
2092  * The returned value is an integer (vector).
2093  * Ex: itrunc(-1.5) = -1
2094  */
2095 LLVMValueRef
2096 lp_build_itrunc(struct lp_build_context *bld,
2097                 LLVMValueRef a)
2098 {
2099    LLVMBuilderRef builder = bld->gallivm->builder;
2100    const struct lp_type type = bld->type;
2101    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2102
2103    assert(type.floating);
2104    assert(lp_check_value(type, a));
2105
2106    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2107 }
2108
2109
2110 /**
2111  * Return float (vector) rounded to nearest integer (vector).  The returned
2112  * value is an integer (vector).
2113  * Ex: iround(0.9) = 1
2114  * Ex: iround(-1.5) = -2
2115  */
2116 LLVMValueRef
2117 lp_build_iround(struct lp_build_context *bld,
2118                 LLVMValueRef a)
2119 {
2120    LLVMBuilderRef builder = bld->gallivm->builder;
2121    const struct lp_type type = bld->type;
2122    LLVMTypeRef int_vec_type = bld->int_vec_type;
2123    LLVMValueRef res;
2124
2125    assert(type.floating);
2126
2127    assert(lp_check_value(type, a));
2128
2129    if ((util_cpu_caps.has_sse2 &&
2130        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2131        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2132       return lp_build_iround_nearest_sse2(bld, a);
2133    }
2134    if (arch_rounding_available(type)) {
2135       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2136    }
2137    else {
2138       LLVMValueRef half;
2139
2140       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2141
2142       if (type.sign) {
2143          LLVMTypeRef vec_type = bld->vec_type;
2144          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2145                                     (unsigned long long)1 << (type.width - 1));
2146          LLVMValueRef sign;
2147
2148          /* get sign bit */
2149          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2150          sign = LLVMBuildAnd(builder, sign, mask, "");
2151
2152          /* sign * 0.5 */
2153          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2154          half = LLVMBuildOr(builder, sign, half, "");
2155          half = LLVMBuildBitCast(builder, half, vec_type, "");
2156       }
2157
2158       res = LLVMBuildFAdd(builder, a, half, "");
2159    }
2160
2161    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2162
2163    return res;
2164 }
2165
2166
2167 /**
2168  * Return floor of float (vector), result is an int (vector)
2169  * Ex: ifloor(1.1) = 1.0
2170  * Ex: ifloor(-1.1) = -2.0
2171  */
2172 LLVMValueRef
2173 lp_build_ifloor(struct lp_build_context *bld,
2174                 LLVMValueRef a)
2175 {
2176    LLVMBuilderRef builder = bld->gallivm->builder;
2177    const struct lp_type type = bld->type;
2178    LLVMTypeRef int_vec_type = bld->int_vec_type;
2179    LLVMValueRef res;
2180
2181    assert(type.floating);
2182    assert(lp_check_value(type, a));
2183
2184    res = a;
2185    if (type.sign) {
2186       if (arch_rounding_available(type)) {
2187          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2188       }
2189       else {
2190          struct lp_type inttype;
2191          struct lp_build_context intbld;
2192          LLVMValueRef trunc, itrunc, mask;
2193
2194          assert(type.floating);
2195          assert(lp_check_value(type, a));
2196
2197          inttype = type;
2198          inttype.floating = 0;
2199          lp_build_context_init(&intbld, bld->gallivm, inttype);
2200
2201          /* round by truncation */
2202          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2203          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2204
2205          /*
2206           * fix values if rounding is wrong (for non-special cases)
2207           * - this is the case if trunc > a
2208           * The results of doing this with NaNs, very large values etc.
2209           * are undefined but this seems to be the case anyway.
2210           */
2211          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2212          /* cheapie minus one with mask since the mask is minus one / zero */
2213          return lp_build_add(&intbld, itrunc, mask);
2214       }
2215    }
2216
2217    /* round to nearest (toward zero) */
2218    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2219
2220    return res;
2221 }
2222
2223
2224 /**
2225  * Return ceiling of float (vector), returning int (vector).
2226  * Ex: iceil( 1.1) = 2
2227  * Ex: iceil(-1.1) = -1
2228  */
2229 LLVMValueRef
2230 lp_build_iceil(struct lp_build_context *bld,
2231                LLVMValueRef a)
2232 {
2233    LLVMBuilderRef builder = bld->gallivm->builder;
2234    const struct lp_type type = bld->type;
2235    LLVMTypeRef int_vec_type = bld->int_vec_type;
2236    LLVMValueRef res;
2237
2238    assert(type.floating);
2239    assert(lp_check_value(type, a));
2240
2241    if (arch_rounding_available(type)) {
2242       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2243    }
2244    else {
2245       struct lp_type inttype;
2246       struct lp_build_context intbld;
2247       LLVMValueRef trunc, itrunc, mask;
2248
2249       assert(type.floating);
2250       assert(lp_check_value(type, a));
2251
2252       inttype = type;
2253       inttype.floating = 0;
2254       lp_build_context_init(&intbld, bld->gallivm, inttype);
2255
2256       /* round by truncation */
2257       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2258       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2259
2260       /*
2261        * fix values if rounding is wrong (for non-special cases)
2262        * - this is the case if trunc < a
2263        * The results of doing this with NaNs, very large values etc.
2264        * are undefined but this seems to be the case anyway.
2265        */
2266       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2267       /* cheapie plus one with mask since the mask is minus one / zero */
2268       return lp_build_sub(&intbld, itrunc, mask);
2269    }
2270
2271    /* round to nearest (toward zero) */
2272    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2273
2274    return res;
2275 }
2276
2277
2278 /**
2279  * Combined ifloor() & fract().
2280  *
2281  * Preferred to calling the functions separately, as it will ensure that the
2282  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2283  */
2284 void
2285 lp_build_ifloor_fract(struct lp_build_context *bld,
2286                       LLVMValueRef a,
2287                       LLVMValueRef *out_ipart,
2288                       LLVMValueRef *out_fpart)
2289 {
2290    LLVMBuilderRef builder = bld->gallivm->builder;
2291    const struct lp_type type = bld->type;
2292    LLVMValueRef ipart;
2293
2294    assert(type.floating);
2295    assert(lp_check_value(type, a));
2296
2297    if (arch_rounding_available(type)) {
2298       /*
2299        * floor() is easier.
2300        */
2301
2302       ipart = lp_build_floor(bld, a);
2303       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2304       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2305    }
2306    else {
2307       /*
2308        * ifloor() is easier.
2309        */
2310
2311       *out_ipart = lp_build_ifloor(bld, a);
2312       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2313       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2314    }
2315 }
2316
2317
2318 /**
2319  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2320  * always smaller than one.
2321  */
2322 void
2323 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2324                            LLVMValueRef a,
2325                            LLVMValueRef *out_ipart,
2326                            LLVMValueRef *out_fpart)
2327 {
2328    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2329    *out_fpart = clamp_fract(bld, *out_fpart);
2330 }
2331
2332
2333 LLVMValueRef
2334 lp_build_sqrt(struct lp_build_context *bld,
2335               LLVMValueRef a)
2336 {
2337    LLVMBuilderRef builder = bld->gallivm->builder;
2338    const struct lp_type type = bld->type;
2339    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2340    char intrinsic[32];
2341
2342    assert(lp_check_value(type, a));
2343
2344    /* TODO: optimize the constant case */
2345
2346    assert(type.floating);
2347    if (type.length == 1) {
2348       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2349    }
2350    else {
2351       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2352    }
2353
2354    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2355 }
2356
2357
2358 /**
2359  * Do one Newton-Raphson step to improve reciprocate precision:
2360  *
2361  *   x_{i+1} = x_i * (2 - a * x_i)
2362  *
2363  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2364  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2365  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2366  * halo. It would be necessary to clamp the argument to prevent this.
2367  *
2368  * See also:
2369  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2370  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2371  */
2372 static INLINE LLVMValueRef
2373 lp_build_rcp_refine(struct lp_build_context *bld,
2374                     LLVMValueRef a,
2375                     LLVMValueRef rcp_a)
2376 {
2377    LLVMBuilderRef builder = bld->gallivm->builder;
2378    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2379    LLVMValueRef res;
2380
2381    res = LLVMBuildFMul(builder, a, rcp_a, "");
2382    res = LLVMBuildFSub(builder, two, res, "");
2383    res = LLVMBuildFMul(builder, rcp_a, res, "");
2384
2385    return res;
2386 }
2387
2388
2389 LLVMValueRef
2390 lp_build_rcp(struct lp_build_context *bld,
2391              LLVMValueRef a)
2392 {
2393    LLVMBuilderRef builder = bld->gallivm->builder;
2394    const struct lp_type type = bld->type;
2395
2396    assert(lp_check_value(type, a));
2397
2398    if(a == bld->zero)
2399       return bld->undef;
2400    if(a == bld->one)
2401       return bld->one;
2402    if(a == bld->undef)
2403       return bld->undef;
2404
2405    assert(type.floating);
2406
2407    if(LLVMIsConstant(a))
2408       return LLVMConstFDiv(bld->one, a);
2409
2410    /*
2411     * We don't use RCPPS because:
2412     * - it only has 10bits of precision
2413     * - it doesn't even get the reciprocate of 1.0 exactly
2414     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2415     * - for recent processors the benefit over DIVPS is marginal, a case
2416     *   dependent
2417     *
2418     * We could still use it on certain processors if benchmarks show that the
2419     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2420     * particular uses that require less workarounds.
2421     */
2422
2423    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2424          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2425       const unsigned num_iterations = 0;
2426       LLVMValueRef res;
2427       unsigned i;
2428       const char *intrinsic = NULL;
2429
2430       if (type.length == 4) {
2431          intrinsic = "llvm.x86.sse.rcp.ps";
2432       }
2433       else {
2434          intrinsic = "llvm.x86.avx.rcp.ps.256";
2435       }
2436
2437       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2438
2439       for (i = 0; i < num_iterations; ++i) {
2440          res = lp_build_rcp_refine(bld, a, res);
2441       }
2442
2443       return res;
2444    }
2445
2446    return LLVMBuildFDiv(builder, bld->one, a, "");
2447 }
2448
2449
2450 /**
2451  * Do one Newton-Raphson step to improve rsqrt precision:
2452  *
2453  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2454  *
2455  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2456  */
2457 static INLINE LLVMValueRef
2458 lp_build_rsqrt_refine(struct lp_build_context *bld,
2459                       LLVMValueRef a,
2460                       LLVMValueRef rsqrt_a)
2461 {
2462    LLVMBuilderRef builder = bld->gallivm->builder;
2463    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2464    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2465    LLVMValueRef res;
2466
2467    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2468    res = LLVMBuildFMul(builder, a, res, "");
2469    res = LLVMBuildFSub(builder, three, res, "");
2470    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2471    res = LLVMBuildFMul(builder, half, res, "");
2472
2473    return res;
2474 }
2475
2476
2477 /**
2478  * Generate 1/sqrt(a).
2479  * Result is undefined for values < 0, infinity for +0.
2480  */
2481 LLVMValueRef
2482 lp_build_rsqrt(struct lp_build_context *bld,
2483                LLVMValueRef a)
2484 {
2485    LLVMBuilderRef builder = bld->gallivm->builder;
2486    const struct lp_type type = bld->type;
2487
2488    assert(lp_check_value(type, a));
2489
2490    assert(type.floating);
2491
2492    /*
2493     * This should be faster but all denormals will end up as infinity.
2494     */
2495    if (0 && lp_build_fast_rsqrt_available(type)) {
2496       const unsigned num_iterations = 1;
2497       LLVMValueRef res;
2498       unsigned i;
2499
2500       /* rsqrt(1.0) != 1.0 here */
2501       res = lp_build_fast_rsqrt(bld, a);
2502
2503       if (num_iterations) {
2504          /*
2505           * Newton-Raphson will result in NaN instead of infinity for zero,
2506           * and NaN instead of zero for infinity.
2507           * Also, need to ensure rsqrt(1.0) == 1.0.
2508           * All numbers smaller than FLT_MIN will result in +infinity
2509           * (rsqrtps treats all denormals as zero).
2510           */
2511          /*
2512           * Certain non-c99 compilers don't know INFINITY and might not support
2513           * hacks to evaluate it at compile time neither.
2514           */
2515          const unsigned posinf_int = 0x7F800000;
2516          LLVMValueRef cmp;
2517          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2518          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2519
2520          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2521
2522          for (i = 0; i < num_iterations; ++i) {
2523             res = lp_build_rsqrt_refine(bld, a, res);
2524          }
2525          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2526          res = lp_build_select(bld, cmp, inf, res);
2527          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2528          res = lp_build_select(bld, cmp, bld->zero, res);
2529          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2530          res = lp_build_select(bld, cmp, bld->one, res);
2531       }
2532
2533       return res;
2534    }
2535
2536    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2537 }
2538
2539 /**
2540  * If there's a fast (inaccurate) rsqrt instruction available
2541  * (caller may want to avoid to call rsqrt_fast if it's not available,
2542  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2543  * unavailable it would result in sqrt/div/mul so obviously
2544  * much better to just call sqrt, skipping both div and mul).
2545  */
2546 boolean
2547 lp_build_fast_rsqrt_available(struct lp_type type)
2548 {
2549    assert(type.floating);
2550
2551    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2552        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2553       return true;
2554    }
2555    return false;
2556 }
2557
2558
2559 /**
2560  * Generate 1/sqrt(a).
2561  * Result is undefined for values < 0, infinity for +0.
2562  * Precision is limited, only ~10 bits guaranteed
2563  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2564  */
2565 LLVMValueRef
2566 lp_build_fast_rsqrt(struct lp_build_context *bld,
2567                     LLVMValueRef a)
2568 {
2569    LLVMBuilderRef builder = bld->gallivm->builder;
2570    const struct lp_type type = bld->type;
2571
2572    assert(lp_check_value(type, a));
2573
2574    if (lp_build_fast_rsqrt_available(type)) {
2575       const char *intrinsic = NULL;
2576
2577       if (type.length == 4) {
2578          intrinsic = "llvm.x86.sse.rsqrt.ps";
2579       }
2580       else {
2581          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2582       }
2583       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2584    }
2585    else {
2586       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2587    }
2588    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2589 }
2590
2591
2592 /**
2593  * Generate sin(a) using SSE2
2594  */
2595 LLVMValueRef
2596 lp_build_sin(struct lp_build_context *bld,
2597              LLVMValueRef a)
2598 {
2599    struct gallivm_state *gallivm = bld->gallivm;
2600    LLVMBuilderRef builder = gallivm->builder;
2601    struct lp_type int_type = lp_int_type(bld->type);
2602    LLVMBuilderRef b = builder;
2603
2604    /*
2605     *  take the absolute value,
2606     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2607     */
2608
2609    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2610    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2611
2612    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2613    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2614
2615    /*
2616     * extract the sign bit (upper one)
2617     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2618     */
2619    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2620    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2621
2622    /*
2623     * scale by 4/Pi
2624     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2625     */
2626
2627    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2628    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2629
2630    /*
2631     * store the integer part of y in mm0
2632     * emm2 = _mm_cvttps_epi32(y);
2633     */
2634
2635    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2636
2637    /*
2638     * j=(j+1) & (~1) (see the cephes sources)
2639     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2640     */
2641
2642    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2643    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2644    /*
2645     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2646     */
2647    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2648    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2649
2650    /*
2651     * y = _mm_cvtepi32_ps(emm2);
2652     */
2653    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2654
2655    /* get the swap sign flag
2656     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2657     */
2658    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2659    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2660
2661    /*
2662     * emm2 = _mm_slli_epi32(emm0, 29);
2663     */
2664    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2665    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2666
2667    /*
2668     * get the polynom selection mask
2669     * there is one polynom for 0 <= x <= Pi/4
2670     * and another one for Pi/4<x<=Pi/2
2671     * Both branches will be computed.
2672     *
2673     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2674     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2675     */
2676
2677    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2678    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2679    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2680                                              int_type, PIPE_FUNC_EQUAL,
2681                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2682    /*
2683     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2684     */
2685    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2686
2687    /*
2688     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2689     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2690     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2691     */
2692    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2693    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2694    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2695
2696    /*
2697     * The magic pass: "Extended precision modular arithmetic"
2698     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2699     * xmm1 = _mm_mul_ps(y, xmm1);
2700     * xmm2 = _mm_mul_ps(y, xmm2);
2701     * xmm3 = _mm_mul_ps(y, xmm3);
2702     */
2703    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2704    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2705    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2706
2707    /*
2708     * x = _mm_add_ps(x, xmm1);
2709     * x = _mm_add_ps(x, xmm2);
2710     * x = _mm_add_ps(x, xmm3);
2711     */
2712
2713    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2714    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2715    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2716
2717    /*
2718     * Evaluate the first polynom  (0 <= x <= Pi/4)
2719     *
2720     * z = _mm_mul_ps(x,x);
2721     */
2722    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2723
2724    /*
2725     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2726     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2727     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2728     */
2729    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2730    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2731    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2732
2733    /*
2734     * y = *(v4sf*)_ps_coscof_p0;
2735     * y = _mm_mul_ps(y, z);
2736     */
2737    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2738    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2739    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2740    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2741    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2742    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2743
2744
2745    /*
2746     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2747     * y = _mm_sub_ps(y, tmp);
2748     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2749     */
2750    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2751    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2752    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2753    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2754    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2755
2756    /*
2757     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2758     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2759     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2760     */
2761    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2762    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2763    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2764
2765    /*
2766     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2767     *
2768     * y2 = *(v4sf*)_ps_sincof_p0;
2769     * y2 = _mm_mul_ps(y2, z);
2770     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2771     * y2 = _mm_mul_ps(y2, z);
2772     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2773     * y2 = _mm_mul_ps(y2, z);
2774     * y2 = _mm_mul_ps(y2, x);
2775     * y2 = _mm_add_ps(y2, x);
2776     */
2777
2778    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2779    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2780    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2781    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2782    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2783    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2784    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2785
2786    /*
2787     * select the correct result from the two polynoms
2788     * xmm3 = poly_mask;
2789     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2790     * y = _mm_andnot_ps(xmm3, y);
2791     * y = _mm_or_ps(y,y2);
2792     */
2793    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2794    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2795    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2796    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2797    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2798    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2799
2800    /*
2801     * update the sign
2802     * y = _mm_xor_ps(y, sign_bit);
2803     */
2804    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2805    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2806    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2807
2808    /* clamp output to be within [-1, 1] */
2809    y_result = lp_build_clamp(bld, y_result,
2810                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2811                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2812    /* If a is -inf, inf or NaN then return NaN */
2813    y_result = lp_build_select(bld, isfinite, y_result,
2814                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2815    return y_result;
2816 }
2817
2818
2819 /**
2820  * Generate cos(a) using SSE2
2821  */
2822 LLVMValueRef
2823 lp_build_cos(struct lp_build_context *bld,
2824              LLVMValueRef a)
2825 {
2826    struct gallivm_state *gallivm = bld->gallivm;
2827    LLVMBuilderRef builder = gallivm->builder;
2828    struct lp_type int_type = lp_int_type(bld->type);
2829    LLVMBuilderRef b = builder;
2830
2831    /*
2832     *  take the absolute value,
2833     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2834     */
2835
2836    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2837    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2838
2839    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2840    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2841
2842    /*
2843     * scale by 4/Pi
2844     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2845     */
2846
2847    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2848    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2849
2850    /*
2851     * store the integer part of y in mm0
2852     * emm2 = _mm_cvttps_epi32(y);
2853     */
2854
2855    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2856
2857    /*
2858     * j=(j+1) & (~1) (see the cephes sources)
2859     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2860     */
2861
2862    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2863    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2864    /*
2865     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2866     */
2867    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2868    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2869
2870    /*
2871     * y = _mm_cvtepi32_ps(emm2);
2872     */
2873    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2874
2875
2876    /*
2877     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2878     */
2879    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2880    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2881
2882
2883    /* get the swap sign flag
2884     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2885     */
2886    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2887    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2888    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2889    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2890
2891    /*
2892     * emm2 = _mm_slli_epi32(emm0, 29);
2893     */
2894    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2895    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2896
2897    /*
2898     * get the polynom selection mask
2899     * there is one polynom for 0 <= x <= Pi/4
2900     * and another one for Pi/4<x<=Pi/2
2901     * Both branches will be computed.
2902     *
2903     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2904     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2905     */
2906
2907    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2908    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2909    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2910                                              int_type, PIPE_FUNC_EQUAL,
2911                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2912
2913    /*
2914     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2915     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2916     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2917     */
2918    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2919    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2920    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2921
2922    /*
2923     * The magic pass: "Extended precision modular arithmetic"
2924     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2925     * xmm1 = _mm_mul_ps(y, xmm1);
2926     * xmm2 = _mm_mul_ps(y, xmm2);
2927     * xmm3 = _mm_mul_ps(y, xmm3);
2928     */
2929    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2930    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2931    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2932
2933    /*
2934     * x = _mm_add_ps(x, xmm1);
2935     * x = _mm_add_ps(x, xmm2);
2936     * x = _mm_add_ps(x, xmm3);
2937     */
2938
2939    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2940    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2941    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2942
2943    /*
2944     * Evaluate the first polynom  (0 <= x <= Pi/4)
2945     *
2946     * z = _mm_mul_ps(x,x);
2947     */
2948    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2949
2950    /*
2951     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2952     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2953     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2954     */
2955    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2956    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2957    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2958
2959    /*
2960     * y = *(v4sf*)_ps_coscof_p0;
2961     * y = _mm_mul_ps(y, z);
2962     */
2963    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2964    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2965    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2966    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2967    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2968    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2969
2970
2971    /*
2972     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2973     * y = _mm_sub_ps(y, tmp);
2974     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2975     */
2976    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2977    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2978    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2979    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2980    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2981
2982    /*
2983     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2984     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2985     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2986     */
2987    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2988    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2989    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2990
2991    /*
2992     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2993     *
2994     * y2 = *(v4sf*)_ps_sincof_p0;
2995     * y2 = _mm_mul_ps(y2, z);
2996     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2997     * y2 = _mm_mul_ps(y2, z);
2998     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2999     * y2 = _mm_mul_ps(y2, z);
3000     * y2 = _mm_mul_ps(y2, x);
3001     * y2 = _mm_add_ps(y2, x);
3002     */
3003
3004    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
3005    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
3006    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
3007    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
3008    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3009    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
3010    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
3011
3012    /*
3013     * select the correct result from the two polynoms
3014     * xmm3 = poly_mask;
3015     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3016     * y = _mm_andnot_ps(xmm3, y);
3017     * y = _mm_or_ps(y,y2);
3018     */
3019    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3020    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3021    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3022    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3023    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3024    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3025
3026    /*
3027     * update the sign
3028     * y = _mm_xor_ps(y, sign_bit);
3029     */
3030    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
3031    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3032    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3033
3034    /* clamp output to be within [-1, 1] */
3035    y_result = lp_build_clamp(bld, y_result,
3036                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3037                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3038    /* If a is -inf, inf or NaN then return NaN */
3039    y_result = lp_build_select(bld, isfinite, y_result,
3040                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3041    return y_result;
3042 }
3043
3044
3045 /**
3046  * Generate pow(x, y)
3047  */
3048 LLVMValueRef
3049 lp_build_pow(struct lp_build_context *bld,
3050              LLVMValueRef x,
3051              LLVMValueRef y)
3052 {
3053    /* TODO: optimize the constant case */
3054    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3055        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3056       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3057                    __FUNCTION__);
3058    }
3059
3060    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3061 }
3062
3063
3064 /**
3065  * Generate exp(x)
3066  */
3067 LLVMValueRef
3068 lp_build_exp(struct lp_build_context *bld,
3069              LLVMValueRef x)
3070 {
3071    /* log2(e) = 1/log(2) */
3072    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3073                                            1.4426950408889634);
3074
3075    assert(lp_check_value(bld->type, x));
3076
3077    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3078 }
3079
3080
3081 /**
3082  * Generate log(x)
3083  */
3084 LLVMValueRef
3085 lp_build_log(struct lp_build_context *bld,
3086              LLVMValueRef x)
3087 {
3088    /* log(2) */
3089    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3090                                           0.69314718055994529);
3091
3092    assert(lp_check_value(bld->type, x));
3093
3094    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3095 }
3096
3097
3098 /**
3099  * Generate polynomial.
3100  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3101  */
3102 LLVMValueRef
3103 lp_build_polynomial(struct lp_build_context *bld,
3104                     LLVMValueRef x,
3105                     const double *coeffs,
3106                     unsigned num_coeffs)
3107 {
3108    const struct lp_type type = bld->type;
3109    LLVMValueRef even = NULL, odd = NULL;
3110    LLVMValueRef x2;
3111    unsigned i;
3112
3113    assert(lp_check_value(bld->type, x));
3114
3115    /* TODO: optimize the constant case */
3116    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3117        LLVMIsConstant(x)) {
3118       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3119                    __FUNCTION__);
3120    }
3121
3122    /*
3123     * Calculate odd and even terms seperately to decrease data dependency
3124     * Ex:
3125     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3126     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3127     */
3128    x2 = lp_build_mul(bld, x, x);
3129
3130    for (i = num_coeffs; i--; ) {
3131       LLVMValueRef coeff;
3132
3133       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3134
3135       if (i % 2 == 0) {
3136          if (even)
3137             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3138          else
3139             even = coeff;
3140       } else {
3141          if (odd)
3142             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3143          else
3144             odd = coeff;
3145       }
3146    }
3147
3148    if (odd)
3149       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3150    else if (even)
3151       return even;
3152    else
3153       return bld->undef;
3154 }
3155
3156
3157 /**
3158  * Minimax polynomial fit of 2**x, in range [0, 1[
3159  */
3160 const double lp_build_exp2_polynomial[] = {
3161 #if EXP_POLY_DEGREE == 5
3162    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3163    0.693153073200168932794,
3164    0.240153617044375388211,
3165    0.0558263180532956664775,
3166    0.00898934009049466391101,
3167    0.00187757667519147912699
3168 #elif EXP_POLY_DEGREE == 4
3169    1.00000259337069434683,
3170    0.693003834469974940458,
3171    0.24144275689150793076,
3172    0.0520114606103070150235,
3173    0.0135341679161270268764
3174 #elif EXP_POLY_DEGREE == 3
3175    0.999925218562710312959,
3176    0.695833540494823811697,
3177    0.226067155427249155588,
3178    0.0780245226406372992967
3179 #elif EXP_POLY_DEGREE == 2
3180    1.00172476321474503578,
3181    0.657636275736077639316,
3182    0.33718943461968720704
3183 #else
3184 #error
3185 #endif
3186 };
3187
3188
3189 void
3190 lp_build_exp2_approx(struct lp_build_context *bld,
3191                      LLVMValueRef x,
3192                      LLVMValueRef *p_exp2_int_part,
3193                      LLVMValueRef *p_frac_part,
3194                      LLVMValueRef *p_exp2)
3195 {
3196    LLVMBuilderRef builder = bld->gallivm->builder;
3197    const struct lp_type type = bld->type;
3198    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3199    LLVMValueRef ipart = NULL;
3200    LLVMValueRef fpart = NULL;
3201    LLVMValueRef expipart = NULL;
3202    LLVMValueRef expfpart = NULL;
3203    LLVMValueRef res = NULL;
3204
3205    assert(lp_check_value(bld->type, x));
3206
3207    if(p_exp2_int_part || p_frac_part || p_exp2) {
3208       /* TODO: optimize the constant case */
3209       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3210           LLVMIsConstant(x)) {
3211          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3212                       __FUNCTION__);
3213       }
3214
3215       assert(type.floating && type.width == 32);
3216
3217       /* We want to preserve NaN and make sure than for exp2 if x > 128,
3218        * the result is INF  and if it's smaller than -126.9 the result is 0 */
3219       x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3220                            GALLIVM_NAN_RETURN_SECOND);
3221       x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x,
3222                            GALLIVM_NAN_RETURN_SECOND);
3223
3224       /* ipart = floor(x) */
3225       /* fpart = x - ipart */
3226       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3227    }
3228
3229    if(p_exp2_int_part || p_exp2) {
3230       /* expipart = (float) (1 << ipart) */
3231       expipart = LLVMBuildAdd(builder, ipart,
3232                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
3233       expipart = LLVMBuildShl(builder, expipart,
3234                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
3235       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3236    }
3237
3238    if(p_exp2) {
3239       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3240                                      Elements(lp_build_exp2_polynomial));
3241
3242       res = LLVMBuildFMul(builder, expipart, expfpart, "");
3243    }
3244
3245    if(p_exp2_int_part)
3246       *p_exp2_int_part = expipart;
3247
3248    if(p_frac_part)
3249       *p_frac_part = fpart;
3250
3251    if(p_exp2)
3252       *p_exp2 = res;
3253 }
3254
3255
3256 LLVMValueRef
3257 lp_build_exp2(struct lp_build_context *bld,
3258               LLVMValueRef x)
3259 {
3260    LLVMValueRef res;
3261    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3262    return res;
3263 }
3264
3265
3266 /**
3267  * Extract the exponent of a IEEE-754 floating point value.
3268  *
3269  * Optionally apply an integer bias.
3270  *
3271  * Result is an integer value with
3272  *
3273  *   ifloor(log2(x)) + bias
3274  */
3275 LLVMValueRef
3276 lp_build_extract_exponent(struct lp_build_context *bld,
3277                           LLVMValueRef x,
3278                           int bias)
3279 {
3280    LLVMBuilderRef builder = bld->gallivm->builder;
3281    const struct lp_type type = bld->type;
3282    unsigned mantissa = lp_mantissa(type);
3283    LLVMValueRef res;
3284
3285    assert(type.floating);
3286
3287    assert(lp_check_value(bld->type, x));
3288
3289    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3290
3291    res = LLVMBuildLShr(builder, x,
3292                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3293    res = LLVMBuildAnd(builder, res,
3294                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3295    res = LLVMBuildSub(builder, res,
3296                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3297
3298    return res;
3299 }
3300
3301
3302 /**
3303  * Extract the mantissa of the a floating.
3304  *
3305  * Result is a floating point value with
3306  *
3307  *   x / floor(log2(x))
3308  */
3309 LLVMValueRef
3310 lp_build_extract_mantissa(struct lp_build_context *bld,
3311                           LLVMValueRef x)
3312 {
3313    LLVMBuilderRef builder = bld->gallivm->builder;
3314    const struct lp_type type = bld->type;
3315    unsigned mantissa = lp_mantissa(type);
3316    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3317                                                   (1ULL << mantissa) - 1);
3318    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3319    LLVMValueRef res;
3320
3321    assert(lp_check_value(bld->type, x));
3322
3323    assert(type.floating);
3324
3325    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3326
3327    /* res = x / 2**ipart */
3328    res = LLVMBuildAnd(builder, x, mantmask, "");
3329    res = LLVMBuildOr(builder, res, one, "");
3330    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3331
3332    return res;
3333 }
3334
3335
3336
3337 /**
3338  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3339  * These coefficients can be generate with
3340  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3341  */
3342 const double lp_build_log2_polynomial[] = {
3343 #if LOG_POLY_DEGREE == 5
3344    2.88539008148777786488L,
3345    0.961796878841293367824L,
3346    0.577058946784739859012L,
3347    0.412914355135828735411L,
3348    0.308591899232910175289L,
3349    0.352376952300281371868L,
3350 #elif LOG_POLY_DEGREE == 4
3351    2.88539009343309178325L,
3352    0.961791550404184197881L,
3353    0.577440339438736392009L,
3354    0.403343858251329912514L,
3355    0.406718052498846252698L,
3356 #elif LOG_POLY_DEGREE == 3
3357    2.88538959748872753838L,
3358    0.961932915889597772928L,
3359    0.571118517972136195241L,
3360    0.493997535084709500285L,
3361 #else
3362 #error
3363 #endif
3364 };
3365
3366 /**
3367  * See http://www.devmaster.net/forums/showthread.php?p=43580
3368  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3369  * http://www.nezumi.demon.co.uk/consult/logx.htm
3370  *
3371  * If handle_edge_cases is true the function will perform computations
3372  * to match the required D3D10+ behavior for each of the edge cases.
3373  * That means that if input is:
3374  * - less than zero (to and including -inf) then NaN will be returned
3375  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3376  * - +infinity, then +infinity will be returned
3377  * - NaN, then NaN will be returned
3378  *
3379  * Those checks are fairly expensive so if you don't need them make sure
3380  * handle_edge_cases is false.
3381  */
3382 void
3383 lp_build_log2_approx(struct lp_build_context *bld,
3384                      LLVMValueRef x,
3385                      LLVMValueRef *p_exp,
3386                      LLVMValueRef *p_floor_log2,
3387                      LLVMValueRef *p_log2,
3388                      boolean handle_edge_cases)
3389 {
3390    LLVMBuilderRef builder = bld->gallivm->builder;
3391    const struct lp_type type = bld->type;
3392    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3393    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3394
3395    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3396    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3397    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3398
3399    LLVMValueRef i = NULL;
3400    LLVMValueRef y = NULL;
3401    LLVMValueRef z = NULL;
3402    LLVMValueRef exp = NULL;
3403    LLVMValueRef mant = NULL;
3404    LLVMValueRef logexp = NULL;
3405    LLVMValueRef logmant = NULL;
3406    LLVMValueRef res = NULL;
3407
3408    assert(lp_check_value(bld->type, x));
3409
3410    if(p_exp || p_floor_log2 || p_log2) {
3411       /* TODO: optimize the constant case */
3412       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3413           LLVMIsConstant(x)) {
3414          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3415                       __FUNCTION__);
3416       }
3417
3418       assert(type.floating && type.width == 32);
3419
3420       /*
3421        * We don't explicitly handle denormalized numbers. They will yield a
3422        * result in the neighbourhood of -127, which appears to be adequate
3423        * enough.
3424        */
3425
3426       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3427
3428       /* exp = (float) exponent(x) */
3429       exp = LLVMBuildAnd(builder, i, expmask, "");
3430    }
3431
3432    if(p_floor_log2 || p_log2) {
3433       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3434       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3435       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3436    }
3437
3438    if(p_log2) {
3439       /* mant = 1 + (float) mantissa(x) */
3440       mant = LLVMBuildAnd(builder, i, mantmask, "");
3441       mant = LLVMBuildOr(builder, mant, one, "");
3442       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3443
3444       /* y = (mant - 1) / (mant + 1) */
3445       y = lp_build_div(bld,
3446          lp_build_sub(bld, mant, bld->one),
3447          lp_build_add(bld, mant, bld->one)
3448       );
3449
3450       /* z = y^2 */
3451       z = lp_build_mul(bld, y, y);
3452
3453       /* compute P(z) */
3454       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3455                                     Elements(lp_build_log2_polynomial));
3456
3457       /* logmant = y * P(z) */
3458       logmant = lp_build_mul(bld, y, logmant);
3459
3460       res = lp_build_add(bld, logmant, logexp);
3461
3462       if (type.floating && handle_edge_cases) {
3463          LLVMValueRef negmask, infmask,  zmask;
3464          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3465                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3466          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3467                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3468          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3469                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3470
3471          /* If x is qual to inf make sure we return inf */
3472          res = lp_build_select(bld, infmask,
3473                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3474                                res);
3475          /* If x is qual to 0, return -inf */
3476          res = lp_build_select(bld, zmask,
3477                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3478                                res);
3479          /* If x is nan or less than 0, return nan */
3480          res = lp_build_select(bld, negmask,
3481                                lp_build_const_vec(bld->gallivm, type,  NAN),
3482                                res);
3483       }
3484    }
3485
3486    if(p_exp) {
3487       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3488       *p_exp = exp;
3489    }
3490
3491    if(p_floor_log2)
3492       *p_floor_log2 = logexp;
3493
3494    if(p_log2)
3495       *p_log2 = res;
3496 }
3497
3498
3499 /*
3500  * log2 implementation which doesn't have special code to
3501  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3502  * the results for those cases are undefined.
3503  */
3504 LLVMValueRef
3505 lp_build_log2(struct lp_build_context *bld,
3506               LLVMValueRef x)
3507 {
3508    LLVMValueRef res;
3509    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3510    return res;
3511 }
3512
3513 /*
3514  * Version of log2 which handles all edge cases.
3515  * Look at documentation of lp_build_log2_approx for
3516  * description of the behavior for each of the edge cases.
3517  */
3518 LLVMValueRef
3519 lp_build_log2_safe(struct lp_build_context *bld,
3520                    LLVMValueRef x)
3521 {
3522    LLVMValueRef res;
3523    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3524    return res;
3525 }
3526
3527
3528 /**
3529  * Faster (and less accurate) log2.
3530  *
3531  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3532  *
3533  * Piece-wise linear approximation, with exact results when x is a
3534  * power of two.
3535  *
3536  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3537  */
3538 LLVMValueRef
3539 lp_build_fast_log2(struct lp_build_context *bld,
3540                    LLVMValueRef x)
3541 {
3542    LLVMBuilderRef builder = bld->gallivm->builder;
3543    LLVMValueRef ipart;
3544    LLVMValueRef fpart;
3545
3546    assert(lp_check_value(bld->type, x));
3547
3548    assert(bld->type.floating);
3549
3550    /* ipart = floor(log2(x)) - 1 */
3551    ipart = lp_build_extract_exponent(bld, x, -1);
3552    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3553
3554    /* fpart = x / 2**ipart */
3555    fpart = lp_build_extract_mantissa(bld, x);
3556
3557    /* ipart + fpart */
3558    return LLVMBuildFAdd(builder, ipart, fpart, "");
3559 }
3560
3561
3562 /**
3563  * Fast implementation of iround(log2(x)).
3564  *
3565  * Not an approximation -- it should give accurate results all the time.
3566  */
3567 LLVMValueRef
3568 lp_build_ilog2(struct lp_build_context *bld,
3569                LLVMValueRef x)
3570 {
3571    LLVMBuilderRef builder = bld->gallivm->builder;
3572    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3573    LLVMValueRef ipart;
3574
3575    assert(bld->type.floating);
3576
3577    assert(lp_check_value(bld->type, x));
3578
3579    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3580    x = LLVMBuildFMul(builder, x, sqrt2, "");
3581
3582    /* ipart = floor(log2(x) + 0.5)  */
3583    ipart = lp_build_extract_exponent(bld, x, 0);
3584
3585    return ipart;
3586 }
3587
3588 LLVMValueRef
3589 lp_build_mod(struct lp_build_context *bld,
3590              LLVMValueRef x,
3591              LLVMValueRef y)
3592 {
3593    LLVMBuilderRef builder = bld->gallivm->builder;
3594    LLVMValueRef res;
3595    const struct lp_type type = bld->type;
3596
3597    assert(lp_check_value(type, x));
3598    assert(lp_check_value(type, y));
3599
3600    if (type.floating)
3601       res = LLVMBuildFRem(builder, x, y, "");
3602    else if (type.sign)
3603       res = LLVMBuildSRem(builder, x, y, "");
3604    else
3605       res = LLVMBuildURem(builder, x, y, "");
3606    return res;
3607 }
3608
3609
3610 /*
3611  * For floating inputs it creates and returns a mask
3612  * which is all 1's for channels which are NaN.
3613  * Channels inside x which are not NaN will be 0.
3614  */
3615 LLVMValueRef
3616 lp_build_isnan(struct lp_build_context *bld,
3617                LLVMValueRef x)
3618 {
3619    LLVMValueRef mask;
3620    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3621
3622    assert(bld->type.floating);
3623    assert(lp_check_value(bld->type, x));
3624
3625    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3626                         "isnotnan");
3627    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3628    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3629    return mask;
3630 }
3631
3632 /* Returns all 1's for floating point numbers that are
3633  * finite numbers and returns all zeros for -inf,
3634  * inf and nan's */
3635 LLVMValueRef
3636 lp_build_isfinite(struct lp_build_context *bld,
3637                   LLVMValueRef x)
3638 {
3639    LLVMBuilderRef builder = bld->gallivm->builder;
3640    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3641    struct lp_type int_type = lp_int_type(bld->type);
3642    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3643    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3644                                                     0x7f800000);
3645
3646    if (!bld->type.floating) {
3647       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3648    }
3649    assert(bld->type.floating);
3650    assert(lp_check_value(bld->type, x));
3651    assert(bld->type.width == 32);
3652
3653    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3654    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3655                            intx, infornan32);
3656 }