src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67
  68 #define EXP_POLY_DEGREE 5
  69
  70 #define LOG_POLY_DEGREE 4
  71
  72
  73 /**
  74  * Generate min(a, b)
  75  * No checks for special case values of a or b = 1 or 0 are done.
  76  * NaN's are handled according to the behavior specified by the
  77  * nan_behavior argument.
  78  */
  79 static LLVMValueRef
  80 lp_build_min_simple(struct lp_build_context *bld,
  81                     LLVMValueRef a,
  82                     LLVMValueRef b,
  83                     enum gallivm_nan_behavior nan_behavior)
  84 {
  85    const struct lp_type type = bld->type;
  86    const char *intrinsic = NULL;
  87    unsigned intr_size = 0;
  88    LLVMValueRef cond;
  89
  90    assert(lp_check_value(type, a));
  91    assert(lp_check_value(type, b));
  92
  93    /* TODO: optimize the constant case */
  94
  95    if (type.floating && util_cpu_caps.has_sse) {
  96       if (type.width == 32) {
  97          if (type.length == 1) {
  98             intrinsic = "llvm.x86.sse.min.ss";
  99             intr_size = 128;
 100          }
 101          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 102             intrinsic = "llvm.x86.sse.min.ps";
 103             intr_size = 128;
 104          }
 105          else {
 106             intrinsic = "llvm.x86.avx.min.ps.256";
 107             intr_size = 256;
 108          }
 109       }
 110       if (type.width == 64 && util_cpu_caps.has_sse2) {
 111          if (type.length == 1) {
 112             intrinsic = "llvm.x86.sse2.min.sd";
 113             intr_size = 128;
 114          }
 115          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 116             intrinsic = "llvm.x86.sse2.min.pd";
 117             intr_size = 128;
 118          }
 119          else {
 120             intrinsic = "llvm.x86.avx.min.pd.256";
 121             intr_size = 256;
 122          }
 123       }
 124    }
 125    else if (type.floating && util_cpu_caps.has_altivec) {
 126       if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
 127          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 128                       __FUNCTION__);
 129       }
 130       if (type.width == 32 && type.length == 4) {
 131          intrinsic = "llvm.ppc.altivec.vminfp";
 132          intr_size = 128;
 133       }
 134    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 135       intr_size = 128;
 136       if ((type.width == 8 || type.width == 16) &&
 137           (type.width * type.length <= 64) &&
 138           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 139          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 140                       __FUNCTION__);
 141       }
 142       if (type.width == 8 && !type.sign) {
 143          intrinsic = "llvm.x86.sse2.pminu.b";
 144       }
 145       else if (type.width == 16 && type.sign) {
 146          intrinsic = "llvm.x86.sse2.pmins.w";
 147       }
 148       if (util_cpu_caps.has_sse4_1) {
 149          if (type.width == 8 && type.sign) {
 150             intrinsic = "llvm.x86.sse41.pminsb";
 151          }
 152          if (type.width == 16 && !type.sign) {
 153             intrinsic = "llvm.x86.sse41.pminuw";
 154          }
 155          if (type.width == 32 && !type.sign) {
 156             intrinsic = "llvm.x86.sse41.pminud";
 157          }
 158          if (type.width == 32 && type.sign) {
 159             intrinsic = "llvm.x86.sse41.pminsd";
 160          }
 161       }
 162    } else if (util_cpu_caps.has_altivec) {
 163       intr_size = 128;
 164       if (type.width == 8) {
 165          if (!type.sign) {
 166             intrinsic = "llvm.ppc.altivec.vminub";
 167          } else {
 168             intrinsic = "llvm.ppc.altivec.vminsb";
 169          }
 170       } else if (type.width == 16) {
 171          if (!type.sign) {
 172             intrinsic = "llvm.ppc.altivec.vminuh";
 173          } else {
 174             intrinsic = "llvm.ppc.altivec.vminsh";
 175          }
 176       } else if (type.width == 32) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminuw";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsw";
 181          }
 182       }
 183    }
 184
 185    if(intrinsic) {
 186       /* We need to handle nan's for floating point numbers. If one of the
 187        * inputs is nan the other should be returned (required by both D3D10+
 188        * and OpenCL).
 189        * The sse intrinsics return the second operator in case of nan by
 190        * default so we need to special code to handle those.
 191        */
 192       if (util_cpu_caps.has_sse && type.floating &&
 193           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 194           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
 195          LLVMValueRef isnan, max;
 196          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 197                                                    type,
 198                                                    intr_size, a, b);
 199          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 200             isnan = lp_build_isnan(bld, b);
 201             return lp_build_select(bld, isnan, a, max);
 202          } else {
 203             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 204             isnan = lp_build_isnan(bld, a);
 205             return lp_build_select(bld, isnan, a, max);
 206          }
 207       } else {
 208          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 209                                                     type,
 210                                                     intr_size, a, b);
 211       }
 212    }
 213
 214    if (type.floating) {
 215       switch (nan_behavior) {
 216       case GALLIVM_NAN_RETURN_NAN: {
 217          LLVMValueRef isnan = lp_build_isnan(bld, b);
 218          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 219          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 220          return lp_build_select(bld, cond, a, b);
 221       }
 222          break;
 223       case GALLIVM_NAN_RETURN_OTHER: {
 224          LLVMValueRef isnan = lp_build_isnan(bld, a);
 225          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 226          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 227          return lp_build_select(bld, cond, a, b);
 228       }
 229          break;
 230       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 231          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 232          return lp_build_select(bld, cond, a, b);
 233       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 234          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 235          return lp_build_select(bld, cond, a, b);
 236          break;
 237       default:
 238          assert(0);
 239          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242    } else {
 243       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 244       return lp_build_select(bld, cond, a, b);
 245    }
 246 }
 247
 248
 249 /**
 250  * Generate max(a, b)
 251  * No checks for special case values of a or b = 1 or 0 are done.
 252  * NaN's are handled according to the behavior specified by the
 253  * nan_behavior argument.
 254  */
 255 static LLVMValueRef
 256 lp_build_max_simple(struct lp_build_context *bld,
 257                     LLVMValueRef a,
 258                     LLVMValueRef b,
 259                     enum gallivm_nan_behavior nan_behavior)
 260 {
 261    const struct lp_type type = bld->type;
 262    const char *intrinsic = NULL;
 263    unsigned intr_size = 0;
 264    LLVMValueRef cond;
 265
 266    assert(lp_check_value(type, a));
 267    assert(lp_check_value(type, b));
 268
 269    /* TODO: optimize the constant case */
 270
 271    if (type.floating && util_cpu_caps.has_sse) {
 272       if (type.width == 32) {
 273          if (type.length == 1) {
 274             intrinsic = "llvm.x86.sse.max.ss";
 275             intr_size = 128;
 276          }
 277          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 278             intrinsic = "llvm.x86.sse.max.ps";
 279             intr_size = 128;
 280          }
 281          else {
 282             intrinsic = "llvm.x86.avx.max.ps.256";
 283             intr_size = 256;
 284          }
 285       }
 286       if (type.width == 64 && util_cpu_caps.has_sse2) {
 287          if (type.length == 1) {
 288             intrinsic = "llvm.x86.sse2.max.sd";
 289             intr_size = 128;
 290          }
 291          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 292             intrinsic = "llvm.x86.sse2.max.pd";
 293             intr_size = 128;
 294          }
 295          else {
 296             intrinsic = "llvm.x86.avx.max.pd.256";
 297             intr_size = 256;
 298          }
 299       }
 300    }
 301    else if (type.floating && util_cpu_caps.has_altivec) {
 302       if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
 303          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 304                       __FUNCTION__);
 305       }
 306       if (type.width == 32 || type.length == 4) {
 307          intrinsic = "llvm.ppc.altivec.vmaxfp";
 308          intr_size = 128;
 309       }
 310    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 311       intr_size = 128;
 312       if ((type.width == 8 || type.width == 16) &&
 313           (type.width * type.length <= 64) &&
 314           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 315          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 316                       __FUNCTION__);
 317          }
 318       if (type.width == 8 && !type.sign) {
 319          intrinsic = "llvm.x86.sse2.pmaxu.b";
 320          intr_size = 128;
 321       }
 322       else if (type.width == 16 && type.sign) {
 323          intrinsic = "llvm.x86.sse2.pmaxs.w";
 324       }
 325       if (util_cpu_caps.has_sse4_1) {
 326          if (type.width == 8 && type.sign) {
 327             intrinsic = "llvm.x86.sse41.pmaxsb";
 328          }
 329          if (type.width == 16 && !type.sign) {
 330             intrinsic = "llvm.x86.sse41.pmaxuw";
 331          }
 332          if (type.width == 32 && !type.sign) {
 333             intrinsic = "llvm.x86.sse41.pmaxud";
 334         }
 335          if (type.width == 32 && type.sign) {
 336             intrinsic = "llvm.x86.sse41.pmaxsd";
 337          }
 338       }
 339    } else if (util_cpu_caps.has_altivec) {
 340      intr_size = 128;
 341      if (type.width == 8) {
 342        if (!type.sign) {
 343          intrinsic = "llvm.ppc.altivec.vmaxub";
 344        } else {
 345          intrinsic = "llvm.ppc.altivec.vmaxsb";
 346        }
 347      } else if (type.width == 16) {
 348        if (!type.sign) {
 349          intrinsic = "llvm.ppc.altivec.vmaxuh";
 350        } else {
 351          intrinsic = "llvm.ppc.altivec.vmaxsh";
 352        }
 353      } else if (type.width == 32) {
 354        if (!type.sign) {
 355          intrinsic = "llvm.ppc.altivec.vmaxuw";
 356        } else {
 357          intrinsic = "llvm.ppc.altivec.vmaxsw";
 358        }
 359      }
 360    }
 361
 362    if(intrinsic) {
 363       if (util_cpu_caps.has_sse && type.floating &&
 364           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 365           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
 366          LLVMValueRef isnan, min;
 367          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 368                                                    type,
 369                                                    intr_size, a, b);
 370          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 371             isnan = lp_build_isnan(bld, b);
 372             return lp_build_select(bld, isnan, a, min);
 373          } else {
 374             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 375             isnan = lp_build_isnan(bld, a);
 376             return lp_build_select(bld, isnan, a, min);
 377          }
 378       } else {
 379          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 380                                                     type,
 381                                                     intr_size, a, b);
 382       }
 383    }
 384
 385    if (type.floating) {
 386       switch (nan_behavior) {
 387       case GALLIVM_NAN_RETURN_NAN: {
 388          LLVMValueRef isnan = lp_build_isnan(bld, b);
 389          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 390          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 391          return lp_build_select(bld, cond, a, b);
 392       }
 393          break;
 394       case GALLIVM_NAN_RETURN_OTHER: {
 395          LLVMValueRef isnan = lp_build_isnan(bld, a);
 396          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 397          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 398          return lp_build_select(bld, cond, a, b);
 399       }
 400          break;
 401       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 402          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 403          return lp_build_select(bld, cond, a, b);
 404       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 405          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 406          return lp_build_select(bld, cond, a, b);
 407          break;
 408       default:
 409          assert(0);
 410          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 411          return lp_build_select(bld, cond, a, b);
 412       }
 413    } else {
 414       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415       return lp_build_select(bld, cond, a, b);
 416    }
 417 }
 418
 419
 420 /**
 421  * Generate 1 - a, or ~a depending on bld->type.
 422  */
 423 LLVMValueRef
 424 lp_build_comp(struct lp_build_context *bld,
 425               LLVMValueRef a)
 426 {
 427    LLVMBuilderRef builder = bld->gallivm->builder;
 428    const struct lp_type type = bld->type;
 429
 430    assert(lp_check_value(type, a));
 431
 432    if(a == bld->one)
 433       return bld->zero;
 434    if(a == bld->zero)
 435       return bld->one;
 436
 437    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 438       if(LLVMIsConstant(a))
 439          return LLVMConstNot(a);
 440       else
 441          return LLVMBuildNot(builder, a, "");
 442    }
 443
 444    if(LLVMIsConstant(a))
 445       if (type.floating)
 446           return LLVMConstFSub(bld->one, a);
 447       else
 448           return LLVMConstSub(bld->one, a);
 449    else
 450       if (type.floating)
 451          return LLVMBuildFSub(builder, bld->one, a, "");
 452       else
 453          return LLVMBuildSub(builder, bld->one, a, "");
 454 }
 455
 456
 457 /**
 458  * Generate a + b
 459  */
 460 LLVMValueRef
 461 lp_build_add(struct lp_build_context *bld,
 462              LLVMValueRef a,
 463              LLVMValueRef b)
 464 {
 465    LLVMBuilderRef builder = bld->gallivm->builder;
 466    const struct lp_type type = bld->type;
 467    LLVMValueRef res;
 468
 469    assert(lp_check_value(type, a));
 470    assert(lp_check_value(type, b));
 471
 472    if(a == bld->zero)
 473       return b;
 474    if(b == bld->zero)
 475       return a;
 476    if(a == bld->undef || b == bld->undef)
 477       return bld->undef;
 478
 479    if(bld->type.norm) {
 480       const char *intrinsic = NULL;
 481
 482       if(a == bld->one || b == bld->one)
 483         return bld->one;
 484
 485       if (type.width * type.length == 128 &&
 486           !type.floating && !type.fixed) {
 487          if(util_cpu_caps.has_sse2) {
 488            if(type.width == 8)
 489              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 490            if(type.width == 16)
 491              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 492          } else if (util_cpu_caps.has_altivec) {
 493            if(type.width == 8)
 494               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 495            if(type.width == 16)
 496               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 497          }
 498       }
 499
 500       if(intrinsic)
 501          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 502    }
 503
 504    /* TODO: handle signed case */
 505    if(type.norm && !type.floating && !type.fixed && !type.sign)
 506       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 507
 508    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 509       if (type.floating)
 510          res = LLVMConstFAdd(a, b);
 511       else
 512          res = LLVMConstAdd(a, b);
 513    else
 514       if (type.floating)
 515          res = LLVMBuildFAdd(builder, a, b, "");
 516       else
 517          res = LLVMBuildAdd(builder, a, b, "");
 518
 519    /* clamp to ceiling of 1.0 */
 520    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 521       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 522
 523    /* XXX clamp to floor of -1 or 0??? */
 524
 525    return res;
 526 }
 527
 528
 529 /** Return the scalar sum of the elements of a.
 530  * Should avoid this operation whenever possible.
 531  */
 532 LLVMValueRef
 533 lp_build_horizontal_add(struct lp_build_context *bld,
 534                         LLVMValueRef a)
 535 {
 536    LLVMBuilderRef builder = bld->gallivm->builder;
 537    const struct lp_type type = bld->type;
 538    LLVMValueRef index, res;
 539    unsigned i, length;
 540    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 541    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 542    LLVMValueRef vecres, elem2;
 543
 544    assert(lp_check_value(type, a));
 545
 546    if (type.length == 1) {
 547       return a;
 548    }
 549
 550    assert(!bld->type.norm);
 551
 552    /*
 553     * for byte vectors can do much better with psadbw.
 554     * Using repeated shuffle/adds here. Note with multiple vectors
 555     * this can be done more efficiently as outlined in the intel
 556     * optimization manual.
 557     * Note: could cause data rearrangement if used with smaller element
 558     * sizes.
 559     */
 560
 561    vecres = a;
 562    length = type.length / 2;
 563    while (length > 1) {
 564       LLVMValueRef vec1, vec2;
 565       for (i = 0; i < length; i++) {
 566          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 567          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 568       }
 569       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 570                                     LLVMConstVector(shuffles1, length), "");
 571       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 572                                     LLVMConstVector(shuffles2, length), "");
 573       if (type.floating) {
 574          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 575       }
 576       else {
 577          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 578       }
 579       length = length >> 1;
 580    }
 581
 582    /* always have vector of size 2 here */
 583    assert(length == 1);
 584
 585    index = lp_build_const_int32(bld->gallivm, 0);
 586    res = LLVMBuildExtractElement(builder, vecres, index, "");
 587    index = lp_build_const_int32(bld->gallivm, 1);
 588    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 589
 590    if (type.floating)
 591       res = LLVMBuildFAdd(builder, res, elem2, "");
 592     else
 593       res = LLVMBuildAdd(builder, res, elem2, "");
 594
 595    return res;
 596 }
 597
 598 /**
 599  * Return the horizontal sums of 4 float vectors as a float4 vector.
 600  * This uses the technique as outlined in Intel Optimization Manual.
 601  */
 602 static LLVMValueRef
 603 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 604                             LLVMValueRef src[4])
 605 {
 606    struct gallivm_state *gallivm = bld->gallivm;
 607    LLVMBuilderRef builder = gallivm->builder;
 608    LLVMValueRef shuffles[4];
 609    LLVMValueRef tmp[4];
 610    LLVMValueRef sumtmp[2], shuftmp[2];
 611
 612    /* lower half of regs */
 613    shuffles[0] = lp_build_const_int32(gallivm, 0);
 614    shuffles[1] = lp_build_const_int32(gallivm, 1);
 615    shuffles[2] = lp_build_const_int32(gallivm, 4);
 616    shuffles[3] = lp_build_const_int32(gallivm, 5);
 617    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 618                                    LLVMConstVector(shuffles, 4), "");
 619    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 620                                    LLVMConstVector(shuffles, 4), "");
 621
 622    /* upper half of regs */
 623    shuffles[0] = lp_build_const_int32(gallivm, 2);
 624    shuffles[1] = lp_build_const_int32(gallivm, 3);
 625    shuffles[2] = lp_build_const_int32(gallivm, 6);
 626    shuffles[3] = lp_build_const_int32(gallivm, 7);
 627    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 628                                    LLVMConstVector(shuffles, 4), "");
 629    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 630                                    LLVMConstVector(shuffles, 4), "");
 631
 632    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 633    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 634
 635    shuffles[0] = lp_build_const_int32(gallivm, 0);
 636    shuffles[1] = lp_build_const_int32(gallivm, 2);
 637    shuffles[2] = lp_build_const_int32(gallivm, 4);
 638    shuffles[3] = lp_build_const_int32(gallivm, 6);
 639    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 640                                        LLVMConstVector(shuffles, 4), "");
 641
 642    shuffles[0] = lp_build_const_int32(gallivm, 1);
 643    shuffles[1] = lp_build_const_int32(gallivm, 3);
 644    shuffles[2] = lp_build_const_int32(gallivm, 5);
 645    shuffles[3] = lp_build_const_int32(gallivm, 7);
 646    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 647                                        LLVMConstVector(shuffles, 4), "");
 648
 649    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 650 }
 651
 652
 653 /*
 654  * partially horizontally add 2-4 float vectors with length nx4,
 655  * i.e. only four adjacent values in each vector will be added,
 656  * assuming values are really grouped in 4 which also determines
 657  * output order.
 658  *
 659  * Return a vector of the same length as the initial vectors,
 660  * with the excess elements (if any) being undefined.
 661  * The element order is independent of number of input vectors.
 662  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 663  * the output order thus will be
 664  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 665  */
 666 LLVMValueRef
 667 lp_build_hadd_partial4(struct lp_build_context *bld,
 668                        LLVMValueRef vectors[],
 669                        unsigned num_vecs)
 670 {
 671    struct gallivm_state *gallivm = bld->gallivm;
 672    LLVMBuilderRef builder = gallivm->builder;
 673    LLVMValueRef ret_vec;
 674    LLVMValueRef tmp[4];
 675    const char *intrinsic = NULL;
 676
 677    assert(num_vecs >= 2 && num_vecs <= 4);
 678    assert(bld->type.floating);
 679
 680    /* only use this with at least 2 vectors, as it is sort of expensive
 681     * (depending on cpu) and we always need two horizontal adds anyway,
 682     * so a shuffle/add approach might be better.
 683     */
 684
 685    tmp[0] = vectors[0];
 686    tmp[1] = vectors[1];
 687
 688    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 689    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 690
 691    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 692        bld->type.length == 4) {
 693       intrinsic = "llvm.x86.sse3.hadd.ps";
 694    }
 695    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 696             bld->type.length == 8) {
 697       intrinsic = "llvm.x86.avx.hadd.ps.256";
 698    }
 699    if (intrinsic) {
 700       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 701                                        lp_build_vec_type(gallivm, bld->type),
 702                                        tmp[0], tmp[1]);
 703       if (num_vecs > 2) {
 704          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 705                                           lp_build_vec_type(gallivm, bld->type),
 706                                           tmp[2], tmp[3]);
 707       }
 708       else {
 709          tmp[1] = tmp[0];
 710       }
 711       return lp_build_intrinsic_binary(builder, intrinsic,
 712                                        lp_build_vec_type(gallivm, bld->type),
 713                                        tmp[0], tmp[1]);
 714    }
 715
 716    if (bld->type.length == 4) {
 717       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 718    }
 719    else {
 720       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 721       unsigned j;
 722       unsigned num_iter = bld->type.length / 4;
 723       struct lp_type parttype = bld->type;
 724       parttype.length = 4;
 725       for (j = 0; j < num_iter; j++) {
 726          LLVMValueRef partsrc[4];
 727          unsigned i;
 728          for (i = 0; i < 4; i++) {
 729             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 730          }
 731          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 732       }
 733       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 734    }
 735    return ret_vec;
 736 }
 737
 738 /**
 739  * Generate a - b
 740  */
 741 LLVMValueRef
 742 lp_build_sub(struct lp_build_context *bld,
 743              LLVMValueRef a,
 744              LLVMValueRef b)
 745 {
 746    LLVMBuilderRef builder = bld->gallivm->builder;
 747    const struct lp_type type = bld->type;
 748    LLVMValueRef res;
 749
 750    assert(lp_check_value(type, a));
 751    assert(lp_check_value(type, b));
 752
 753    if(b == bld->zero)
 754       return a;
 755    if(a == bld->undef || b == bld->undef)
 756       return bld->undef;
 757    if(a == b)
 758       return bld->zero;
 759
 760    if(bld->type.norm) {
 761       const char *intrinsic = NULL;
 762
 763       if(b == bld->one)
 764         return bld->zero;
 765
 766       if (type.width * type.length == 128 &&
 767           !type.floating && !type.fixed) {
 768          if (util_cpu_caps.has_sse2) {
 769            if(type.width == 8)
 770               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 771            if(type.width == 16)
 772               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 773          } else if (util_cpu_caps.has_altivec) {
 774            if(type.width == 8)
 775               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 776            if(type.width == 16)
 777               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 778          }
 779       }
 780
 781       if(intrinsic)
 782          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 783    }
 784
 785    /* TODO: handle signed case */
 786    if(type.norm && !type.floating && !type.fixed && !type.sign)
 787       a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 788
 789    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 790       if (type.floating)
 791          res = LLVMConstFSub(a, b);
 792       else
 793          res = LLVMConstSub(a, b);
 794    else
 795       if (type.floating)
 796          res = LLVMBuildFSub(builder, a, b, "");
 797       else
 798          res = LLVMBuildSub(builder, a, b, "");
 799
 800    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 801       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 802
 803    return res;
 804 }
 805
 806
 807
 808 /**
 809  * Normalized multiplication.
 810  *
 811  * There are several approaches for (using 8-bit normalized multiplication as
 812  * an example):
 813  *
 814  * - alpha plus one
 815  *
 816  *     makes the following approximation to the division (Sree)
 817  *
 818  *       a*b/255 ~= (a*(b + 1)) >> 256
 819  *
 820  *     which is the fastest method that satisfies the following OpenGL criteria of
 821  *
 822  *       0*0 = 0 and 255*255 = 255
 823  *
 824  * - geometric series
 825  *
 826  *     takes the geometric series approximation to the division
 827  *
 828  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 829  *
 830  *     in this case just the first two terms to fit in 16bit arithmetic
 831  *
 832  *       t/255 ~= (t + (t >> 8)) >> 8
 833  *
 834  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 835  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 836  *     must be used.
 837  *
 838  * - geometric series plus rounding
 839  *
 840  *     when using a geometric series division instead of truncating the result
 841  *     use roundoff in the approximation (Jim Blinn)
 842  *
 843  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 844  *
 845  *     achieving the exact results.
 846  *
 847  *
 848  *
 849  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 850  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 851  * @sa Michael Herf, The "double blend trick", May 2000,
 852  *     http://www.stereopsis.com/doubleblend.html
 853  */
 854 static LLVMValueRef
 855 lp_build_mul_norm(struct gallivm_state *gallivm,
 856                   struct lp_type wide_type,
 857                   LLVMValueRef a, LLVMValueRef b)
 858 {
 859    LLVMBuilderRef builder = gallivm->builder;
 860    struct lp_build_context bld;
 861    unsigned n;
 862    LLVMValueRef half;
 863    LLVMValueRef ab;
 864
 865    assert(!wide_type.floating);
 866    assert(lp_check_value(wide_type, a));
 867    assert(lp_check_value(wide_type, b));
 868
 869    lp_build_context_init(&bld, gallivm, wide_type);
 870
 871    n = wide_type.width / 2;
 872    if (wide_type.sign) {
 873       --n;
 874    }
 875
 876    /*
 877     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 878     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 879     */
 880
 881    /*
 882     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 883     */
 884
 885    ab = LLVMBuildMul(builder, a, b, "");
 886    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 887
 888    /*
 889     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 890     */
 891
 892    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 893    if (wide_type.sign) {
 894       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 895       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 896       half = lp_build_select(&bld, sign, minus_half, half);
 897    }
 898    ab = LLVMBuildAdd(builder, ab, half, "");
 899
 900    /* Final division */
 901    ab = lp_build_shr_imm(&bld, ab, n);
 902
 903    return ab;
 904 }
 905
 906 /**
 907  * Generate a * b
 908  */
 909 LLVMValueRef
 910 lp_build_mul(struct lp_build_context *bld,
 911              LLVMValueRef a,
 912              LLVMValueRef b)
 913 {
 914    LLVMBuilderRef builder = bld->gallivm->builder;
 915    const struct lp_type type = bld->type;
 916    LLVMValueRef shift;
 917    LLVMValueRef res;
 918
 919    assert(lp_check_value(type, a));
 920    assert(lp_check_value(type, b));
 921
 922    if(a == bld->zero)
 923       return bld->zero;
 924    if(a == bld->one)
 925       return b;
 926    if(b == bld->zero)
 927       return bld->zero;
 928    if(b == bld->one)
 929       return a;
 930    if(a == bld->undef || b == bld->undef)
 931       return bld->undef;
 932
 933    if (!type.floating && !type.fixed && type.norm) {
 934       struct lp_type wide_type = lp_wider_type(type);
 935       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 936
 937       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 938       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 939
 940       /* PMULLW, PSRLW, PADDW */
 941       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 942       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 943
 944       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 945
 946       return ab;
 947    }
 948
 949    if(type.fixed)
 950       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 951    else
 952       shift = NULL;
 953
 954    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 955       if (type.floating)
 956          res = LLVMConstFMul(a, b);
 957       else
 958          res = LLVMConstMul(a, b);
 959       if(shift) {
 960          if(type.sign)
 961             res = LLVMConstAShr(res, shift);
 962          else
 963             res = LLVMConstLShr(res, shift);
 964       }
 965    }
 966    else {
 967       if (type.floating)
 968          res = LLVMBuildFMul(builder, a, b, "");
 969       else
 970          res = LLVMBuildMul(builder, a, b, "");
 971       if(shift) {
 972          if(type.sign)
 973             res = LLVMBuildAShr(builder, res, shift, "");
 974          else
 975             res = LLVMBuildLShr(builder, res, shift, "");
 976       }
 977    }
 978
 979    return res;
 980 }
 981
 982
 983 /**
 984  * Small vector x scale multiplication optimization.
 985  */
 986 LLVMValueRef
 987 lp_build_mul_imm(struct lp_build_context *bld,
 988                  LLVMValueRef a,
 989                  int b)
 990 {
 991    LLVMBuilderRef builder = bld->gallivm->builder;
 992    LLVMValueRef factor;
 993
 994    assert(lp_check_value(bld->type, a));
 995
 996    if(b == 0)
 997       return bld->zero;
 998
 999    if(b == 1)
1000       return a;
1001
1002    if(b == -1)
1003       return lp_build_negate(bld, a);
1004
1005    if(b == 2 && bld->type.floating)
1006       return lp_build_add(bld, a, a);
1007
1008    if(util_is_power_of_two(b)) {
1009       unsigned shift = ffs(b) - 1;
1010
1011       if(bld->type.floating) {
1012 #if 0
1013          /*
1014           * Power of two multiplication by directly manipulating the exponent.
1015           *
1016           * XXX: This might not be always faster, it will introduce a small error
1017           * for multiplication by zero, and it will produce wrong results
1018           * for Inf and NaN.
1019           */
1020          unsigned mantissa = lp_mantissa(bld->type);
1021          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1022          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1023          a = LLVMBuildAdd(builder, a, factor, "");
1024          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1025          return a;
1026 #endif
1027       }
1028       else {
1029          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1030          return LLVMBuildShl(builder, a, factor, "");
1031       }
1032    }
1033
1034    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1035    return lp_build_mul(bld, a, factor);
1036 }
1037
1038
1039 /**
1040  * Generate a / b
1041  */
1042 LLVMValueRef
1043 lp_build_div(struct lp_build_context *bld,
1044              LLVMValueRef a,
1045              LLVMValueRef b)
1046 {
1047    LLVMBuilderRef builder = bld->gallivm->builder;
1048    const struct lp_type type = bld->type;
1049
1050    assert(lp_check_value(type, a));
1051    assert(lp_check_value(type, b));
1052
1053    if(a == bld->zero)
1054       return bld->zero;
1055    if(a == bld->one)
1056       return lp_build_rcp(bld, b);
1057    if(b == bld->zero)
1058       return bld->undef;
1059    if(b == bld->one)
1060       return a;
1061    if(a == bld->undef || b == bld->undef)
1062       return bld->undef;
1063
1064    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1065       if (type.floating)
1066          return LLVMConstFDiv(a, b);
1067       else if (type.sign)
1068          return LLVMConstSDiv(a, b);
1069       else
1070          return LLVMConstUDiv(a, b);
1071    }
1072
1073    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1074        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1075       type.floating)
1076       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1077
1078    if (type.floating)
1079       return LLVMBuildFDiv(builder, a, b, "");
1080    else if (type.sign)
1081       return LLVMBuildSDiv(builder, a, b, "");
1082    else
1083       return LLVMBuildUDiv(builder, a, b, "");
1084 }
1085
1086
1087 /**
1088  * Linear interpolation helper.
1089  *
1090  * @param normalized whether we are interpolating normalized values,
1091  *        encoded in normalized integers, twice as wide.
1092  *
1093  * @sa http://www.stereopsis.com/doubleblend.html
1094  */
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context *bld,
1097                      LLVMValueRef x,
1098                      LLVMValueRef v0,
1099                      LLVMValueRef v1,
1100                      unsigned flags)
1101 {
1102    unsigned half_width = bld->type.width/2;
1103    LLVMBuilderRef builder = bld->gallivm->builder;
1104    LLVMValueRef delta;
1105    LLVMValueRef res;
1106
1107    assert(lp_check_value(bld->type, x));
1108    assert(lp_check_value(bld->type, v0));
1109    assert(lp_check_value(bld->type, v1));
1110
1111    delta = lp_build_sub(bld, v1, v0);
1112
1113    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1114       if (!bld->type.sign) {
1115          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1116             /*
1117              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118              * most-significant-bit to the lowest-significant-bit, so that
1119              * later we can just divide by 2**n instead of 2**n - 1.
1120              */
1121
1122             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1123          }
1124
1125          /* (x * delta) >> n */
1126          res = lp_build_mul(bld, x, delta);
1127          res = lp_build_shr_imm(bld, res, half_width);
1128       } else {
1129          /*
1130           * The rescaling trick above doesn't work for signed numbers, so
1131           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1132           * instead.
1133           */
1134          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1135          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1136       }
1137    } else {
1138       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1139       res = lp_build_mul(bld, x, delta);
1140    }
1141
1142    res = lp_build_add(bld, v0, res);
1143
1144    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1145        bld->type.fixed) {
1146       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148        * but it will be wrong for true fixed point use cases. Basically we need
1149        * a more powerful lp_type, capable of further distinguishing the values
1150        * interpretation from the value storage. */
1151       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1152    }
1153
1154    return res;
1155 }
1156
1157
1158 /**
1159  * Linear interpolation.
1160  */
1161 LLVMValueRef
1162 lp_build_lerp(struct lp_build_context *bld,
1163               LLVMValueRef x,
1164               LLVMValueRef v0,
1165               LLVMValueRef v1,
1166               unsigned flags)
1167 {
1168    const struct lp_type type = bld->type;
1169    LLVMValueRef res;
1170
1171    assert(lp_check_value(type, x));
1172    assert(lp_check_value(type, v0));
1173    assert(lp_check_value(type, v1));
1174
1175    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1176
1177    if (type.norm) {
1178       struct lp_type wide_type;
1179       struct lp_build_context wide_bld;
1180       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1181
1182       assert(type.length >= 2);
1183
1184       /*
1185        * Create a wider integer type, enough to hold the
1186        * intermediate result of the multiplication.
1187        */
1188       memset(&wide_type, 0, sizeof wide_type);
1189       wide_type.sign   = type.sign;
1190       wide_type.width  = type.width*2;
1191       wide_type.length = type.length/2;
1192
1193       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1194
1195       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1196       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1197       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1198
1199       /*
1200        * Lerp both halves.
1201        */
1202
1203       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1204
1205       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1206       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1207
1208       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1209    } else {
1210       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1211    }
1212
1213    return res;
1214 }
1215
1216
1217 /**
1218  * Bilinear interpolation.
1219  *
1220  * Values indices are in v_{yx}.
1221  */
1222 LLVMValueRef
1223 lp_build_lerp_2d(struct lp_build_context *bld,
1224                  LLVMValueRef x,
1225                  LLVMValueRef y,
1226                  LLVMValueRef v00,
1227                  LLVMValueRef v01,
1228                  LLVMValueRef v10,
1229                  LLVMValueRef v11,
1230                  unsigned flags)
1231 {
1232    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1233    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1234    return lp_build_lerp(bld, y, v0, v1, flags);
1235 }
1236
1237
1238 LLVMValueRef
1239 lp_build_lerp_3d(struct lp_build_context *bld,
1240                  LLVMValueRef x,
1241                  LLVMValueRef y,
1242                  LLVMValueRef z,
1243                  LLVMValueRef v000,
1244                  LLVMValueRef v001,
1245                  LLVMValueRef v010,
1246                  LLVMValueRef v011,
1247                  LLVMValueRef v100,
1248                  LLVMValueRef v101,
1249                  LLVMValueRef v110,
1250                  LLVMValueRef v111,
1251                  unsigned flags)
1252 {
1253    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1254    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1255    return lp_build_lerp(bld, z, v0, v1, flags);
1256 }
1257
1258
1259 /**
1260  * Generate min(a, b)
1261  * Do checks for special cases but not for nans.
1262  */
1263 LLVMValueRef
1264 lp_build_min(struct lp_build_context *bld,
1265              LLVMValueRef a,
1266              LLVMValueRef b)
1267 {
1268    assert(lp_check_value(bld->type, a));
1269    assert(lp_check_value(bld->type, b));
1270
1271    if(a == bld->undef || b == bld->undef)
1272       return bld->undef;
1273
1274    if(a == b)
1275       return a;
1276
1277    if (bld->type.norm) {
1278       if (!bld->type.sign) {
1279          if (a == bld->zero || b == bld->zero) {
1280             return bld->zero;
1281          }
1282       }
1283       if(a == bld->one)
1284          return b;
1285       if(b == bld->one)
1286          return a;
1287    }
1288
1289    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1290 }
1291
1292
1293 /**
1294  * Generate min(a, b)
1295  * NaN's are handled according to the behavior specified by the
1296  * nan_behavior argument.
1297  */
1298 LLVMValueRef
1299 lp_build_min_ext(struct lp_build_context *bld,
1300                  LLVMValueRef a,
1301                  LLVMValueRef b,
1302                  enum gallivm_nan_behavior nan_behavior)
1303 {
1304    assert(lp_check_value(bld->type, a));
1305    assert(lp_check_value(bld->type, b));
1306
1307    if(a == bld->undef || b == bld->undef)
1308       return bld->undef;
1309
1310    if(a == b)
1311       return a;
1312
1313    if (bld->type.norm) {
1314       if (!bld->type.sign) {
1315          if (a == bld->zero || b == bld->zero) {
1316             return bld->zero;
1317          }
1318       }
1319       if(a == bld->one)
1320          return b;
1321       if(b == bld->one)
1322          return a;
1323    }
1324
1325    return lp_build_min_simple(bld, a, b, nan_behavior);
1326 }
1327
1328 /**
1329  * Generate max(a, b)
1330  * Do checks for special cases, but NaN behavior is undefined.
1331  */
1332 LLVMValueRef
1333 lp_build_max(struct lp_build_context *bld,
1334              LLVMValueRef a,
1335              LLVMValueRef b)
1336 {
1337    assert(lp_check_value(bld->type, a));
1338    assert(lp_check_value(bld->type, b));
1339
1340    if(a == bld->undef || b == bld->undef)
1341       return bld->undef;
1342
1343    if(a == b)
1344       return a;
1345
1346    if(bld->type.norm) {
1347       if(a == bld->one || b == bld->one)
1348          return bld->one;
1349       if (!bld->type.sign) {
1350          if (a == bld->zero) {
1351             return b;
1352          }
1353          if (b == bld->zero) {
1354             return a;
1355          }
1356       }
1357    }
1358
1359    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1360 }
1361
1362
1363 /**
1364  * Generate max(a, b)
1365  * Checks for special cases.
1366  * NaN's are handled according to the behavior specified by the
1367  * nan_behavior argument.
1368  */
1369 LLVMValueRef
1370 lp_build_max_ext(struct lp_build_context *bld,
1371                   LLVMValueRef a,
1372                   LLVMValueRef b,
1373                   enum gallivm_nan_behavior nan_behavior)
1374 {
1375    assert(lp_check_value(bld->type, a));
1376    assert(lp_check_value(bld->type, b));
1377
1378    if(a == bld->undef || b == bld->undef)
1379       return bld->undef;
1380
1381    if(a == b)
1382       return a;
1383
1384    if(bld->type.norm) {
1385       if(a == bld->one || b == bld->one)
1386          return bld->one;
1387       if (!bld->type.sign) {
1388          if (a == bld->zero) {
1389             return b;
1390          }
1391          if (b == bld->zero) {
1392             return a;
1393          }
1394       }
1395    }
1396
1397    return lp_build_max_simple(bld, a, b, nan_behavior);
1398 }
1399
1400 /**
1401  * Generate clamp(a, min, max)
1402  * NaN behavior (for any of a, min, max) is undefined.
1403  * Do checks for special cases.
1404  */
1405 LLVMValueRef
1406 lp_build_clamp(struct lp_build_context *bld,
1407                LLVMValueRef a,
1408                LLVMValueRef min,
1409                LLVMValueRef max)
1410 {
1411    assert(lp_check_value(bld->type, a));
1412    assert(lp_check_value(bld->type, min));
1413    assert(lp_check_value(bld->type, max));
1414
1415    a = lp_build_min(bld, a, max);
1416    a = lp_build_max(bld, a, min);
1417    return a;
1418 }
1419
1420
1421 /**
1422  * Generate clamp(a, 0, 1)
1423  * A NaN will get converted to zero.
1424  */
1425 LLVMValueRef
1426 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1427                                 LLVMValueRef a)
1428 {
1429    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1430    a = lp_build_min(bld, a, bld->one);
1431    return a;
1432 }
1433
1434
1435 /**
1436  * Generate abs(a)
1437  */
1438 LLVMValueRef
1439 lp_build_abs(struct lp_build_context *bld,
1440              LLVMValueRef a)
1441 {
1442    LLVMBuilderRef builder = bld->gallivm->builder;
1443    const struct lp_type type = bld->type;
1444    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1445
1446    assert(lp_check_value(type, a));
1447
1448    if(!type.sign)
1449       return a;
1450
1451    if(type.floating) {
1452       /* Mask out the sign bit */
1453       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1454       unsigned long long absMask = ~(1ULL << (type.width - 1));
1455       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1456       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1457       a = LLVMBuildAnd(builder, a, mask, "");
1458       a = LLVMBuildBitCast(builder, a, vec_type, "");
1459       return a;
1460    }
1461
1462    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1463       switch(type.width) {
1464       case 8:
1465          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1466       case 16:
1467          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1468       case 32:
1469          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1470       }
1471    }
1472    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1473             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1474             (type.width == 8 || type.width == 16 || type.width == 32)) {
1475       debug_printf("%s: inefficient code, should split vectors manually\n",
1476                    __FUNCTION__);
1477    }
1478
1479    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1480 }
1481
1482
1483 LLVMValueRef
1484 lp_build_negate(struct lp_build_context *bld,
1485                 LLVMValueRef a)
1486 {
1487    LLVMBuilderRef builder = bld->gallivm->builder;
1488
1489    assert(lp_check_value(bld->type, a));
1490
1491 #if HAVE_LLVM >= 0x0207
1492    if (bld->type.floating)
1493       a = LLVMBuildFNeg(builder, a, "");
1494    else
1495 #endif
1496       a = LLVMBuildNeg(builder, a, "");
1497
1498    return a;
1499 }
1500
1501
1502 /** Return -1, 0 or +1 depending on the sign of a */
1503 LLVMValueRef
1504 lp_build_sgn(struct lp_build_context *bld,
1505              LLVMValueRef a)
1506 {
1507    LLVMBuilderRef builder = bld->gallivm->builder;
1508    const struct lp_type type = bld->type;
1509    LLVMValueRef cond;
1510    LLVMValueRef res;
1511
1512    assert(lp_check_value(type, a));
1513
1514    /* Handle non-zero case */
1515    if(!type.sign) {
1516       /* if not zero then sign must be positive */
1517       res = bld->one;
1518    }
1519    else if(type.floating) {
1520       LLVMTypeRef vec_type;
1521       LLVMTypeRef int_type;
1522       LLVMValueRef mask;
1523       LLVMValueRef sign;
1524       LLVMValueRef one;
1525       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1526
1527       int_type = lp_build_int_vec_type(bld->gallivm, type);
1528       vec_type = lp_build_vec_type(bld->gallivm, type);
1529       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1530
1531       /* Take the sign bit and add it to 1 constant */
1532       sign = LLVMBuildBitCast(builder, a, int_type, "");
1533       sign = LLVMBuildAnd(builder, sign, mask, "");
1534       one = LLVMConstBitCast(bld->one, int_type);
1535       res = LLVMBuildOr(builder, sign, one, "");
1536       res = LLVMBuildBitCast(builder, res, vec_type, "");
1537    }
1538    else
1539    {
1540       /* signed int/norm/fixed point */
1541       /* could use psign with sse3 and appropriate vectors here */
1542       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1543       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1544       res = lp_build_select(bld, cond, bld->one, minus_one);
1545    }
1546
1547    /* Handle zero */
1548    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1549    res = lp_build_select(bld, cond, bld->zero, res);
1550
1551    return res;
1552 }
1553
1554
1555 /**
1556  * Set the sign of float vector 'a' according to 'sign'.
1557  * If sign==0, return abs(a).
1558  * If sign==1, return -abs(a);
1559  * Other values for sign produce undefined results.
1560  */
1561 LLVMValueRef
1562 lp_build_set_sign(struct lp_build_context *bld,
1563                   LLVMValueRef a, LLVMValueRef sign)
1564 {
1565    LLVMBuilderRef builder = bld->gallivm->builder;
1566    const struct lp_type type = bld->type;
1567    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1568    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1569    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1570    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1571                              ~((unsigned long long) 1 << (type.width - 1)));
1572    LLVMValueRef val, res;
1573
1574    assert(type.floating);
1575    assert(lp_check_value(type, a));
1576
1577    /* val = reinterpret_cast<int>(a) */
1578    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1579    /* val = val & mask */
1580    val = LLVMBuildAnd(builder, val, mask, "");
1581    /* sign = sign << shift */
1582    sign = LLVMBuildShl(builder, sign, shift, "");
1583    /* res = val | sign */
1584    res = LLVMBuildOr(builder, val, sign, "");
1585    /* res = reinterpret_cast<float>(res) */
1586    res = LLVMBuildBitCast(builder, res, vec_type, "");
1587
1588    return res;
1589 }
1590
1591
1592 /**
1593  * Convert vector of (or scalar) int to vector of (or scalar) float.
1594  */
1595 LLVMValueRef
1596 lp_build_int_to_float(struct lp_build_context *bld,
1597                       LLVMValueRef a)
1598 {
1599    LLVMBuilderRef builder = bld->gallivm->builder;
1600    const struct lp_type type = bld->type;
1601    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1602
1603    assert(type.floating);
1604
1605    return LLVMBuildSIToFP(builder, a, vec_type, "");
1606 }
1607
1608 static boolean
1609 arch_rounding_available(const struct lp_type type)
1610 {
1611    if ((util_cpu_caps.has_sse4_1 &&
1612        (type.length == 1 || type.width*type.length == 128)) ||
1613        (util_cpu_caps.has_avx && type.width*type.length == 256))
1614       return TRUE;
1615    else if ((util_cpu_caps.has_altivec &&
1616             (type.width == 32 && type.length == 4)))
1617       return TRUE;
1618
1619    return FALSE;
1620 }
1621
1622 enum lp_build_round_mode
1623 {
1624    LP_BUILD_ROUND_NEAREST = 0,
1625    LP_BUILD_ROUND_FLOOR = 1,
1626    LP_BUILD_ROUND_CEIL = 2,
1627    LP_BUILD_ROUND_TRUNCATE = 3
1628 };
1629
1630 /**
1631  * Helper for SSE4.1's ROUNDxx instructions.
1632  *
1633  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1634  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1635  */
1636 static INLINE LLVMValueRef
1637 lp_build_round_sse41(struct lp_build_context *bld,
1638                      LLVMValueRef a,
1639                      enum lp_build_round_mode mode)
1640 {
1641    LLVMBuilderRef builder = bld->gallivm->builder;
1642    const struct lp_type type = bld->type;
1643    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1644    const char *intrinsic;
1645    LLVMValueRef res;
1646
1647    assert(type.floating);
1648
1649    assert(lp_check_value(type, a));
1650    assert(util_cpu_caps.has_sse4_1);
1651
1652    if (type.length == 1) {
1653       LLVMTypeRef vec_type;
1654       LLVMValueRef undef;
1655       LLVMValueRef args[3];
1656       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1657
1658       switch(type.width) {
1659       case 32:
1660          intrinsic = "llvm.x86.sse41.round.ss";
1661          break;
1662       case 64:
1663          intrinsic = "llvm.x86.sse41.round.sd";
1664          break;
1665       default:
1666          assert(0);
1667          return bld->undef;
1668       }
1669
1670       vec_type = LLVMVectorType(bld->elem_type, 4);
1671
1672       undef = LLVMGetUndef(vec_type);
1673
1674       args[0] = undef;
1675       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1676       args[2] = LLVMConstInt(i32t, mode, 0);
1677
1678       res = lp_build_intrinsic(builder, intrinsic,
1679                                vec_type, args, Elements(args));
1680
1681       res = LLVMBuildExtractElement(builder, res, index0, "");
1682    }
1683    else {
1684       if (type.width * type.length == 128) {
1685          switch(type.width) {
1686          case 32:
1687             intrinsic = "llvm.x86.sse41.round.ps";
1688             break;
1689          case 64:
1690             intrinsic = "llvm.x86.sse41.round.pd";
1691             break;
1692          default:
1693             assert(0);
1694             return bld->undef;
1695          }
1696       }
1697       else {
1698          assert(type.width * type.length == 256);
1699          assert(util_cpu_caps.has_avx);
1700
1701          switch(type.width) {
1702          case 32:
1703             intrinsic = "llvm.x86.avx.round.ps.256";
1704             break;
1705          case 64:
1706             intrinsic = "llvm.x86.avx.round.pd.256";
1707             break;
1708          default:
1709             assert(0);
1710             return bld->undef;
1711          }
1712       }
1713
1714       res = lp_build_intrinsic_binary(builder, intrinsic,
1715                                       bld->vec_type, a,
1716                                       LLVMConstInt(i32t, mode, 0));
1717    }
1718
1719    return res;
1720 }
1721
1722
1723 static INLINE LLVMValueRef
1724 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1725                              LLVMValueRef a)
1726 {
1727    LLVMBuilderRef builder = bld->gallivm->builder;
1728    const struct lp_type type = bld->type;
1729    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1730    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1731    const char *intrinsic;
1732    LLVMValueRef res;
1733
1734    assert(type.floating);
1735    /* using the double precision conversions is a bit more complicated */
1736    assert(type.width == 32);
1737
1738    assert(lp_check_value(type, a));
1739    assert(util_cpu_caps.has_sse2);
1740
1741    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1742    if (type.length == 1) {
1743       LLVMTypeRef vec_type;
1744       LLVMValueRef undef;
1745       LLVMValueRef arg;
1746       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1747
1748       vec_type = LLVMVectorType(bld->elem_type, 4);
1749
1750       intrinsic = "llvm.x86.sse.cvtss2si";
1751
1752       undef = LLVMGetUndef(vec_type);
1753
1754       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1755
1756       res = lp_build_intrinsic_unary(builder, intrinsic,
1757                                      ret_type, arg);
1758    }
1759    else {
1760       if (type.width* type.length == 128) {
1761          intrinsic = "llvm.x86.sse2.cvtps2dq";
1762       }
1763       else {
1764          assert(type.width*type.length == 256);
1765          assert(util_cpu_caps.has_avx);
1766
1767          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1768       }
1769       res = lp_build_intrinsic_unary(builder, intrinsic,
1770                                      ret_type, a);
1771    }
1772
1773    return res;
1774 }
1775
1776
1777 /*
1778  */
1779 static INLINE LLVMValueRef
1780 lp_build_round_altivec(struct lp_build_context *bld,
1781                        LLVMValueRef a,
1782                        enum lp_build_round_mode mode)
1783 {
1784    LLVMBuilderRef builder = bld->gallivm->builder;
1785    const struct lp_type type = bld->type;
1786    const char *intrinsic = NULL;
1787
1788    assert(type.floating);
1789
1790    assert(lp_check_value(type, a));
1791    assert(util_cpu_caps.has_altivec);
1792
1793    switch (mode) {
1794    case LP_BUILD_ROUND_NEAREST:
1795       intrinsic = "llvm.ppc.altivec.vrfin";
1796       break;
1797    case LP_BUILD_ROUND_FLOOR:
1798       intrinsic = "llvm.ppc.altivec.vrfim";
1799       break;
1800    case LP_BUILD_ROUND_CEIL:
1801       intrinsic = "llvm.ppc.altivec.vrfip";
1802       break;
1803    case LP_BUILD_ROUND_TRUNCATE:
1804       intrinsic = "llvm.ppc.altivec.vrfiz";
1805       break;
1806    }
1807
1808    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1809 }
1810
1811 static INLINE LLVMValueRef
1812 lp_build_round_arch(struct lp_build_context *bld,
1813                     LLVMValueRef a,
1814                     enum lp_build_round_mode mode)
1815 {
1816    if (util_cpu_caps.has_sse4_1)
1817      return lp_build_round_sse41(bld, a, mode);
1818    else /* (util_cpu_caps.has_altivec) */
1819      return lp_build_round_altivec(bld, a, mode);
1820 }
1821
1822 /**
1823  * Return the integer part of a float (vector) value (== round toward zero).
1824  * The returned value is a float (vector).
1825  * Ex: trunc(-1.5) = -1.0
1826  */
1827 LLVMValueRef
1828 lp_build_trunc(struct lp_build_context *bld,
1829                LLVMValueRef a)
1830 {
1831    LLVMBuilderRef builder = bld->gallivm->builder;
1832    const struct lp_type type = bld->type;
1833
1834    assert(type.floating);
1835    assert(lp_check_value(type, a));
1836
1837    if (arch_rounding_available(type)) {
1838       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1839    }
1840    else {
1841       const struct lp_type type = bld->type;
1842       struct lp_type inttype;
1843       struct lp_build_context intbld;
1844       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1845       LLVMValueRef trunc, res, anosign, mask;
1846       LLVMTypeRef int_vec_type = bld->int_vec_type;
1847       LLVMTypeRef vec_type = bld->vec_type;
1848
1849       assert(type.width == 32); /* might want to handle doubles at some point */
1850
1851       inttype = type;
1852       inttype.floating = 0;
1853       lp_build_context_init(&intbld, bld->gallivm, inttype);
1854
1855       /* round by truncation */
1856       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1857       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1858
1859       /* mask out sign bit */
1860       anosign = lp_build_abs(bld, a);
1861       /*
1862        * mask out all values if anosign > 2^24
1863        * This should work both for large ints (all rounding is no-op for them
1864        * because such floats are always exact) as well as special cases like
1865        * NaNs, Infs (taking advantage of the fact they use max exponent).
1866        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1867        */
1868       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1869       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1870       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1871       return lp_build_select(bld, mask, a, res);
1872    }
1873 }
1874
1875
1876 /**
1877  * Return float (vector) rounded to nearest integer (vector).  The returned
1878  * value is a float (vector).
1879  * Ex: round(0.9) = 1.0
1880  * Ex: round(-1.5) = -2.0
1881  */
1882 LLVMValueRef
1883 lp_build_round(struct lp_build_context *bld,
1884                LLVMValueRef a)
1885 {
1886    LLVMBuilderRef builder = bld->gallivm->builder;
1887    const struct lp_type type = bld->type;
1888
1889    assert(type.floating);
1890    assert(lp_check_value(type, a));
1891
1892    if (arch_rounding_available(type)) {
1893       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1894    }
1895    else {
1896       const struct lp_type type = bld->type;
1897       struct lp_type inttype;
1898       struct lp_build_context intbld;
1899       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1900       LLVMValueRef res, anosign, mask;
1901       LLVMTypeRef int_vec_type = bld->int_vec_type;
1902       LLVMTypeRef vec_type = bld->vec_type;
1903
1904       assert(type.width == 32); /* might want to handle doubles at some point */
1905
1906       inttype = type;
1907       inttype.floating = 0;
1908       lp_build_context_init(&intbld, bld->gallivm, inttype);
1909
1910       res = lp_build_iround(bld, a);
1911       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1912
1913       /* mask out sign bit */
1914       anosign = lp_build_abs(bld, a);
1915       /*
1916        * mask out all values if anosign > 2^24
1917        * This should work both for large ints (all rounding is no-op for them
1918        * because such floats are always exact) as well as special cases like
1919        * NaNs, Infs (taking advantage of the fact they use max exponent).
1920        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1921        */
1922       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1923       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1924       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1925       return lp_build_select(bld, mask, a, res);
1926    }
1927 }
1928
1929
1930 /**
1931  * Return floor of float (vector), result is a float (vector)
1932  * Ex: floor(1.1) = 1.0
1933  * Ex: floor(-1.1) = -2.0
1934  */
1935 LLVMValueRef
1936 lp_build_floor(struct lp_build_context *bld,
1937                LLVMValueRef a)
1938 {
1939    LLVMBuilderRef builder = bld->gallivm->builder;
1940    const struct lp_type type = bld->type;
1941
1942    assert(type.floating);
1943    assert(lp_check_value(type, a));
1944
1945    if (arch_rounding_available(type)) {
1946       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1947    }
1948    else {
1949       const struct lp_type type = bld->type;
1950       struct lp_type inttype;
1951       struct lp_build_context intbld;
1952       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1953       LLVMValueRef trunc, res, anosign, mask;
1954       LLVMTypeRef int_vec_type = bld->int_vec_type;
1955       LLVMTypeRef vec_type = bld->vec_type;
1956
1957       assert(type.width == 32); /* might want to handle doubles at some point */
1958
1959       inttype = type;
1960       inttype.floating = 0;
1961       lp_build_context_init(&intbld, bld->gallivm, inttype);
1962
1963       /* round by truncation */
1964       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1965       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1966
1967       if (type.sign) {
1968          LLVMValueRef tmp;
1969
1970          /*
1971           * fix values if rounding is wrong (for non-special cases)
1972           * - this is the case if trunc > a
1973           */
1974          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1975          /* tmp = trunc > a ? 1.0 : 0.0 */
1976          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1977          tmp = lp_build_and(&intbld, mask, tmp);
1978          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1979          res = lp_build_sub(bld, res, tmp);
1980       }
1981
1982       /* mask out sign bit */
1983       anosign = lp_build_abs(bld, a);
1984       /*
1985        * mask out all values if anosign > 2^24
1986        * This should work both for large ints (all rounding is no-op for them
1987        * because such floats are always exact) as well as special cases like
1988        * NaNs, Infs (taking advantage of the fact they use max exponent).
1989        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1990        */
1991       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1992       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1993       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1994       return lp_build_select(bld, mask, a, res);
1995    }
1996 }
1997
1998
1999 /**
2000  * Return ceiling of float (vector), returning float (vector).
2001  * Ex: ceil( 1.1) = 2.0
2002  * Ex: ceil(-1.1) = -1.0
2003  */
2004 LLVMValueRef
2005 lp_build_ceil(struct lp_build_context *bld,
2006               LLVMValueRef a)
2007 {
2008    LLVMBuilderRef builder = bld->gallivm->builder;
2009    const struct lp_type type = bld->type;
2010
2011    assert(type.floating);
2012    assert(lp_check_value(type, a));
2013
2014    if (arch_rounding_available(type)) {
2015       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2016    }
2017    else {
2018       const struct lp_type type = bld->type;
2019       struct lp_type inttype;
2020       struct lp_build_context intbld;
2021       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2022       LLVMValueRef trunc, res, anosign, mask, tmp;
2023       LLVMTypeRef int_vec_type = bld->int_vec_type;
2024       LLVMTypeRef vec_type = bld->vec_type;
2025
2026       assert(type.width == 32); /* might want to handle doubles at some point */
2027
2028       inttype = type;
2029       inttype.floating = 0;
2030       lp_build_context_init(&intbld, bld->gallivm, inttype);
2031
2032       /* round by truncation */
2033       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2034       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2035
2036       /*
2037        * fix values if rounding is wrong (for non-special cases)
2038        * - this is the case if trunc < a
2039        */
2040       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2041       /* tmp = trunc < a ? 1.0 : 0.0 */
2042       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2043       tmp = lp_build_and(&intbld, mask, tmp);
2044       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2045       res = lp_build_add(bld, trunc, tmp);
2046
2047       /* mask out sign bit */
2048       anosign = lp_build_abs(bld, a);
2049       /*
2050        * mask out all values if anosign > 2^24
2051        * This should work both for large ints (all rounding is no-op for them
2052        * because such floats are always exact) as well as special cases like
2053        * NaNs, Infs (taking advantage of the fact they use max exponent).
2054        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2055        */
2056       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2057       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2058       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2059       return lp_build_select(bld, mask, a, res);
2060    }
2061 }
2062
2063
2064 /**
2065  * Return fractional part of 'a' computed as a - floor(a)
2066  * Typically used in texture coord arithmetic.
2067  */
2068 LLVMValueRef
2069 lp_build_fract(struct lp_build_context *bld,
2070                LLVMValueRef a)
2071 {
2072    assert(bld->type.floating);
2073    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2074 }
2075
2076
2077 /**
2078  * Prevent returning a fractional part of 1.0 for very small negative values of
2079  * 'a' by clamping against 0.99999(9).
2080  */
2081 static inline LLVMValueRef
2082 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2083 {
2084    LLVMValueRef max;
2085
2086    /* this is the largest number smaller than 1.0 representable as float */
2087    max = lp_build_const_vec(bld->gallivm, bld->type,
2088                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2089    return lp_build_min(bld, fract, max);
2090 }
2091
2092
2093 /**
2094  * Same as lp_build_fract, but guarantees that the result is always smaller
2095  * than one.
2096  */
2097 LLVMValueRef
2098 lp_build_fract_safe(struct lp_build_context *bld,
2099                     LLVMValueRef a)
2100 {
2101    return clamp_fract(bld, lp_build_fract(bld, a));
2102 }
2103
2104
2105 /**
2106  * Return the integer part of a float (vector) value (== round toward zero).
2107  * The returned value is an integer (vector).
2108  * Ex: itrunc(-1.5) = -1
2109  */
2110 LLVMValueRef
2111 lp_build_itrunc(struct lp_build_context *bld,
2112                 LLVMValueRef a)
2113 {
2114    LLVMBuilderRef builder = bld->gallivm->builder;
2115    const struct lp_type type = bld->type;
2116    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2117
2118    assert(type.floating);
2119    assert(lp_check_value(type, a));
2120
2121    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2122 }
2123
2124
2125 /**
2126  * Return float (vector) rounded to nearest integer (vector).  The returned
2127  * value is an integer (vector).
2128  * Ex: iround(0.9) = 1
2129  * Ex: iround(-1.5) = -2
2130  */
2131 LLVMValueRef
2132 lp_build_iround(struct lp_build_context *bld,
2133                 LLVMValueRef a)
2134 {
2135    LLVMBuilderRef builder = bld->gallivm->builder;
2136    const struct lp_type type = bld->type;
2137    LLVMTypeRef int_vec_type = bld->int_vec_type;
2138    LLVMValueRef res;
2139
2140    assert(type.floating);
2141
2142    assert(lp_check_value(type, a));
2143
2144    if ((util_cpu_caps.has_sse2 &&
2145        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2146        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2147       return lp_build_iround_nearest_sse2(bld, a);
2148    }
2149    if (arch_rounding_available(type)) {
2150       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2151    }
2152    else {
2153       LLVMValueRef half;
2154
2155       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2156
2157       if (type.sign) {
2158          LLVMTypeRef vec_type = bld->vec_type;
2159          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2160                                     (unsigned long long)1 << (type.width - 1));
2161          LLVMValueRef sign;
2162
2163          /* get sign bit */
2164          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2165          sign = LLVMBuildAnd(builder, sign, mask, "");
2166
2167          /* sign * 0.5 */
2168          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2169          half = LLVMBuildOr(builder, sign, half, "");
2170          half = LLVMBuildBitCast(builder, half, vec_type, "");
2171       }
2172
2173       res = LLVMBuildFAdd(builder, a, half, "");
2174    }
2175
2176    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2177
2178    return res;
2179 }
2180
2181
2182 /**
2183  * Return floor of float (vector), result is an int (vector)
2184  * Ex: ifloor(1.1) = 1.0
2185  * Ex: ifloor(-1.1) = -2.0
2186  */
2187 LLVMValueRef
2188 lp_build_ifloor(struct lp_build_context *bld,
2189                 LLVMValueRef a)
2190 {
2191    LLVMBuilderRef builder = bld->gallivm->builder;
2192    const struct lp_type type = bld->type;
2193    LLVMTypeRef int_vec_type = bld->int_vec_type;
2194    LLVMValueRef res;
2195
2196    assert(type.floating);
2197    assert(lp_check_value(type, a));
2198
2199    res = a;
2200    if (type.sign) {
2201       if (arch_rounding_available(type)) {
2202          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2203       }
2204       else {
2205          struct lp_type inttype;
2206          struct lp_build_context intbld;
2207          LLVMValueRef trunc, itrunc, mask;
2208
2209          assert(type.floating);
2210          assert(lp_check_value(type, a));
2211
2212          inttype = type;
2213          inttype.floating = 0;
2214          lp_build_context_init(&intbld, bld->gallivm, inttype);
2215
2216          /* round by truncation */
2217          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2218          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2219
2220          /*
2221           * fix values if rounding is wrong (for non-special cases)
2222           * - this is the case if trunc > a
2223           * The results of doing this with NaNs, very large values etc.
2224           * are undefined but this seems to be the case anyway.
2225           */
2226          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2227          /* cheapie minus one with mask since the mask is minus one / zero */
2228          return lp_build_add(&intbld, itrunc, mask);
2229       }
2230    }
2231
2232    /* round to nearest (toward zero) */
2233    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2234
2235    return res;
2236 }
2237
2238
2239 /**
2240  * Return ceiling of float (vector), returning int (vector).
2241  * Ex: iceil( 1.1) = 2
2242  * Ex: iceil(-1.1) = -1
2243  */
2244 LLVMValueRef
2245 lp_build_iceil(struct lp_build_context *bld,
2246                LLVMValueRef a)
2247 {
2248    LLVMBuilderRef builder = bld->gallivm->builder;
2249    const struct lp_type type = bld->type;
2250    LLVMTypeRef int_vec_type = bld->int_vec_type;
2251    LLVMValueRef res;
2252
2253    assert(type.floating);
2254    assert(lp_check_value(type, a));
2255
2256    if (arch_rounding_available(type)) {
2257       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2258    }
2259    else {
2260       struct lp_type inttype;
2261       struct lp_build_context intbld;
2262       LLVMValueRef trunc, itrunc, mask;
2263
2264       assert(type.floating);
2265       assert(lp_check_value(type, a));
2266
2267       inttype = type;
2268       inttype.floating = 0;
2269       lp_build_context_init(&intbld, bld->gallivm, inttype);
2270
2271       /* round by truncation */
2272       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2273       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2274
2275       /*
2276        * fix values if rounding is wrong (for non-special cases)
2277        * - this is the case if trunc < a
2278        * The results of doing this with NaNs, very large values etc.
2279        * are undefined but this seems to be the case anyway.
2280        */
2281       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2282       /* cheapie plus one with mask since the mask is minus one / zero */
2283       return lp_build_sub(&intbld, itrunc, mask);
2284    }
2285
2286    /* round to nearest (toward zero) */
2287    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2288
2289    return res;
2290 }
2291
2292
2293 /**
2294  * Combined ifloor() & fract().
2295  *
2296  * Preferred to calling the functions separately, as it will ensure that the
2297  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2298  */
2299 void
2300 lp_build_ifloor_fract(struct lp_build_context *bld,
2301                       LLVMValueRef a,
2302                       LLVMValueRef *out_ipart,
2303                       LLVMValueRef *out_fpart)
2304 {
2305    LLVMBuilderRef builder = bld->gallivm->builder;
2306    const struct lp_type type = bld->type;
2307    LLVMValueRef ipart;
2308
2309    assert(type.floating);
2310    assert(lp_check_value(type, a));
2311
2312    if (arch_rounding_available(type)) {
2313       /*
2314        * floor() is easier.
2315        */
2316
2317       ipart = lp_build_floor(bld, a);
2318       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2319       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2320    }
2321    else {
2322       /*
2323        * ifloor() is easier.
2324        */
2325
2326       *out_ipart = lp_build_ifloor(bld, a);
2327       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2328       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2329    }
2330 }
2331
2332
2333 /**
2334  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2335  * always smaller than one.
2336  */
2337 void
2338 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2339                            LLVMValueRef a,
2340                            LLVMValueRef *out_ipart,
2341                            LLVMValueRef *out_fpart)
2342 {
2343    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2344    *out_fpart = clamp_fract(bld, *out_fpart);
2345 }
2346
2347
2348 LLVMValueRef
2349 lp_build_sqrt(struct lp_build_context *bld,
2350               LLVMValueRef a)
2351 {
2352    LLVMBuilderRef builder = bld->gallivm->builder;
2353    const struct lp_type type = bld->type;
2354    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2355    char intrinsic[32];
2356
2357    assert(lp_check_value(type, a));
2358
2359    /* TODO: optimize the constant case */
2360
2361    assert(type.floating);
2362    if (type.length == 1) {
2363       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2364    }
2365    else {
2366       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2367    }
2368
2369    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2370 }
2371
2372
2373 /**
2374  * Do one Newton-Raphson step to improve reciprocate precision:
2375  *
2376  *   x_{i+1} = x_i * (2 - a * x_i)
2377  *
2378  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2379  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2380  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2381  * halo. It would be necessary to clamp the argument to prevent this.
2382  *
2383  * See also:
2384  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2385  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2386  */
2387 static INLINE LLVMValueRef
2388 lp_build_rcp_refine(struct lp_build_context *bld,
2389                     LLVMValueRef a,
2390                     LLVMValueRef rcp_a)
2391 {
2392    LLVMBuilderRef builder = bld->gallivm->builder;
2393    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2394    LLVMValueRef res;
2395
2396    res = LLVMBuildFMul(builder, a, rcp_a, "");
2397    res = LLVMBuildFSub(builder, two, res, "");
2398    res = LLVMBuildFMul(builder, rcp_a, res, "");
2399
2400    return res;
2401 }
2402
2403
2404 LLVMValueRef
2405 lp_build_rcp(struct lp_build_context *bld,
2406              LLVMValueRef a)
2407 {
2408    LLVMBuilderRef builder = bld->gallivm->builder;
2409    const struct lp_type type = bld->type;
2410
2411    assert(lp_check_value(type, a));
2412
2413    if(a == bld->zero)
2414       return bld->undef;
2415    if(a == bld->one)
2416       return bld->one;
2417    if(a == bld->undef)
2418       return bld->undef;
2419
2420    assert(type.floating);
2421
2422    if(LLVMIsConstant(a))
2423       return LLVMConstFDiv(bld->one, a);
2424
2425    /*
2426     * We don't use RCPPS because:
2427     * - it only has 10bits of precision
2428     * - it doesn't even get the reciprocate of 1.0 exactly
2429     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2430     * - for recent processors the benefit over DIVPS is marginal, a case
2431     *   dependent
2432     *
2433     * We could still use it on certain processors if benchmarks show that the
2434     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2435     * particular uses that require less workarounds.
2436     */
2437
2438    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2439          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2440       const unsigned num_iterations = 0;
2441       LLVMValueRef res;
2442       unsigned i;
2443       const char *intrinsic = NULL;
2444
2445       if (type.length == 4) {
2446          intrinsic = "llvm.x86.sse.rcp.ps";
2447       }
2448       else {
2449          intrinsic = "llvm.x86.avx.rcp.ps.256";
2450       }
2451
2452       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2453
2454       for (i = 0; i < num_iterations; ++i) {
2455          res = lp_build_rcp_refine(bld, a, res);
2456       }
2457
2458       return res;
2459    }
2460
2461    return LLVMBuildFDiv(builder, bld->one, a, "");
2462 }
2463
2464
2465 /**
2466  * Do one Newton-Raphson step to improve rsqrt precision:
2467  *
2468  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2469  *
2470  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2471  */
2472 static INLINE LLVMValueRef
2473 lp_build_rsqrt_refine(struct lp_build_context *bld,
2474                       LLVMValueRef a,
2475                       LLVMValueRef rsqrt_a)
2476 {
2477    LLVMBuilderRef builder = bld->gallivm->builder;
2478    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2479    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2480    LLVMValueRef res;
2481
2482    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2483    res = LLVMBuildFMul(builder, a, res, "");
2484    res = LLVMBuildFSub(builder, three, res, "");
2485    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2486    res = LLVMBuildFMul(builder, half, res, "");
2487
2488    return res;
2489 }
2490
2491
2492 /**
2493  * Generate 1/sqrt(a).
2494  * Result is undefined for values < 0, infinity for +0.
2495  */
2496 LLVMValueRef
2497 lp_build_rsqrt(struct lp_build_context *bld,
2498                LLVMValueRef a)
2499 {
2500    LLVMBuilderRef builder = bld->gallivm->builder;
2501    const struct lp_type type = bld->type;
2502
2503    assert(lp_check_value(type, a));
2504
2505    assert(type.floating);
2506
2507    /*
2508     * This should be faster but all denormals will end up as infinity.
2509     */
2510    if (0 && lp_build_fast_rsqrt_available(type)) {
2511       const unsigned num_iterations = 1;
2512       LLVMValueRef res;
2513       unsigned i;
2514
2515       /* rsqrt(1.0) != 1.0 here */
2516       res = lp_build_fast_rsqrt(bld, a);
2517
2518       if (num_iterations) {
2519          /*
2520           * Newton-Raphson will result in NaN instead of infinity for zero,
2521           * and NaN instead of zero for infinity.
2522           * Also, need to ensure rsqrt(1.0) == 1.0.
2523           * All numbers smaller than FLT_MIN will result in +infinity
2524           * (rsqrtps treats all denormals as zero).
2525           */
2526          /*
2527           * Certain non-c99 compilers don't know INFINITY and might not support
2528           * hacks to evaluate it at compile time neither.
2529           */
2530          const unsigned posinf_int = 0x7F800000;
2531          LLVMValueRef cmp;
2532          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2533          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2534
2535          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2536
2537          for (i = 0; i < num_iterations; ++i) {
2538             res = lp_build_rsqrt_refine(bld, a, res);
2539          }
2540          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2541          res = lp_build_select(bld, cmp, inf, res);
2542          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2543          res = lp_build_select(bld, cmp, bld->zero, res);
2544          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2545          res = lp_build_select(bld, cmp, bld->one, res);
2546       }
2547
2548       return res;
2549    }
2550
2551    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2552 }
2553
2554 /**
2555  * If there's a fast (inaccurate) rsqrt instruction available
2556  * (caller may want to avoid to call rsqrt_fast if it's not available,
2557  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2558  * unavailable it would result in sqrt/div/mul so obviously
2559  * much better to just call sqrt, skipping both div and mul).
2560  */
2561 boolean
2562 lp_build_fast_rsqrt_available(struct lp_type type)
2563 {
2564    assert(type.floating);
2565
2566    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2567        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2568       return true;
2569    }
2570    return false;
2571 }
2572
2573
2574 /**
2575  * Generate 1/sqrt(a).
2576  * Result is undefined for values < 0, infinity for +0.
2577  * Precision is limited, only ~10 bits guaranteed
2578  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2579  */
2580 LLVMValueRef
2581 lp_build_fast_rsqrt(struct lp_build_context *bld,
2582                     LLVMValueRef a)
2583 {
2584    LLVMBuilderRef builder = bld->gallivm->builder;
2585    const struct lp_type type = bld->type;
2586
2587    assert(lp_check_value(type, a));
2588
2589    if (lp_build_fast_rsqrt_available(type)) {
2590       const char *intrinsic = NULL;
2591
2592       if (type.length == 4) {
2593          intrinsic = "llvm.x86.sse.rsqrt.ps";
2594       }
2595       else {
2596          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2597       }
2598       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2599    }
2600    else {
2601       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2602    }
2603    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2604 }
2605
2606
2607 /**
2608  * Generate sin(a) or cos(a) using polynomial approximation.
2609  * TODO: it might be worth recognizing sin and cos using same source
2610  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2611  * would be way cheaper than calculating (nearly) everything twice...
2612  * Not sure it's common enough to be worth bothering however, scs
2613  * opcode could also benefit from calculating both though.
2614  */
2615 static LLVMValueRef
2616 lp_build_sin_or_cos(struct lp_build_context *bld,
2617                     LLVMValueRef a,
2618                     boolean cos)
2619 {
2620    struct gallivm_state *gallivm = bld->gallivm;
2621    LLVMBuilderRef b = gallivm->builder;
2622    struct lp_type int_type = lp_int_type(bld->type);
2623
2624    /*
2625     *  take the absolute value,
2626     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2627     */
2628
2629    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2630    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2631
2632    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2633    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2634
2635    /*
2636     * scale by 4/Pi
2637     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2638     */
2639
2640    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2641    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2642
2643    /*
2644     * store the integer part of y in mm0
2645     * emm2 = _mm_cvttps_epi32(y);
2646     */
2647
2648    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2649
2650    /*
2651     * j=(j+1) & (~1) (see the cephes sources)
2652     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2653     */
2654
2655    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2656    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2657    /*
2658     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2659     */
2660    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2661    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2662
2663    /*
2664     * y = _mm_cvtepi32_ps(emm2);
2665     */
2666    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2667
2668    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2669    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2670    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2671    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2672
2673    /*
2674     * Argument used for poly selection and sign bit determination
2675     * is different for sin vs. cos.
2676     */
2677    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2678                                emm2_and;
2679
2680    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2681                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2682                                               const_29, "sign_bit") :
2683                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2684                                                               LLVMBuildShl(b, emm2_add,
2685                                                                            const_29, ""), ""),
2686                                               sign_mask, "sign_bit");
2687
2688    /*
2689     * get the polynom selection mask
2690     * there is one polynom for 0 <= x <= Pi/4
2691     * and another one for Pi/4<x<=Pi/2
2692     * Both branches will be computed.
2693     *
2694     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2695     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2696     */
2697
2698    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2699    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2700                                              int_type, PIPE_FUNC_EQUAL,
2701                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2702
2703    /*
2704     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2705     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2706     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2707     */
2708    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2709    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2710    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2711
2712    /*
2713     * The magic pass: "Extended precision modular arithmetic"
2714     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2715     * xmm1 = _mm_mul_ps(y, xmm1);
2716     * xmm2 = _mm_mul_ps(y, xmm2);
2717     * xmm3 = _mm_mul_ps(y, xmm3);
2718     */
2719    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2720    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2721    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2722
2723    /*
2724     * x = _mm_add_ps(x, xmm1);
2725     * x = _mm_add_ps(x, xmm2);
2726     * x = _mm_add_ps(x, xmm3);
2727     */
2728
2729    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2730    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2731    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2732
2733    /*
2734     * Evaluate the first polynom  (0 <= x <= Pi/4)
2735     *
2736     * z = _mm_mul_ps(x,x);
2737     */
2738    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2739
2740    /*
2741     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2742     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2743     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2744     */
2745    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2746    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2747    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2748
2749    /*
2750     * y = *(v4sf*)_ps_coscof_p0;
2751     * y = _mm_mul_ps(y, z);
2752     */
2753    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2754    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2755    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2756    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2757    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2758    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2759
2760
2761    /*
2762     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2763     * y = _mm_sub_ps(y, tmp);
2764     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2765     */
2766    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2767    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2768    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2769    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2770    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2771
2772    /*
2773     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2774     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2775     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2776     */
2777    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2778    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2779    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2780
2781    /*
2782     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2783     *
2784     * y2 = *(v4sf*)_ps_sincof_p0;
2785     * y2 = _mm_mul_ps(y2, z);
2786     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2787     * y2 = _mm_mul_ps(y2, z);
2788     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2789     * y2 = _mm_mul_ps(y2, z);
2790     * y2 = _mm_mul_ps(y2, x);
2791     * y2 = _mm_add_ps(y2, x);
2792     */
2793
2794    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2795    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2796    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2797    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2798    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2799    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2800    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2801
2802    /*
2803     * select the correct result from the two polynoms
2804     * xmm3 = poly_mask;
2805     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2806     * y = _mm_andnot_ps(xmm3, y);
2807     * y = _mm_or_ps(y,y2);
2808     */
2809    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2810    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2811    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2812    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2813    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2814    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2815
2816    /*
2817     * update the sign
2818     * y = _mm_xor_ps(y, sign_bit);
2819     */
2820    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2821    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2822
2823    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2824
2825    /* clamp output to be within [-1, 1] */
2826    y_result = lp_build_clamp(bld, y_result,
2827                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2828                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2829    /* If a is -inf, inf or NaN then return NaN */
2830    y_result = lp_build_select(bld, isfinite, y_result,
2831                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2832    return y_result;
2833 }
2834
2835
2836 /**
2837  * Generate sin(a)
2838  */
2839 LLVMValueRef
2840 lp_build_sin(struct lp_build_context *bld,
2841              LLVMValueRef a)
2842 {
2843    return lp_build_sin_or_cos(bld, a, FALSE);
2844 }
2845
2846
2847 /**
2848  * Generate cos(a)
2849  */
2850 LLVMValueRef
2851 lp_build_cos(struct lp_build_context *bld,
2852              LLVMValueRef a)
2853 {
2854    return lp_build_sin_or_cos(bld, a, TRUE);
2855 }
2856
2857
2858 /**
2859  * Generate pow(x, y)
2860  */
2861 LLVMValueRef
2862 lp_build_pow(struct lp_build_context *bld,
2863              LLVMValueRef x,
2864              LLVMValueRef y)
2865 {
2866    /* TODO: optimize the constant case */
2867    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2868        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2869       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2870                    __FUNCTION__);
2871    }
2872
2873    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2874 }
2875
2876
2877 /**
2878  * Generate exp(x)
2879  */
2880 LLVMValueRef
2881 lp_build_exp(struct lp_build_context *bld,
2882              LLVMValueRef x)
2883 {
2884    /* log2(e) = 1/log(2) */
2885    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2886                                            1.4426950408889634);
2887
2888    assert(lp_check_value(bld->type, x));
2889
2890    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2891 }
2892
2893
2894 /**
2895  * Generate log(x)
2896  * Behavior is undefined with infs, 0s and nans
2897  */
2898 LLVMValueRef
2899 lp_build_log(struct lp_build_context *bld,
2900              LLVMValueRef x)
2901 {
2902    /* log(2) */
2903    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2904                                           0.69314718055994529);
2905
2906    assert(lp_check_value(bld->type, x));
2907
2908    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2909 }
2910
2911 /**
2912  * Generate log(x) that handles edge cases (infs, 0s and nans)
2913  */
2914 LLVMValueRef
2915 lp_build_log_safe(struct lp_build_context *bld,
2916                   LLVMValueRef x)
2917 {
2918    /* log(2) */
2919    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2920                                           0.69314718055994529);
2921
2922    assert(lp_check_value(bld->type, x));
2923
2924    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2925 }
2926
2927
2928 /**
2929  * Generate polynomial.
2930  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2931  */
2932 LLVMValueRef
2933 lp_build_polynomial(struct lp_build_context *bld,
2934                     LLVMValueRef x,
2935                     const double *coeffs,
2936                     unsigned num_coeffs)
2937 {
2938    const struct lp_type type = bld->type;
2939    LLVMValueRef even = NULL, odd = NULL;
2940    LLVMValueRef x2;
2941    unsigned i;
2942
2943    assert(lp_check_value(bld->type, x));
2944
2945    /* TODO: optimize the constant case */
2946    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2947        LLVMIsConstant(x)) {
2948       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2949                    __FUNCTION__);
2950    }
2951
2952    /*
2953     * Calculate odd and even terms seperately to decrease data dependency
2954     * Ex:
2955     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2956     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2957     */
2958    x2 = lp_build_mul(bld, x, x);
2959
2960    for (i = num_coeffs; i--; ) {
2961       LLVMValueRef coeff;
2962
2963       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2964
2965       if (i % 2 == 0) {
2966          if (even)
2967             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2968          else
2969             even = coeff;
2970       } else {
2971          if (odd)
2972             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2973          else
2974             odd = coeff;
2975       }
2976    }
2977
2978    if (odd)
2979       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2980    else if (even)
2981       return even;
2982    else
2983       return bld->undef;
2984 }
2985
2986
2987 /**
2988  * Minimax polynomial fit of 2**x, in range [0, 1[
2989  */
2990 const double lp_build_exp2_polynomial[] = {
2991 #if EXP_POLY_DEGREE == 5
2992    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2993    0.693153073200168932794,
2994    0.240153617044375388211,
2995    0.0558263180532956664775,
2996    0.00898934009049466391101,
2997    0.00187757667519147912699
2998 #elif EXP_POLY_DEGREE == 4
2999    1.00000259337069434683,
3000    0.693003834469974940458,
3001    0.24144275689150793076,
3002    0.0520114606103070150235,
3003    0.0135341679161270268764
3004 #elif EXP_POLY_DEGREE == 3
3005    0.999925218562710312959,
3006    0.695833540494823811697,
3007    0.226067155427249155588,
3008    0.0780245226406372992967
3009 #elif EXP_POLY_DEGREE == 2
3010    1.00172476321474503578,
3011    0.657636275736077639316,
3012    0.33718943461968720704
3013 #else
3014 #error
3015 #endif
3016 };
3017
3018
3019 LLVMValueRef
3020 lp_build_exp2(struct lp_build_context *bld,
3021               LLVMValueRef x)
3022 {
3023    LLVMBuilderRef builder = bld->gallivm->builder;
3024    const struct lp_type type = bld->type;
3025    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3026    LLVMValueRef ipart = NULL;
3027    LLVMValueRef fpart = NULL;
3028    LLVMValueRef expipart = NULL;
3029    LLVMValueRef expfpart = NULL;
3030    LLVMValueRef res = NULL;
3031
3032    assert(lp_check_value(bld->type, x));
3033
3034
3035    /* TODO: optimize the constant case */
3036    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3037        LLVMIsConstant(x)) {
3038       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3039                    __FUNCTION__);
3040    }
3041
3042    assert(type.floating && type.width == 32);
3043
3044    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3045     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3046    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3047                         GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
3048    x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x);
3049
3050    /* ipart = floor(x) */
3051    /* fpart = x - ipart */
3052    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3053
3054
3055
3056    /* expipart = (float) (1 << ipart) */
3057    expipart = LLVMBuildAdd(builder, ipart,
3058                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3059    expipart = LLVMBuildShl(builder, expipart,
3060                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3061    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3062
3063
3064    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3065                                   Elements(lp_build_exp2_polynomial));
3066
3067    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3068
3069
3070    return res;
3071 }
3072
3073
3074
3075 /**
3076  * Extract the exponent of a IEEE-754 floating point value.
3077  *
3078  * Optionally apply an integer bias.
3079  *
3080  * Result is an integer value with
3081  *
3082  *   ifloor(log2(x)) + bias
3083  */
3084 LLVMValueRef
3085 lp_build_extract_exponent(struct lp_build_context *bld,
3086                           LLVMValueRef x,
3087                           int bias)
3088 {
3089    LLVMBuilderRef builder = bld->gallivm->builder;
3090    const struct lp_type type = bld->type;
3091    unsigned mantissa = lp_mantissa(type);
3092    LLVMValueRef res;
3093
3094    assert(type.floating);
3095
3096    assert(lp_check_value(bld->type, x));
3097
3098    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3099
3100    res = LLVMBuildLShr(builder, x,
3101                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3102    res = LLVMBuildAnd(builder, res,
3103                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3104    res = LLVMBuildSub(builder, res,
3105                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3106
3107    return res;
3108 }
3109
3110
3111 /**
3112  * Extract the mantissa of the a floating.
3113  *
3114  * Result is a floating point value with
3115  *
3116  *   x / floor(log2(x))
3117  */
3118 LLVMValueRef
3119 lp_build_extract_mantissa(struct lp_build_context *bld,
3120                           LLVMValueRef x)
3121 {
3122    LLVMBuilderRef builder = bld->gallivm->builder;
3123    const struct lp_type type = bld->type;
3124    unsigned mantissa = lp_mantissa(type);
3125    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3126                                                   (1ULL << mantissa) - 1);
3127    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3128    LLVMValueRef res;
3129
3130    assert(lp_check_value(bld->type, x));
3131
3132    assert(type.floating);
3133
3134    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3135
3136    /* res = x / 2**ipart */
3137    res = LLVMBuildAnd(builder, x, mantmask, "");
3138    res = LLVMBuildOr(builder, res, one, "");
3139    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3140
3141    return res;
3142 }
3143
3144
3145
3146 /**
3147  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3148  * These coefficients can be generate with
3149  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3150  */
3151 const double lp_build_log2_polynomial[] = {
3152 #if LOG_POLY_DEGREE == 5
3153    2.88539008148777786488L,
3154    0.961796878841293367824L,
3155    0.577058946784739859012L,
3156    0.412914355135828735411L,
3157    0.308591899232910175289L,
3158    0.352376952300281371868L,
3159 #elif LOG_POLY_DEGREE == 4
3160    2.88539009343309178325L,
3161    0.961791550404184197881L,
3162    0.577440339438736392009L,
3163    0.403343858251329912514L,
3164    0.406718052498846252698L,
3165 #elif LOG_POLY_DEGREE == 3
3166    2.88538959748872753838L,
3167    0.961932915889597772928L,
3168    0.571118517972136195241L,
3169    0.493997535084709500285L,
3170 #else
3171 #error
3172 #endif
3173 };
3174
3175 /**
3176  * See http://www.devmaster.net/forums/showthread.php?p=43580
3177  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3178  * http://www.nezumi.demon.co.uk/consult/logx.htm
3179  *
3180  * If handle_edge_cases is true the function will perform computations
3181  * to match the required D3D10+ behavior for each of the edge cases.
3182  * That means that if input is:
3183  * - less than zero (to and including -inf) then NaN will be returned
3184  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3185  * - +infinity, then +infinity will be returned
3186  * - NaN, then NaN will be returned
3187  *
3188  * Those checks are fairly expensive so if you don't need them make sure
3189  * handle_edge_cases is false.
3190  */
3191 void
3192 lp_build_log2_approx(struct lp_build_context *bld,
3193                      LLVMValueRef x,
3194                      LLVMValueRef *p_exp,
3195                      LLVMValueRef *p_floor_log2,
3196                      LLVMValueRef *p_log2,
3197                      boolean handle_edge_cases)
3198 {
3199    LLVMBuilderRef builder = bld->gallivm->builder;
3200    const struct lp_type type = bld->type;
3201    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3202    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3203
3204    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3205    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3206    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3207
3208    LLVMValueRef i = NULL;
3209    LLVMValueRef y = NULL;
3210    LLVMValueRef z = NULL;
3211    LLVMValueRef exp = NULL;
3212    LLVMValueRef mant = NULL;
3213    LLVMValueRef logexp = NULL;
3214    LLVMValueRef logmant = NULL;
3215    LLVMValueRef res = NULL;
3216
3217    assert(lp_check_value(bld->type, x));
3218
3219    if(p_exp || p_floor_log2 || p_log2) {
3220       /* TODO: optimize the constant case */
3221       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3222           LLVMIsConstant(x)) {
3223          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3224                       __FUNCTION__);
3225       }
3226
3227       assert(type.floating && type.width == 32);
3228
3229       /*
3230        * We don't explicitly handle denormalized numbers. They will yield a
3231        * result in the neighbourhood of -127, which appears to be adequate
3232        * enough.
3233        */
3234
3235       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3236
3237       /* exp = (float) exponent(x) */
3238       exp = LLVMBuildAnd(builder, i, expmask, "");
3239    }
3240
3241    if(p_floor_log2 || p_log2) {
3242       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3243       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3244       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3245    }
3246
3247    if(p_log2) {
3248       /* mant = 1 + (float) mantissa(x) */
3249       mant = LLVMBuildAnd(builder, i, mantmask, "");
3250       mant = LLVMBuildOr(builder, mant, one, "");
3251       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3252
3253       /* y = (mant - 1) / (mant + 1) */
3254       y = lp_build_div(bld,
3255          lp_build_sub(bld, mant, bld->one),
3256          lp_build_add(bld, mant, bld->one)
3257       );
3258
3259       /* z = y^2 */
3260       z = lp_build_mul(bld, y, y);
3261
3262       /* compute P(z) */
3263       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3264                                     Elements(lp_build_log2_polynomial));
3265
3266       /* logmant = y * P(z) */
3267       logmant = lp_build_mul(bld, y, logmant);
3268
3269       res = lp_build_add(bld, logmant, logexp);
3270
3271       if (type.floating && handle_edge_cases) {
3272          LLVMValueRef negmask, infmask,  zmask;
3273          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3274                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3275          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3276                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3277          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3278                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3279
3280          /* If x is qual to inf make sure we return inf */
3281          res = lp_build_select(bld, infmask,
3282                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3283                                res);
3284          /* If x is qual to 0, return -inf */
3285          res = lp_build_select(bld, zmask,
3286                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3287                                res);
3288          /* If x is nan or less than 0, return nan */
3289          res = lp_build_select(bld, negmask,
3290                                lp_build_const_vec(bld->gallivm, type,  NAN),
3291                                res);
3292       }
3293    }
3294
3295    if(p_exp) {
3296       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3297       *p_exp = exp;
3298    }
3299
3300    if(p_floor_log2)
3301       *p_floor_log2 = logexp;
3302
3303    if(p_log2)
3304       *p_log2 = res;
3305 }
3306
3307
3308 /*
3309  * log2 implementation which doesn't have special code to
3310  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3311  * the results for those cases are undefined.
3312  */
3313 LLVMValueRef
3314 lp_build_log2(struct lp_build_context *bld,
3315               LLVMValueRef x)
3316 {
3317    LLVMValueRef res;
3318    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3319    return res;
3320 }
3321
3322 /*
3323  * Version of log2 which handles all edge cases.
3324  * Look at documentation of lp_build_log2_approx for
3325  * description of the behavior for each of the edge cases.
3326  */
3327 LLVMValueRef
3328 lp_build_log2_safe(struct lp_build_context *bld,
3329                    LLVMValueRef x)
3330 {
3331    LLVMValueRef res;
3332    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3333    return res;
3334 }
3335
3336
3337 /**
3338  * Faster (and less accurate) log2.
3339  *
3340  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3341  *
3342  * Piece-wise linear approximation, with exact results when x is a
3343  * power of two.
3344  *
3345  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3346  */
3347 LLVMValueRef
3348 lp_build_fast_log2(struct lp_build_context *bld,
3349                    LLVMValueRef x)
3350 {
3351    LLVMBuilderRef builder = bld->gallivm->builder;
3352    LLVMValueRef ipart;
3353    LLVMValueRef fpart;
3354
3355    assert(lp_check_value(bld->type, x));
3356
3357    assert(bld->type.floating);
3358
3359    /* ipart = floor(log2(x)) - 1 */
3360    ipart = lp_build_extract_exponent(bld, x, -1);
3361    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3362
3363    /* fpart = x / 2**ipart */
3364    fpart = lp_build_extract_mantissa(bld, x);
3365
3366    /* ipart + fpart */
3367    return LLVMBuildFAdd(builder, ipart, fpart, "");
3368 }
3369
3370
3371 /**
3372  * Fast implementation of iround(log2(x)).
3373  *
3374  * Not an approximation -- it should give accurate results all the time.
3375  */
3376 LLVMValueRef
3377 lp_build_ilog2(struct lp_build_context *bld,
3378                LLVMValueRef x)
3379 {
3380    LLVMBuilderRef builder = bld->gallivm->builder;
3381    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3382    LLVMValueRef ipart;
3383
3384    assert(bld->type.floating);
3385
3386    assert(lp_check_value(bld->type, x));
3387
3388    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3389    x = LLVMBuildFMul(builder, x, sqrt2, "");
3390
3391    /* ipart = floor(log2(x) + 0.5)  */
3392    ipart = lp_build_extract_exponent(bld, x, 0);
3393
3394    return ipart;
3395 }
3396
3397 LLVMValueRef
3398 lp_build_mod(struct lp_build_context *bld,
3399              LLVMValueRef x,
3400              LLVMValueRef y)
3401 {
3402    LLVMBuilderRef builder = bld->gallivm->builder;
3403    LLVMValueRef res;
3404    const struct lp_type type = bld->type;
3405
3406    assert(lp_check_value(type, x));
3407    assert(lp_check_value(type, y));
3408
3409    if (type.floating)
3410       res = LLVMBuildFRem(builder, x, y, "");
3411    else if (type.sign)
3412       res = LLVMBuildSRem(builder, x, y, "");
3413    else
3414       res = LLVMBuildURem(builder, x, y, "");
3415    return res;
3416 }
3417
3418
3419 /*
3420  * For floating inputs it creates and returns a mask
3421  * which is all 1's for channels which are NaN.
3422  * Channels inside x which are not NaN will be 0.
3423  */
3424 LLVMValueRef
3425 lp_build_isnan(struct lp_build_context *bld,
3426                LLVMValueRef x)
3427 {
3428    LLVMValueRef mask;
3429    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3430
3431    assert(bld->type.floating);
3432    assert(lp_check_value(bld->type, x));
3433
3434    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3435                         "isnotnan");
3436    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3437    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3438    return mask;
3439 }
3440
3441 /* Returns all 1's for floating point numbers that are
3442  * finite numbers and returns all zeros for -inf,
3443  * inf and nan's */
3444 LLVMValueRef
3445 lp_build_isfinite(struct lp_build_context *bld,
3446                   LLVMValueRef x)
3447 {
3448    LLVMBuilderRef builder = bld->gallivm->builder;
3449    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3450    struct lp_type int_type = lp_int_type(bld->type);
3451    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3452    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3453                                                     0x7f800000);
3454
3455    if (!bld->type.floating) {
3456       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3457    }
3458    assert(bld->type.floating);
3459    assert(lp_check_value(bld->type, x));
3460    assert(bld->type.width == 32);
3461
3462    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3463    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3464                            intx, infornan32);
3465 }
3466
3467 /*
3468  * Returns true if the number is nan or inf and false otherwise.
3469  * The input has to be a floating point vector.
3470  */
3471 LLVMValueRef
3472 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3473                        const struct lp_type type,
3474                        LLVMValueRef x)
3475 {
3476    LLVMBuilderRef builder = gallivm->builder;
3477    struct lp_type int_type = lp_int_type(type);
3478    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3479                                                 0x7f800000);
3480    LLVMValueRef ret;
3481
3482    assert(type.floating);
3483
3484    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3485    ret = LLVMBuildAnd(builder, ret, const0, "");
3486    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3487                           ret, const0);
3488
3489    return ret;
3490 }
3491