src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65
  66
  67 #define EXP_POLY_DEGREE 5
  68
  69 #define LOG_POLY_DEGREE 4
  70
  71
  72 /**
  73  * Generate min(a, b)
  74  * No checks for special case values of a or b = 1 or 0 are done.
  75  */
  76 static LLVMValueRef
  77 lp_build_min_simple(struct lp_build_context *bld,
  78                     LLVMValueRef a,
  79                     LLVMValueRef b)
  80 {
  81    const struct lp_type type = bld->type;
  82    const char *intrinsic = NULL;
  83    unsigned intr_size = 0;
  84    LLVMValueRef cond;
  85
  86    assert(lp_check_value(type, a));
  87    assert(lp_check_value(type, b));
  88
  89    /* TODO: optimize the constant case */
  90
  91    if (type.floating && util_cpu_caps.has_sse) {
  92       if (type.width == 32) {
  93          if (type.length == 1) {
  94             intrinsic = "llvm.x86.sse.min.ss";
  95             intr_size = 128;
  96          }
  97          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  98             intrinsic = "llvm.x86.sse.min.ps";
  99             intr_size = 128;
 100          }
 101          else {
 102             intrinsic = "llvm.x86.avx.min.ps.256";
 103             intr_size = 256;
 104          }
 105       }
 106       if (type.width == 64 && util_cpu_caps.has_sse2) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse2.min.sd";
 109             intr_size = 128;
 110          }
 111          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse2.min.pd";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.pd.256";
 117             intr_size = 256;
 118          }
 119       }
 120    }
 121    else if (type.floating && util_cpu_caps.has_altivec) {
 122       if (type.width == 32 && type.length == 4) {
 123          intrinsic = "llvm.ppc.altivec.vminfp";
 124          intr_size = 128;
 125       }
 126    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 127       intr_size = 128;
 128       if ((type.width == 8 || type.width == 16) &&
 129           (type.width * type.length <= 64) &&
 130           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 131          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 132                       __FUNCTION__);
 133          }
 134       if (type.width == 8 && !type.sign) {
 135          intrinsic = "llvm.x86.sse2.pminu.b";
 136       }
 137       else if (type.width == 16 && type.sign) {
 138          intrinsic = "llvm.x86.sse2.pmins.w";
 139       }
 140       if (util_cpu_caps.has_sse4_1) {
 141          if (type.width == 8 && type.sign) {
 142             intrinsic = "llvm.x86.sse41.pminsb";
 143          }
 144          if (type.width == 16 && !type.sign) {
 145             intrinsic = "llvm.x86.sse41.pminuw";
 146          }
 147          if (type.width == 32 && !type.sign) {
 148             intrinsic = "llvm.x86.sse41.pminud";
 149         }
 150          if (type.width == 32 && type.sign) {
 151             intrinsic = "llvm.x86.sse41.pminsd";
 152          }
 153       }
 154    } else if (util_cpu_caps.has_altivec) {
 155      intr_size = 128;
 156      if (type.width == 8) {
 157        if (!type.sign) {
 158          intrinsic = "llvm.ppc.altivec.vminub";
 159        } else {
 160          intrinsic = "llvm.ppc.altivec.vminsb";
 161        }
 162      } else if (type.width == 16) {
 163        if (!type.sign) {
 164          intrinsic = "llvm.ppc.altivec.vminuh";
 165        } else {
 166          intrinsic = "llvm.ppc.altivec.vminsh";
 167        }
 168      } else if (type.width == 32) {
 169        if (!type.sign) {
 170          intrinsic = "llvm.ppc.altivec.vminuw";
 171        } else {
 172          intrinsic = "llvm.ppc.altivec.vminsw";
 173        }
 174      }
 175    }
 176
 177    if(intrinsic) {
 178       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 179                                                  type,
 180                                                  intr_size, a, b);
 181    }
 182
 183    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 184    return lp_build_select(bld, cond, a, b);
 185 }
 186
 187
 188 /**
 189  * Generate max(a, b)
 190  * No checks for special case values of a or b = 1 or 0 are done.
 191  */
 192 static LLVMValueRef
 193 lp_build_max_simple(struct lp_build_context *bld,
 194                     LLVMValueRef a,
 195                     LLVMValueRef b)
 196 {
 197    const struct lp_type type = bld->type;
 198    const char *intrinsic = NULL;
 199    unsigned intr_size = 0;
 200    LLVMValueRef cond;
 201
 202    assert(lp_check_value(type, a));
 203    assert(lp_check_value(type, b));
 204
 205    /* TODO: optimize the constant case */
 206
 207    if (type.floating && util_cpu_caps.has_sse) {
 208       if (type.width == 32) {
 209          if (type.length == 1) {
 210             intrinsic = "llvm.x86.sse.max.ss";
 211             intr_size = 128;
 212          }
 213          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 214             intrinsic = "llvm.x86.sse.max.ps";
 215             intr_size = 128;
 216          }
 217          else {
 218             intrinsic = "llvm.x86.avx.max.ps.256";
 219             intr_size = 256;
 220          }
 221       }
 222       if (type.width == 64 && util_cpu_caps.has_sse2) {
 223          if (type.length == 1) {
 224             intrinsic = "llvm.x86.sse2.max.sd";
 225             intr_size = 128;
 226          }
 227          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 228             intrinsic = "llvm.x86.sse2.max.pd";
 229             intr_size = 128;
 230          }
 231          else {
 232             intrinsic = "llvm.x86.avx.max.pd.256";
 233             intr_size = 256;
 234          }
 235       }
 236    }
 237    else if (type.floating && util_cpu_caps.has_altivec) {
 238       if (type.width == 32 || type.length == 4) {
 239          intrinsic = "llvm.ppc.altivec.vmaxfp";
 240          intr_size = 128;
 241       }
 242    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 243       intr_size = 128;
 244       if ((type.width == 8 || type.width == 16) &&
 245           (type.width * type.length <= 64) &&
 246           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 247          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 248                       __FUNCTION__);
 249          }
 250       if (type.width == 8 && !type.sign) {
 251          intrinsic = "llvm.x86.sse2.pmaxu.b";
 252          intr_size = 128;
 253       }
 254       else if (type.width == 16 && type.sign) {
 255          intrinsic = "llvm.x86.sse2.pmaxs.w";
 256       }
 257       if (util_cpu_caps.has_sse4_1) {
 258          if (type.width == 8 && type.sign) {
 259             intrinsic = "llvm.x86.sse41.pmaxsb";
 260          }
 261          if (type.width == 16 && !type.sign) {
 262             intrinsic = "llvm.x86.sse41.pmaxuw";
 263          }
 264          if (type.width == 32 && !type.sign) {
 265             intrinsic = "llvm.x86.sse41.pmaxud";
 266         }
 267          if (type.width == 32 && type.sign) {
 268             intrinsic = "llvm.x86.sse41.pmaxsd";
 269          }
 270       }
 271    } else if (util_cpu_caps.has_altivec) {
 272      intr_size = 128;
 273      if (type.width == 8) {
 274        if (!type.sign) {
 275          intrinsic = "llvm.ppc.altivec.vmaxub";
 276        } else {
 277          intrinsic = "llvm.ppc.altivec.vmaxsb";
 278        }
 279      } else if (type.width == 16) {
 280        if (!type.sign) {
 281          intrinsic = "llvm.ppc.altivec.vmaxuh";
 282        } else {
 283          intrinsic = "llvm.ppc.altivec.vmaxsh";
 284        }
 285      } else if (type.width == 32) {
 286        if (!type.sign) {
 287          intrinsic = "llvm.ppc.altivec.vmaxuw";
 288        } else {
 289          intrinsic = "llvm.ppc.altivec.vmaxsw";
 290        }
 291      }
 292    }
 293
 294    if(intrinsic) {
 295       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 296                                                  type,
 297                                                  intr_size, a, b);
 298    }
 299
 300    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 301    return lp_build_select(bld, cond, a, b);
 302 }
 303
 304
 305 /**
 306  * Generate 1 - a, or ~a depending on bld->type.
 307  */
 308 LLVMValueRef
 309 lp_build_comp(struct lp_build_context *bld,
 310               LLVMValueRef a)
 311 {
 312    LLVMBuilderRef builder = bld->gallivm->builder;
 313    const struct lp_type type = bld->type;
 314
 315    assert(lp_check_value(type, a));
 316
 317    if(a == bld->one)
 318       return bld->zero;
 319    if(a == bld->zero)
 320       return bld->one;
 321
 322    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 323       if(LLVMIsConstant(a))
 324          return LLVMConstNot(a);
 325       else
 326          return LLVMBuildNot(builder, a, "");
 327    }
 328
 329    if(LLVMIsConstant(a))
 330       if (type.floating)
 331           return LLVMConstFSub(bld->one, a);
 332       else
 333           return LLVMConstSub(bld->one, a);
 334    else
 335       if (type.floating)
 336          return LLVMBuildFSub(builder, bld->one, a, "");
 337       else
 338          return LLVMBuildSub(builder, bld->one, a, "");
 339 }
 340
 341
 342 /**
 343  * Generate a + b
 344  */
 345 LLVMValueRef
 346 lp_build_add(struct lp_build_context *bld,
 347              LLVMValueRef a,
 348              LLVMValueRef b)
 349 {
 350    LLVMBuilderRef builder = bld->gallivm->builder;
 351    const struct lp_type type = bld->type;
 352    LLVMValueRef res;
 353
 354    assert(lp_check_value(type, a));
 355    assert(lp_check_value(type, b));
 356
 357    if(a == bld->zero)
 358       return b;
 359    if(b == bld->zero)
 360       return a;
 361    if(a == bld->undef || b == bld->undef)
 362       return bld->undef;
 363
 364    if(bld->type.norm) {
 365       const char *intrinsic = NULL;
 366
 367       if(a == bld->one || b == bld->one)
 368         return bld->one;
 369
 370       if (type.width * type.length == 128 &&
 371           !type.floating && !type.fixed) {
 372          if(util_cpu_caps.has_sse2) {
 373            if(type.width == 8)
 374              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 375            if(type.width == 16)
 376              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 377          } else if (util_cpu_caps.has_altivec) {
 378            if(type.width == 8)
 379               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 380            if(type.width == 16)
 381               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsws" : "llvm.ppc.altivec.vadduws";
 382          }
 383       }
 384
 385       if(intrinsic)
 386          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 387    }
 388
 389    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 390       if (type.floating)
 391          res = LLVMConstFAdd(a, b);
 392       else
 393          res = LLVMConstAdd(a, b);
 394    else
 395       if (type.floating)
 396          res = LLVMBuildFAdd(builder, a, b, "");
 397       else
 398          res = LLVMBuildAdd(builder, a, b, "");
 399
 400    /* clamp to ceiling of 1.0 */
 401    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 402       res = lp_build_min_simple(bld, res, bld->one);
 403
 404    /* XXX clamp to floor of -1 or 0??? */
 405
 406    return res;
 407 }
 408
 409
 410 /** Return the scalar sum of the elements of a.
 411  * Should avoid this operation whenever possible.
 412  */
 413 LLVMValueRef
 414 lp_build_horizontal_add(struct lp_build_context *bld,
 415                         LLVMValueRef a)
 416 {
 417    LLVMBuilderRef builder = bld->gallivm->builder;
 418    const struct lp_type type = bld->type;
 419    LLVMValueRef index, res;
 420    unsigned i, length;
 421    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 422    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 423    LLVMValueRef vecres, elem2;
 424
 425    assert(lp_check_value(type, a));
 426
 427    if (type.length == 1) {
 428       return a;
 429    }
 430
 431    assert(!bld->type.norm);
 432
 433    /*
 434     * for byte vectors can do much better with psadbw.
 435     * Using repeated shuffle/adds here. Note with multiple vectors
 436     * this can be done more efficiently as outlined in the intel
 437     * optimization manual.
 438     * Note: could cause data rearrangement if used with smaller element
 439     * sizes.
 440     */
 441
 442    vecres = a;
 443    length = type.length / 2;
 444    while (length > 1) {
 445       LLVMValueRef vec1, vec2;
 446       for (i = 0; i < length; i++) {
 447          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 448          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 449       }
 450       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 451                                     LLVMConstVector(shuffles1, length), "");
 452       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 453                                     LLVMConstVector(shuffles2, length), "");
 454       if (type.floating) {
 455          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 456       }
 457       else {
 458          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 459       }
 460       length = length >> 1;
 461    }
 462
 463    /* always have vector of size 2 here */
 464    assert(length == 1);
 465
 466    index = lp_build_const_int32(bld->gallivm, 0);
 467    res = LLVMBuildExtractElement(builder, vecres, index, "");
 468    index = lp_build_const_int32(bld->gallivm, 1);
 469    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 470
 471    if (type.floating)
 472       res = LLVMBuildFAdd(builder, res, elem2, "");
 473     else
 474       res = LLVMBuildAdd(builder, res, elem2, "");
 475
 476    return res;
 477 }
 478
 479 /**
 480  * Return the horizontal sums of 4 float vectors as a float4 vector.
 481  * This uses the technique as outlined in Intel Optimization Manual.
 482  */
 483 static LLVMValueRef
 484 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 485                             LLVMValueRef src[4])
 486 {
 487    struct gallivm_state *gallivm = bld->gallivm;
 488    LLVMBuilderRef builder = gallivm->builder;
 489    LLVMValueRef shuffles[4];
 490    LLVMValueRef tmp[4];
 491    LLVMValueRef sumtmp[2], shuftmp[2];
 492
 493    /* lower half of regs */
 494    shuffles[0] = lp_build_const_int32(gallivm, 0);
 495    shuffles[1] = lp_build_const_int32(gallivm, 1);
 496    shuffles[2] = lp_build_const_int32(gallivm, 4);
 497    shuffles[3] = lp_build_const_int32(gallivm, 5);
 498    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 499                                    LLVMConstVector(shuffles, 4), "");
 500    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 501                                    LLVMConstVector(shuffles, 4), "");
 502
 503    /* upper half of regs */
 504    shuffles[0] = lp_build_const_int32(gallivm, 2);
 505    shuffles[1] = lp_build_const_int32(gallivm, 3);
 506    shuffles[2] = lp_build_const_int32(gallivm, 6);
 507    shuffles[3] = lp_build_const_int32(gallivm, 7);
 508    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 509                                    LLVMConstVector(shuffles, 4), "");
 510    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 511                                    LLVMConstVector(shuffles, 4), "");
 512
 513    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 514    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 515
 516    shuffles[0] = lp_build_const_int32(gallivm, 0);
 517    shuffles[1] = lp_build_const_int32(gallivm, 2);
 518    shuffles[2] = lp_build_const_int32(gallivm, 4);
 519    shuffles[3] = lp_build_const_int32(gallivm, 6);
 520    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 521                                        LLVMConstVector(shuffles, 4), "");
 522
 523    shuffles[0] = lp_build_const_int32(gallivm, 1);
 524    shuffles[1] = lp_build_const_int32(gallivm, 3);
 525    shuffles[2] = lp_build_const_int32(gallivm, 5);
 526    shuffles[3] = lp_build_const_int32(gallivm, 7);
 527    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 528                                        LLVMConstVector(shuffles, 4), "");
 529
 530    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 531 }
 532
 533
 534 /*
 535  * partially horizontally add 2-4 float vectors with length nx4,
 536  * i.e. only four adjacent values in each vector will be added,
 537  * assuming values are really grouped in 4 which also determines
 538  * output order.
 539  *
 540  * Return a vector of the same length as the initial vectors,
 541  * with the excess elements (if any) being undefined.
 542  * The element order is independent of number of input vectors.
 543  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 544  * the output order thus will be
 545  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 546  */
 547 LLVMValueRef
 548 lp_build_hadd_partial4(struct lp_build_context *bld,
 549                        LLVMValueRef vectors[],
 550                        unsigned num_vecs)
 551 {
 552    struct gallivm_state *gallivm = bld->gallivm;
 553    LLVMBuilderRef builder = gallivm->builder;
 554    LLVMValueRef ret_vec;
 555    LLVMValueRef tmp[4];
 556    const char *intrinsic = NULL;
 557
 558    assert(num_vecs >= 2 && num_vecs <= 4);
 559    assert(bld->type.floating);
 560
 561    /* only use this with at least 2 vectors, as it is sort of expensive
 562     * (depending on cpu) and we always need two horizontal adds anyway,
 563     * so a shuffle/add approach might be better.
 564     */
 565
 566    tmp[0] = vectors[0];
 567    tmp[1] = vectors[1];
 568
 569    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 570    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 571
 572    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 573        bld->type.length == 4) {
 574       intrinsic = "llvm.x86.sse3.hadd.ps";
 575    }
 576    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 577             bld->type.length == 8) {
 578       intrinsic = "llvm.x86.avx.hadd.ps.256";
 579    }
 580    if (intrinsic) {
 581       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 582                                        lp_build_vec_type(gallivm, bld->type),
 583                                        tmp[0], tmp[1]);
 584       if (num_vecs > 2) {
 585          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 586                                           lp_build_vec_type(gallivm, bld->type),
 587                                           tmp[2], tmp[3]);
 588       }
 589       else {
 590          tmp[1] = tmp[0];
 591       }
 592       return lp_build_intrinsic_binary(builder, intrinsic,
 593                                        lp_build_vec_type(gallivm, bld->type),
 594                                        tmp[0], tmp[1]);
 595    }
 596
 597    if (bld->type.length == 4) {
 598       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 599    }
 600    else {
 601       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 602       unsigned j;
 603       unsigned num_iter = bld->type.length / 4;
 604       struct lp_type parttype = bld->type;
 605       parttype.length = 4;
 606       for (j = 0; j < num_iter; j++) {
 607          LLVMValueRef partsrc[4];
 608          unsigned i;
 609          for (i = 0; i < 4; i++) {
 610             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 611          }
 612          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 613       }
 614       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 615    }
 616    return ret_vec;
 617 }
 618
 619 /**
 620  * Generate a - b
 621  */
 622 LLVMValueRef
 623 lp_build_sub(struct lp_build_context *bld,
 624              LLVMValueRef a,
 625              LLVMValueRef b)
 626 {
 627    LLVMBuilderRef builder = bld->gallivm->builder;
 628    const struct lp_type type = bld->type;
 629    LLVMValueRef res;
 630
 631    assert(lp_check_value(type, a));
 632    assert(lp_check_value(type, b));
 633
 634    if(b == bld->zero)
 635       return a;
 636    if(a == bld->undef || b == bld->undef)
 637       return bld->undef;
 638    if(a == b)
 639       return bld->zero;
 640
 641    if(bld->type.norm) {
 642       const char *intrinsic = NULL;
 643
 644       if(b == bld->one)
 645         return bld->zero;
 646
 647       if (type.width * type.length == 128 &&
 648           !type.floating && !type.fixed) {
 649          if (util_cpu_caps.has_sse2) {
 650            if(type.width == 8)
 651               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 652            if(type.width == 16)
 653               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 654          } else if (util_cpu_caps.has_altivec) {
 655            if(type.width == 8)
 656               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 657            if(type.width == 16)
 658               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsws" : "llvm.ppc.altivec.vsubuws";
 659          }
 660       }
 661
 662       if(intrinsic)
 663          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 664    }
 665
 666    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 667       if (type.floating)
 668          res = LLVMConstFSub(a, b);
 669       else
 670          res = LLVMConstSub(a, b);
 671    else
 672       if (type.floating)
 673          res = LLVMBuildFSub(builder, a, b, "");
 674       else
 675          res = LLVMBuildSub(builder, a, b, "");
 676
 677    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 678       res = lp_build_max_simple(bld, res, bld->zero);
 679
 680    return res;
 681 }
 682
 683
 684
 685 /**
 686  * Normalized multiplication.
 687  *
 688  * There are several approaches for (using 8-bit normalized multiplication as
 689  * an example):
 690  *
 691  * - alpha plus one
 692  *
 693  *     makes the following approximation to the division (Sree)
 694  *
 695  *       a*b/255 ~= (a*(b + 1)) >> 256
 696  *
 697  *     which is the fastest method that satisfies the following OpenGL criteria of
 698  *
 699  *       0*0 = 0 and 255*255 = 255
 700  *
 701  * - geometric series
 702  *
 703  *     takes the geometric series approximation to the division
 704  *
 705  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 706  *
 707  *     in this case just the first two terms to fit in 16bit arithmetic
 708  *
 709  *       t/255 ~= (t + (t >> 8)) >> 8
 710  *
 711  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 712  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 713  *     must be used.
 714  *
 715  * - geometric series plus rounding
 716  *
 717  *     when using a geometric series division instead of truncating the result
 718  *     use roundoff in the approximation (Jim Blinn)
 719  *
 720  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 721  *
 722  *     achieving the exact results.
 723  *
 724  *
 725  *
 726  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 727  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 728  * @sa Michael Herf, The "double blend trick", May 2000,
 729  *     http://www.stereopsis.com/doubleblend.html
 730  */
 731 static LLVMValueRef
 732 lp_build_mul_norm(struct gallivm_state *gallivm,
 733                   struct lp_type wide_type,
 734                   LLVMValueRef a, LLVMValueRef b)
 735 {
 736    LLVMBuilderRef builder = gallivm->builder;
 737    struct lp_build_context bld;
 738    unsigned n;
 739    LLVMValueRef half;
 740    LLVMValueRef ab;
 741
 742    assert(!wide_type.floating);
 743    assert(lp_check_value(wide_type, a));
 744    assert(lp_check_value(wide_type, b));
 745
 746    lp_build_context_init(&bld, gallivm, wide_type);
 747
 748    n = wide_type.width / 2;
 749    if (wide_type.sign) {
 750       --n;
 751    }
 752
 753    /*
 754     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 755     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 756     */
 757
 758    /*
 759     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 760     */
 761
 762    ab = LLVMBuildMul(builder, a, b, "");
 763    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 764
 765    /*
 766     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 767     */
 768
 769    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 770    if (wide_type.sign) {
 771       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 772       LLVMValueRef sign = lp_build_shr_imm(&bld, half, wide_type.width - 1);
 773       half = lp_build_select(&bld, sign, minus_half, half);
 774    }
 775    ab = LLVMBuildAdd(builder, ab, half, "");
 776
 777    /* Final division */
 778    ab = lp_build_shr_imm(&bld, ab, n);
 779
 780    return ab;
 781 }
 782
 783 /**
 784  * Generate a * b
 785  */
 786 LLVMValueRef
 787 lp_build_mul(struct lp_build_context *bld,
 788              LLVMValueRef a,
 789              LLVMValueRef b)
 790 {
 791    LLVMBuilderRef builder = bld->gallivm->builder;
 792    const struct lp_type type = bld->type;
 793    LLVMValueRef shift;
 794    LLVMValueRef res;
 795
 796    assert(lp_check_value(type, a));
 797    assert(lp_check_value(type, b));
 798
 799    if(a == bld->zero)
 800       return bld->zero;
 801    if(a == bld->one)
 802       return b;
 803    if(b == bld->zero)
 804       return bld->zero;
 805    if(b == bld->one)
 806       return a;
 807    if(a == bld->undef || b == bld->undef)
 808       return bld->undef;
 809
 810    if (!type.floating && !type.fixed && type.norm) {
 811       struct lp_type wide_type = lp_wider_type(type);
 812       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 813
 814       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 815       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 816
 817       /* PMULLW, PSRLW, PADDW */
 818       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 819       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 820
 821       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 822
 823       return ab;
 824    }
 825
 826    if(type.fixed)
 827       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 828    else
 829       shift = NULL;
 830
 831    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 832       if (type.floating)
 833          res = LLVMConstFMul(a, b);
 834       else
 835          res = LLVMConstMul(a, b);
 836       if(shift) {
 837          if(type.sign)
 838             res = LLVMConstAShr(res, shift);
 839          else
 840             res = LLVMConstLShr(res, shift);
 841       }
 842    }
 843    else {
 844       if (type.floating)
 845          res = LLVMBuildFMul(builder, a, b, "");
 846       else
 847          res = LLVMBuildMul(builder, a, b, "");
 848       if(shift) {
 849          if(type.sign)
 850             res = LLVMBuildAShr(builder, res, shift, "");
 851          else
 852             res = LLVMBuildLShr(builder, res, shift, "");
 853       }
 854    }
 855
 856    return res;
 857 }
 858
 859
 860 /**
 861  * Small vector x scale multiplication optimization.
 862  */
 863 LLVMValueRef
 864 lp_build_mul_imm(struct lp_build_context *bld,
 865                  LLVMValueRef a,
 866                  int b)
 867 {
 868    LLVMBuilderRef builder = bld->gallivm->builder;
 869    LLVMValueRef factor;
 870
 871    assert(lp_check_value(bld->type, a));
 872
 873    if(b == 0)
 874       return bld->zero;
 875
 876    if(b == 1)
 877       return a;
 878
 879    if(b == -1)
 880       return lp_build_negate(bld, a);
 881
 882    if(b == 2 && bld->type.floating)
 883       return lp_build_add(bld, a, a);
 884
 885    if(util_is_power_of_two(b)) {
 886       unsigned shift = ffs(b) - 1;
 887
 888       if(bld->type.floating) {
 889 #if 0
 890          /*
 891           * Power of two multiplication by directly manipulating the exponent.
 892           *
 893           * XXX: This might not be always faster, it will introduce a small error
 894           * for multiplication by zero, and it will produce wrong results
 895           * for Inf and NaN.
 896           */
 897          unsigned mantissa = lp_mantissa(bld->type);
 898          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 899          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 900          a = LLVMBuildAdd(builder, a, factor, "");
 901          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 902          return a;
 903 #endif
 904       }
 905       else {
 906          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 907          return LLVMBuildShl(builder, a, factor, "");
 908       }
 909    }
 910
 911    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 912    return lp_build_mul(bld, a, factor);
 913 }
 914
 915
 916 /**
 917  * Generate a / b
 918  */
 919 LLVMValueRef
 920 lp_build_div(struct lp_build_context *bld,
 921              LLVMValueRef a,
 922              LLVMValueRef b)
 923 {
 924    LLVMBuilderRef builder = bld->gallivm->builder;
 925    const struct lp_type type = bld->type;
 926
 927    assert(lp_check_value(type, a));
 928    assert(lp_check_value(type, b));
 929
 930    if(a == bld->zero)
 931       return bld->zero;
 932    if(a == bld->one)
 933       return lp_build_rcp(bld, b);
 934    if(b == bld->zero)
 935       return bld->undef;
 936    if(b == bld->one)
 937       return a;
 938    if(a == bld->undef || b == bld->undef)
 939       return bld->undef;
 940
 941    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 942       if (type.floating)
 943          return LLVMConstFDiv(a, b);
 944       else if (type.sign)
 945          return LLVMConstSDiv(a, b);
 946       else
 947          return LLVMConstUDiv(a, b);
 948    }
 949
 950    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
 951        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
 952       type.floating)
 953       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 954
 955    if (type.floating)
 956       return LLVMBuildFDiv(builder, a, b, "");
 957    else if (type.sign)
 958       return LLVMBuildSDiv(builder, a, b, "");
 959    else
 960       return LLVMBuildUDiv(builder, a, b, "");
 961 }
 962
 963
 964 /**
 965  * Linear interpolation helper.
 966  *
 967  * @param normalized whether we are interpolating normalized values,
 968  *        encoded in normalized integers, twice as wide.
 969  *
 970  * @sa http://www.stereopsis.com/doubleblend.html
 971  */
 972 static INLINE LLVMValueRef
 973 lp_build_lerp_simple(struct lp_build_context *bld,
 974                      LLVMValueRef x,
 975                      LLVMValueRef v0,
 976                      LLVMValueRef v1,
 977                      bool normalized)
 978 {
 979    unsigned half_width = bld->type.width/2;
 980    LLVMBuilderRef builder = bld->gallivm->builder;
 981    LLVMValueRef delta;
 982    LLVMValueRef res;
 983
 984    assert(lp_check_value(bld->type, x));
 985    assert(lp_check_value(bld->type, v0));
 986    assert(lp_check_value(bld->type, v1));
 987
 988    delta = lp_build_sub(bld, v1, v0);
 989
 990    if (normalized) {
 991       if (!bld->type.sign) {
 992          /*
 993           * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
 994           * most-significant-bit to the lowest-significant-bit, so that
 995           * later we can just divide by 2**n instead of 2**n - 1.
 996           */
 997          x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
 998
 999          /* (x * delta) >> n */
1000          res = lp_build_mul(bld, x, delta);
1001          res = lp_build_shr_imm(bld, res, half_width);
1002       } else {
1003          /*
1004           * The rescaling trick above doesn't work for signed numbers, so
1005           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1006           * instead.
1007           */
1008          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1009       }
1010    } else {
1011       res = lp_build_mul(bld, x, delta);
1012    }
1013
1014    res = lp_build_add(bld, v0, res);
1015
1016    if ((normalized && !bld->type.sign) || bld->type.fixed) {
1017       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1018       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1019        * but it will be wrong for true fixed point use cases. Basically we need
1020        * a more powerful lp_type, capable of further distinguishing the values
1021        * interpretation from the value storage. */
1022       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1023    }
1024
1025    return res;
1026 }
1027
1028
1029 /**
1030  * Linear interpolation.
1031  */
1032 LLVMValueRef
1033 lp_build_lerp(struct lp_build_context *bld,
1034               LLVMValueRef x,
1035               LLVMValueRef v0,
1036               LLVMValueRef v1)
1037 {
1038    const struct lp_type type = bld->type;
1039    LLVMValueRef res;
1040
1041    assert(lp_check_value(type, x));
1042    assert(lp_check_value(type, v0));
1043    assert(lp_check_value(type, v1));
1044
1045    if (type.norm) {
1046       struct lp_type wide_type;
1047       struct lp_build_context wide_bld;
1048       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1049
1050       assert(type.length >= 2);
1051
1052       /*
1053        * Create a wider integer type, enough to hold the
1054        * intermediate result of the multiplication.
1055        */
1056       memset(&wide_type, 0, sizeof wide_type);
1057       wide_type.sign   = type.sign;
1058       wide_type.width  = type.width*2;
1059       wide_type.length = type.length/2;
1060
1061       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1062
1063       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1064       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1065       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1066
1067       /*
1068        * Lerp both halves.
1069        */
1070
1071       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, TRUE);
1072       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, TRUE);
1073
1074       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1075    } else {
1076       res = lp_build_lerp_simple(bld, x, v0, v1, FALSE);
1077    }
1078
1079    return res;
1080 }
1081
1082
1083 LLVMValueRef
1084 lp_build_lerp_2d(struct lp_build_context *bld,
1085                  LLVMValueRef x,
1086                  LLVMValueRef y,
1087                  LLVMValueRef v00,
1088                  LLVMValueRef v01,
1089                  LLVMValueRef v10,
1090                  LLVMValueRef v11)
1091 {
1092    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
1093    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
1094    return lp_build_lerp(bld, y, v0, v1);
1095 }
1096
1097
1098 /**
1099  * Generate min(a, b)
1100  * Do checks for special cases.
1101  */
1102 LLVMValueRef
1103 lp_build_min(struct lp_build_context *bld,
1104              LLVMValueRef a,
1105              LLVMValueRef b)
1106 {
1107    assert(lp_check_value(bld->type, a));
1108    assert(lp_check_value(bld->type, b));
1109
1110    if(a == bld->undef || b == bld->undef)
1111       return bld->undef;
1112
1113    if(a == b)
1114       return a;
1115
1116    if (bld->type.norm) {
1117       if (!bld->type.sign) {
1118          if (a == bld->zero || b == bld->zero) {
1119             return bld->zero;
1120          }
1121       }
1122       if(a == bld->one)
1123          return b;
1124       if(b == bld->one)
1125          return a;
1126    }
1127
1128    return lp_build_min_simple(bld, a, b);
1129 }
1130
1131
1132 /**
1133  * Generate max(a, b)
1134  * Do checks for special cases.
1135  */
1136 LLVMValueRef
1137 lp_build_max(struct lp_build_context *bld,
1138              LLVMValueRef a,
1139              LLVMValueRef b)
1140 {
1141    assert(lp_check_value(bld->type, a));
1142    assert(lp_check_value(bld->type, b));
1143
1144    if(a == bld->undef || b == bld->undef)
1145       return bld->undef;
1146
1147    if(a == b)
1148       return a;
1149
1150    if(bld->type.norm) {
1151       if(a == bld->one || b == bld->one)
1152          return bld->one;
1153       if (!bld->type.sign) {
1154          if (a == bld->zero) {
1155             return b;
1156          }
1157          if (b == bld->zero) {
1158             return a;
1159          }
1160       }
1161    }
1162
1163    return lp_build_max_simple(bld, a, b);
1164 }
1165
1166
1167 /**
1168  * Generate clamp(a, min, max)
1169  * Do checks for special cases.
1170  */
1171 LLVMValueRef
1172 lp_build_clamp(struct lp_build_context *bld,
1173                LLVMValueRef a,
1174                LLVMValueRef min,
1175                LLVMValueRef max)
1176 {
1177    assert(lp_check_value(bld->type, a));
1178    assert(lp_check_value(bld->type, min));
1179    assert(lp_check_value(bld->type, max));
1180
1181    a = lp_build_min(bld, a, max);
1182    a = lp_build_max(bld, a, min);
1183    return a;
1184 }
1185
1186
1187 /**
1188  * Generate abs(a)
1189  */
1190 LLVMValueRef
1191 lp_build_abs(struct lp_build_context *bld,
1192              LLVMValueRef a)
1193 {
1194    LLVMBuilderRef builder = bld->gallivm->builder;
1195    const struct lp_type type = bld->type;
1196    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1197
1198    assert(lp_check_value(type, a));
1199
1200    if(!type.sign)
1201       return a;
1202
1203    if(type.floating) {
1204       /* Mask out the sign bit */
1205       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1206       unsigned long long absMask = ~(1ULL << (type.width - 1));
1207       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1208       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1209       a = LLVMBuildAnd(builder, a, mask, "");
1210       a = LLVMBuildBitCast(builder, a, vec_type, "");
1211       return a;
1212    }
1213
1214    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1215       switch(type.width) {
1216       case 8:
1217          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1218       case 16:
1219          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1220       case 32:
1221          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1222       }
1223    }
1224    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1225             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1226             (type.width == 8 || type.width == 16 || type.width == 32)) {
1227       debug_printf("%s: inefficient code, should split vectors manually\n",
1228                    __FUNCTION__);
1229    }
1230
1231    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1232 }
1233
1234
1235 LLVMValueRef
1236 lp_build_negate(struct lp_build_context *bld,
1237                 LLVMValueRef a)
1238 {
1239    LLVMBuilderRef builder = bld->gallivm->builder;
1240
1241    assert(lp_check_value(bld->type, a));
1242
1243 #if HAVE_LLVM >= 0x0207
1244    if (bld->type.floating)
1245       a = LLVMBuildFNeg(builder, a, "");
1246    else
1247 #endif
1248       a = LLVMBuildNeg(builder, a, "");
1249
1250    return a;
1251 }
1252
1253
1254 /** Return -1, 0 or +1 depending on the sign of a */
1255 LLVMValueRef
1256 lp_build_sgn(struct lp_build_context *bld,
1257              LLVMValueRef a)
1258 {
1259    LLVMBuilderRef builder = bld->gallivm->builder;
1260    const struct lp_type type = bld->type;
1261    LLVMValueRef cond;
1262    LLVMValueRef res;
1263
1264    assert(lp_check_value(type, a));
1265
1266    /* Handle non-zero case */
1267    if(!type.sign) {
1268       /* if not zero then sign must be positive */
1269       res = bld->one;
1270    }
1271    else if(type.floating) {
1272       LLVMTypeRef vec_type;
1273       LLVMTypeRef int_type;
1274       LLVMValueRef mask;
1275       LLVMValueRef sign;
1276       LLVMValueRef one;
1277       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1278
1279       int_type = lp_build_int_vec_type(bld->gallivm, type);
1280       vec_type = lp_build_vec_type(bld->gallivm, type);
1281       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1282
1283       /* Take the sign bit and add it to 1 constant */
1284       sign = LLVMBuildBitCast(builder, a, int_type, "");
1285       sign = LLVMBuildAnd(builder, sign, mask, "");
1286       one = LLVMConstBitCast(bld->one, int_type);
1287       res = LLVMBuildOr(builder, sign, one, "");
1288       res = LLVMBuildBitCast(builder, res, vec_type, "");
1289    }
1290    else
1291    {
1292       /* signed int/norm/fixed point */
1293       /* could use psign with sse3 and appropriate vectors here */
1294       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1295       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1296       res = lp_build_select(bld, cond, bld->one, minus_one);
1297    }
1298
1299    /* Handle zero */
1300    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1301    res = lp_build_select(bld, cond, bld->zero, res);
1302
1303    return res;
1304 }
1305
1306
1307 /**
1308  * Set the sign of float vector 'a' according to 'sign'.
1309  * If sign==0, return abs(a).
1310  * If sign==1, return -abs(a);
1311  * Other values for sign produce undefined results.
1312  */
1313 LLVMValueRef
1314 lp_build_set_sign(struct lp_build_context *bld,
1315                   LLVMValueRef a, LLVMValueRef sign)
1316 {
1317    LLVMBuilderRef builder = bld->gallivm->builder;
1318    const struct lp_type type = bld->type;
1319    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1320    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1321    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1322    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1323                              ~((unsigned long long) 1 << (type.width - 1)));
1324    LLVMValueRef val, res;
1325
1326    assert(type.floating);
1327    assert(lp_check_value(type, a));
1328
1329    /* val = reinterpret_cast<int>(a) */
1330    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1331    /* val = val & mask */
1332    val = LLVMBuildAnd(builder, val, mask, "");
1333    /* sign = sign << shift */
1334    sign = LLVMBuildShl(builder, sign, shift, "");
1335    /* res = val | sign */
1336    res = LLVMBuildOr(builder, val, sign, "");
1337    /* res = reinterpret_cast<float>(res) */
1338    res = LLVMBuildBitCast(builder, res, vec_type, "");
1339
1340    return res;
1341 }
1342
1343
1344 /**
1345  * Convert vector of (or scalar) int to vector of (or scalar) float.
1346  */
1347 LLVMValueRef
1348 lp_build_int_to_float(struct lp_build_context *bld,
1349                       LLVMValueRef a)
1350 {
1351    LLVMBuilderRef builder = bld->gallivm->builder;
1352    const struct lp_type type = bld->type;
1353    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1354
1355    assert(type.floating);
1356
1357    return LLVMBuildSIToFP(builder, a, vec_type, "");
1358 }
1359
1360 static boolean
1361 arch_rounding_available(const struct lp_type type)
1362 {
1363    if ((util_cpu_caps.has_sse4_1 &&
1364        (type.length == 1 || type.width*type.length == 128)) ||
1365        (util_cpu_caps.has_avx && type.width*type.length == 256))
1366       return TRUE;
1367    else if ((util_cpu_caps.has_altivec &&
1368             (type.width == 32 && type.length == 4)))
1369       return TRUE;
1370
1371    return FALSE;
1372 }
1373
1374 enum lp_build_round_mode
1375 {
1376    LP_BUILD_ROUND_NEAREST = 0,
1377    LP_BUILD_ROUND_FLOOR = 1,
1378    LP_BUILD_ROUND_CEIL = 2,
1379    LP_BUILD_ROUND_TRUNCATE = 3
1380 };
1381
1382 /**
1383  * Helper for SSE4.1's ROUNDxx instructions.
1384  *
1385  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1386  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1387  */
1388 static INLINE LLVMValueRef
1389 lp_build_round_sse41(struct lp_build_context *bld,
1390                      LLVMValueRef a,
1391                      enum lp_build_round_mode mode)
1392 {
1393    LLVMBuilderRef builder = bld->gallivm->builder;
1394    const struct lp_type type = bld->type;
1395    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1396    const char *intrinsic;
1397    LLVMValueRef res;
1398
1399    assert(type.floating);
1400
1401    assert(lp_check_value(type, a));
1402    assert(util_cpu_caps.has_sse4_1);
1403
1404    if (type.length == 1) {
1405       LLVMTypeRef vec_type;
1406       LLVMValueRef undef;
1407       LLVMValueRef args[3];
1408       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1409
1410       switch(type.width) {
1411       case 32:
1412          intrinsic = "llvm.x86.sse41.round.ss";
1413          break;
1414       case 64:
1415          intrinsic = "llvm.x86.sse41.round.sd";
1416          break;
1417       default:
1418          assert(0);
1419          return bld->undef;
1420       }
1421
1422       vec_type = LLVMVectorType(bld->elem_type, 4);
1423
1424       undef = LLVMGetUndef(vec_type);
1425
1426       args[0] = undef;
1427       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1428       args[2] = LLVMConstInt(i32t, mode, 0);
1429
1430       res = lp_build_intrinsic(builder, intrinsic,
1431                                vec_type, args, Elements(args));
1432
1433       res = LLVMBuildExtractElement(builder, res, index0, "");
1434    }
1435    else {
1436       if (type.width * type.length == 128) {
1437          switch(type.width) {
1438          case 32:
1439             intrinsic = "llvm.x86.sse41.round.ps";
1440             break;
1441          case 64:
1442             intrinsic = "llvm.x86.sse41.round.pd";
1443             break;
1444          default:
1445             assert(0);
1446             return bld->undef;
1447          }
1448       }
1449       else {
1450          assert(type.width * type.length == 256);
1451          assert(util_cpu_caps.has_avx);
1452
1453          switch(type.width) {
1454          case 32:
1455             intrinsic = "llvm.x86.avx.round.ps.256";
1456             break;
1457          case 64:
1458             intrinsic = "llvm.x86.avx.round.pd.256";
1459             break;
1460          default:
1461             assert(0);
1462             return bld->undef;
1463          }
1464       }
1465
1466       res = lp_build_intrinsic_binary(builder, intrinsic,
1467                                       bld->vec_type, a,
1468                                       LLVMConstInt(i32t, mode, 0));
1469    }
1470
1471    return res;
1472 }
1473
1474
1475 static INLINE LLVMValueRef
1476 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1477                              LLVMValueRef a)
1478 {
1479    LLVMBuilderRef builder = bld->gallivm->builder;
1480    const struct lp_type type = bld->type;
1481    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1482    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1483    const char *intrinsic;
1484    LLVMValueRef res;
1485
1486    assert(type.floating);
1487    /* using the double precision conversions is a bit more complicated */
1488    assert(type.width == 32);
1489
1490    assert(lp_check_value(type, a));
1491    assert(util_cpu_caps.has_sse2);
1492
1493    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1494    if (type.length == 1) {
1495       LLVMTypeRef vec_type;
1496       LLVMValueRef undef;
1497       LLVMValueRef arg;
1498       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1499
1500       vec_type = LLVMVectorType(bld->elem_type, 4);
1501
1502       intrinsic = "llvm.x86.sse.cvtss2si";
1503
1504       undef = LLVMGetUndef(vec_type);
1505
1506       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1507
1508       res = lp_build_intrinsic_unary(builder, intrinsic,
1509                                      ret_type, arg);
1510    }
1511    else {
1512       if (type.width* type.length == 128) {
1513          intrinsic = "llvm.x86.sse2.cvtps2dq";
1514       }
1515       else {
1516          assert(type.width*type.length == 256);
1517          assert(util_cpu_caps.has_avx);
1518
1519          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1520       }
1521       res = lp_build_intrinsic_unary(builder, intrinsic,
1522                                      ret_type, a);
1523    }
1524
1525    return res;
1526 }
1527
1528
1529 /*
1530  */
1531 static INLINE LLVMValueRef
1532 lp_build_round_altivec(struct lp_build_context *bld,
1533                        LLVMValueRef a,
1534                        enum lp_build_round_mode mode)
1535 {
1536    LLVMBuilderRef builder = bld->gallivm->builder;
1537    const struct lp_type type = bld->type;
1538    const char *intrinsic = NULL;
1539
1540    assert(type.floating);
1541
1542    assert(lp_check_value(type, a));
1543    assert(util_cpu_caps.has_altivec);
1544
1545    switch (mode) {
1546    case LP_BUILD_ROUND_NEAREST:
1547       intrinsic = "llvm.ppc.altivec.vrfin";
1548       break;
1549    case LP_BUILD_ROUND_FLOOR:
1550       intrinsic = "llvm.ppc.altivec.vrfim";
1551       break;
1552    case LP_BUILD_ROUND_CEIL:
1553       intrinsic = "llvm.ppc.altivec.vrfip";
1554       break;
1555    case LP_BUILD_ROUND_TRUNCATE:
1556       intrinsic = "llvm.ppc.altivec.vrfiz";
1557       break;
1558    }
1559
1560    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1561 }
1562
1563 static INLINE LLVMValueRef
1564 lp_build_round_arch(struct lp_build_context *bld,
1565                     LLVMValueRef a,
1566                     enum lp_build_round_mode mode)
1567 {
1568    if (util_cpu_caps.has_sse4_1)
1569      return lp_build_round_sse41(bld, a, mode);
1570    else /* (util_cpu_caps.has_altivec) */
1571      return lp_build_round_altivec(bld, a, mode);
1572 }
1573
1574 /**
1575  * Return the integer part of a float (vector) value (== round toward zero).
1576  * The returned value is a float (vector).
1577  * Ex: trunc(-1.5) = -1.0
1578  */
1579 LLVMValueRef
1580 lp_build_trunc(struct lp_build_context *bld,
1581                LLVMValueRef a)
1582 {
1583    LLVMBuilderRef builder = bld->gallivm->builder;
1584    const struct lp_type type = bld->type;
1585
1586    assert(type.floating);
1587    assert(lp_check_value(type, a));
1588
1589    if (arch_rounding_available(type)) {
1590       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1591    }
1592    else {
1593       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1594       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1595       LLVMValueRef res;
1596       res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1597       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1598       return res;
1599    }
1600 }
1601
1602
1603 /**
1604  * Return float (vector) rounded to nearest integer (vector).  The returned
1605  * value is a float (vector).
1606  * Ex: round(0.9) = 1.0
1607  * Ex: round(-1.5) = -2.0
1608  */
1609 LLVMValueRef
1610 lp_build_round(struct lp_build_context *bld,
1611                LLVMValueRef a)
1612 {
1613    LLVMBuilderRef builder = bld->gallivm->builder;
1614    const struct lp_type type = bld->type;
1615
1616    assert(type.floating);
1617    assert(lp_check_value(type, a));
1618
1619    if (arch_rounding_available(type)) {
1620       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1621    }
1622    else {
1623       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1624       LLVMValueRef res;
1625       res = lp_build_iround(bld, a);
1626       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1627       return res;
1628    }
1629 }
1630
1631
1632 /**
1633  * Return floor of float (vector), result is a float (vector)
1634  * Ex: floor(1.1) = 1.0
1635  * Ex: floor(-1.1) = -2.0
1636  */
1637 LLVMValueRef
1638 lp_build_floor(struct lp_build_context *bld,
1639                LLVMValueRef a)
1640 {
1641    LLVMBuilderRef builder = bld->gallivm->builder;
1642    const struct lp_type type = bld->type;
1643
1644    assert(type.floating);
1645    assert(lp_check_value(type, a));
1646
1647    if (arch_rounding_available(type)) {
1648       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1649    }
1650    else {
1651       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1652       LLVMValueRef res;
1653       res = lp_build_ifloor(bld, a);
1654       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1655       return res;
1656    }
1657 }
1658
1659
1660 /**
1661  * Return ceiling of float (vector), returning float (vector).
1662  * Ex: ceil( 1.1) = 2.0
1663  * Ex: ceil(-1.1) = -1.0
1664  */
1665 LLVMValueRef
1666 lp_build_ceil(struct lp_build_context *bld,
1667               LLVMValueRef a)
1668 {
1669    LLVMBuilderRef builder = bld->gallivm->builder;
1670    const struct lp_type type = bld->type;
1671
1672    assert(type.floating);
1673    assert(lp_check_value(type, a));
1674
1675    if (arch_rounding_available(type)) {
1676       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1677    }
1678    else {
1679       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1680       LLVMValueRef res;
1681       res = lp_build_iceil(bld, a);
1682       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1683       return res;
1684    }
1685 }
1686
1687
1688 /**
1689  * Return fractional part of 'a' computed as a - floor(a)
1690  * Typically used in texture coord arithmetic.
1691  */
1692 LLVMValueRef
1693 lp_build_fract(struct lp_build_context *bld,
1694                LLVMValueRef a)
1695 {
1696    assert(bld->type.floating);
1697    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1698 }
1699
1700
1701 /**
1702  * Prevent returning a fractional part of 1.0 for very small negative values of
1703  * 'a' by clamping against 0.99999(9).
1704  */
1705 static inline LLVMValueRef
1706 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1707 {
1708    LLVMValueRef max;
1709
1710    /* this is the largest number smaller than 1.0 representable as float */
1711    max = lp_build_const_vec(bld->gallivm, bld->type,
1712                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1713    return lp_build_min(bld, fract, max);
1714 }
1715
1716
1717 /**
1718  * Same as lp_build_fract, but guarantees that the result is always smaller
1719  * than one.
1720  */
1721 LLVMValueRef
1722 lp_build_fract_safe(struct lp_build_context *bld,
1723                     LLVMValueRef a)
1724 {
1725    return clamp_fract(bld, lp_build_fract(bld, a));
1726 }
1727
1728
1729 /**
1730  * Return the integer part of a float (vector) value (== round toward zero).
1731  * The returned value is an integer (vector).
1732  * Ex: itrunc(-1.5) = -1
1733  */
1734 LLVMValueRef
1735 lp_build_itrunc(struct lp_build_context *bld,
1736                 LLVMValueRef a)
1737 {
1738    LLVMBuilderRef builder = bld->gallivm->builder;
1739    const struct lp_type type = bld->type;
1740    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1741
1742    assert(type.floating);
1743    assert(lp_check_value(type, a));
1744
1745    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1746 }
1747
1748
1749 /**
1750  * Return float (vector) rounded to nearest integer (vector).  The returned
1751  * value is an integer (vector).
1752  * Ex: iround(0.9) = 1
1753  * Ex: iround(-1.5) = -2
1754  */
1755 LLVMValueRef
1756 lp_build_iround(struct lp_build_context *bld,
1757                 LLVMValueRef a)
1758 {
1759    LLVMBuilderRef builder = bld->gallivm->builder;
1760    const struct lp_type type = bld->type;
1761    LLVMTypeRef int_vec_type = bld->int_vec_type;
1762    LLVMValueRef res;
1763
1764    assert(type.floating);
1765
1766    assert(lp_check_value(type, a));
1767
1768    if ((util_cpu_caps.has_sse2 &&
1769        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1770        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1771       return lp_build_iround_nearest_sse2(bld, a);
1772    }
1773    if (arch_rounding_available(type)) {
1774       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1775    }
1776    else {
1777       LLVMValueRef half;
1778
1779       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1780
1781       if (type.sign) {
1782          LLVMTypeRef vec_type = bld->vec_type;
1783          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1784                                     (unsigned long long)1 << (type.width - 1));
1785          LLVMValueRef sign;
1786
1787          /* get sign bit */
1788          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1789          sign = LLVMBuildAnd(builder, sign, mask, "");
1790
1791          /* sign * 0.5 */
1792          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1793          half = LLVMBuildOr(builder, sign, half, "");
1794          half = LLVMBuildBitCast(builder, half, vec_type, "");
1795       }
1796
1797       res = LLVMBuildFAdd(builder, a, half, "");
1798    }
1799
1800    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1801
1802    return res;
1803 }
1804
1805
1806 /**
1807  * Return floor of float (vector), result is an int (vector)
1808  * Ex: ifloor(1.1) = 1.0
1809  * Ex: ifloor(-1.1) = -2.0
1810  */
1811 LLVMValueRef
1812 lp_build_ifloor(struct lp_build_context *bld,
1813                 LLVMValueRef a)
1814 {
1815    LLVMBuilderRef builder = bld->gallivm->builder;
1816    const struct lp_type type = bld->type;
1817    LLVMTypeRef int_vec_type = bld->int_vec_type;
1818    LLVMValueRef res;
1819
1820    assert(type.floating);
1821    assert(lp_check_value(type, a));
1822
1823    res = a;
1824    if (type.sign) {
1825       if (arch_rounding_available(type)) {
1826          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1827       }
1828       else {
1829          /* Take the sign bit and add it to 1 constant */
1830          LLVMTypeRef vec_type = bld->vec_type;
1831          unsigned mantissa = lp_mantissa(type);
1832          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1833                                   (unsigned long long)1 << (type.width - 1));
1834          LLVMValueRef sign;
1835          LLVMValueRef offset;
1836
1837          /* sign = a < 0 ? ~0 : 0 */
1838          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1839          sign = LLVMBuildAnd(builder, sign, mask, "");
1840          sign = LLVMBuildAShr(builder, sign,
1841                               lp_build_const_int_vec(bld->gallivm, type,
1842                                                      type.width - 1),
1843                               "ifloor.sign");
1844
1845          /* offset = -0.99999(9)f */
1846          offset = lp_build_const_vec(bld->gallivm, type,
1847                                      -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1848          offset = LLVMConstBitCast(offset, int_vec_type);
1849
1850          /* offset = a < 0 ? offset : 0.0f */
1851          offset = LLVMBuildAnd(builder, offset, sign, "");
1852          offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1853
1854          res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1855       }
1856    }
1857
1858    /* round to nearest (toward zero) */
1859    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1860
1861    return res;
1862 }
1863
1864
1865 /**
1866  * Return ceiling of float (vector), returning int (vector).
1867  * Ex: iceil( 1.1) = 2
1868  * Ex: iceil(-1.1) = -1
1869  */
1870 LLVMValueRef
1871 lp_build_iceil(struct lp_build_context *bld,
1872                LLVMValueRef a)
1873 {
1874    LLVMBuilderRef builder = bld->gallivm->builder;
1875    const struct lp_type type = bld->type;
1876    LLVMTypeRef int_vec_type = bld->int_vec_type;
1877    LLVMValueRef res;
1878
1879    assert(type.floating);
1880    assert(lp_check_value(type, a));
1881
1882    if (arch_rounding_available(type)) {
1883       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1884    }
1885    else {
1886       LLVMTypeRef vec_type = bld->vec_type;
1887       unsigned mantissa = lp_mantissa(type);
1888       LLVMValueRef offset;
1889
1890       /* offset = 0.99999(9)f */
1891       offset = lp_build_const_vec(bld->gallivm, type,
1892                                   (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1893
1894       if (type.sign) {
1895          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1896                                 (unsigned long long)1 << (type.width - 1));
1897          LLVMValueRef sign;
1898
1899          /* sign = a < 0 ? 0 : ~0 */
1900          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1901          sign = LLVMBuildAnd(builder, sign, mask, "");
1902          sign = LLVMBuildAShr(builder, sign,
1903                               lp_build_const_int_vec(bld->gallivm, type,
1904                                                      type.width - 1),
1905                               "iceil.sign");
1906          sign = LLVMBuildNot(builder, sign, "iceil.not");
1907
1908          /* offset = a < 0 ? 0.0 : offset */
1909          offset = LLVMConstBitCast(offset, int_vec_type);
1910          offset = LLVMBuildAnd(builder, offset, sign, "");
1911          offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1912       }
1913
1914       res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1915    }
1916
1917    /* round to nearest (toward zero) */
1918    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1919
1920    return res;
1921 }
1922
1923
1924 /**
1925  * Combined ifloor() & fract().
1926  *
1927  * Preferred to calling the functions separately, as it will ensure that the
1928  * strategy (floor() vs ifloor()) that results in less redundant work is used.
1929  */
1930 void
1931 lp_build_ifloor_fract(struct lp_build_context *bld,
1932                       LLVMValueRef a,
1933                       LLVMValueRef *out_ipart,
1934                       LLVMValueRef *out_fpart)
1935 {
1936    LLVMBuilderRef builder = bld->gallivm->builder;
1937    const struct lp_type type = bld->type;
1938    LLVMValueRef ipart;
1939
1940    assert(type.floating);
1941    assert(lp_check_value(type, a));
1942
1943    if (arch_rounding_available(type)) {
1944       /*
1945        * floor() is easier.
1946        */
1947
1948       ipart = lp_build_floor(bld, a);
1949       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1950       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1951    }
1952    else {
1953       /*
1954        * ifloor() is easier.
1955        */
1956
1957       *out_ipart = lp_build_ifloor(bld, a);
1958       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1959       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1960    }
1961 }
1962
1963
1964 /**
1965  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1966  * always smaller than one.
1967  */
1968 void
1969 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
1970                            LLVMValueRef a,
1971                            LLVMValueRef *out_ipart,
1972                            LLVMValueRef *out_fpart)
1973 {
1974    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
1975    *out_fpart = clamp_fract(bld, *out_fpart);
1976 }
1977
1978
1979 LLVMValueRef
1980 lp_build_sqrt(struct lp_build_context *bld,
1981               LLVMValueRef a)
1982 {
1983    LLVMBuilderRef builder = bld->gallivm->builder;
1984    const struct lp_type type = bld->type;
1985    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1986    char intrinsic[32];
1987
1988    assert(lp_check_value(type, a));
1989
1990    /* TODO: optimize the constant case */
1991
1992    assert(type.floating);
1993    if (type.length == 1) {
1994       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
1995    }
1996    else {
1997       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1998    }
1999
2000    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2001 }
2002
2003
2004 /**
2005  * Do one Newton-Raphson step to improve reciprocate precision:
2006  *
2007  *   x_{i+1} = x_i * (2 - a * x_i)
2008  *
2009  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2010  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2011  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2012  * halo. It would be necessary to clamp the argument to prevent this.
2013  *
2014  * See also:
2015  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2016  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2017  */
2018 static INLINE LLVMValueRef
2019 lp_build_rcp_refine(struct lp_build_context *bld,
2020                     LLVMValueRef a,
2021                     LLVMValueRef rcp_a)
2022 {
2023    LLVMBuilderRef builder = bld->gallivm->builder;
2024    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2025    LLVMValueRef res;
2026
2027    res = LLVMBuildFMul(builder, a, rcp_a, "");
2028    res = LLVMBuildFSub(builder, two, res, "");
2029    res = LLVMBuildFMul(builder, rcp_a, res, "");
2030
2031    return res;
2032 }
2033
2034
2035 LLVMValueRef
2036 lp_build_rcp(struct lp_build_context *bld,
2037              LLVMValueRef a)
2038 {
2039    LLVMBuilderRef builder = bld->gallivm->builder;
2040    const struct lp_type type = bld->type;
2041
2042    assert(lp_check_value(type, a));
2043
2044    if(a == bld->zero)
2045       return bld->undef;
2046    if(a == bld->one)
2047       return bld->one;
2048    if(a == bld->undef)
2049       return bld->undef;
2050
2051    assert(type.floating);
2052
2053    if(LLVMIsConstant(a))
2054       return LLVMConstFDiv(bld->one, a);
2055
2056    /*
2057     * We don't use RCPPS because:
2058     * - it only has 10bits of precision
2059     * - it doesn't even get the reciprocate of 1.0 exactly
2060     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2061     * - for recent processors the benefit over DIVPS is marginal, a case
2062     *   dependent
2063     *
2064     * We could still use it on certain processors if benchmarks show that the
2065     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2066     * particular uses that require less workarounds.
2067     */
2068
2069    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2070          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2071       const unsigned num_iterations = 0;
2072       LLVMValueRef res;
2073       unsigned i;
2074       const char *intrinsic = NULL;
2075
2076       if (type.length == 4) {
2077          intrinsic = "llvm.x86.sse.rcp.ps";
2078       }
2079       else {
2080          intrinsic = "llvm.x86.avx.rcp.ps.256";
2081       }
2082
2083       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2084
2085       for (i = 0; i < num_iterations; ++i) {
2086          res = lp_build_rcp_refine(bld, a, res);
2087       }
2088
2089       return res;
2090    }
2091
2092    return LLVMBuildFDiv(builder, bld->one, a, "");
2093 }
2094
2095
2096 /**
2097  * Do one Newton-Raphson step to improve rsqrt precision:
2098  *
2099  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2100  *
2101  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2102  */
2103 static INLINE LLVMValueRef
2104 lp_build_rsqrt_refine(struct lp_build_context *bld,
2105                       LLVMValueRef a,
2106                       LLVMValueRef rsqrt_a)
2107 {
2108    LLVMBuilderRef builder = bld->gallivm->builder;
2109    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2110    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2111    LLVMValueRef res;
2112
2113    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2114    res = LLVMBuildFMul(builder, a, res, "");
2115    res = LLVMBuildFSub(builder, three, res, "");
2116    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2117    res = LLVMBuildFMul(builder, half, res, "");
2118
2119    return res;
2120 }
2121
2122
2123 /**
2124  * Generate 1/sqrt(a).
2125  * Result is undefined for values < 0, infinity for +0.
2126  */
2127 LLVMValueRef
2128 lp_build_rsqrt(struct lp_build_context *bld,
2129                LLVMValueRef a)
2130 {
2131    LLVMBuilderRef builder = bld->gallivm->builder;
2132    const struct lp_type type = bld->type;
2133
2134    assert(lp_check_value(type, a));
2135
2136    assert(type.floating);
2137
2138    /*
2139     * This should be faster but all denormals will end up as infinity.
2140     */
2141    if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2142         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2143       const unsigned num_iterations = 1;
2144       LLVMValueRef res;
2145       unsigned i;
2146       const char *intrinsic = NULL;
2147
2148       if (type.length == 4) {
2149          intrinsic = "llvm.x86.sse.rsqrt.ps";
2150       }
2151       else {
2152          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2153       }
2154       if (num_iterations) {
2155          /*
2156           * Newton-Raphson will result in NaN instead of infinity for zero,
2157           * and NaN instead of zero for infinity.
2158           * Also, need to ensure rsqrt(1.0) == 1.0.
2159           * All numbers smaller than FLT_MIN will result in +infinity
2160           * (rsqrtps treats all denormals as zero).
2161           */
2162          /*
2163           * Certain non-c99 compilers don't know INFINITY and might not support
2164           * hacks to evaluate it at compile time neither.
2165           */
2166          const unsigned posinf_int = 0x7F800000;
2167          LLVMValueRef cmp;
2168          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2169          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2170
2171          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2172
2173          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2174
2175          for (i = 0; i < num_iterations; ++i) {
2176             res = lp_build_rsqrt_refine(bld, a, res);
2177          }
2178          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2179          res = lp_build_select(bld, cmp, inf, res);
2180          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2181          res = lp_build_select(bld, cmp, bld->zero, res);
2182          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2183          res = lp_build_select(bld, cmp, bld->one, res);
2184       }
2185       else {
2186          /* rsqrt(1.0) != 1.0 here */
2187          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2188
2189       }
2190
2191       return res;
2192    }
2193
2194    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2195 }
2196
2197
2198 /**
2199  * Generate sin(a) using SSE2
2200  */
2201 LLVMValueRef
2202 lp_build_sin(struct lp_build_context *bld,
2203              LLVMValueRef a)
2204 {
2205    struct gallivm_state *gallivm = bld->gallivm;
2206    LLVMBuilderRef builder = gallivm->builder;
2207    struct lp_type int_type = lp_int_type(bld->type);
2208    LLVMBuilderRef b = builder;
2209
2210    /*
2211     *  take the absolute value,
2212     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2213     */
2214
2215    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2216    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2217
2218    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2219    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2220
2221    /*
2222     * extract the sign bit (upper one)
2223     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2224     */
2225    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2226    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2227
2228    /*
2229     * scale by 4/Pi
2230     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2231     */
2232
2233    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2234    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2235
2236    /*
2237     * store the integer part of y in mm0
2238     * emm2 = _mm_cvttps_epi32(y);
2239     */
2240
2241    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2242
2243    /*
2244     * j=(j+1) & (~1) (see the cephes sources)
2245     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2246     */
2247
2248    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2249    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2250    /*
2251     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2252     */
2253    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2254    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2255
2256    /*
2257     * y = _mm_cvtepi32_ps(emm2);
2258     */
2259    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2260
2261    /* get the swap sign flag
2262     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2263     */
2264    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2265    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2266
2267    /*
2268     * emm2 = _mm_slli_epi32(emm0, 29);
2269     */
2270    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2271    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2272
2273    /*
2274     * get the polynom selection mask
2275     * there is one polynom for 0 <= x <= Pi/4
2276     * and another one for Pi/4<x<=Pi/2
2277     * Both branches will be computed.
2278     *
2279     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2280     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2281     */
2282
2283    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2284    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2285    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2286                                              int_type, PIPE_FUNC_EQUAL,
2287                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2288    /*
2289     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2290     */
2291    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2292
2293    /*
2294     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2295     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2296     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2297     */
2298    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2299    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2300    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2301
2302    /*
2303     * The magic pass: "Extended precision modular arithmetic"
2304     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2305     * xmm1 = _mm_mul_ps(y, xmm1);
2306     * xmm2 = _mm_mul_ps(y, xmm2);
2307     * xmm3 = _mm_mul_ps(y, xmm3);
2308     */
2309    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2310    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2311    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2312
2313    /*
2314     * x = _mm_add_ps(x, xmm1);
2315     * x = _mm_add_ps(x, xmm2);
2316     * x = _mm_add_ps(x, xmm3);
2317     */
2318
2319    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2320    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2321    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2322
2323    /*
2324     * Evaluate the first polynom  (0 <= x <= Pi/4)
2325     *
2326     * z = _mm_mul_ps(x,x);
2327     */
2328    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2329
2330    /*
2331     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2332     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2333     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2334     */
2335    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2336    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2337    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2338
2339    /*
2340     * y = *(v4sf*)_ps_coscof_p0;
2341     * y = _mm_mul_ps(y, z);
2342     */
2343    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2344    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2345    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2346    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2347    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2348    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2349
2350
2351    /*
2352     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2353     * y = _mm_sub_ps(y, tmp);
2354     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2355     */
2356    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2357    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2358    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2359    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2360    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2361
2362    /*
2363     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2364     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2365     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2366     */
2367    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2368    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2369    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2370
2371    /*
2372     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2373     *
2374     * y2 = *(v4sf*)_ps_sincof_p0;
2375     * y2 = _mm_mul_ps(y2, z);
2376     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2377     * y2 = _mm_mul_ps(y2, z);
2378     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2379     * y2 = _mm_mul_ps(y2, z);
2380     * y2 = _mm_mul_ps(y2, x);
2381     * y2 = _mm_add_ps(y2, x);
2382     */
2383
2384    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2385    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2386    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2387    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2388    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2389    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2390    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2391
2392    /*
2393     * select the correct result from the two polynoms
2394     * xmm3 = poly_mask;
2395     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2396     * y = _mm_andnot_ps(xmm3, y);
2397     * y = _mm_add_ps(y,y2);
2398     */
2399    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2400    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2401    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2402    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2403    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2404    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2405    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2406
2407    /*
2408     * update the sign
2409     * y = _mm_xor_ps(y, sign_bit);
2410     */
2411    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2412    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2413    return y_result;
2414 }
2415
2416
2417 /**
2418  * Generate cos(a) using SSE2
2419  */
2420 LLVMValueRef
2421 lp_build_cos(struct lp_build_context *bld,
2422              LLVMValueRef a)
2423 {
2424    struct gallivm_state *gallivm = bld->gallivm;
2425    LLVMBuilderRef builder = gallivm->builder;
2426    struct lp_type int_type = lp_int_type(bld->type);
2427    LLVMBuilderRef b = builder;
2428
2429    /*
2430     *  take the absolute value,
2431     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2432     */
2433
2434    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2435    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2436
2437    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2438    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2439
2440    /*
2441     * scale by 4/Pi
2442     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2443     */
2444
2445    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2446    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2447
2448    /*
2449     * store the integer part of y in mm0
2450     * emm2 = _mm_cvttps_epi32(y);
2451     */
2452
2453    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2454
2455    /*
2456     * j=(j+1) & (~1) (see the cephes sources)
2457     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2458     */
2459
2460    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2461    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2462    /*
2463     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2464     */
2465    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2466    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2467
2468    /*
2469     * y = _mm_cvtepi32_ps(emm2);
2470     */
2471    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2472
2473
2474    /*
2475     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2476     */
2477    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2478    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2479
2480
2481    /* get the swap sign flag
2482     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2483     */
2484    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2485    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2486    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2487    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2488
2489    /*
2490     * emm2 = _mm_slli_epi32(emm0, 29);
2491     */
2492    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2493    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2494
2495    /*
2496     * get the polynom selection mask
2497     * there is one polynom for 0 <= x <= Pi/4
2498     * and another one for Pi/4<x<=Pi/2
2499     * Both branches will be computed.
2500     *
2501     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2502     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2503     */
2504
2505    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2506    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2507    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2508                                              int_type, PIPE_FUNC_EQUAL,
2509                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2510
2511    /*
2512     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2513     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2514     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2515     */
2516    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2517    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2518    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2519
2520    /*
2521     * The magic pass: "Extended precision modular arithmetic"
2522     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2523     * xmm1 = _mm_mul_ps(y, xmm1);
2524     * xmm2 = _mm_mul_ps(y, xmm2);
2525     * xmm3 = _mm_mul_ps(y, xmm3);
2526     */
2527    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2528    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2529    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2530
2531    /*
2532     * x = _mm_add_ps(x, xmm1);
2533     * x = _mm_add_ps(x, xmm2);
2534     * x = _mm_add_ps(x, xmm3);
2535     */
2536
2537    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2538    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2539    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2540
2541    /*
2542     * Evaluate the first polynom  (0 <= x <= Pi/4)
2543     *
2544     * z = _mm_mul_ps(x,x);
2545     */
2546    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2547
2548    /*
2549     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2550     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2551     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2552     */
2553    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2554    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2555    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2556
2557    /*
2558     * y = *(v4sf*)_ps_coscof_p0;
2559     * y = _mm_mul_ps(y, z);
2560     */
2561    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2562    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2563    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2564    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2565    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2566    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2567
2568
2569    /*
2570     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2571     * y = _mm_sub_ps(y, tmp);
2572     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2573     */
2574    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2575    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2576    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2577    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2578    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2579
2580    /*
2581     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2582     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2583     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2584     */
2585    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2586    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2587    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2588
2589    /*
2590     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2591     *
2592     * y2 = *(v4sf*)_ps_sincof_p0;
2593     * y2 = _mm_mul_ps(y2, z);
2594     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2595     * y2 = _mm_mul_ps(y2, z);
2596     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2597     * y2 = _mm_mul_ps(y2, z);
2598     * y2 = _mm_mul_ps(y2, x);
2599     * y2 = _mm_add_ps(y2, x);
2600     */
2601
2602    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2603    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2604    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2605    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2606    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2607    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2608    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2609
2610    /*
2611     * select the correct result from the two polynoms
2612     * xmm3 = poly_mask;
2613     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2614     * y = _mm_andnot_ps(xmm3, y);
2615     * y = _mm_add_ps(y,y2);
2616     */
2617    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2618    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2619    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2620    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2621    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2622    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2623
2624    /*
2625     * update the sign
2626     * y = _mm_xor_ps(y, sign_bit);
2627     */
2628    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2629    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2630    return y_result;
2631 }
2632
2633
2634 /**
2635  * Generate pow(x, y)
2636  */
2637 LLVMValueRef
2638 lp_build_pow(struct lp_build_context *bld,
2639              LLVMValueRef x,
2640              LLVMValueRef y)
2641 {
2642    /* TODO: optimize the constant case */
2643    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2644        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2645       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2646                    __FUNCTION__);
2647    }
2648
2649    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2650 }
2651
2652
2653 /**
2654  * Generate exp(x)
2655  */
2656 LLVMValueRef
2657 lp_build_exp(struct lp_build_context *bld,
2658              LLVMValueRef x)
2659 {
2660    /* log2(e) = 1/log(2) */
2661    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2662                                            1.4426950408889634);
2663
2664    assert(lp_check_value(bld->type, x));
2665
2666    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2667 }
2668
2669
2670 /**
2671  * Generate log(x)
2672  */
2673 LLVMValueRef
2674 lp_build_log(struct lp_build_context *bld,
2675              LLVMValueRef x)
2676 {
2677    /* log(2) */
2678    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2679                                           0.69314718055994529);
2680
2681    assert(lp_check_value(bld->type, x));
2682
2683    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2684 }
2685
2686
2687 /**
2688  * Generate polynomial.
2689  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2690  */
2691 static LLVMValueRef
2692 lp_build_polynomial(struct lp_build_context *bld,
2693                     LLVMValueRef x,
2694                     const double *coeffs,
2695                     unsigned num_coeffs)
2696 {
2697    const struct lp_type type = bld->type;
2698    LLVMValueRef even = NULL, odd = NULL;
2699    LLVMValueRef x2;
2700    unsigned i;
2701
2702    assert(lp_check_value(bld->type, x));
2703
2704    /* TODO: optimize the constant case */
2705    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2706        LLVMIsConstant(x)) {
2707       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2708                    __FUNCTION__);
2709    }
2710
2711    /*
2712     * Calculate odd and even terms seperately to decrease data dependency
2713     * Ex:
2714     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2715     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2716     */
2717    x2 = lp_build_mul(bld, x, x);
2718
2719    for (i = num_coeffs; i--; ) {
2720       LLVMValueRef coeff;
2721
2722       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2723
2724       if (i % 2 == 0) {
2725          if (even)
2726             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2727          else
2728             even = coeff;
2729       } else {
2730          if (odd)
2731             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2732          else
2733             odd = coeff;
2734       }
2735    }
2736
2737    if (odd)
2738       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2739    else if (even)
2740       return even;
2741    else
2742       return bld->undef;
2743 }
2744
2745
2746 /**
2747  * Minimax polynomial fit of 2**x, in range [0, 1[
2748  */
2749 const double lp_build_exp2_polynomial[] = {
2750 #if EXP_POLY_DEGREE == 5
2751    0.999999925063526176901,
2752    0.693153073200168932794,
2753    0.240153617044375388211,
2754    0.0558263180532956664775,
2755    0.00898934009049466391101,
2756    0.00187757667519147912699
2757 #elif EXP_POLY_DEGREE == 4
2758    1.00000259337069434683,
2759    0.693003834469974940458,
2760    0.24144275689150793076,
2761    0.0520114606103070150235,
2762    0.0135341679161270268764
2763 #elif EXP_POLY_DEGREE == 3
2764    0.999925218562710312959,
2765    0.695833540494823811697,
2766    0.226067155427249155588,
2767    0.0780245226406372992967
2768 #elif EXP_POLY_DEGREE == 2
2769    1.00172476321474503578,
2770    0.657636275736077639316,
2771    0.33718943461968720704
2772 #else
2773 #error
2774 #endif
2775 };
2776
2777
2778 void
2779 lp_build_exp2_approx(struct lp_build_context *bld,
2780                      LLVMValueRef x,
2781                      LLVMValueRef *p_exp2_int_part,
2782                      LLVMValueRef *p_frac_part,
2783                      LLVMValueRef *p_exp2)
2784 {
2785    LLVMBuilderRef builder = bld->gallivm->builder;
2786    const struct lp_type type = bld->type;
2787    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2788    LLVMValueRef ipart = NULL;
2789    LLVMValueRef fpart = NULL;
2790    LLVMValueRef expipart = NULL;
2791    LLVMValueRef expfpart = NULL;
2792    LLVMValueRef res = NULL;
2793
2794    assert(lp_check_value(bld->type, x));
2795
2796    if(p_exp2_int_part || p_frac_part || p_exp2) {
2797       /* TODO: optimize the constant case */
2798       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2799           LLVMIsConstant(x)) {
2800          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2801                       __FUNCTION__);
2802       }
2803
2804       assert(type.floating && type.width == 32);
2805
2806       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2807       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2808
2809       /* ipart = floor(x) */
2810       /* fpart = x - ipart */
2811       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2812    }
2813
2814    if(p_exp2_int_part || p_exp2) {
2815       /* expipart = (float) (1 << ipart) */
2816       expipart = LLVMBuildAdd(builder, ipart,
2817                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2818       expipart = LLVMBuildShl(builder, expipart,
2819                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2820       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2821    }
2822
2823    if(p_exp2) {
2824       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2825                                      Elements(lp_build_exp2_polynomial));
2826
2827       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2828    }
2829
2830    if(p_exp2_int_part)
2831       *p_exp2_int_part = expipart;
2832
2833    if(p_frac_part)
2834       *p_frac_part = fpart;
2835
2836    if(p_exp2)
2837       *p_exp2 = res;
2838 }
2839
2840
2841 LLVMValueRef
2842 lp_build_exp2(struct lp_build_context *bld,
2843               LLVMValueRef x)
2844 {
2845    LLVMValueRef res;
2846    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2847    return res;
2848 }
2849
2850
2851 /**
2852  * Extract the exponent of a IEEE-754 floating point value.
2853  *
2854  * Optionally apply an integer bias.
2855  *
2856  * Result is an integer value with
2857  *
2858  *   ifloor(log2(x)) + bias
2859  */
2860 LLVMValueRef
2861 lp_build_extract_exponent(struct lp_build_context *bld,
2862                           LLVMValueRef x,
2863                           int bias)
2864 {
2865    LLVMBuilderRef builder = bld->gallivm->builder;
2866    const struct lp_type type = bld->type;
2867    unsigned mantissa = lp_mantissa(type);
2868    LLVMValueRef res;
2869
2870    assert(type.floating);
2871
2872    assert(lp_check_value(bld->type, x));
2873
2874    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2875
2876    res = LLVMBuildLShr(builder, x,
2877                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2878    res = LLVMBuildAnd(builder, res,
2879                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
2880    res = LLVMBuildSub(builder, res,
2881                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2882
2883    return res;
2884 }
2885
2886
2887 /**
2888  * Extract the mantissa of the a floating.
2889  *
2890  * Result is a floating point value with
2891  *
2892  *   x / floor(log2(x))
2893  */
2894 LLVMValueRef
2895 lp_build_extract_mantissa(struct lp_build_context *bld,
2896                           LLVMValueRef x)
2897 {
2898    LLVMBuilderRef builder = bld->gallivm->builder;
2899    const struct lp_type type = bld->type;
2900    unsigned mantissa = lp_mantissa(type);
2901    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2902                                                   (1ULL << mantissa) - 1);
2903    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2904    LLVMValueRef res;
2905
2906    assert(lp_check_value(bld->type, x));
2907
2908    assert(type.floating);
2909
2910    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2911
2912    /* res = x / 2**ipart */
2913    res = LLVMBuildAnd(builder, x, mantmask, "");
2914    res = LLVMBuildOr(builder, res, one, "");
2915    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2916
2917    return res;
2918 }
2919
2920
2921
2922 /**
2923  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2924  * These coefficients can be generate with
2925  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2926  */
2927 const double lp_build_log2_polynomial[] = {
2928 #if LOG_POLY_DEGREE == 5
2929    2.88539008148777786488L,
2930    0.961796878841293367824L,
2931    0.577058946784739859012L,
2932    0.412914355135828735411L,
2933    0.308591899232910175289L,
2934    0.352376952300281371868L,
2935 #elif LOG_POLY_DEGREE == 4
2936    2.88539009343309178325L,
2937    0.961791550404184197881L,
2938    0.577440339438736392009L,
2939    0.403343858251329912514L,
2940    0.406718052498846252698L,
2941 #elif LOG_POLY_DEGREE == 3
2942    2.88538959748872753838L,
2943    0.961932915889597772928L,
2944    0.571118517972136195241L,
2945    0.493997535084709500285L,
2946 #else
2947 #error
2948 #endif
2949 };
2950
2951 /**
2952  * See http://www.devmaster.net/forums/showthread.php?p=43580
2953  * http://en.wikipedia.org/wiki/Logarithm#Calculation
2954  * http://www.nezumi.demon.co.uk/consult/logx.htm
2955  */
2956 void
2957 lp_build_log2_approx(struct lp_build_context *bld,
2958                      LLVMValueRef x,
2959                      LLVMValueRef *p_exp,
2960                      LLVMValueRef *p_floor_log2,
2961                      LLVMValueRef *p_log2)
2962 {
2963    LLVMBuilderRef builder = bld->gallivm->builder;
2964    const struct lp_type type = bld->type;
2965    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2966    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2967
2968    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2969    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2970    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2971
2972    LLVMValueRef i = NULL;
2973    LLVMValueRef y = NULL;
2974    LLVMValueRef z = NULL;
2975    LLVMValueRef exp = NULL;
2976    LLVMValueRef mant = NULL;
2977    LLVMValueRef logexp = NULL;
2978    LLVMValueRef logmant = NULL;
2979    LLVMValueRef res = NULL;
2980
2981    assert(lp_check_value(bld->type, x));
2982
2983    if(p_exp || p_floor_log2 || p_log2) {
2984       /* TODO: optimize the constant case */
2985       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2986           LLVMIsConstant(x)) {
2987          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2988                       __FUNCTION__);
2989       }
2990
2991       assert(type.floating && type.width == 32);
2992
2993       /*
2994        * We don't explicitly handle denormalized numbers. They will yield a
2995        * result in the neighbourhood of -127, which appears to be adequate
2996        * enough.
2997        */
2998
2999       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3000
3001       /* exp = (float) exponent(x) */
3002       exp = LLVMBuildAnd(builder, i, expmask, "");
3003    }
3004
3005    if(p_floor_log2 || p_log2) {
3006       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3007       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3008       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3009    }
3010
3011    if(p_log2) {
3012       /* mant = 1 + (float) mantissa(x) */
3013       mant = LLVMBuildAnd(builder, i, mantmask, "");
3014       mant = LLVMBuildOr(builder, mant, one, "");
3015       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3016
3017       /* y = (mant - 1) / (mant + 1) */
3018       y = lp_build_div(bld,
3019          lp_build_sub(bld, mant, bld->one),
3020          lp_build_add(bld, mant, bld->one)
3021       );
3022
3023       /* z = y^2 */
3024       z = lp_build_mul(bld, y, y);
3025
3026       /* compute P(z) */
3027       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3028                                     Elements(lp_build_log2_polynomial));
3029
3030       /* logmant = y * P(z) */
3031       logmant = lp_build_mul(bld, y, logmant);
3032
3033       res = lp_build_add(bld, logmant, logexp);
3034    }
3035
3036    if(p_exp) {
3037       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3038       *p_exp = exp;
3039    }
3040
3041    if(p_floor_log2)
3042       *p_floor_log2 = logexp;
3043
3044    if(p_log2)
3045       *p_log2 = res;
3046 }
3047
3048
3049 LLVMValueRef
3050 lp_build_log2(struct lp_build_context *bld,
3051               LLVMValueRef x)
3052 {
3053    LLVMValueRef res;
3054    lp_build_log2_approx(bld, x, NULL, NULL, &res);
3055    return res;
3056 }
3057
3058
3059 /**
3060  * Faster (and less accurate) log2.
3061  *
3062  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3063  *
3064  * Piece-wise linear approximation, with exact results when x is a
3065  * power of two.
3066  *
3067  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3068  */
3069 LLVMValueRef
3070 lp_build_fast_log2(struct lp_build_context *bld,
3071                    LLVMValueRef x)
3072 {
3073    LLVMBuilderRef builder = bld->gallivm->builder;
3074    LLVMValueRef ipart;
3075    LLVMValueRef fpart;
3076
3077    assert(lp_check_value(bld->type, x));
3078
3079    assert(bld->type.floating);
3080
3081    /* ipart = floor(log2(x)) - 1 */
3082    ipart = lp_build_extract_exponent(bld, x, -1);
3083    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3084
3085    /* fpart = x / 2**ipart */
3086    fpart = lp_build_extract_mantissa(bld, x);
3087
3088    /* ipart + fpart */
3089    return LLVMBuildFAdd(builder, ipart, fpart, "");
3090 }
3091
3092
3093 /**
3094  * Fast implementation of iround(log2(x)).
3095  *
3096  * Not an approximation -- it should give accurate results all the time.
3097  */
3098 LLVMValueRef
3099 lp_build_ilog2(struct lp_build_context *bld,
3100                LLVMValueRef x)
3101 {
3102    LLVMBuilderRef builder = bld->gallivm->builder;
3103    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3104    LLVMValueRef ipart;
3105
3106    assert(bld->type.floating);
3107
3108    assert(lp_check_value(bld->type, x));
3109
3110    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3111    x = LLVMBuildFMul(builder, x, sqrt2, "");
3112
3113    /* ipart = floor(log2(x) + 0.5)  */
3114    ipart = lp_build_extract_exponent(bld, x, 0);
3115
3116    return ipart;
3117 }
3118
3119 LLVMValueRef
3120 lp_build_mod(struct lp_build_context *bld,
3121              LLVMValueRef x,
3122              LLVMValueRef y)
3123 {
3124    LLVMBuilderRef builder = bld->gallivm->builder;
3125    LLVMValueRef res;
3126    const struct lp_type type = bld->type;
3127
3128    assert(lp_check_value(type, x));
3129    assert(lp_check_value(type, y));
3130
3131    if (type.floating)
3132       res = LLVMBuildFRem(builder, x, y, "");
3133    else if (type.sign)
3134       res = LLVMBuildSRem(builder, x, y, "");
3135    else
3136       res = LLVMBuildURem(builder, x, y, "");
3137    return res;
3138 }