src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65
  66
  67 #define EXP_POLY_DEGREE 5
  68
  69 #define LOG_POLY_DEGREE 4
  70
  71
  72 /**
  73  * Generate min(a, b)
  74  * No checks for special case values of a or b = 1 or 0 are done.
  75  */
  76 static LLVMValueRef
  77 lp_build_min_simple(struct lp_build_context *bld,
  78                     LLVMValueRef a,
  79                     LLVMValueRef b)
  80 {
  81    const struct lp_type type = bld->type;
  82    const char *intrinsic = NULL;
  83    unsigned intr_size = 0;
  84    LLVMValueRef cond;
  85
  86    assert(lp_check_value(type, a));
  87    assert(lp_check_value(type, b));
  88
  89    /* TODO: optimize the constant case */
  90
  91    if (type.floating && util_cpu_caps.has_sse) {
  92       if (type.width == 32) {
  93          if (type.length == 1) {
  94             intrinsic = "llvm.x86.sse.min.ss";
  95             intr_size = 128;
  96          }
  97          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  98             intrinsic = "llvm.x86.sse.min.ps";
  99             intr_size = 128;
 100          }
 101          else {
 102             intrinsic = "llvm.x86.avx.min.ps.256";
 103             intr_size = 256;
 104          }
 105       }
 106       if (type.width == 64 && util_cpu_caps.has_sse2) {
 107          if (type.length == 1) {
 108             intrinsic = "llvm.x86.sse2.min.sd";
 109             intr_size = 128;
 110          }
 111          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 112             intrinsic = "llvm.x86.sse2.min.pd";
 113             intr_size = 128;
 114          }
 115          else {
 116             intrinsic = "llvm.x86.avx.min.pd.256";
 117             intr_size = 256;
 118          }
 119       }
 120    }
 121    else if (type.floating && util_cpu_caps.has_altivec) {
 122       if (type.width == 32 && type.length == 4) {
 123          intrinsic = "llvm.ppc.altivec.vminfp";
 124          intr_size = 128;
 125       }
 126    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 127       intr_size = 128;
 128       if ((type.width == 8 || type.width == 16) &&
 129           (type.width * type.length <= 64) &&
 130           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 131          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 132                       __FUNCTION__);
 133          }
 134       if (type.width == 8 && !type.sign) {
 135          intrinsic = "llvm.x86.sse2.pminu.b";
 136       }
 137       else if (type.width == 16 && type.sign) {
 138          intrinsic = "llvm.x86.sse2.pmins.w";
 139       }
 140       if (util_cpu_caps.has_sse4_1) {
 141          if (type.width == 8 && type.sign) {
 142             intrinsic = "llvm.x86.sse41.pminsb";
 143          }
 144          if (type.width == 16 && !type.sign) {
 145             intrinsic = "llvm.x86.sse41.pminuw";
 146          }
 147          if (type.width == 32 && !type.sign) {
 148             intrinsic = "llvm.x86.sse41.pminud";
 149         }
 150          if (type.width == 32 && type.sign) {
 151             intrinsic = "llvm.x86.sse41.pminsd";
 152          }
 153       }
 154    } else if (util_cpu_caps.has_altivec) {
 155      intr_size = 128;
 156      if (type.width == 8) {
 157        if (!type.sign) {
 158          intrinsic = "llvm.ppc.altivec.vminub";
 159        } else {
 160          intrinsic = "llvm.ppc.altivec.vminsb";
 161        }
 162      } else if (type.width == 16) {
 163        if (!type.sign) {
 164          intrinsic = "llvm.ppc.altivec.vminuh";
 165        } else {
 166          intrinsic = "llvm.ppc.altivec.vminsh";
 167        }
 168      } else if (type.width == 32) {
 169        if (!type.sign) {
 170          intrinsic = "llvm.ppc.altivec.vminuw";
 171        } else {
 172          intrinsic = "llvm.ppc.altivec.vminsw";
 173        }
 174      }
 175    }
 176
 177    if(intrinsic) {
 178       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 179                                                  type,
 180                                                  intr_size, a, b);
 181    }
 182
 183    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 184    return lp_build_select(bld, cond, a, b);
 185 }
 186
 187
 188 /**
 189  * Generate max(a, b)
 190  * No checks for special case values of a or b = 1 or 0 are done.
 191  */
 192 static LLVMValueRef
 193 lp_build_max_simple(struct lp_build_context *bld,
 194                     LLVMValueRef a,
 195                     LLVMValueRef b)
 196 {
 197    const struct lp_type type = bld->type;
 198    const char *intrinsic = NULL;
 199    unsigned intr_size = 0;
 200    LLVMValueRef cond;
 201
 202    assert(lp_check_value(type, a));
 203    assert(lp_check_value(type, b));
 204
 205    /* TODO: optimize the constant case */
 206
 207    if (type.floating && util_cpu_caps.has_sse) {
 208       if (type.width == 32) {
 209          if (type.length == 1) {
 210             intrinsic = "llvm.x86.sse.max.ss";
 211             intr_size = 128;
 212          }
 213          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 214             intrinsic = "llvm.x86.sse.max.ps";
 215             intr_size = 128;
 216          }
 217          else {
 218             intrinsic = "llvm.x86.avx.max.ps.256";
 219             intr_size = 256;
 220          }
 221       }
 222       if (type.width == 64 && util_cpu_caps.has_sse2) {
 223          if (type.length == 1) {
 224             intrinsic = "llvm.x86.sse2.max.sd";
 225             intr_size = 128;
 226          }
 227          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 228             intrinsic = "llvm.x86.sse2.max.pd";
 229             intr_size = 128;
 230          }
 231          else {
 232             intrinsic = "llvm.x86.avx.max.pd.256";
 233             intr_size = 256;
 234          }
 235       }
 236    }
 237    else if (type.floating && util_cpu_caps.has_altivec) {
 238       if (type.width == 32 || type.length == 4) {
 239          intrinsic = "llvm.ppc.altivec.vmaxfp";
 240          intr_size = 128;
 241       }
 242    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 243       intr_size = 128;
 244       if ((type.width == 8 || type.width == 16) &&
 245           (type.width * type.length <= 64) &&
 246           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 247          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 248                       __FUNCTION__);
 249          }
 250       if (type.width == 8 && !type.sign) {
 251          intrinsic = "llvm.x86.sse2.pmaxu.b";
 252          intr_size = 128;
 253       }
 254       else if (type.width == 16 && type.sign) {
 255          intrinsic = "llvm.x86.sse2.pmaxs.w";
 256       }
 257       if (util_cpu_caps.has_sse4_1) {
 258          if (type.width == 8 && type.sign) {
 259             intrinsic = "llvm.x86.sse41.pmaxsb";
 260          }
 261          if (type.width == 16 && !type.sign) {
 262             intrinsic = "llvm.x86.sse41.pmaxuw";
 263          }
 264          if (type.width == 32 && !type.sign) {
 265             intrinsic = "llvm.x86.sse41.pmaxud";
 266         }
 267          if (type.width == 32 && type.sign) {
 268             intrinsic = "llvm.x86.sse41.pmaxsd";
 269          }
 270       }
 271    } else if (util_cpu_caps.has_altivec) {
 272      intr_size = 128;
 273      if (type.width == 8) {
 274        if (!type.sign) {
 275          intrinsic = "llvm.ppc.altivec.vmaxub";
 276        } else {
 277          intrinsic = "llvm.ppc.altivec.vmaxsb";
 278        }
 279      } else if (type.width == 16) {
 280        if (!type.sign) {
 281          intrinsic = "llvm.ppc.altivec.vmaxuh";
 282        } else {
 283          intrinsic = "llvm.ppc.altivec.vmaxsh";
 284        }
 285      } else if (type.width == 32) {
 286        if (!type.sign) {
 287          intrinsic = "llvm.ppc.altivec.vmaxuw";
 288        } else {
 289          intrinsic = "llvm.ppc.altivec.vmaxsw";
 290        }
 291      }
 292    }
 293
 294    if(intrinsic) {
 295       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 296                                                  type,
 297                                                  intr_size, a, b);
 298    }
 299
 300    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 301    return lp_build_select(bld, cond, a, b);
 302 }
 303
 304
 305 /**
 306  * Generate 1 - a, or ~a depending on bld->type.
 307  */
 308 LLVMValueRef
 309 lp_build_comp(struct lp_build_context *bld,
 310               LLVMValueRef a)
 311 {
 312    LLVMBuilderRef builder = bld->gallivm->builder;
 313    const struct lp_type type = bld->type;
 314
 315    assert(lp_check_value(type, a));
 316
 317    if(a == bld->one)
 318       return bld->zero;
 319    if(a == bld->zero)
 320       return bld->one;
 321
 322    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 323       if(LLVMIsConstant(a))
 324          return LLVMConstNot(a);
 325       else
 326          return LLVMBuildNot(builder, a, "");
 327    }
 328
 329    if(LLVMIsConstant(a))
 330       if (type.floating)
 331           return LLVMConstFSub(bld->one, a);
 332       else
 333           return LLVMConstSub(bld->one, a);
 334    else
 335       if (type.floating)
 336          return LLVMBuildFSub(builder, bld->one, a, "");
 337       else
 338          return LLVMBuildSub(builder, bld->one, a, "");
 339 }
 340
 341
 342 /**
 343  * Generate a + b
 344  */
 345 LLVMValueRef
 346 lp_build_add(struct lp_build_context *bld,
 347              LLVMValueRef a,
 348              LLVMValueRef b)
 349 {
 350    LLVMBuilderRef builder = bld->gallivm->builder;
 351    const struct lp_type type = bld->type;
 352    LLVMValueRef res;
 353
 354    assert(lp_check_value(type, a));
 355    assert(lp_check_value(type, b));
 356
 357    if(a == bld->zero)
 358       return b;
 359    if(b == bld->zero)
 360       return a;
 361    if(a == bld->undef || b == bld->undef)
 362       return bld->undef;
 363
 364    if(bld->type.norm) {
 365       const char *intrinsic = NULL;
 366
 367       if(a == bld->one || b == bld->one)
 368         return bld->one;
 369
 370       if (type.width * type.length == 128 &&
 371           !type.floating && !type.fixed) {
 372          if(util_cpu_caps.has_sse2) {
 373            if(type.width == 8)
 374              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 375            if(type.width == 16)
 376              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 377          } else if (util_cpu_caps.has_altivec) {
 378            if(type.width == 8)
 379               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 380            if(type.width == 16)
 381               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 382          }
 383       }
 384
 385       if(intrinsic)
 386          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 387    }
 388
 389    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 390       if (type.floating)
 391          res = LLVMConstFAdd(a, b);
 392       else
 393          res = LLVMConstAdd(a, b);
 394    else
 395       if (type.floating)
 396          res = LLVMBuildFAdd(builder, a, b, "");
 397       else
 398          res = LLVMBuildAdd(builder, a, b, "");
 399
 400    /* clamp to ceiling of 1.0 */
 401    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 402       res = lp_build_min_simple(bld, res, bld->one);
 403
 404    /* XXX clamp to floor of -1 or 0??? */
 405
 406    return res;
 407 }
 408
 409
 410 /** Return the scalar sum of the elements of a.
 411  * Should avoid this operation whenever possible.
 412  */
 413 LLVMValueRef
 414 lp_build_horizontal_add(struct lp_build_context *bld,
 415                         LLVMValueRef a)
 416 {
 417    LLVMBuilderRef builder = bld->gallivm->builder;
 418    const struct lp_type type = bld->type;
 419    LLVMValueRef index, res;
 420    unsigned i, length;
 421    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 422    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 423    LLVMValueRef vecres, elem2;
 424
 425    assert(lp_check_value(type, a));
 426
 427    if (type.length == 1) {
 428       return a;
 429    }
 430
 431    assert(!bld->type.norm);
 432
 433    /*
 434     * for byte vectors can do much better with psadbw.
 435     * Using repeated shuffle/adds here. Note with multiple vectors
 436     * this can be done more efficiently as outlined in the intel
 437     * optimization manual.
 438     * Note: could cause data rearrangement if used with smaller element
 439     * sizes.
 440     */
 441
 442    vecres = a;
 443    length = type.length / 2;
 444    while (length > 1) {
 445       LLVMValueRef vec1, vec2;
 446       for (i = 0; i < length; i++) {
 447          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 448          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 449       }
 450       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 451                                     LLVMConstVector(shuffles1, length), "");
 452       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 453                                     LLVMConstVector(shuffles2, length), "");
 454       if (type.floating) {
 455          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 456       }
 457       else {
 458          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 459       }
 460       length = length >> 1;
 461    }
 462
 463    /* always have vector of size 2 here */
 464    assert(length == 1);
 465
 466    index = lp_build_const_int32(bld->gallivm, 0);
 467    res = LLVMBuildExtractElement(builder, vecres, index, "");
 468    index = lp_build_const_int32(bld->gallivm, 1);
 469    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 470
 471    if (type.floating)
 472       res = LLVMBuildFAdd(builder, res, elem2, "");
 473     else
 474       res = LLVMBuildAdd(builder, res, elem2, "");
 475
 476    return res;
 477 }
 478
 479 /**
 480  * Return the horizontal sums of 4 float vectors as a float4 vector.
 481  * This uses the technique as outlined in Intel Optimization Manual.
 482  */
 483 static LLVMValueRef
 484 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 485                             LLVMValueRef src[4])
 486 {
 487    struct gallivm_state *gallivm = bld->gallivm;
 488    LLVMBuilderRef builder = gallivm->builder;
 489    LLVMValueRef shuffles[4];
 490    LLVMValueRef tmp[4];
 491    LLVMValueRef sumtmp[2], shuftmp[2];
 492
 493    /* lower half of regs */
 494    shuffles[0] = lp_build_const_int32(gallivm, 0);
 495    shuffles[1] = lp_build_const_int32(gallivm, 1);
 496    shuffles[2] = lp_build_const_int32(gallivm, 4);
 497    shuffles[3] = lp_build_const_int32(gallivm, 5);
 498    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 499                                    LLVMConstVector(shuffles, 4), "");
 500    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 501                                    LLVMConstVector(shuffles, 4), "");
 502
 503    /* upper half of regs */
 504    shuffles[0] = lp_build_const_int32(gallivm, 2);
 505    shuffles[1] = lp_build_const_int32(gallivm, 3);
 506    shuffles[2] = lp_build_const_int32(gallivm, 6);
 507    shuffles[3] = lp_build_const_int32(gallivm, 7);
 508    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 509                                    LLVMConstVector(shuffles, 4), "");
 510    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 511                                    LLVMConstVector(shuffles, 4), "");
 512
 513    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 514    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 515
 516    shuffles[0] = lp_build_const_int32(gallivm, 0);
 517    shuffles[1] = lp_build_const_int32(gallivm, 2);
 518    shuffles[2] = lp_build_const_int32(gallivm, 4);
 519    shuffles[3] = lp_build_const_int32(gallivm, 6);
 520    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 521                                        LLVMConstVector(shuffles, 4), "");
 522
 523    shuffles[0] = lp_build_const_int32(gallivm, 1);
 524    shuffles[1] = lp_build_const_int32(gallivm, 3);
 525    shuffles[2] = lp_build_const_int32(gallivm, 5);
 526    shuffles[3] = lp_build_const_int32(gallivm, 7);
 527    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 528                                        LLVMConstVector(shuffles, 4), "");
 529
 530    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 531 }
 532
 533
 534 /*
 535  * partially horizontally add 2-4 float vectors with length nx4,
 536  * i.e. only four adjacent values in each vector will be added,
 537  * assuming values are really grouped in 4 which also determines
 538  * output order.
 539  *
 540  * Return a vector of the same length as the initial vectors,
 541  * with the excess elements (if any) being undefined.
 542  * The element order is independent of number of input vectors.
 543  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 544  * the output order thus will be
 545  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 546  */
 547 LLVMValueRef
 548 lp_build_hadd_partial4(struct lp_build_context *bld,
 549                        LLVMValueRef vectors[],
 550                        unsigned num_vecs)
 551 {
 552    struct gallivm_state *gallivm = bld->gallivm;
 553    LLVMBuilderRef builder = gallivm->builder;
 554    LLVMValueRef ret_vec;
 555    LLVMValueRef tmp[4];
 556    const char *intrinsic = NULL;
 557
 558    assert(num_vecs >= 2 && num_vecs <= 4);
 559    assert(bld->type.floating);
 560
 561    /* only use this with at least 2 vectors, as it is sort of expensive
 562     * (depending on cpu) and we always need two horizontal adds anyway,
 563     * so a shuffle/add approach might be better.
 564     */
 565
 566    tmp[0] = vectors[0];
 567    tmp[1] = vectors[1];
 568
 569    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 570    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 571
 572    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 573        bld->type.length == 4) {
 574       intrinsic = "llvm.x86.sse3.hadd.ps";
 575    }
 576    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 577             bld->type.length == 8) {
 578       intrinsic = "llvm.x86.avx.hadd.ps.256";
 579    }
 580    if (intrinsic) {
 581       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 582                                        lp_build_vec_type(gallivm, bld->type),
 583                                        tmp[0], tmp[1]);
 584       if (num_vecs > 2) {
 585          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 586                                           lp_build_vec_type(gallivm, bld->type),
 587                                           tmp[2], tmp[3]);
 588       }
 589       else {
 590          tmp[1] = tmp[0];
 591       }
 592       return lp_build_intrinsic_binary(builder, intrinsic,
 593                                        lp_build_vec_type(gallivm, bld->type),
 594                                        tmp[0], tmp[1]);
 595    }
 596
 597    if (bld->type.length == 4) {
 598       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 599    }
 600    else {
 601       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 602       unsigned j;
 603       unsigned num_iter = bld->type.length / 4;
 604       struct lp_type parttype = bld->type;
 605       parttype.length = 4;
 606       for (j = 0; j < num_iter; j++) {
 607          LLVMValueRef partsrc[4];
 608          unsigned i;
 609          for (i = 0; i < 4; i++) {
 610             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 611          }
 612          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 613       }
 614       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 615    }
 616    return ret_vec;
 617 }
 618
 619 /**
 620  * Generate a - b
 621  */
 622 LLVMValueRef
 623 lp_build_sub(struct lp_build_context *bld,
 624              LLVMValueRef a,
 625              LLVMValueRef b)
 626 {
 627    LLVMBuilderRef builder = bld->gallivm->builder;
 628    const struct lp_type type = bld->type;
 629    LLVMValueRef res;
 630
 631    assert(lp_check_value(type, a));
 632    assert(lp_check_value(type, b));
 633
 634    if(b == bld->zero)
 635       return a;
 636    if(a == bld->undef || b == bld->undef)
 637       return bld->undef;
 638    if(a == b)
 639       return bld->zero;
 640
 641    if(bld->type.norm) {
 642       const char *intrinsic = NULL;
 643
 644       if(b == bld->one)
 645         return bld->zero;
 646
 647       if (type.width * type.length == 128 &&
 648           !type.floating && !type.fixed) {
 649          if (util_cpu_caps.has_sse2) {
 650            if(type.width == 8)
 651               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 652            if(type.width == 16)
 653               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 654          } else if (util_cpu_caps.has_altivec) {
 655            if(type.width == 8)
 656               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 657            if(type.width == 16)
 658               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 659          }
 660       }
 661
 662       if(intrinsic)
 663          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 664    }
 665
 666    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 667       if (type.floating)
 668          res = LLVMConstFSub(a, b);
 669       else
 670          res = LLVMConstSub(a, b);
 671    else
 672       if (type.floating)
 673          res = LLVMBuildFSub(builder, a, b, "");
 674       else
 675          res = LLVMBuildSub(builder, a, b, "");
 676
 677    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 678       res = lp_build_max_simple(bld, res, bld->zero);
 679
 680    return res;
 681 }
 682
 683
 684
 685 /**
 686  * Normalized multiplication.
 687  *
 688  * There are several approaches for (using 8-bit normalized multiplication as
 689  * an example):
 690  *
 691  * - alpha plus one
 692  *
 693  *     makes the following approximation to the division (Sree)
 694  *
 695  *       a*b/255 ~= (a*(b + 1)) >> 256
 696  *
 697  *     which is the fastest method that satisfies the following OpenGL criteria of
 698  *
 699  *       0*0 = 0 and 255*255 = 255
 700  *
 701  * - geometric series
 702  *
 703  *     takes the geometric series approximation to the division
 704  *
 705  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 706  *
 707  *     in this case just the first two terms to fit in 16bit arithmetic
 708  *
 709  *       t/255 ~= (t + (t >> 8)) >> 8
 710  *
 711  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 712  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 713  *     must be used.
 714  *
 715  * - geometric series plus rounding
 716  *
 717  *     when using a geometric series division instead of truncating the result
 718  *     use roundoff in the approximation (Jim Blinn)
 719  *
 720  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 721  *
 722  *     achieving the exact results.
 723  *
 724  *
 725  *
 726  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 727  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 728  * @sa Michael Herf, The "double blend trick", May 2000,
 729  *     http://www.stereopsis.com/doubleblend.html
 730  */
 731 static LLVMValueRef
 732 lp_build_mul_norm(struct gallivm_state *gallivm,
 733                   struct lp_type wide_type,
 734                   LLVMValueRef a, LLVMValueRef b)
 735 {
 736    LLVMBuilderRef builder = gallivm->builder;
 737    struct lp_build_context bld;
 738    unsigned n;
 739    LLVMValueRef half;
 740    LLVMValueRef ab;
 741
 742    assert(!wide_type.floating);
 743    assert(lp_check_value(wide_type, a));
 744    assert(lp_check_value(wide_type, b));
 745
 746    lp_build_context_init(&bld, gallivm, wide_type);
 747
 748    n = wide_type.width / 2;
 749    if (wide_type.sign) {
 750       --n;
 751    }
 752
 753    /*
 754     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 755     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 756     */
 757
 758    /*
 759     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 760     */
 761
 762    ab = LLVMBuildMul(builder, a, b, "");
 763    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 764
 765    /*
 766     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 767     */
 768
 769    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 770    if (wide_type.sign) {
 771       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 772       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 773       half = lp_build_select(&bld, sign, minus_half, half);
 774    }
 775    ab = LLVMBuildAdd(builder, ab, half, "");
 776
 777    /* Final division */
 778    ab = lp_build_shr_imm(&bld, ab, n);
 779
 780    return ab;
 781 }
 782
 783 /**
 784  * Generate a * b
 785  */
 786 LLVMValueRef
 787 lp_build_mul(struct lp_build_context *bld,
 788              LLVMValueRef a,
 789              LLVMValueRef b)
 790 {
 791    LLVMBuilderRef builder = bld->gallivm->builder;
 792    const struct lp_type type = bld->type;
 793    LLVMValueRef shift;
 794    LLVMValueRef res;
 795
 796    assert(lp_check_value(type, a));
 797    assert(lp_check_value(type, b));
 798
 799    if(a == bld->zero)
 800       return bld->zero;
 801    if(a == bld->one)
 802       return b;
 803    if(b == bld->zero)
 804       return bld->zero;
 805    if(b == bld->one)
 806       return a;
 807    if(a == bld->undef || b == bld->undef)
 808       return bld->undef;
 809
 810    if (!type.floating && !type.fixed && type.norm) {
 811       struct lp_type wide_type = lp_wider_type(type);
 812       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 813
 814       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 815       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 816
 817       /* PMULLW, PSRLW, PADDW */
 818       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 819       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 820
 821       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 822
 823       return ab;
 824    }
 825
 826    if(type.fixed)
 827       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 828    else
 829       shift = NULL;
 830
 831    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 832       if (type.floating)
 833          res = LLVMConstFMul(a, b);
 834       else
 835          res = LLVMConstMul(a, b);
 836       if(shift) {
 837          if(type.sign)
 838             res = LLVMConstAShr(res, shift);
 839          else
 840             res = LLVMConstLShr(res, shift);
 841       }
 842    }
 843    else {
 844       if (type.floating)
 845          res = LLVMBuildFMul(builder, a, b, "");
 846       else
 847          res = LLVMBuildMul(builder, a, b, "");
 848       if(shift) {
 849          if(type.sign)
 850             res = LLVMBuildAShr(builder, res, shift, "");
 851          else
 852             res = LLVMBuildLShr(builder, res, shift, "");
 853       }
 854    }
 855
 856    return res;
 857 }
 858
 859
 860 /**
 861  * Small vector x scale multiplication optimization.
 862  */
 863 LLVMValueRef
 864 lp_build_mul_imm(struct lp_build_context *bld,
 865                  LLVMValueRef a,
 866                  int b)
 867 {
 868    LLVMBuilderRef builder = bld->gallivm->builder;
 869    LLVMValueRef factor;
 870
 871    assert(lp_check_value(bld->type, a));
 872
 873    if(b == 0)
 874       return bld->zero;
 875
 876    if(b == 1)
 877       return a;
 878
 879    if(b == -1)
 880       return lp_build_negate(bld, a);
 881
 882    if(b == 2 && bld->type.floating)
 883       return lp_build_add(bld, a, a);
 884
 885    if(util_is_power_of_two(b)) {
 886       unsigned shift = ffs(b) - 1;
 887
 888       if(bld->type.floating) {
 889 #if 0
 890          /*
 891           * Power of two multiplication by directly manipulating the exponent.
 892           *
 893           * XXX: This might not be always faster, it will introduce a small error
 894           * for multiplication by zero, and it will produce wrong results
 895           * for Inf and NaN.
 896           */
 897          unsigned mantissa = lp_mantissa(bld->type);
 898          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 899          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 900          a = LLVMBuildAdd(builder, a, factor, "");
 901          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 902          return a;
 903 #endif
 904       }
 905       else {
 906          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 907          return LLVMBuildShl(builder, a, factor, "");
 908       }
 909    }
 910
 911    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 912    return lp_build_mul(bld, a, factor);
 913 }
 914
 915
 916 /**
 917  * Generate a / b
 918  */
 919 LLVMValueRef
 920 lp_build_div(struct lp_build_context *bld,
 921              LLVMValueRef a,
 922              LLVMValueRef b)
 923 {
 924    LLVMBuilderRef builder = bld->gallivm->builder;
 925    const struct lp_type type = bld->type;
 926
 927    assert(lp_check_value(type, a));
 928    assert(lp_check_value(type, b));
 929
 930    if(a == bld->zero)
 931       return bld->zero;
 932    if(a == bld->one)
 933       return lp_build_rcp(bld, b);
 934    if(b == bld->zero)
 935       return bld->undef;
 936    if(b == bld->one)
 937       return a;
 938    if(a == bld->undef || b == bld->undef)
 939       return bld->undef;
 940
 941    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 942       if (type.floating)
 943          return LLVMConstFDiv(a, b);
 944       else if (type.sign)
 945          return LLVMConstSDiv(a, b);
 946       else
 947          return LLVMConstUDiv(a, b);
 948    }
 949
 950    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
 951        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
 952       type.floating)
 953       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 954
 955    if (type.floating)
 956       return LLVMBuildFDiv(builder, a, b, "");
 957    else if (type.sign)
 958       return LLVMBuildSDiv(builder, a, b, "");
 959    else
 960       return LLVMBuildUDiv(builder, a, b, "");
 961 }
 962
 963
 964 /**
 965  * Linear interpolation helper.
 966  *
 967  * @param normalized whether we are interpolating normalized values,
 968  *        encoded in normalized integers, twice as wide.
 969  *
 970  * @sa http://www.stereopsis.com/doubleblend.html
 971  */
 972 static INLINE LLVMValueRef
 973 lp_build_lerp_simple(struct lp_build_context *bld,
 974                      LLVMValueRef x,
 975                      LLVMValueRef v0,
 976                      LLVMValueRef v1,
 977                      bool normalized)
 978 {
 979    unsigned half_width = bld->type.width/2;
 980    LLVMBuilderRef builder = bld->gallivm->builder;
 981    LLVMValueRef delta;
 982    LLVMValueRef res;
 983
 984    assert(lp_check_value(bld->type, x));
 985    assert(lp_check_value(bld->type, v0));
 986    assert(lp_check_value(bld->type, v1));
 987
 988    delta = lp_build_sub(bld, v1, v0);
 989
 990    if (normalized) {
 991       if (!bld->type.sign) {
 992          /*
 993           * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
 994           * most-significant-bit to the lowest-significant-bit, so that
 995           * later we can just divide by 2**n instead of 2**n - 1.
 996           */
 997          x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
 998
 999          /* (x * delta) >> n */
1000          res = lp_build_mul(bld, x, delta);
1001          res = lp_build_shr_imm(bld, res, half_width);
1002       } else {
1003          /*
1004           * The rescaling trick above doesn't work for signed numbers, so
1005           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1006           * instead.
1007           */
1008          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1009       }
1010    } else {
1011       res = lp_build_mul(bld, x, delta);
1012    }
1013
1014    res = lp_build_add(bld, v0, res);
1015
1016    if ((normalized && !bld->type.sign) || bld->type.fixed) {
1017       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1018       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1019        * but it will be wrong for true fixed point use cases. Basically we need
1020        * a more powerful lp_type, capable of further distinguishing the values
1021        * interpretation from the value storage. */
1022       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1023    }
1024
1025    return res;
1026 }
1027
1028
1029 /**
1030  * Linear interpolation.
1031  */
1032 LLVMValueRef
1033 lp_build_lerp(struct lp_build_context *bld,
1034               LLVMValueRef x,
1035               LLVMValueRef v0,
1036               LLVMValueRef v1)
1037 {
1038    const struct lp_type type = bld->type;
1039    LLVMValueRef res;
1040
1041    assert(lp_check_value(type, x));
1042    assert(lp_check_value(type, v0));
1043    assert(lp_check_value(type, v1));
1044
1045    if (type.norm) {
1046       struct lp_type wide_type;
1047       struct lp_build_context wide_bld;
1048       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1049
1050       assert(type.length >= 2);
1051
1052       /*
1053        * Create a wider integer type, enough to hold the
1054        * intermediate result of the multiplication.
1055        */
1056       memset(&wide_type, 0, sizeof wide_type);
1057       wide_type.sign   = type.sign;
1058       wide_type.width  = type.width*2;
1059       wide_type.length = type.length/2;
1060
1061       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1062
1063       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1064       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1065       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1066
1067       /*
1068        * Lerp both halves.
1069        */
1070
1071       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, TRUE);
1072       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, TRUE);
1073
1074       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1075    } else {
1076       res = lp_build_lerp_simple(bld, x, v0, v1, FALSE);
1077    }
1078
1079    return res;
1080 }
1081
1082
1083 LLVMValueRef
1084 lp_build_lerp_2d(struct lp_build_context *bld,
1085                  LLVMValueRef x,
1086                  LLVMValueRef y,
1087                  LLVMValueRef v00,
1088                  LLVMValueRef v01,
1089                  LLVMValueRef v10,
1090                  LLVMValueRef v11)
1091 {
1092    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
1093    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
1094    return lp_build_lerp(bld, y, v0, v1);
1095 }
1096
1097
1098 /**
1099  * Generate min(a, b)
1100  * Do checks for special cases.
1101  */
1102 LLVMValueRef
1103 lp_build_min(struct lp_build_context *bld,
1104              LLVMValueRef a,
1105              LLVMValueRef b)
1106 {
1107    assert(lp_check_value(bld->type, a));
1108    assert(lp_check_value(bld->type, b));
1109
1110    if(a == bld->undef || b == bld->undef)
1111       return bld->undef;
1112
1113    if(a == b)
1114       return a;
1115
1116    if (bld->type.norm) {
1117       if (!bld->type.sign) {
1118          if (a == bld->zero || b == bld->zero) {
1119             return bld->zero;
1120          }
1121       }
1122       if(a == bld->one)
1123          return b;
1124       if(b == bld->one)
1125          return a;
1126    }
1127
1128    return lp_build_min_simple(bld, a, b);
1129 }
1130
1131
1132 /**
1133  * Generate max(a, b)
1134  * Do checks for special cases.
1135  */
1136 LLVMValueRef
1137 lp_build_max(struct lp_build_context *bld,
1138              LLVMValueRef a,
1139              LLVMValueRef b)
1140 {
1141    assert(lp_check_value(bld->type, a));
1142    assert(lp_check_value(bld->type, b));
1143
1144    if(a == bld->undef || b == bld->undef)
1145       return bld->undef;
1146
1147    if(a == b)
1148       return a;
1149
1150    if(bld->type.norm) {
1151       if(a == bld->one || b == bld->one)
1152          return bld->one;
1153       if (!bld->type.sign) {
1154          if (a == bld->zero) {
1155             return b;
1156          }
1157          if (b == bld->zero) {
1158             return a;
1159          }
1160       }
1161    }
1162
1163    return lp_build_max_simple(bld, a, b);
1164 }
1165
1166
1167 /**
1168  * Generate clamp(a, min, max)
1169  * Do checks for special cases.
1170  */
1171 LLVMValueRef
1172 lp_build_clamp(struct lp_build_context *bld,
1173                LLVMValueRef a,
1174                LLVMValueRef min,
1175                LLVMValueRef max)
1176 {
1177    assert(lp_check_value(bld->type, a));
1178    assert(lp_check_value(bld->type, min));
1179    assert(lp_check_value(bld->type, max));
1180
1181    a = lp_build_min(bld, a, max);
1182    a = lp_build_max(bld, a, min);
1183    return a;
1184 }
1185
1186
1187 /**
1188  * Generate abs(a)
1189  */
1190 LLVMValueRef
1191 lp_build_abs(struct lp_build_context *bld,
1192              LLVMValueRef a)
1193 {
1194    LLVMBuilderRef builder = bld->gallivm->builder;
1195    const struct lp_type type = bld->type;
1196    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1197
1198    assert(lp_check_value(type, a));
1199
1200    if(!type.sign)
1201       return a;
1202
1203    if(type.floating) {
1204       /* Mask out the sign bit */
1205       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1206       unsigned long long absMask = ~(1ULL << (type.width - 1));
1207       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1208       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1209       a = LLVMBuildAnd(builder, a, mask, "");
1210       a = LLVMBuildBitCast(builder, a, vec_type, "");
1211       return a;
1212    }
1213
1214    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1215       switch(type.width) {
1216       case 8:
1217          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1218       case 16:
1219          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1220       case 32:
1221          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1222       }
1223    }
1224    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1225             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1226             (type.width == 8 || type.width == 16 || type.width == 32)) {
1227       debug_printf("%s: inefficient code, should split vectors manually\n",
1228                    __FUNCTION__);
1229    }
1230
1231    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1232 }
1233
1234
1235 LLVMValueRef
1236 lp_build_negate(struct lp_build_context *bld,
1237                 LLVMValueRef a)
1238 {
1239    LLVMBuilderRef builder = bld->gallivm->builder;
1240
1241    assert(lp_check_value(bld->type, a));
1242
1243 #if HAVE_LLVM >= 0x0207
1244    if (bld->type.floating)
1245       a = LLVMBuildFNeg(builder, a, "");
1246    else
1247 #endif
1248       a = LLVMBuildNeg(builder, a, "");
1249
1250    return a;
1251 }
1252
1253
1254 /** Return -1, 0 or +1 depending on the sign of a */
1255 LLVMValueRef
1256 lp_build_sgn(struct lp_build_context *bld,
1257              LLVMValueRef a)
1258 {
1259    LLVMBuilderRef builder = bld->gallivm->builder;
1260    const struct lp_type type = bld->type;
1261    LLVMValueRef cond;
1262    LLVMValueRef res;
1263
1264    assert(lp_check_value(type, a));
1265
1266    /* Handle non-zero case */
1267    if(!type.sign) {
1268       /* if not zero then sign must be positive */
1269       res = bld->one;
1270    }
1271    else if(type.floating) {
1272       LLVMTypeRef vec_type;
1273       LLVMTypeRef int_type;
1274       LLVMValueRef mask;
1275       LLVMValueRef sign;
1276       LLVMValueRef one;
1277       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1278
1279       int_type = lp_build_int_vec_type(bld->gallivm, type);
1280       vec_type = lp_build_vec_type(bld->gallivm, type);
1281       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1282
1283       /* Take the sign bit and add it to 1 constant */
1284       sign = LLVMBuildBitCast(builder, a, int_type, "");
1285       sign = LLVMBuildAnd(builder, sign, mask, "");
1286       one = LLVMConstBitCast(bld->one, int_type);
1287       res = LLVMBuildOr(builder, sign, one, "");
1288       res = LLVMBuildBitCast(builder, res, vec_type, "");
1289    }
1290    else
1291    {
1292       /* signed int/norm/fixed point */
1293       /* could use psign with sse3 and appropriate vectors here */
1294       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1295       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1296       res = lp_build_select(bld, cond, bld->one, minus_one);
1297    }
1298
1299    /* Handle zero */
1300    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1301    res = lp_build_select(bld, cond, bld->zero, res);
1302
1303    return res;
1304 }
1305
1306
1307 /**
1308  * Set the sign of float vector 'a' according to 'sign'.
1309  * If sign==0, return abs(a).
1310  * If sign==1, return -abs(a);
1311  * Other values for sign produce undefined results.
1312  */
1313 LLVMValueRef
1314 lp_build_set_sign(struct lp_build_context *bld,
1315                   LLVMValueRef a, LLVMValueRef sign)
1316 {
1317    LLVMBuilderRef builder = bld->gallivm->builder;
1318    const struct lp_type type = bld->type;
1319    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1320    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1321    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1322    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1323                              ~((unsigned long long) 1 << (type.width - 1)));
1324    LLVMValueRef val, res;
1325
1326    assert(type.floating);
1327    assert(lp_check_value(type, a));
1328
1329    /* val = reinterpret_cast<int>(a) */
1330    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1331    /* val = val & mask */
1332    val = LLVMBuildAnd(builder, val, mask, "");
1333    /* sign = sign << shift */
1334    sign = LLVMBuildShl(builder, sign, shift, "");
1335    /* res = val | sign */
1336    res = LLVMBuildOr(builder, val, sign, "");
1337    /* res = reinterpret_cast<float>(res) */
1338    res = LLVMBuildBitCast(builder, res, vec_type, "");
1339
1340    return res;
1341 }
1342
1343
1344 /**
1345  * Convert vector of (or scalar) int to vector of (or scalar) float.
1346  */
1347 LLVMValueRef
1348 lp_build_int_to_float(struct lp_build_context *bld,
1349                       LLVMValueRef a)
1350 {
1351    LLVMBuilderRef builder = bld->gallivm->builder;
1352    const struct lp_type type = bld->type;
1353    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1354
1355    assert(type.floating);
1356
1357    return LLVMBuildSIToFP(builder, a, vec_type, "");
1358 }
1359
1360 static boolean
1361 arch_rounding_available(const struct lp_type type)
1362 {
1363    if ((util_cpu_caps.has_sse4_1 &&
1364        (type.length == 1 || type.width*type.length == 128)) ||
1365        (util_cpu_caps.has_avx && type.width*type.length == 256))
1366       return TRUE;
1367    else if ((util_cpu_caps.has_altivec &&
1368             (type.width == 32 && type.length == 4)))
1369       return TRUE;
1370
1371    return FALSE;
1372 }
1373
1374 enum lp_build_round_mode
1375 {
1376    LP_BUILD_ROUND_NEAREST = 0,
1377    LP_BUILD_ROUND_FLOOR = 1,
1378    LP_BUILD_ROUND_CEIL = 2,
1379    LP_BUILD_ROUND_TRUNCATE = 3
1380 };
1381
1382 /**
1383  * Helper for SSE4.1's ROUNDxx instructions.
1384  *
1385  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1386  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1387  */
1388 static INLINE LLVMValueRef
1389 lp_build_round_sse41(struct lp_build_context *bld,
1390                      LLVMValueRef a,
1391                      enum lp_build_round_mode mode)
1392 {
1393    LLVMBuilderRef builder = bld->gallivm->builder;
1394    const struct lp_type type = bld->type;
1395    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1396    const char *intrinsic;
1397    LLVMValueRef res;
1398
1399    assert(type.floating);
1400
1401    assert(lp_check_value(type, a));
1402    assert(util_cpu_caps.has_sse4_1);
1403
1404    if (type.length == 1) {
1405       LLVMTypeRef vec_type;
1406       LLVMValueRef undef;
1407       LLVMValueRef args[3];
1408       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1409
1410       switch(type.width) {
1411       case 32:
1412          intrinsic = "llvm.x86.sse41.round.ss";
1413          break;
1414       case 64:
1415          intrinsic = "llvm.x86.sse41.round.sd";
1416          break;
1417       default:
1418          assert(0);
1419          return bld->undef;
1420       }
1421
1422       vec_type = LLVMVectorType(bld->elem_type, 4);
1423
1424       undef = LLVMGetUndef(vec_type);
1425
1426       args[0] = undef;
1427       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1428       args[2] = LLVMConstInt(i32t, mode, 0);
1429
1430       res = lp_build_intrinsic(builder, intrinsic,
1431                                vec_type, args, Elements(args));
1432
1433       res = LLVMBuildExtractElement(builder, res, index0, "");
1434    }
1435    else {
1436       if (type.width * type.length == 128) {
1437          switch(type.width) {
1438          case 32:
1439             intrinsic = "llvm.x86.sse41.round.ps";
1440             break;
1441          case 64:
1442             intrinsic = "llvm.x86.sse41.round.pd";
1443             break;
1444          default:
1445             assert(0);
1446             return bld->undef;
1447          }
1448       }
1449       else {
1450          assert(type.width * type.length == 256);
1451          assert(util_cpu_caps.has_avx);
1452
1453          switch(type.width) {
1454          case 32:
1455             intrinsic = "llvm.x86.avx.round.ps.256";
1456             break;
1457          case 64:
1458             intrinsic = "llvm.x86.avx.round.pd.256";
1459             break;
1460          default:
1461             assert(0);
1462             return bld->undef;
1463          }
1464       }
1465
1466       res = lp_build_intrinsic_binary(builder, intrinsic,
1467                                       bld->vec_type, a,
1468                                       LLVMConstInt(i32t, mode, 0));
1469    }
1470
1471    return res;
1472 }
1473
1474
1475 static INLINE LLVMValueRef
1476 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1477                              LLVMValueRef a)
1478 {
1479    LLVMBuilderRef builder = bld->gallivm->builder;
1480    const struct lp_type type = bld->type;
1481    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1482    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1483    const char *intrinsic;
1484    LLVMValueRef res;
1485
1486    assert(type.floating);
1487    /* using the double precision conversions is a bit more complicated */
1488    assert(type.width == 32);
1489
1490    assert(lp_check_value(type, a));
1491    assert(util_cpu_caps.has_sse2);
1492
1493    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1494    if (type.length == 1) {
1495       LLVMTypeRef vec_type;
1496       LLVMValueRef undef;
1497       LLVMValueRef arg;
1498       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1499
1500       vec_type = LLVMVectorType(bld->elem_type, 4);
1501
1502       intrinsic = "llvm.x86.sse.cvtss2si";
1503
1504       undef = LLVMGetUndef(vec_type);
1505
1506       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1507
1508       res = lp_build_intrinsic_unary(builder, intrinsic,
1509                                      ret_type, arg);
1510    }
1511    else {
1512       if (type.width* type.length == 128) {
1513          intrinsic = "llvm.x86.sse2.cvtps2dq";
1514       }
1515       else {
1516          assert(type.width*type.length == 256);
1517          assert(util_cpu_caps.has_avx);
1518
1519          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1520       }
1521       res = lp_build_intrinsic_unary(builder, intrinsic,
1522                                      ret_type, a);
1523    }
1524
1525    return res;
1526 }
1527
1528
1529 /*
1530  */
1531 static INLINE LLVMValueRef
1532 lp_build_round_altivec(struct lp_build_context *bld,
1533                        LLVMValueRef a,
1534                        enum lp_build_round_mode mode)
1535 {
1536    LLVMBuilderRef builder = bld->gallivm->builder;
1537    const struct lp_type type = bld->type;
1538    const char *intrinsic = NULL;
1539
1540    assert(type.floating);
1541
1542    assert(lp_check_value(type, a));
1543    assert(util_cpu_caps.has_altivec);
1544
1545    switch (mode) {
1546    case LP_BUILD_ROUND_NEAREST:
1547       intrinsic = "llvm.ppc.altivec.vrfin";
1548       break;
1549    case LP_BUILD_ROUND_FLOOR:
1550       intrinsic = "llvm.ppc.altivec.vrfim";
1551       break;
1552    case LP_BUILD_ROUND_CEIL:
1553       intrinsic = "llvm.ppc.altivec.vrfip";
1554       break;
1555    case LP_BUILD_ROUND_TRUNCATE:
1556       intrinsic = "llvm.ppc.altivec.vrfiz";
1557       break;
1558    }
1559
1560    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1561 }
1562
1563 static INLINE LLVMValueRef
1564 lp_build_round_arch(struct lp_build_context *bld,
1565                     LLVMValueRef a,
1566                     enum lp_build_round_mode mode)
1567 {
1568    if (util_cpu_caps.has_sse4_1)
1569      return lp_build_round_sse41(bld, a, mode);
1570    else /* (util_cpu_caps.has_altivec) */
1571      return lp_build_round_altivec(bld, a, mode);
1572 }
1573
1574 /**
1575  * Return the integer part of a float (vector) value (== round toward zero).
1576  * The returned value is a float (vector).
1577  * Ex: trunc(-1.5) = -1.0
1578  */
1579 LLVMValueRef
1580 lp_build_trunc(struct lp_build_context *bld,
1581                LLVMValueRef a)
1582 {
1583    LLVMBuilderRef builder = bld->gallivm->builder;
1584    const struct lp_type type = bld->type;
1585
1586    assert(type.floating);
1587    assert(lp_check_value(type, a));
1588
1589    if (arch_rounding_available(type)) {
1590       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1591    }
1592    else {
1593       const struct lp_type type = bld->type;
1594       struct lp_type inttype;
1595       struct lp_build_context intbld;
1596       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1597       LLVMValueRef trunc, res, anosign, mask;
1598       LLVMTypeRef int_vec_type = bld->int_vec_type;
1599       LLVMTypeRef vec_type = bld->vec_type;
1600
1601       assert(type.width == 32); /* might want to handle doubles at some point */
1602
1603       inttype = type;
1604       inttype.floating = 0;
1605       lp_build_context_init(&intbld, bld->gallivm, inttype);
1606
1607       /* round by truncation */
1608       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1609       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1610
1611       /* mask out sign bit */
1612       anosign = lp_build_abs(bld, a);
1613       /*
1614        * mask out all values if anosign > 2^24
1615        * This should work both for large ints (all rounding is no-op for them
1616        * because such floats are always exact) as well as special cases like
1617        * NaNs, Infs (taking advantage of the fact they use max exponent).
1618        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1619        */
1620       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1621       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1622       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1623       return lp_build_select(bld, mask, a, res);
1624    }
1625 }
1626
1627
1628 /**
1629  * Return float (vector) rounded to nearest integer (vector).  The returned
1630  * value is a float (vector).
1631  * Ex: round(0.9) = 1.0
1632  * Ex: round(-1.5) = -2.0
1633  */
1634 LLVMValueRef
1635 lp_build_round(struct lp_build_context *bld,
1636                LLVMValueRef a)
1637 {
1638    LLVMBuilderRef builder = bld->gallivm->builder;
1639    const struct lp_type type = bld->type;
1640
1641    assert(type.floating);
1642    assert(lp_check_value(type, a));
1643
1644    if (arch_rounding_available(type)) {
1645       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1646    }
1647    else {
1648       const struct lp_type type = bld->type;
1649       struct lp_type inttype;
1650       struct lp_build_context intbld;
1651       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1652       LLVMValueRef res, anosign, mask;
1653       LLVMTypeRef int_vec_type = bld->int_vec_type;
1654       LLVMTypeRef vec_type = bld->vec_type;
1655
1656       assert(type.width == 32); /* might want to handle doubles at some point */
1657
1658       inttype = type;
1659       inttype.floating = 0;
1660       lp_build_context_init(&intbld, bld->gallivm, inttype);
1661
1662       res = lp_build_iround(bld, a);
1663       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1664
1665       /* mask out sign bit */
1666       anosign = lp_build_abs(bld, a);
1667       /*
1668        * mask out all values if anosign > 2^24
1669        * This should work both for large ints (all rounding is no-op for them
1670        * because such floats are always exact) as well as special cases like
1671        * NaNs, Infs (taking advantage of the fact they use max exponent).
1672        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1673        */
1674       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1675       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1676       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1677       return lp_build_select(bld, mask, a, res);
1678    }
1679 }
1680
1681
1682 /**
1683  * Return floor of float (vector), result is a float (vector)
1684  * Ex: floor(1.1) = 1.0
1685  * Ex: floor(-1.1) = -2.0
1686  */
1687 LLVMValueRef
1688 lp_build_floor(struct lp_build_context *bld,
1689                LLVMValueRef a)
1690 {
1691    LLVMBuilderRef builder = bld->gallivm->builder;
1692    const struct lp_type type = bld->type;
1693
1694    assert(type.floating);
1695    assert(lp_check_value(type, a));
1696
1697    if (arch_rounding_available(type)) {
1698       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1699    }
1700    else {
1701       const struct lp_type type = bld->type;
1702       struct lp_type inttype;
1703       struct lp_build_context intbld;
1704       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1705       LLVMValueRef trunc, res, anosign, mask;
1706       LLVMTypeRef int_vec_type = bld->int_vec_type;
1707       LLVMTypeRef vec_type = bld->vec_type;
1708
1709       assert(type.width == 32); /* might want to handle doubles at some point */
1710
1711       inttype = type;
1712       inttype.floating = 0;
1713       lp_build_context_init(&intbld, bld->gallivm, inttype);
1714
1715       /* round by truncation */
1716       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1717       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1718
1719       if (type.sign) {
1720          LLVMValueRef tmp;
1721
1722          /*
1723           * fix values if rounding is wrong (for non-special cases)
1724           * - this is the case if trunc > a
1725           */
1726          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1727          /* tmp = trunc > a ? 1.0 : 0.0 */
1728          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1729          tmp = lp_build_and(&intbld, mask, tmp);
1730          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1731          res = lp_build_sub(bld, res, tmp);
1732       }
1733
1734       /* mask out sign bit */
1735       anosign = lp_build_abs(bld, a);
1736       /*
1737        * mask out all values if anosign > 2^24
1738        * This should work both for large ints (all rounding is no-op for them
1739        * because such floats are always exact) as well as special cases like
1740        * NaNs, Infs (taking advantage of the fact they use max exponent).
1741        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1742        */
1743       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1744       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1745       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1746       return lp_build_select(bld, mask, a, res);
1747    }
1748 }
1749
1750
1751 /**
1752  * Return ceiling of float (vector), returning float (vector).
1753  * Ex: ceil( 1.1) = 2.0
1754  * Ex: ceil(-1.1) = -1.0
1755  */
1756 LLVMValueRef
1757 lp_build_ceil(struct lp_build_context *bld,
1758               LLVMValueRef a)
1759 {
1760    LLVMBuilderRef builder = bld->gallivm->builder;
1761    const struct lp_type type = bld->type;
1762
1763    assert(type.floating);
1764    assert(lp_check_value(type, a));
1765
1766    if (arch_rounding_available(type)) {
1767       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1768    }
1769    else {
1770       const struct lp_type type = bld->type;
1771       struct lp_type inttype;
1772       struct lp_build_context intbld;
1773       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1774       LLVMValueRef trunc, res, anosign, mask, tmp;
1775       LLVMTypeRef int_vec_type = bld->int_vec_type;
1776       LLVMTypeRef vec_type = bld->vec_type;
1777
1778       assert(type.width == 32); /* might want to handle doubles at some point */
1779
1780       inttype = type;
1781       inttype.floating = 0;
1782       lp_build_context_init(&intbld, bld->gallivm, inttype);
1783
1784       /* round by truncation */
1785       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1786       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1787
1788       /*
1789        * fix values if rounding is wrong (for non-special cases)
1790        * - this is the case if trunc < a
1791        */
1792       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1793       /* tmp = trunc < a ? 1.0 : 0.0 */
1794       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1795       tmp = lp_build_and(&intbld, mask, tmp);
1796       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1797       res = lp_build_add(bld, trunc, tmp);
1798
1799       /* mask out sign bit */
1800       anosign = lp_build_abs(bld, a);
1801       /*
1802        * mask out all values if anosign > 2^24
1803        * This should work both for large ints (all rounding is no-op for them
1804        * because such floats are always exact) as well as special cases like
1805        * NaNs, Infs (taking advantage of the fact they use max exponent).
1806        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1807        */
1808       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1809       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1810       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1811       return lp_build_select(bld, mask, a, res);
1812    }
1813 }
1814
1815
1816 /**
1817  * Return fractional part of 'a' computed as a - floor(a)
1818  * Typically used in texture coord arithmetic.
1819  */
1820 LLVMValueRef
1821 lp_build_fract(struct lp_build_context *bld,
1822                LLVMValueRef a)
1823 {
1824    assert(bld->type.floating);
1825    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1826 }
1827
1828
1829 /**
1830  * Prevent returning a fractional part of 1.0 for very small negative values of
1831  * 'a' by clamping against 0.99999(9).
1832  */
1833 static inline LLVMValueRef
1834 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1835 {
1836    LLVMValueRef max;
1837
1838    /* this is the largest number smaller than 1.0 representable as float */
1839    max = lp_build_const_vec(bld->gallivm, bld->type,
1840                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1841    return lp_build_min(bld, fract, max);
1842 }
1843
1844
1845 /**
1846  * Same as lp_build_fract, but guarantees that the result is always smaller
1847  * than one.
1848  */
1849 LLVMValueRef
1850 lp_build_fract_safe(struct lp_build_context *bld,
1851                     LLVMValueRef a)
1852 {
1853    return clamp_fract(bld, lp_build_fract(bld, a));
1854 }
1855
1856
1857 /**
1858  * Return the integer part of a float (vector) value (== round toward zero).
1859  * The returned value is an integer (vector).
1860  * Ex: itrunc(-1.5) = -1
1861  */
1862 LLVMValueRef
1863 lp_build_itrunc(struct lp_build_context *bld,
1864                 LLVMValueRef a)
1865 {
1866    LLVMBuilderRef builder = bld->gallivm->builder;
1867    const struct lp_type type = bld->type;
1868    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1869
1870    assert(type.floating);
1871    assert(lp_check_value(type, a));
1872
1873    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1874 }
1875
1876
1877 /**
1878  * Return float (vector) rounded to nearest integer (vector).  The returned
1879  * value is an integer (vector).
1880  * Ex: iround(0.9) = 1
1881  * Ex: iround(-1.5) = -2
1882  */
1883 LLVMValueRef
1884 lp_build_iround(struct lp_build_context *bld,
1885                 LLVMValueRef a)
1886 {
1887    LLVMBuilderRef builder = bld->gallivm->builder;
1888    const struct lp_type type = bld->type;
1889    LLVMTypeRef int_vec_type = bld->int_vec_type;
1890    LLVMValueRef res;
1891
1892    assert(type.floating);
1893
1894    assert(lp_check_value(type, a));
1895
1896    if ((util_cpu_caps.has_sse2 &&
1897        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1898        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1899       return lp_build_iround_nearest_sse2(bld, a);
1900    }
1901    if (arch_rounding_available(type)) {
1902       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1903    }
1904    else {
1905       LLVMValueRef half;
1906
1907       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1908
1909       if (type.sign) {
1910          LLVMTypeRef vec_type = bld->vec_type;
1911          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1912                                     (unsigned long long)1 << (type.width - 1));
1913          LLVMValueRef sign;
1914
1915          /* get sign bit */
1916          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1917          sign = LLVMBuildAnd(builder, sign, mask, "");
1918
1919          /* sign * 0.5 */
1920          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1921          half = LLVMBuildOr(builder, sign, half, "");
1922          half = LLVMBuildBitCast(builder, half, vec_type, "");
1923       }
1924
1925       res = LLVMBuildFAdd(builder, a, half, "");
1926    }
1927
1928    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1929
1930    return res;
1931 }
1932
1933
1934 /**
1935  * Return floor of float (vector), result is an int (vector)
1936  * Ex: ifloor(1.1) = 1.0
1937  * Ex: ifloor(-1.1) = -2.0
1938  */
1939 LLVMValueRef
1940 lp_build_ifloor(struct lp_build_context *bld,
1941                 LLVMValueRef a)
1942 {
1943    LLVMBuilderRef builder = bld->gallivm->builder;
1944    const struct lp_type type = bld->type;
1945    LLVMTypeRef int_vec_type = bld->int_vec_type;
1946    LLVMValueRef res;
1947
1948    assert(type.floating);
1949    assert(lp_check_value(type, a));
1950
1951    res = a;
1952    if (type.sign) {
1953       if (arch_rounding_available(type)) {
1954          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1955       }
1956       else {
1957          struct lp_type inttype;
1958          struct lp_build_context intbld;
1959          LLVMValueRef trunc, itrunc, mask;
1960
1961          assert(type.floating);
1962          assert(lp_check_value(type, a));
1963
1964          inttype = type;
1965          inttype.floating = 0;
1966          lp_build_context_init(&intbld, bld->gallivm, inttype);
1967
1968          /* round by truncation */
1969          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1970          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
1971
1972          /*
1973           * fix values if rounding is wrong (for non-special cases)
1974           * - this is the case if trunc > a
1975           * The results of doing this with NaNs, very large values etc.
1976           * are undefined but this seems to be the case anyway.
1977           */
1978          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
1979          /* cheapie minus one with mask since the mask is minus one / zero */
1980          return lp_build_add(&intbld, itrunc, mask);
1981       }
1982    }
1983
1984    /* round to nearest (toward zero) */
1985    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1986
1987    return res;
1988 }
1989
1990
1991 /**
1992  * Return ceiling of float (vector), returning int (vector).
1993  * Ex: iceil( 1.1) = 2
1994  * Ex: iceil(-1.1) = -1
1995  */
1996 LLVMValueRef
1997 lp_build_iceil(struct lp_build_context *bld,
1998                LLVMValueRef a)
1999 {
2000    LLVMBuilderRef builder = bld->gallivm->builder;
2001    const struct lp_type type = bld->type;
2002    LLVMTypeRef int_vec_type = bld->int_vec_type;
2003    LLVMValueRef res;
2004
2005    assert(type.floating);
2006    assert(lp_check_value(type, a));
2007
2008    if (arch_rounding_available(type)) {
2009       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2010    }
2011    else {
2012       struct lp_type inttype;
2013       struct lp_build_context intbld;
2014       LLVMValueRef trunc, itrunc, mask;
2015
2016       assert(type.floating);
2017       assert(lp_check_value(type, a));
2018
2019       inttype = type;
2020       inttype.floating = 0;
2021       lp_build_context_init(&intbld, bld->gallivm, inttype);
2022
2023       /* round by truncation */
2024       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2025       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2026
2027       /*
2028        * fix values if rounding is wrong (for non-special cases)
2029        * - this is the case if trunc < a
2030        * The results of doing this with NaNs, very large values etc.
2031        * are undefined but this seems to be the case anyway.
2032        */
2033       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2034       /* cheapie plus one with mask since the mask is minus one / zero */
2035       return lp_build_sub(&intbld, itrunc, mask);
2036    }
2037
2038    /* round to nearest (toward zero) */
2039    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2040
2041    return res;
2042 }
2043
2044
2045 /**
2046  * Combined ifloor() & fract().
2047  *
2048  * Preferred to calling the functions separately, as it will ensure that the
2049  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2050  */
2051 void
2052 lp_build_ifloor_fract(struct lp_build_context *bld,
2053                       LLVMValueRef a,
2054                       LLVMValueRef *out_ipart,
2055                       LLVMValueRef *out_fpart)
2056 {
2057    LLVMBuilderRef builder = bld->gallivm->builder;
2058    const struct lp_type type = bld->type;
2059    LLVMValueRef ipart;
2060
2061    assert(type.floating);
2062    assert(lp_check_value(type, a));
2063
2064    if (arch_rounding_available(type)) {
2065       /*
2066        * floor() is easier.
2067        */
2068
2069       ipart = lp_build_floor(bld, a);
2070       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2071       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2072    }
2073    else {
2074       /*
2075        * ifloor() is easier.
2076        */
2077
2078       *out_ipart = lp_build_ifloor(bld, a);
2079       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2080       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2081    }
2082 }
2083
2084
2085 /**
2086  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2087  * always smaller than one.
2088  */
2089 void
2090 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2091                            LLVMValueRef a,
2092                            LLVMValueRef *out_ipart,
2093                            LLVMValueRef *out_fpart)
2094 {
2095    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2096    *out_fpart = clamp_fract(bld, *out_fpart);
2097 }
2098
2099
2100 LLVMValueRef
2101 lp_build_sqrt(struct lp_build_context *bld,
2102               LLVMValueRef a)
2103 {
2104    LLVMBuilderRef builder = bld->gallivm->builder;
2105    const struct lp_type type = bld->type;
2106    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2107    char intrinsic[32];
2108
2109    assert(lp_check_value(type, a));
2110
2111    /* TODO: optimize the constant case */
2112
2113    assert(type.floating);
2114    if (type.length == 1) {
2115       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2116    }
2117    else {
2118       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2119    }
2120
2121    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2122 }
2123
2124
2125 /**
2126  * Do one Newton-Raphson step to improve reciprocate precision:
2127  *
2128  *   x_{i+1} = x_i * (2 - a * x_i)
2129  *
2130  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2131  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2132  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2133  * halo. It would be necessary to clamp the argument to prevent this.
2134  *
2135  * See also:
2136  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2137  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2138  */
2139 static INLINE LLVMValueRef
2140 lp_build_rcp_refine(struct lp_build_context *bld,
2141                     LLVMValueRef a,
2142                     LLVMValueRef rcp_a)
2143 {
2144    LLVMBuilderRef builder = bld->gallivm->builder;
2145    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2146    LLVMValueRef res;
2147
2148    res = LLVMBuildFMul(builder, a, rcp_a, "");
2149    res = LLVMBuildFSub(builder, two, res, "");
2150    res = LLVMBuildFMul(builder, rcp_a, res, "");
2151
2152    return res;
2153 }
2154
2155
2156 LLVMValueRef
2157 lp_build_rcp(struct lp_build_context *bld,
2158              LLVMValueRef a)
2159 {
2160    LLVMBuilderRef builder = bld->gallivm->builder;
2161    const struct lp_type type = bld->type;
2162
2163    assert(lp_check_value(type, a));
2164
2165    if(a == bld->zero)
2166       return bld->undef;
2167    if(a == bld->one)
2168       return bld->one;
2169    if(a == bld->undef)
2170       return bld->undef;
2171
2172    assert(type.floating);
2173
2174    if(LLVMIsConstant(a))
2175       return LLVMConstFDiv(bld->one, a);
2176
2177    /*
2178     * We don't use RCPPS because:
2179     * - it only has 10bits of precision
2180     * - it doesn't even get the reciprocate of 1.0 exactly
2181     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2182     * - for recent processors the benefit over DIVPS is marginal, a case
2183     *   dependent
2184     *
2185     * We could still use it on certain processors if benchmarks show that the
2186     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2187     * particular uses that require less workarounds.
2188     */
2189
2190    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2191          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2192       const unsigned num_iterations = 0;
2193       LLVMValueRef res;
2194       unsigned i;
2195       const char *intrinsic = NULL;
2196
2197       if (type.length == 4) {
2198          intrinsic = "llvm.x86.sse.rcp.ps";
2199       }
2200       else {
2201          intrinsic = "llvm.x86.avx.rcp.ps.256";
2202       }
2203
2204       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2205
2206       for (i = 0; i < num_iterations; ++i) {
2207          res = lp_build_rcp_refine(bld, a, res);
2208       }
2209
2210       return res;
2211    }
2212
2213    return LLVMBuildFDiv(builder, bld->one, a, "");
2214 }
2215
2216
2217 /**
2218  * Do one Newton-Raphson step to improve rsqrt precision:
2219  *
2220  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2221  *
2222  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2223  */
2224 static INLINE LLVMValueRef
2225 lp_build_rsqrt_refine(struct lp_build_context *bld,
2226                       LLVMValueRef a,
2227                       LLVMValueRef rsqrt_a)
2228 {
2229    LLVMBuilderRef builder = bld->gallivm->builder;
2230    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2231    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2232    LLVMValueRef res;
2233
2234    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2235    res = LLVMBuildFMul(builder, a, res, "");
2236    res = LLVMBuildFSub(builder, three, res, "");
2237    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2238    res = LLVMBuildFMul(builder, half, res, "");
2239
2240    return res;
2241 }
2242
2243
2244 /**
2245  * Generate 1/sqrt(a).
2246  * Result is undefined for values < 0, infinity for +0.
2247  */
2248 LLVMValueRef
2249 lp_build_rsqrt(struct lp_build_context *bld,
2250                LLVMValueRef a)
2251 {
2252    LLVMBuilderRef builder = bld->gallivm->builder;
2253    const struct lp_type type = bld->type;
2254
2255    assert(lp_check_value(type, a));
2256
2257    assert(type.floating);
2258
2259    /*
2260     * This should be faster but all denormals will end up as infinity.
2261     */
2262    if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2263         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2264       const unsigned num_iterations = 1;
2265       LLVMValueRef res;
2266       unsigned i;
2267       const char *intrinsic = NULL;
2268
2269       if (type.length == 4) {
2270          intrinsic = "llvm.x86.sse.rsqrt.ps";
2271       }
2272       else {
2273          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2274       }
2275       if (num_iterations) {
2276          /*
2277           * Newton-Raphson will result in NaN instead of infinity for zero,
2278           * and NaN instead of zero for infinity.
2279           * Also, need to ensure rsqrt(1.0) == 1.0.
2280           * All numbers smaller than FLT_MIN will result in +infinity
2281           * (rsqrtps treats all denormals as zero).
2282           */
2283          /*
2284           * Certain non-c99 compilers don't know INFINITY and might not support
2285           * hacks to evaluate it at compile time neither.
2286           */
2287          const unsigned posinf_int = 0x7F800000;
2288          LLVMValueRef cmp;
2289          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2290          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2291
2292          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2293
2294          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2295
2296          for (i = 0; i < num_iterations; ++i) {
2297             res = lp_build_rsqrt_refine(bld, a, res);
2298          }
2299          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2300          res = lp_build_select(bld, cmp, inf, res);
2301          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2302          res = lp_build_select(bld, cmp, bld->zero, res);
2303          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2304          res = lp_build_select(bld, cmp, bld->one, res);
2305       }
2306       else {
2307          /* rsqrt(1.0) != 1.0 here */
2308          res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2309
2310       }
2311
2312       return res;
2313    }
2314
2315    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2316 }
2317
2318
2319 /**
2320  * Generate sin(a) using SSE2
2321  */
2322 LLVMValueRef
2323 lp_build_sin(struct lp_build_context *bld,
2324              LLVMValueRef a)
2325 {
2326    struct gallivm_state *gallivm = bld->gallivm;
2327    LLVMBuilderRef builder = gallivm->builder;
2328    struct lp_type int_type = lp_int_type(bld->type);
2329    LLVMBuilderRef b = builder;
2330
2331    /*
2332     *  take the absolute value,
2333     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2334     */
2335
2336    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2337    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2338
2339    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2340    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2341
2342    /*
2343     * extract the sign bit (upper one)
2344     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2345     */
2346    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2347    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2348
2349    /*
2350     * scale by 4/Pi
2351     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2352     */
2353
2354    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2355    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2356
2357    /*
2358     * store the integer part of y in mm0
2359     * emm2 = _mm_cvttps_epi32(y);
2360     */
2361
2362    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2363
2364    /*
2365     * j=(j+1) & (~1) (see the cephes sources)
2366     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2367     */
2368
2369    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2370    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2371    /*
2372     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2373     */
2374    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2375    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2376
2377    /*
2378     * y = _mm_cvtepi32_ps(emm2);
2379     */
2380    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2381
2382    /* get the swap sign flag
2383     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2384     */
2385    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2386    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2387
2388    /*
2389     * emm2 = _mm_slli_epi32(emm0, 29);
2390     */
2391    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2392    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2393
2394    /*
2395     * get the polynom selection mask
2396     * there is one polynom for 0 <= x <= Pi/4
2397     * and another one for Pi/4<x<=Pi/2
2398     * Both branches will be computed.
2399     *
2400     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2401     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2402     */
2403
2404    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2405    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2406    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2407                                              int_type, PIPE_FUNC_EQUAL,
2408                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2409    /*
2410     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2411     */
2412    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2413
2414    /*
2415     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2416     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2417     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2418     */
2419    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2420    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2421    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2422
2423    /*
2424     * The magic pass: "Extended precision modular arithmetic"
2425     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2426     * xmm1 = _mm_mul_ps(y, xmm1);
2427     * xmm2 = _mm_mul_ps(y, xmm2);
2428     * xmm3 = _mm_mul_ps(y, xmm3);
2429     */
2430    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2431    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2432    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2433
2434    /*
2435     * x = _mm_add_ps(x, xmm1);
2436     * x = _mm_add_ps(x, xmm2);
2437     * x = _mm_add_ps(x, xmm3);
2438     */
2439
2440    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2441    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2442    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2443
2444    /*
2445     * Evaluate the first polynom  (0 <= x <= Pi/4)
2446     *
2447     * z = _mm_mul_ps(x,x);
2448     */
2449    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2450
2451    /*
2452     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2453     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2454     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2455     */
2456    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2457    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2458    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2459
2460    /*
2461     * y = *(v4sf*)_ps_coscof_p0;
2462     * y = _mm_mul_ps(y, z);
2463     */
2464    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2465    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2466    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2467    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2468    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2469    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2470
2471
2472    /*
2473     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2474     * y = _mm_sub_ps(y, tmp);
2475     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2476     */
2477    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2478    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2479    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2480    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2481    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2482
2483    /*
2484     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2485     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2486     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2487     */
2488    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2489    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2490    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2491
2492    /*
2493     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2494     *
2495     * y2 = *(v4sf*)_ps_sincof_p0;
2496     * y2 = _mm_mul_ps(y2, z);
2497     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2498     * y2 = _mm_mul_ps(y2, z);
2499     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2500     * y2 = _mm_mul_ps(y2, z);
2501     * y2 = _mm_mul_ps(y2, x);
2502     * y2 = _mm_add_ps(y2, x);
2503     */
2504
2505    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2506    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2507    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2508    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2509    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2510    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2511    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2512
2513    /*
2514     * select the correct result from the two polynoms
2515     * xmm3 = poly_mask;
2516     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2517     * y = _mm_andnot_ps(xmm3, y);
2518     * y = _mm_add_ps(y,y2);
2519     */
2520    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2521    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2522    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2523    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2524    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2525    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2526    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2527
2528    /*
2529     * update the sign
2530     * y = _mm_xor_ps(y, sign_bit);
2531     */
2532    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2533    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2534    return y_result;
2535 }
2536
2537
2538 /**
2539  * Generate cos(a) using SSE2
2540  */
2541 LLVMValueRef
2542 lp_build_cos(struct lp_build_context *bld,
2543              LLVMValueRef a)
2544 {
2545    struct gallivm_state *gallivm = bld->gallivm;
2546    LLVMBuilderRef builder = gallivm->builder;
2547    struct lp_type int_type = lp_int_type(bld->type);
2548    LLVMBuilderRef b = builder;
2549
2550    /*
2551     *  take the absolute value,
2552     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2553     */
2554
2555    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2556    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2557
2558    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2559    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2560
2561    /*
2562     * scale by 4/Pi
2563     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2564     */
2565
2566    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2567    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2568
2569    /*
2570     * store the integer part of y in mm0
2571     * emm2 = _mm_cvttps_epi32(y);
2572     */
2573
2574    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2575
2576    /*
2577     * j=(j+1) & (~1) (see the cephes sources)
2578     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2579     */
2580
2581    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2582    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2583    /*
2584     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2585     */
2586    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2587    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2588
2589    /*
2590     * y = _mm_cvtepi32_ps(emm2);
2591     */
2592    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2593
2594
2595    /*
2596     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2597     */
2598    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2599    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2600
2601
2602    /* get the swap sign flag
2603     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2604     */
2605    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2606    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2607    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2608    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2609
2610    /*
2611     * emm2 = _mm_slli_epi32(emm0, 29);
2612     */
2613    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2614    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2615
2616    /*
2617     * get the polynom selection mask
2618     * there is one polynom for 0 <= x <= Pi/4
2619     * and another one for Pi/4<x<=Pi/2
2620     * Both branches will be computed.
2621     *
2622     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2623     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2624     */
2625
2626    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2627    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2628    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2629                                              int_type, PIPE_FUNC_EQUAL,
2630                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2631
2632    /*
2633     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2634     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2635     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2636     */
2637    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2638    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2639    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2640
2641    /*
2642     * The magic pass: "Extended precision modular arithmetic"
2643     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2644     * xmm1 = _mm_mul_ps(y, xmm1);
2645     * xmm2 = _mm_mul_ps(y, xmm2);
2646     * xmm3 = _mm_mul_ps(y, xmm3);
2647     */
2648    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2649    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2650    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2651
2652    /*
2653     * x = _mm_add_ps(x, xmm1);
2654     * x = _mm_add_ps(x, xmm2);
2655     * x = _mm_add_ps(x, xmm3);
2656     */
2657
2658    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2659    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2660    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2661
2662    /*
2663     * Evaluate the first polynom  (0 <= x <= Pi/4)
2664     *
2665     * z = _mm_mul_ps(x,x);
2666     */
2667    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2668
2669    /*
2670     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2671     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2672     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2673     */
2674    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2675    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2676    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2677
2678    /*
2679     * y = *(v4sf*)_ps_coscof_p0;
2680     * y = _mm_mul_ps(y, z);
2681     */
2682    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2683    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2684    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2685    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2686    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2687    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2688
2689
2690    /*
2691     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2692     * y = _mm_sub_ps(y, tmp);
2693     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2694     */
2695    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2696    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2697    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2698    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2699    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2700
2701    /*
2702     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2703     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2704     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2705     */
2706    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2707    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2708    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2709
2710    /*
2711     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2712     *
2713     * y2 = *(v4sf*)_ps_sincof_p0;
2714     * y2 = _mm_mul_ps(y2, z);
2715     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2716     * y2 = _mm_mul_ps(y2, z);
2717     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2718     * y2 = _mm_mul_ps(y2, z);
2719     * y2 = _mm_mul_ps(y2, x);
2720     * y2 = _mm_add_ps(y2, x);
2721     */
2722
2723    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2724    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2725    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2726    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2727    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2728    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2729    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2730
2731    /*
2732     * select the correct result from the two polynoms
2733     * xmm3 = poly_mask;
2734     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2735     * y = _mm_andnot_ps(xmm3, y);
2736     * y = _mm_add_ps(y,y2);
2737     */
2738    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2739    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2740    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2741    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2742    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2743    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2744
2745    /*
2746     * update the sign
2747     * y = _mm_xor_ps(y, sign_bit);
2748     */
2749    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2750    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2751    return y_result;
2752 }
2753
2754
2755 /**
2756  * Generate pow(x, y)
2757  */
2758 LLVMValueRef
2759 lp_build_pow(struct lp_build_context *bld,
2760              LLVMValueRef x,
2761              LLVMValueRef y)
2762 {
2763    /* TODO: optimize the constant case */
2764    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2765        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2766       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2767                    __FUNCTION__);
2768    }
2769
2770    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2771 }
2772
2773
2774 /**
2775  * Generate exp(x)
2776  */
2777 LLVMValueRef
2778 lp_build_exp(struct lp_build_context *bld,
2779              LLVMValueRef x)
2780 {
2781    /* log2(e) = 1/log(2) */
2782    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2783                                            1.4426950408889634);
2784
2785    assert(lp_check_value(bld->type, x));
2786
2787    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2788 }
2789
2790
2791 /**
2792  * Generate log(x)
2793  */
2794 LLVMValueRef
2795 lp_build_log(struct lp_build_context *bld,
2796              LLVMValueRef x)
2797 {
2798    /* log(2) */
2799    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2800                                           0.69314718055994529);
2801
2802    assert(lp_check_value(bld->type, x));
2803
2804    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2805 }
2806
2807
2808 /**
2809  * Generate polynomial.
2810  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2811  */
2812 static LLVMValueRef
2813 lp_build_polynomial(struct lp_build_context *bld,
2814                     LLVMValueRef x,
2815                     const double *coeffs,
2816                     unsigned num_coeffs)
2817 {
2818    const struct lp_type type = bld->type;
2819    LLVMValueRef even = NULL, odd = NULL;
2820    LLVMValueRef x2;
2821    unsigned i;
2822
2823    assert(lp_check_value(bld->type, x));
2824
2825    /* TODO: optimize the constant case */
2826    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2827        LLVMIsConstant(x)) {
2828       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2829                    __FUNCTION__);
2830    }
2831
2832    /*
2833     * Calculate odd and even terms seperately to decrease data dependency
2834     * Ex:
2835     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2836     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2837     */
2838    x2 = lp_build_mul(bld, x, x);
2839
2840    for (i = num_coeffs; i--; ) {
2841       LLVMValueRef coeff;
2842
2843       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2844
2845       if (i % 2 == 0) {
2846          if (even)
2847             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2848          else
2849             even = coeff;
2850       } else {
2851          if (odd)
2852             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2853          else
2854             odd = coeff;
2855       }
2856    }
2857
2858    if (odd)
2859       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2860    else if (even)
2861       return even;
2862    else
2863       return bld->undef;
2864 }
2865
2866
2867 /**
2868  * Minimax polynomial fit of 2**x, in range [0, 1[
2869  */
2870 const double lp_build_exp2_polynomial[] = {
2871 #if EXP_POLY_DEGREE == 5
2872    0.999999925063526176901,
2873    0.693153073200168932794,
2874    0.240153617044375388211,
2875    0.0558263180532956664775,
2876    0.00898934009049466391101,
2877    0.00187757667519147912699
2878 #elif EXP_POLY_DEGREE == 4
2879    1.00000259337069434683,
2880    0.693003834469974940458,
2881    0.24144275689150793076,
2882    0.0520114606103070150235,
2883    0.0135341679161270268764
2884 #elif EXP_POLY_DEGREE == 3
2885    0.999925218562710312959,
2886    0.695833540494823811697,
2887    0.226067155427249155588,
2888    0.0780245226406372992967
2889 #elif EXP_POLY_DEGREE == 2
2890    1.00172476321474503578,
2891    0.657636275736077639316,
2892    0.33718943461968720704
2893 #else
2894 #error
2895 #endif
2896 };
2897
2898
2899 void
2900 lp_build_exp2_approx(struct lp_build_context *bld,
2901                      LLVMValueRef x,
2902                      LLVMValueRef *p_exp2_int_part,
2903                      LLVMValueRef *p_frac_part,
2904                      LLVMValueRef *p_exp2)
2905 {
2906    LLVMBuilderRef builder = bld->gallivm->builder;
2907    const struct lp_type type = bld->type;
2908    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2909    LLVMValueRef ipart = NULL;
2910    LLVMValueRef fpart = NULL;
2911    LLVMValueRef expipart = NULL;
2912    LLVMValueRef expfpart = NULL;
2913    LLVMValueRef res = NULL;
2914
2915    assert(lp_check_value(bld->type, x));
2916
2917    if(p_exp2_int_part || p_frac_part || p_exp2) {
2918       /* TODO: optimize the constant case */
2919       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2920           LLVMIsConstant(x)) {
2921          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2922                       __FUNCTION__);
2923       }
2924
2925       assert(type.floating && type.width == 32);
2926
2927       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2928       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2929
2930       /* ipart = floor(x) */
2931       /* fpart = x - ipart */
2932       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2933    }
2934
2935    if(p_exp2_int_part || p_exp2) {
2936       /* expipart = (float) (1 << ipart) */
2937       expipart = LLVMBuildAdd(builder, ipart,
2938                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2939       expipart = LLVMBuildShl(builder, expipart,
2940                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2941       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2942    }
2943
2944    if(p_exp2) {
2945       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2946                                      Elements(lp_build_exp2_polynomial));
2947
2948       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2949    }
2950
2951    if(p_exp2_int_part)
2952       *p_exp2_int_part = expipart;
2953
2954    if(p_frac_part)
2955       *p_frac_part = fpart;
2956
2957    if(p_exp2)
2958       *p_exp2 = res;
2959 }
2960
2961
2962 LLVMValueRef
2963 lp_build_exp2(struct lp_build_context *bld,
2964               LLVMValueRef x)
2965 {
2966    LLVMValueRef res;
2967    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2968    return res;
2969 }
2970
2971
2972 /**
2973  * Extract the exponent of a IEEE-754 floating point value.
2974  *
2975  * Optionally apply an integer bias.
2976  *
2977  * Result is an integer value with
2978  *
2979  *   ifloor(log2(x)) + bias
2980  */
2981 LLVMValueRef
2982 lp_build_extract_exponent(struct lp_build_context *bld,
2983                           LLVMValueRef x,
2984                           int bias)
2985 {
2986    LLVMBuilderRef builder = bld->gallivm->builder;
2987    const struct lp_type type = bld->type;
2988    unsigned mantissa = lp_mantissa(type);
2989    LLVMValueRef res;
2990
2991    assert(type.floating);
2992
2993    assert(lp_check_value(bld->type, x));
2994
2995    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2996
2997    res = LLVMBuildLShr(builder, x,
2998                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2999    res = LLVMBuildAnd(builder, res,
3000                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3001    res = LLVMBuildSub(builder, res,
3002                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3003
3004    return res;
3005 }
3006
3007
3008 /**
3009  * Extract the mantissa of the a floating.
3010  *
3011  * Result is a floating point value with
3012  *
3013  *   x / floor(log2(x))
3014  */
3015 LLVMValueRef
3016 lp_build_extract_mantissa(struct lp_build_context *bld,
3017                           LLVMValueRef x)
3018 {
3019    LLVMBuilderRef builder = bld->gallivm->builder;
3020    const struct lp_type type = bld->type;
3021    unsigned mantissa = lp_mantissa(type);
3022    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3023                                                   (1ULL << mantissa) - 1);
3024    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3025    LLVMValueRef res;
3026
3027    assert(lp_check_value(bld->type, x));
3028
3029    assert(type.floating);
3030
3031    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3032
3033    /* res = x / 2**ipart */
3034    res = LLVMBuildAnd(builder, x, mantmask, "");
3035    res = LLVMBuildOr(builder, res, one, "");
3036    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3037
3038    return res;
3039 }
3040
3041
3042
3043 /**
3044  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3045  * These coefficients can be generate with
3046  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3047  */
3048 const double lp_build_log2_polynomial[] = {
3049 #if LOG_POLY_DEGREE == 5
3050    2.88539008148777786488L,
3051    0.961796878841293367824L,
3052    0.577058946784739859012L,
3053    0.412914355135828735411L,
3054    0.308591899232910175289L,
3055    0.352376952300281371868L,
3056 #elif LOG_POLY_DEGREE == 4
3057    2.88539009343309178325L,
3058    0.961791550404184197881L,
3059    0.577440339438736392009L,
3060    0.403343858251329912514L,
3061    0.406718052498846252698L,
3062 #elif LOG_POLY_DEGREE == 3
3063    2.88538959748872753838L,
3064    0.961932915889597772928L,
3065    0.571118517972136195241L,
3066    0.493997535084709500285L,
3067 #else
3068 #error
3069 #endif
3070 };
3071
3072 /**
3073  * See http://www.devmaster.net/forums/showthread.php?p=43580
3074  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3075  * http://www.nezumi.demon.co.uk/consult/logx.htm
3076  */
3077 void
3078 lp_build_log2_approx(struct lp_build_context *bld,
3079                      LLVMValueRef x,
3080                      LLVMValueRef *p_exp,
3081                      LLVMValueRef *p_floor_log2,
3082                      LLVMValueRef *p_log2)
3083 {
3084    LLVMBuilderRef builder = bld->gallivm->builder;
3085    const struct lp_type type = bld->type;
3086    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3087    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3088
3089    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3090    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3091    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3092
3093    LLVMValueRef i = NULL;
3094    LLVMValueRef y = NULL;
3095    LLVMValueRef z = NULL;
3096    LLVMValueRef exp = NULL;
3097    LLVMValueRef mant = NULL;
3098    LLVMValueRef logexp = NULL;
3099    LLVMValueRef logmant = NULL;
3100    LLVMValueRef res = NULL;
3101
3102    assert(lp_check_value(bld->type, x));
3103
3104    if(p_exp || p_floor_log2 || p_log2) {
3105       /* TODO: optimize the constant case */
3106       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3107           LLVMIsConstant(x)) {
3108          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3109                       __FUNCTION__);
3110       }
3111
3112       assert(type.floating && type.width == 32);
3113
3114       /*
3115        * We don't explicitly handle denormalized numbers. They will yield a
3116        * result in the neighbourhood of -127, which appears to be adequate
3117        * enough.
3118        */
3119
3120       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3121
3122       /* exp = (float) exponent(x) */
3123       exp = LLVMBuildAnd(builder, i, expmask, "");
3124    }
3125
3126    if(p_floor_log2 || p_log2) {
3127       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3128       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3129       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3130    }
3131
3132    if(p_log2) {
3133       /* mant = 1 + (float) mantissa(x) */
3134       mant = LLVMBuildAnd(builder, i, mantmask, "");
3135       mant = LLVMBuildOr(builder, mant, one, "");
3136       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3137
3138       /* y = (mant - 1) / (mant + 1) */
3139       y = lp_build_div(bld,
3140          lp_build_sub(bld, mant, bld->one),
3141          lp_build_add(bld, mant, bld->one)
3142       );
3143
3144       /* z = y^2 */
3145       z = lp_build_mul(bld, y, y);
3146
3147       /* compute P(z) */
3148       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3149                                     Elements(lp_build_log2_polynomial));
3150
3151       /* logmant = y * P(z) */
3152       logmant = lp_build_mul(bld, y, logmant);
3153
3154       res = lp_build_add(bld, logmant, logexp);
3155    }
3156
3157    if(p_exp) {
3158       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3159       *p_exp = exp;
3160    }
3161
3162    if(p_floor_log2)
3163       *p_floor_log2 = logexp;
3164
3165    if(p_log2)
3166       *p_log2 = res;
3167 }
3168
3169
3170 LLVMValueRef
3171 lp_build_log2(struct lp_build_context *bld,
3172               LLVMValueRef x)
3173 {
3174    LLVMValueRef res;
3175    lp_build_log2_approx(bld, x, NULL, NULL, &res);
3176    return res;
3177 }
3178
3179
3180 /**
3181  * Faster (and less accurate) log2.
3182  *
3183  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3184  *
3185  * Piece-wise linear approximation, with exact results when x is a
3186  * power of two.
3187  *
3188  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3189  */
3190 LLVMValueRef
3191 lp_build_fast_log2(struct lp_build_context *bld,
3192                    LLVMValueRef x)
3193 {
3194    LLVMBuilderRef builder = bld->gallivm->builder;
3195    LLVMValueRef ipart;
3196    LLVMValueRef fpart;
3197
3198    assert(lp_check_value(bld->type, x));
3199
3200    assert(bld->type.floating);
3201
3202    /* ipart = floor(log2(x)) - 1 */
3203    ipart = lp_build_extract_exponent(bld, x, -1);
3204    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3205
3206    /* fpart = x / 2**ipart */
3207    fpart = lp_build_extract_mantissa(bld, x);
3208
3209    /* ipart + fpart */
3210    return LLVMBuildFAdd(builder, ipart, fpart, "");
3211 }
3212
3213
3214 /**
3215  * Fast implementation of iround(log2(x)).
3216  *
3217  * Not an approximation -- it should give accurate results all the time.
3218  */
3219 LLVMValueRef
3220 lp_build_ilog2(struct lp_build_context *bld,
3221                LLVMValueRef x)
3222 {
3223    LLVMBuilderRef builder = bld->gallivm->builder;
3224    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3225    LLVMValueRef ipart;
3226
3227    assert(bld->type.floating);
3228
3229    assert(lp_check_value(bld->type, x));
3230
3231    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3232    x = LLVMBuildFMul(builder, x, sqrt2, "");
3233
3234    /* ipart = floor(log2(x) + 0.5)  */
3235    ipart = lp_build_extract_exponent(bld, x, 0);
3236
3237    return ipart;
3238 }
3239
3240 LLVMValueRef
3241 lp_build_mod(struct lp_build_context *bld,
3242              LLVMValueRef x,
3243              LLVMValueRef y)
3244 {
3245    LLVMBuilderRef builder = bld->gallivm->builder;
3246    LLVMValueRef res;
3247    const struct lp_type type = bld->type;
3248
3249    assert(lp_check_value(type, x));
3250    assert(lp_check_value(type, y));
3251
3252    if (type.floating)
3253       res = LLVMBuildFRem(builder, x, y, "");
3254    else if (type.sign)
3255       res = LLVMBuildSRem(builder, x, y, "");
3256    else
3257       res = LLVMBuildURem(builder, x, y, "");
3258    return res;
3259 }