src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_intr.h"
  57 #include "lp_bld_logic.h"
  58 #include "lp_bld_pack.h"
  59 #include "lp_bld_arit.h"
  60
  61
  62 #define EXP_POLY_DEGREE 3
  63
  64 #define LOG_POLY_DEGREE 5
  65
  66
  67 /**
  68  * Generate min(a, b)
  69  * No checks for special case values of a or b = 1 or 0 are done.
  70  */
  71 static LLVMValueRef
  72 lp_build_min_simple(struct lp_build_context *bld,
  73                     LLVMValueRef a,
  74                     LLVMValueRef b)
  75 {
  76    const struct lp_type type = bld->type;
  77    const char *intrinsic = NULL;
  78    LLVMValueRef cond;
  79
  80    assert(lp_check_value(type, a));
  81    assert(lp_check_value(type, b));
  82
  83    /* TODO: optimize the constant case */
  84
  85    if(type.width * type.length == 128) {
  86       if(type.floating) {
  87          if(type.width == 32 && util_cpu_caps.has_sse)
  88             intrinsic = "llvm.x86.sse.min.ps";
  89          if(type.width == 64 && util_cpu_caps.has_sse2)
  90             intrinsic = "llvm.x86.sse2.min.pd";
  91       }
  92       else {
  93          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  94             intrinsic = "llvm.x86.sse2.pminu.b";
  95          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  96             intrinsic = "llvm.x86.sse41.pminsb";
  97          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
  98             intrinsic = "llvm.x86.sse41.pminuw";
  99          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 100             intrinsic = "llvm.x86.sse2.pmins.w";
 101          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 102             intrinsic = "llvm.x86.sse41.pminud";
 103          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 104             intrinsic = "llvm.x86.sse41.pminsd";
 105       }
 106    }
 107
 108    if(intrinsic)
 109       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 110
 111    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 112    return lp_build_select(bld, cond, a, b);
 113 }
 114
 115
 116 /**
 117  * Generate max(a, b)
 118  * No checks for special case values of a or b = 1 or 0 are done.
 119  */
 120 static LLVMValueRef
 121 lp_build_max_simple(struct lp_build_context *bld,
 122                     LLVMValueRef a,
 123                     LLVMValueRef b)
 124 {
 125    const struct lp_type type = bld->type;
 126    const char *intrinsic = NULL;
 127    LLVMValueRef cond;
 128
 129    assert(lp_check_value(type, a));
 130    assert(lp_check_value(type, b));
 131
 132    /* TODO: optimize the constant case */
 133
 134    if(type.width * type.length == 128) {
 135       if(type.floating) {
 136          if(type.width == 32 && util_cpu_caps.has_sse)
 137             intrinsic = "llvm.x86.sse.max.ps";
 138          if(type.width == 64 && util_cpu_caps.has_sse2)
 139             intrinsic = "llvm.x86.sse2.max.pd";
 140       }
 141       else {
 142          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 143             intrinsic = "llvm.x86.sse2.pmaxu.b";
 144          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 145             intrinsic = "llvm.x86.sse41.pmaxsb";
 146          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 147             intrinsic = "llvm.x86.sse41.pmaxuw";
 148          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 149             intrinsic = "llvm.x86.sse2.pmaxs.w";
 150          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 151             intrinsic = "llvm.x86.sse41.pmaxud";
 152          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 153             intrinsic = "llvm.x86.sse41.pmaxsd";
 154       }
 155    }
 156
 157    if(intrinsic)
 158       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 159
 160    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 161    return lp_build_select(bld, cond, a, b);
 162 }
 163
 164
 165 /**
 166  * Generate 1 - a, or ~a depending on bld->type.
 167  */
 168 LLVMValueRef
 169 lp_build_comp(struct lp_build_context *bld,
 170               LLVMValueRef a)
 171 {
 172    const struct lp_type type = bld->type;
 173
 174    assert(lp_check_value(type, a));
 175
 176    if(a == bld->one)
 177       return bld->zero;
 178    if(a == bld->zero)
 179       return bld->one;
 180
 181    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 182       if(LLVMIsConstant(a))
 183          return LLVMConstNot(a);
 184       else
 185          return LLVMBuildNot(bld->builder, a, "");
 186    }
 187
 188    if(LLVMIsConstant(a))
 189       if (type.floating)
 190           return LLVMConstFSub(bld->one, a);
 191       else
 192           return LLVMConstSub(bld->one, a);
 193    else
 194       if (type.floating)
 195          return LLVMBuildFSub(bld->builder, bld->one, a, "");
 196       else
 197          return LLVMBuildSub(bld->builder, bld->one, a, "");
 198 }
 199
 200
 201 /**
 202  * Generate a + b
 203  */
 204 LLVMValueRef
 205 lp_build_add(struct lp_build_context *bld,
 206              LLVMValueRef a,
 207              LLVMValueRef b)
 208 {
 209    const struct lp_type type = bld->type;
 210    LLVMValueRef res;
 211
 212    assert(lp_check_value(type, a));
 213    assert(lp_check_value(type, b));
 214
 215    if(a == bld->zero)
 216       return b;
 217    if(b == bld->zero)
 218       return a;
 219    if(a == bld->undef || b == bld->undef)
 220       return bld->undef;
 221
 222    if(bld->type.norm) {
 223       const char *intrinsic = NULL;
 224
 225       if(a == bld->one || b == bld->one)
 226         return bld->one;
 227
 228       if(util_cpu_caps.has_sse2 &&
 229          type.width * type.length == 128 &&
 230          !type.floating && !type.fixed) {
 231          if(type.width == 8)
 232             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 233          if(type.width == 16)
 234             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 235       }
 236
 237       if(intrinsic)
 238          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 239    }
 240
 241    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 242       if (type.floating)
 243          res = LLVMConstFAdd(a, b);
 244       else
 245          res = LLVMConstAdd(a, b);
 246    else
 247       if (type.floating)
 248          res = LLVMBuildFAdd(bld->builder, a, b, "");
 249       else
 250          res = LLVMBuildAdd(bld->builder, a, b, "");
 251
 252    /* clamp to ceiling of 1.0 */
 253    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 254       res = lp_build_min_simple(bld, res, bld->one);
 255
 256    /* XXX clamp to floor of -1 or 0??? */
 257
 258    return res;
 259 }
 260
 261
 262 /** Return the sum of the elements of a */
 263 LLVMValueRef
 264 lp_build_sum_vector(struct lp_build_context *bld,
 265                     LLVMValueRef a)
 266 {
 267    const struct lp_type type = bld->type;
 268    LLVMValueRef index, res;
 269    unsigned i;
 270
 271    assert(lp_check_value(type, a));
 272
 273    if (a == bld->zero)
 274       return bld->zero;
 275    if (a == bld->undef)
 276       return bld->undef;
 277    assert(type.length > 1);
 278
 279    assert(!bld->type.norm);
 280
 281    index = LLVMConstInt(LLVMInt32Type(), 0, 0);
 282    res = LLVMBuildExtractElement(bld->builder, a, index, "");
 283
 284    for (i = 1; i < type.length; i++) {
 285       index = LLVMConstInt(LLVMInt32Type(), i, 0);
 286       if (type.floating)
 287          res = LLVMBuildFAdd(bld->builder, res,
 288                             LLVMBuildExtractElement(bld->builder,
 289                                                     a, index, ""),
 290                             "");
 291       else
 292          res = LLVMBuildAdd(bld->builder, res,
 293                             LLVMBuildExtractElement(bld->builder,
 294                                                     a, index, ""),
 295                             "");
 296    }
 297
 298    return res;
 299 }
 300
 301
 302 /**
 303  * Generate a - b
 304  */
 305 LLVMValueRef
 306 lp_build_sub(struct lp_build_context *bld,
 307              LLVMValueRef a,
 308              LLVMValueRef b)
 309 {
 310    const struct lp_type type = bld->type;
 311    LLVMValueRef res;
 312
 313    assert(lp_check_value(type, a));
 314    assert(lp_check_value(type, b));
 315
 316    if(b == bld->zero)
 317       return a;
 318    if(a == bld->undef || b == bld->undef)
 319       return bld->undef;
 320    if(a == b)
 321       return bld->zero;
 322
 323    if(bld->type.norm) {
 324       const char *intrinsic = NULL;
 325
 326       if(b == bld->one)
 327         return bld->zero;
 328
 329       if(util_cpu_caps.has_sse2 &&
 330          type.width * type.length == 128 &&
 331          !type.floating && !type.fixed) {
 332          if(type.width == 8)
 333             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 334          if(type.width == 16)
 335             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 336       }
 337
 338       if(intrinsic)
 339          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 340    }
 341
 342    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 343       if (type.floating)
 344          res = LLVMConstFSub(a, b);
 345       else
 346          res = LLVMConstSub(a, b);
 347    else
 348       if (type.floating)
 349          res = LLVMBuildFSub(bld->builder, a, b, "");
 350       else
 351          res = LLVMBuildSub(bld->builder, a, b, "");
 352
 353    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 354       res = lp_build_max_simple(bld, res, bld->zero);
 355
 356    return res;
 357 }
 358
 359
 360 /**
 361  * Normalized 8bit multiplication.
 362  *
 363  * - alpha plus one
 364  *
 365  *     makes the following approximation to the division (Sree)
 366  *
 367  *       a*b/255 ~= (a*(b + 1)) >> 256
 368  *
 369  *     which is the fastest method that satisfies the following OpenGL criteria
 370  *
 371  *       0*0 = 0 and 255*255 = 255
 372  *
 373  * - geometric series
 374  *
 375  *     takes the geometric series approximation to the division
 376  *
 377  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 378  *
 379  *     in this case just the first two terms to fit in 16bit arithmetic
 380  *
 381  *       t/255 ~= (t + (t >> 8)) >> 8
 382  *
 383  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 384  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 385  *     must be used
 386  *
 387  * - geometric series plus rounding
 388  *
 389  *     when using a geometric series division instead of truncating the result
 390  *     use roundoff in the approximation (Jim Blinn)
 391  *
 392  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 393  *
 394  *     achieving the exact results
 395  *
 396  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 397  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 398  * @sa Michael Herf, The "double blend trick", May 2000,
 399  *     http://www.stereopsis.com/doubleblend.html
 400  */
 401 static LLVMValueRef
 402 lp_build_mul_u8n(LLVMBuilderRef builder,
 403                  struct lp_type i16_type,
 404                  LLVMValueRef a, LLVMValueRef b)
 405 {
 406    LLVMValueRef c8;
 407    LLVMValueRef ab;
 408
 409    assert(!i16_type.floating);
 410    assert(lp_check_value(i16_type, a));
 411    assert(lp_check_value(i16_type, b));
 412
 413    c8 = lp_build_const_int_vec(i16_type, 8);
 414
 415 #if 0
 416
 417    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 418    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
 419    ab = LLVMBuildMul(builder, a, b, "");
 420
 421 #else
 422
 423    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 424    ab = LLVMBuildMul(builder, a, b, "");
 425    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 426    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 427
 428 #endif
 429
 430    ab = LLVMBuildLShr(builder, ab, c8, "");
 431
 432    return ab;
 433 }
 434
 435
 436 /**
 437  * Generate a * b
 438  */
 439 LLVMValueRef
 440 lp_build_mul(struct lp_build_context *bld,
 441              LLVMValueRef a,
 442              LLVMValueRef b)
 443 {
 444    const struct lp_type type = bld->type;
 445    LLVMValueRef shift;
 446    LLVMValueRef res;
 447
 448    assert(lp_check_value(type, a));
 449    assert(lp_check_value(type, b));
 450
 451    if(a == bld->zero)
 452       return bld->zero;
 453    if(a == bld->one)
 454       return b;
 455    if(b == bld->zero)
 456       return bld->zero;
 457    if(b == bld->one)
 458       return a;
 459    if(a == bld->undef || b == bld->undef)
 460       return bld->undef;
 461
 462    if(!type.floating && !type.fixed && type.norm) {
 463       if(type.width == 8) {
 464          struct lp_type i16_type = lp_wider_type(type);
 465          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 466
 467          lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
 468          lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 469
 470          /* PMULLW, PSRLW, PADDW */
 471          abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
 472          abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 473
 474          ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
 475
 476          return ab;
 477       }
 478
 479       /* FIXME */
 480       assert(0);
 481    }
 482
 483    if(type.fixed)
 484       shift = lp_build_const_int_vec(type, type.width/2);
 485    else
 486       shift = NULL;
 487
 488    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 489       if (type.floating)
 490          res = LLVMConstFMul(a, b);
 491       else
 492          res = LLVMConstMul(a, b);
 493       if(shift) {
 494          if(type.sign)
 495             res = LLVMConstAShr(res, shift);
 496          else
 497             res = LLVMConstLShr(res, shift);
 498       }
 499    }
 500    else {
 501       if (type.floating)
 502          res = LLVMBuildFMul(bld->builder, a, b, "");
 503       else
 504          res = LLVMBuildMul(bld->builder, a, b, "");
 505       if(shift) {
 506          if(type.sign)
 507             res = LLVMBuildAShr(bld->builder, res, shift, "");
 508          else
 509             res = LLVMBuildLShr(bld->builder, res, shift, "");
 510       }
 511    }
 512
 513    return res;
 514 }
 515
 516
 517 /**
 518  * Small vector x scale multiplication optimization.
 519  */
 520 LLVMValueRef
 521 lp_build_mul_imm(struct lp_build_context *bld,
 522                  LLVMValueRef a,
 523                  int b)
 524 {
 525    LLVMValueRef factor;
 526
 527    assert(lp_check_value(bld->type, a));
 528
 529    if(b == 0)
 530       return bld->zero;
 531
 532    if(b == 1)
 533       return a;
 534
 535    if(b == -1)
 536       return lp_build_negate(bld, a);
 537
 538    if(b == 2 && bld->type.floating)
 539       return lp_build_add(bld, a, a);
 540
 541    if(util_is_power_of_two(b)) {
 542       unsigned shift = ffs(b) - 1;
 543
 544       if(bld->type.floating) {
 545 #if 0
 546          /*
 547           * Power of two multiplication by directly manipulating the mantissa.
 548           *
 549           * XXX: This might not be always faster, it will introduce a small error
 550           * for multiplication by zero, and it will produce wrong results
 551           * for Inf and NaN.
 552           */
 553          unsigned mantissa = lp_mantissa(bld->type);
 554          factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
 555          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
 556          a = LLVMBuildAdd(bld->builder, a, factor, "");
 557          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
 558          return a;
 559 #endif
 560       }
 561       else {
 562          factor = lp_build_const_vec(bld->type, shift);
 563          return LLVMBuildShl(bld->builder, a, factor, "");
 564       }
 565    }
 566
 567    factor = lp_build_const_vec(bld->type, (double)b);
 568    return lp_build_mul(bld, a, factor);
 569 }
 570
 571
 572 /**
 573  * Generate a / b
 574  */
 575 LLVMValueRef
 576 lp_build_div(struct lp_build_context *bld,
 577              LLVMValueRef a,
 578              LLVMValueRef b)
 579 {
 580    const struct lp_type type = bld->type;
 581
 582    assert(lp_check_value(type, a));
 583    assert(lp_check_value(type, b));
 584
 585    if(a == bld->zero)
 586       return bld->zero;
 587    if(a == bld->one)
 588       return lp_build_rcp(bld, b);
 589    if(b == bld->zero)
 590       return bld->undef;
 591    if(b == bld->one)
 592       return a;
 593    if(a == bld->undef || b == bld->undef)
 594       return bld->undef;
 595
 596    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 597       if (type.floating)
 598          return LLVMConstFDiv(a, b);
 599       else if (type.sign)
 600          return LLVMConstSDiv(a, b);
 601       else
 602          return LLVMConstUDiv(a, b);
 603    }
 604
 605    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 606       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 607
 608    if (type.floating)
 609       return LLVMBuildFDiv(bld->builder, a, b, "");
 610    else if (type.sign)
 611       return LLVMBuildSDiv(bld->builder, a, b, "");
 612    else
 613       return LLVMBuildUDiv(bld->builder, a, b, "");
 614 }
 615
 616
 617 /**
 618  * Linear interpolation.
 619  *
 620  * This also works for integer values with a few caveats.
 621  *
 622  * @sa http://www.stereopsis.com/doubleblend.html
 623  */
 624 LLVMValueRef
 625 lp_build_lerp(struct lp_build_context *bld,
 626               LLVMValueRef x,
 627               LLVMValueRef v0,
 628               LLVMValueRef v1)
 629 {
 630    LLVMValueRef delta;
 631    LLVMValueRef res;
 632
 633    assert(lp_check_value(bld->type, x));
 634    assert(lp_check_value(bld->type, v0));
 635    assert(lp_check_value(bld->type, v1));
 636
 637    delta = lp_build_sub(bld, v1, v0);
 638
 639    res = lp_build_mul(bld, x, delta);
 640
 641    res = lp_build_add(bld, v0, res);
 642
 643    if(bld->type.fixed)
 644       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 645        * but it will be wrong for other uses. Basically we need a more
 646        * powerful lp_type, capable of further distinguishing the values
 647        * interpretation from the value storage. */
 648       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 649
 650    return res;
 651 }
 652
 653
 654 LLVMValueRef
 655 lp_build_lerp_2d(struct lp_build_context *bld,
 656                  LLVMValueRef x,
 657                  LLVMValueRef y,
 658                  LLVMValueRef v00,
 659                  LLVMValueRef v01,
 660                  LLVMValueRef v10,
 661                  LLVMValueRef v11)
 662 {
 663    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 664    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 665    return lp_build_lerp(bld, y, v0, v1);
 666 }
 667
 668
 669 /**
 670  * Generate min(a, b)
 671  * Do checks for special cases.
 672  */
 673 LLVMValueRef
 674 lp_build_min(struct lp_build_context *bld,
 675              LLVMValueRef a,
 676              LLVMValueRef b)
 677 {
 678    assert(lp_check_value(bld->type, a));
 679    assert(lp_check_value(bld->type, b));
 680
 681    if(a == bld->undef || b == bld->undef)
 682       return bld->undef;
 683
 684    if(a == b)
 685       return a;
 686
 687    if(bld->type.norm) {
 688       if(a == bld->zero || b == bld->zero)
 689          return bld->zero;
 690       if(a == bld->one)
 691          return b;
 692       if(b == bld->one)
 693          return a;
 694    }
 695
 696    return lp_build_min_simple(bld, a, b);
 697 }
 698
 699
 700 /**
 701  * Generate max(a, b)
 702  * Do checks for special cases.
 703  */
 704 LLVMValueRef
 705 lp_build_max(struct lp_build_context *bld,
 706              LLVMValueRef a,
 707              LLVMValueRef b)
 708 {
 709    assert(lp_check_value(bld->type, a));
 710    assert(lp_check_value(bld->type, b));
 711
 712    if(a == bld->undef || b == bld->undef)
 713       return bld->undef;
 714
 715    if(a == b)
 716       return a;
 717
 718    if(bld->type.norm) {
 719       if(a == bld->one || b == bld->one)
 720          return bld->one;
 721       if(a == bld->zero)
 722          return b;
 723       if(b == bld->zero)
 724          return a;
 725    }
 726
 727    return lp_build_max_simple(bld, a, b);
 728 }
 729
 730
 731 /**
 732  * Generate clamp(a, min, max)
 733  * Do checks for special cases.
 734  */
 735 LLVMValueRef
 736 lp_build_clamp(struct lp_build_context *bld,
 737                LLVMValueRef a,
 738                LLVMValueRef min,
 739                LLVMValueRef max)
 740 {
 741    assert(lp_check_value(bld->type, a));
 742    assert(lp_check_value(bld->type, min));
 743    assert(lp_check_value(bld->type, max));
 744
 745    a = lp_build_min(bld, a, max);
 746    a = lp_build_max(bld, a, min);
 747    return a;
 748 }
 749
 750
 751 /**
 752  * Generate abs(a)
 753  */
 754 LLVMValueRef
 755 lp_build_abs(struct lp_build_context *bld,
 756              LLVMValueRef a)
 757 {
 758    const struct lp_type type = bld->type;
 759    LLVMTypeRef vec_type = lp_build_vec_type(type);
 760
 761    assert(lp_check_value(type, a));
 762
 763    if(!type.sign)
 764       return a;
 765
 766    if(type.floating) {
 767       /* Mask out the sign bit */
 768       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 769       unsigned long long absMask = ~(1ULL << (type.width - 1));
 770       LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
 771       a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 772       a = LLVMBuildAnd(bld->builder, a, mask, "");
 773       a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 774       return a;
 775    }
 776
 777    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 778       switch(type.width) {
 779       case 8:
 780          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 781       case 16:
 782          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 783       case 32:
 784          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 785       }
 786    }
 787
 788    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 789 }
 790
 791
 792 LLVMValueRef
 793 lp_build_negate(struct lp_build_context *bld,
 794                 LLVMValueRef a)
 795 {
 796    assert(lp_check_value(bld->type, a));
 797
 798 #if HAVE_LLVM >= 0x0207
 799    if (bld->type.floating)
 800       a = LLVMBuildFNeg(bld->builder, a, "");
 801    else
 802 #endif
 803       a = LLVMBuildNeg(bld->builder, a, "");
 804
 805    return a;
 806 }
 807
 808
 809 /** Return -1, 0 or +1 depending on the sign of a */
 810 LLVMValueRef
 811 lp_build_sgn(struct lp_build_context *bld,
 812              LLVMValueRef a)
 813 {
 814    const struct lp_type type = bld->type;
 815    LLVMValueRef cond;
 816    LLVMValueRef res;
 817
 818    assert(lp_check_value(type, a));
 819
 820    /* Handle non-zero case */
 821    if(!type.sign) {
 822       /* if not zero then sign must be positive */
 823       res = bld->one;
 824    }
 825    else if(type.floating) {
 826       LLVMTypeRef vec_type;
 827       LLVMTypeRef int_type;
 828       LLVMValueRef mask;
 829       LLVMValueRef sign;
 830       LLVMValueRef one;
 831       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 832
 833       int_type = lp_build_int_vec_type(type);
 834       vec_type = lp_build_vec_type(type);
 835       mask = lp_build_const_int_vec(type, maskBit);
 836
 837       /* Take the sign bit and add it to 1 constant */
 838       sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
 839       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 840       one = LLVMConstBitCast(bld->one, int_type);
 841       res = LLVMBuildOr(bld->builder, sign, one, "");
 842       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 843    }
 844    else
 845    {
 846       LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
 847       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 848       res = lp_build_select(bld, cond, bld->one, minus_one);
 849    }
 850
 851    /* Handle zero */
 852    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 853    res = lp_build_select(bld, cond, bld->zero, res);
 854
 855    return res;
 856 }
 857
 858
 859 /**
 860  * Set the sign of float vector 'a' according to 'sign'.
 861  * If sign==0, return abs(a).
 862  * If sign==1, return -abs(a);
 863  * Other values for sign produce undefined results.
 864  */
 865 LLVMValueRef
 866 lp_build_set_sign(struct lp_build_context *bld,
 867                   LLVMValueRef a, LLVMValueRef sign)
 868 {
 869    const struct lp_type type = bld->type;
 870    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 871    LLVMTypeRef vec_type = lp_build_vec_type(type);
 872    LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
 873    LLVMValueRef mask = lp_build_const_int_vec(type,
 874                              ~((unsigned long long) 1 << (type.width - 1)));
 875    LLVMValueRef val, res;
 876
 877    assert(type.floating);
 878    assert(lp_check_value(type, a));
 879
 880    /* val = reinterpret_cast<int>(a) */
 881    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 882    /* val = val & mask */
 883    val = LLVMBuildAnd(bld->builder, val, mask, "");
 884    /* sign = sign << shift */
 885    sign = LLVMBuildShl(bld->builder, sign, shift, "");
 886    /* res = val | sign */
 887    res = LLVMBuildOr(bld->builder, val, sign, "");
 888    /* res = reinterpret_cast<float>(res) */
 889    res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 890
 891    return res;
 892 }
 893
 894
 895 /**
 896  * Convert vector of (or scalar) int to vector of (or scalar) float.
 897  */
 898 LLVMValueRef
 899 lp_build_int_to_float(struct lp_build_context *bld,
 900                       LLVMValueRef a)
 901 {
 902    const struct lp_type type = bld->type;
 903    LLVMTypeRef vec_type = lp_build_vec_type(type);
 904
 905    assert(type.floating);
 906
 907    return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
 908 }
 909
 910
 911
 912 enum lp_build_round_sse41_mode
 913 {
 914    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 915    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 916    LP_BUILD_ROUND_SSE41_CEIL = 2,
 917    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 918 };
 919
 920
 921 static INLINE LLVMValueRef
 922 lp_build_round_sse41(struct lp_build_context *bld,
 923                      LLVMValueRef a,
 924                      enum lp_build_round_sse41_mode mode)
 925 {
 926    const struct lp_type type = bld->type;
 927    LLVMTypeRef vec_type = lp_build_vec_type(type);
 928    const char *intrinsic;
 929
 930    assert(type.floating);
 931    assert(type.width*type.length == 128);
 932    assert(lp_check_value(type, a));
 933    assert(util_cpu_caps.has_sse4_1);
 934
 935    switch(type.width) {
 936    case 32:
 937       intrinsic = "llvm.x86.sse41.round.ps";
 938       break;
 939    case 64:
 940       intrinsic = "llvm.x86.sse41.round.pd";
 941       break;
 942    default:
 943       assert(0);
 944       return bld->undef;
 945    }
 946
 947    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 948                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 949 }
 950
 951
 952 /**
 953  * Return the integer part of a float (vector) value.  The returned value is
 954  * a float (vector).
 955  * Ex: trunc(-1.5) = 1.0
 956  */
 957 LLVMValueRef
 958 lp_build_trunc(struct lp_build_context *bld,
 959                LLVMValueRef a)
 960 {
 961    const struct lp_type type = bld->type;
 962
 963    assert(type.floating);
 964    assert(lp_check_value(type, a));
 965
 966    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 967       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 968    else {
 969       LLVMTypeRef vec_type = lp_build_vec_type(type);
 970       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 971       LLVMValueRef res;
 972       res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 973       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 974       return res;
 975    }
 976 }
 977
 978
 979 /**
 980  * Return float (vector) rounded to nearest integer (vector).  The returned
 981  * value is a float (vector).
 982  * Ex: round(0.9) = 1.0
 983  * Ex: round(-1.5) = -2.0
 984  */
 985 LLVMValueRef
 986 lp_build_round(struct lp_build_context *bld,
 987                LLVMValueRef a)
 988 {
 989    const struct lp_type type = bld->type;
 990
 991    assert(type.floating);
 992    assert(lp_check_value(type, a));
 993
 994    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 995       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 996    else {
 997       LLVMTypeRef vec_type = lp_build_vec_type(type);
 998       LLVMValueRef res;
 999       res = lp_build_iround(bld, a);
1000       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1001       return res;
1002    }
1003 }
1004
1005
1006 /**
1007  * Return floor of float (vector), result is a float (vector)
1008  * Ex: floor(1.1) = 1.0
1009  * Ex: floor(-1.1) = -2.0
1010  */
1011 LLVMValueRef
1012 lp_build_floor(struct lp_build_context *bld,
1013                LLVMValueRef a)
1014 {
1015    const struct lp_type type = bld->type;
1016
1017    assert(type.floating);
1018    assert(lp_check_value(type, a));
1019
1020    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1021       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1022    else {
1023       LLVMTypeRef vec_type = lp_build_vec_type(type);
1024       LLVMValueRef res;
1025       res = lp_build_ifloor(bld, a);
1026       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1027       return res;
1028    }
1029 }
1030
1031
1032 /**
1033  * Return ceiling of float (vector), returning float (vector).
1034  * Ex: ceil( 1.1) = 2.0
1035  * Ex: ceil(-1.1) = -1.0
1036  */
1037 LLVMValueRef
1038 lp_build_ceil(struct lp_build_context *bld,
1039               LLVMValueRef a)
1040 {
1041    const struct lp_type type = bld->type;
1042
1043    assert(type.floating);
1044    assert(lp_check_value(type, a));
1045
1046    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1047       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1048    else {
1049       LLVMTypeRef vec_type = lp_build_vec_type(type);
1050       LLVMValueRef res;
1051       res = lp_build_iceil(bld, a);
1052       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1053       return res;
1054    }
1055 }
1056
1057
1058 /**
1059  * Return fractional part of 'a' computed as a - floor(a)
1060  * Typically used in texture coord arithmetic.
1061  */
1062 LLVMValueRef
1063 lp_build_fract(struct lp_build_context *bld,
1064                LLVMValueRef a)
1065 {
1066    assert(bld->type.floating);
1067    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1068 }
1069
1070
1071 /**
1072  * Return the integer part of a float (vector) value.  The returned value is
1073  * an integer (vector).
1074  * Ex: itrunc(-1.5) = 1
1075  */
1076 LLVMValueRef
1077 lp_build_itrunc(struct lp_build_context *bld,
1078                 LLVMValueRef a)
1079 {
1080    const struct lp_type type = bld->type;
1081    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1082
1083    assert(type.floating);
1084    assert(lp_check_value(type, a));
1085
1086    return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1087 }
1088
1089
1090 /**
1091  * Return float (vector) rounded to nearest integer (vector).  The returned
1092  * value is an integer (vector).
1093  * Ex: iround(0.9) = 1
1094  * Ex: iround(-1.5) = -2
1095  */
1096 LLVMValueRef
1097 lp_build_iround(struct lp_build_context *bld,
1098                 LLVMValueRef a)
1099 {
1100    const struct lp_type type = bld->type;
1101    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1102    LLVMValueRef res;
1103
1104    assert(type.floating);
1105
1106    assert(lp_check_value(type, a));
1107
1108    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1109       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1110    }
1111    else {
1112       LLVMTypeRef vec_type = lp_build_vec_type(type);
1113       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1114       LLVMValueRef sign;
1115       LLVMValueRef half;
1116
1117       /* get sign bit */
1118       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1119       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1120
1121       /* sign * 0.5 */
1122       half = lp_build_const_vec(type, 0.5);
1123       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1124       half = LLVMBuildOr(bld->builder, sign, half, "");
1125       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1126
1127       res = LLVMBuildFAdd(bld->builder, a, half, "");
1128    }
1129
1130    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1131
1132    return res;
1133 }
1134
1135
1136 /**
1137  * Return floor of float (vector), result is an int (vector)
1138  * Ex: ifloor(1.1) = 1.0
1139  * Ex: ifloor(-1.1) = -2.0
1140  */
1141 LLVMValueRef
1142 lp_build_ifloor(struct lp_build_context *bld,
1143                 LLVMValueRef a)
1144 {
1145    const struct lp_type type = bld->type;
1146    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1147    LLVMValueRef res;
1148
1149    assert(type.floating);
1150    assert(lp_check_value(type, a));
1151
1152    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1153       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1154    }
1155    else {
1156       /* Take the sign bit and add it to 1 constant */
1157       LLVMTypeRef vec_type = lp_build_vec_type(type);
1158       unsigned mantissa = lp_mantissa(type);
1159       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1160       LLVMValueRef sign;
1161       LLVMValueRef offset;
1162
1163       /* sign = a < 0 ? ~0 : 0 */
1164       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1165       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1166       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1167
1168       /* offset = -0.99999(9)f */
1169       offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1170       offset = LLVMConstBitCast(offset, int_vec_type);
1171
1172       /* offset = a < 0 ? offset : 0.0f */
1173       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1174       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1175
1176       res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1177    }
1178
1179    /* round to nearest (toward zero) */
1180    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1181
1182    return res;
1183 }
1184
1185
1186 /**
1187  * Return ceiling of float (vector), returning int (vector).
1188  * Ex: iceil( 1.1) = 2
1189  * Ex: iceil(-1.1) = -1
1190  */
1191 LLVMValueRef
1192 lp_build_iceil(struct lp_build_context *bld,
1193                LLVMValueRef a)
1194 {
1195    const struct lp_type type = bld->type;
1196    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1197    LLVMValueRef res;
1198
1199    assert(type.floating);
1200    assert(lp_check_value(type, a));
1201
1202    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1203       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1204    }
1205    else {
1206       LLVMTypeRef vec_type = lp_build_vec_type(type);
1207       unsigned mantissa = lp_mantissa(type);
1208       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1209       LLVMValueRef sign;
1210       LLVMValueRef offset;
1211
1212       /* sign = a < 0 ? 0 : ~0 */
1213       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1214       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1215       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1216       sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1217
1218       /* offset = 0.99999(9)f */
1219       offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1220       offset = LLVMConstBitCast(offset, int_vec_type);
1221
1222       /* offset = a < 0 ? 0.0 : offset */
1223       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1224       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1225
1226       res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1227    }
1228
1229    /* round to nearest (toward zero) */
1230    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1231
1232    return res;
1233 }
1234
1235
1236 LLVMValueRef
1237 lp_build_sqrt(struct lp_build_context *bld,
1238               LLVMValueRef a)
1239 {
1240    const struct lp_type type = bld->type;
1241    LLVMTypeRef vec_type = lp_build_vec_type(type);
1242    char intrinsic[32];
1243
1244    assert(lp_check_value(type, a));
1245
1246    /* TODO: optimize the constant case */
1247    /* TODO: optimize the constant case */
1248
1249    assert(type.floating);
1250    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1251
1252    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1253 }
1254
1255
1256 /**
1257  * Do one Newton-Raphson step to improve reciprocate precision:
1258  *
1259  *   x_{i+1} = x_i * (2 - a * x_i)
1260  *
1261  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1262  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1263  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1264  * halo. It would be necessary to clamp the argument to prevent this.
1265  *
1266  * See also:
1267  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1268  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1269  */
1270 static INLINE LLVMValueRef
1271 lp_build_rcp_refine(struct lp_build_context *bld,
1272                     LLVMValueRef a,
1273                     LLVMValueRef rcp_a)
1274 {
1275    LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1276    LLVMValueRef res;
1277
1278    res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1279    res = LLVMBuildFSub(bld->builder, two, res, "");
1280    res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1281
1282    return res;
1283 }
1284
1285
1286 LLVMValueRef
1287 lp_build_rcp(struct lp_build_context *bld,
1288              LLVMValueRef a)
1289 {
1290    const struct lp_type type = bld->type;
1291
1292    assert(lp_check_value(type, a));
1293
1294    if(a == bld->zero)
1295       return bld->undef;
1296    if(a == bld->one)
1297       return bld->one;
1298    if(a == bld->undef)
1299       return bld->undef;
1300
1301    assert(type.floating);
1302
1303    if(LLVMIsConstant(a))
1304       return LLVMConstFDiv(bld->one, a);
1305
1306    /*
1307     * We don't use RCPPS because:
1308     * - it only has 10bits of precision
1309     * - it doesn't even get the reciprocate of 1.0 exactly
1310     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1311     * - for recent processors the benefit over DIVPS is marginal, a case
1312     *   depedent
1313     *
1314     * We could still use it on certain processors if benchmarks show that the
1315     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1316     * particular uses that require less workarounds.
1317     */
1318
1319    if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1320       const unsigned num_iterations = 0;
1321       LLVMValueRef res;
1322       unsigned i;
1323
1324       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1325
1326       for (i = 0; i < num_iterations; ++i) {
1327          res = lp_build_rcp_refine(bld, a, res);
1328       }
1329
1330       return res;
1331    }
1332
1333    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1334 }
1335
1336
1337 /**
1338  * Do one Newton-Raphson step to improve rsqrt precision:
1339  *
1340  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1341  *
1342  * See also:
1343  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1344  */
1345 static INLINE LLVMValueRef
1346 lp_build_rsqrt_refine(struct lp_build_context *bld,
1347                       LLVMValueRef a,
1348                       LLVMValueRef rsqrt_a)
1349 {
1350    LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1351    LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1352    LLVMValueRef res;
1353
1354    res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1355    res = LLVMBuildFMul(bld->builder, a, res, "");
1356    res = LLVMBuildFSub(bld->builder, three, res, "");
1357    res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1358    res = LLVMBuildFMul(bld->builder, half, res, "");
1359
1360    return res;
1361 }
1362
1363
1364 /**
1365  * Generate 1/sqrt(a)
1366  */
1367 LLVMValueRef
1368 lp_build_rsqrt(struct lp_build_context *bld,
1369                LLVMValueRef a)
1370 {
1371    const struct lp_type type = bld->type;
1372
1373    assert(lp_check_value(type, a));
1374
1375    assert(type.floating);
1376
1377    if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1378       const unsigned num_iterations = 0;
1379       LLVMValueRef res;
1380       unsigned i;
1381
1382       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1383
1384       for (i = 0; i < num_iterations; ++i) {
1385          res = lp_build_rsqrt_refine(bld, a, res);
1386       }
1387
1388       return res;
1389    }
1390
1391    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1392 }
1393
1394
1395 static inline LLVMValueRef
1396 lp_build_const_v4si(unsigned long value)
1397 {
1398    LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1399    LLVMValueRef elements[4] = { element, element, element, element };
1400    return LLVMConstVector(elements, 4);
1401 }
1402
1403 static inline LLVMValueRef
1404 lp_build_const_v4sf(float value)
1405 {
1406    LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1407    LLVMValueRef elements[4] = { element, element, element, element };
1408    return LLVMConstVector(elements, 4);
1409 }
1410
1411
1412 /**
1413  * Generate sin(a) using SSE2
1414  */
1415 LLVMValueRef
1416 lp_build_sin(struct lp_build_context *bld,
1417              LLVMValueRef a)
1418 {
1419    struct lp_type int_type = lp_int_type(bld->type);
1420    LLVMBuilderRef b = bld->builder;
1421    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1422    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1423
1424    /*
1425     *  take the absolute value,
1426     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1427     */
1428
1429    LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1430    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1431
1432    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1433    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1434
1435    /*
1436     * extract the sign bit (upper one)
1437     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1438     */
1439    LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1440    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1441
1442    /*
1443     * scale by 4/Pi
1444     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1445     */
1446
1447    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1448    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1449
1450    /*
1451     * store the integer part of y in mm0
1452     * emm2 = _mm_cvttps_epi32(y);
1453     */
1454
1455    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1456
1457    /*
1458     * j=(j+1) & (~1) (see the cephes sources)
1459     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1460     */
1461
1462    LLVMValueRef all_one = lp_build_const_v4si(1);
1463    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1464    /*
1465     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1466     */
1467    LLVMValueRef inv_one = lp_build_const_v4si(~1);
1468    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1469
1470    /*
1471     * y = _mm_cvtepi32_ps(emm2);
1472     */
1473    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1474
1475    /* get the swap sign flag
1476     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1477     */
1478    LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1479    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1480
1481    /*
1482     * emm2 = _mm_slli_epi32(emm0, 29);
1483     */
1484    LLVMValueRef const_29 = lp_build_const_v4si(29);
1485    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1486
1487    /*
1488     * get the polynom selection mask
1489     * there is one polynom for 0 <= x <= Pi/4
1490     * and another one for Pi/4<x<=Pi/2
1491     * Both branches will be computed.
1492     *
1493     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1494     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1495     */
1496
1497    LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1498    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1499    LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1500                                              emm2_3, lp_build_const_v4si(0));
1501    /*
1502     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1503     */
1504    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1505
1506    /*
1507     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1508     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1509     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1510     */
1511    LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1512    LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1513    LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1514
1515    /*
1516     * The magic pass: "Extended precision modular arithmetic"
1517     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1518     * xmm1 = _mm_mul_ps(y, xmm1);
1519     * xmm2 = _mm_mul_ps(y, xmm2);
1520     * xmm3 = _mm_mul_ps(y, xmm3);
1521     */
1522    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1523    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1524    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1525
1526    /*
1527     * x = _mm_add_ps(x, xmm1);
1528     * x = _mm_add_ps(x, xmm2);
1529     * x = _mm_add_ps(x, xmm3);
1530     */
1531
1532    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1533    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1534    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1535
1536    /*
1537     * Evaluate the first polynom  (0 <= x <= Pi/4)
1538     *
1539     * z = _mm_mul_ps(x,x);
1540     */
1541    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1542
1543    /*
1544     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1545     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1546     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1547     */
1548    LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1549    LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1550    LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1551
1552    /*
1553     * y = *(v4sf*)_ps_coscof_p0;
1554     * y = _mm_mul_ps(y, z);
1555     */
1556    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1557    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1558    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1559    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1560    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1561    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1562
1563
1564    /*
1565     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1566     * y = _mm_sub_ps(y, tmp);
1567     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1568     */
1569    LLVMValueRef half = lp_build_const_v4sf(0.5);
1570    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1571    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1572    LLVMValueRef one = lp_build_const_v4sf(1.0);
1573    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1574
1575    /*
1576     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1577     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1578     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1579     */
1580    LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1581    LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1582    LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1583
1584    /*
1585     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1586     *
1587     * y2 = *(v4sf*)_ps_sincof_p0;
1588     * y2 = _mm_mul_ps(y2, z);
1589     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1590     * y2 = _mm_mul_ps(y2, z);
1591     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1592     * y2 = _mm_mul_ps(y2, z);
1593     * y2 = _mm_mul_ps(y2, x);
1594     * y2 = _mm_add_ps(y2, x);
1595     */
1596
1597    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1598    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1599    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1600    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1601    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1602    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1603    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1604
1605    /*
1606     * select the correct result from the two polynoms
1607     * xmm3 = poly_mask;
1608     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1609     * y = _mm_andnot_ps(xmm3, y);
1610     * y = _mm_add_ps(y,y2);
1611     */
1612    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1613    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1614    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1615    LLVMValueRef inv = lp_build_const_v4si(~0);
1616    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1617    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1618    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1619
1620    /*
1621     * update the sign
1622     * y = _mm_xor_ps(y, sign_bit);
1623     */
1624    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1625    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1626    return y_result;
1627 }
1628
1629
1630 /**
1631  * Generate cos(a) using SSE2
1632  */
1633 LLVMValueRef
1634 lp_build_cos(struct lp_build_context *bld,
1635              LLVMValueRef a)
1636 {
1637    struct lp_type int_type = lp_int_type(bld->type);
1638    LLVMBuilderRef b = bld->builder;
1639    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1640    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1641
1642    /*
1643     *  take the absolute value,
1644     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1645     */
1646
1647    LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1648    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1649
1650    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1651    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1652
1653    /*
1654     * scale by 4/Pi
1655     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1656     */
1657
1658    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1659    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1660
1661    /*
1662     * store the integer part of y in mm0
1663     * emm2 = _mm_cvttps_epi32(y);
1664     */
1665
1666    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1667
1668    /*
1669     * j=(j+1) & (~1) (see the cephes sources)
1670     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1671     */
1672
1673    LLVMValueRef all_one = lp_build_const_v4si(1);
1674    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1675    /*
1676     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1677     */
1678    LLVMValueRef inv_one = lp_build_const_v4si(~1);
1679    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1680
1681    /*
1682     * y = _mm_cvtepi32_ps(emm2);
1683     */
1684    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1685
1686
1687    /*
1688     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1689     */
1690    LLVMValueRef const_2 = lp_build_const_v4si(2);
1691    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1692
1693
1694    /* get the swap sign flag
1695     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1696     */
1697    LLVMValueRef inv = lp_build_const_v4si(~0);
1698    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1699    LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1700    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1701
1702    /*
1703     * emm2 = _mm_slli_epi32(emm0, 29);
1704     */
1705    LLVMValueRef const_29 = lp_build_const_v4si(29);
1706    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1707
1708    /*
1709     * get the polynom selection mask
1710     * there is one polynom for 0 <= x <= Pi/4
1711     * and another one for Pi/4<x<=Pi/2
1712     * Both branches will be computed.
1713     *
1714     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1715     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1716     */
1717
1718    LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1719    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1720    LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1721                                              emm2_3, lp_build_const_v4si(0));
1722
1723    /*
1724     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1725     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1726     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1727     */
1728    LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1729    LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1730    LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1731
1732    /*
1733     * The magic pass: "Extended precision modular arithmetic"
1734     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1735     * xmm1 = _mm_mul_ps(y, xmm1);
1736     * xmm2 = _mm_mul_ps(y, xmm2);
1737     * xmm3 = _mm_mul_ps(y, xmm3);
1738     */
1739    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1740    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1741    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1742
1743    /*
1744     * x = _mm_add_ps(x, xmm1);
1745     * x = _mm_add_ps(x, xmm2);
1746     * x = _mm_add_ps(x, xmm3);
1747     */
1748
1749    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1750    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1751    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1752
1753    /*
1754     * Evaluate the first polynom  (0 <= x <= Pi/4)
1755     *
1756     * z = _mm_mul_ps(x,x);
1757     */
1758    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1759
1760    /*
1761     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1762     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1763     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1764     */
1765    LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1766    LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1767    LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1768
1769    /*
1770     * y = *(v4sf*)_ps_coscof_p0;
1771     * y = _mm_mul_ps(y, z);
1772     */
1773    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1774    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1775    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1776    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1777    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1778    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1779
1780
1781    /*
1782     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1783     * y = _mm_sub_ps(y, tmp);
1784     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1785     */
1786    LLVMValueRef half = lp_build_const_v4sf(0.5);
1787    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1788    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1789    LLVMValueRef one = lp_build_const_v4sf(1.0);
1790    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1791
1792    /*
1793     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1794     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1795     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1796     */
1797    LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1798    LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1799    LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1800
1801    /*
1802     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1803     *
1804     * y2 = *(v4sf*)_ps_sincof_p0;
1805     * y2 = _mm_mul_ps(y2, z);
1806     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1807     * y2 = _mm_mul_ps(y2, z);
1808     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1809     * y2 = _mm_mul_ps(y2, z);
1810     * y2 = _mm_mul_ps(y2, x);
1811     * y2 = _mm_add_ps(y2, x);
1812     */
1813
1814    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1815    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1816    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1817    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1818    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1819    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1820    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1821
1822    /*
1823     * select the correct result from the two polynoms
1824     * xmm3 = poly_mask;
1825     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1826     * y = _mm_andnot_ps(xmm3, y);
1827     * y = _mm_add_ps(y,y2);
1828     */
1829    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1830    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1831    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1832    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1833    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1834    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1835
1836    /*
1837     * update the sign
1838     * y = _mm_xor_ps(y, sign_bit);
1839     */
1840    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1841    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1842    return y_result;
1843 }
1844
1845
1846 /**
1847  * Generate pow(x, y)
1848  */
1849 LLVMValueRef
1850 lp_build_pow(struct lp_build_context *bld,
1851              LLVMValueRef x,
1852              LLVMValueRef y)
1853 {
1854    /* TODO: optimize the constant case */
1855    if(LLVMIsConstant(x) && LLVMIsConstant(y))
1856       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1857                    __FUNCTION__);
1858
1859    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1860 }
1861
1862
1863 /**
1864  * Generate exp(x)
1865  */
1866 LLVMValueRef
1867 lp_build_exp(struct lp_build_context *bld,
1868              LLVMValueRef x)
1869 {
1870    /* log2(e) = 1/log(2) */
1871    LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1872
1873    assert(lp_check_value(bld->type, x));
1874
1875    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1876 }
1877
1878
1879 /**
1880  * Generate log(x)
1881  */
1882 LLVMValueRef
1883 lp_build_log(struct lp_build_context *bld,
1884              LLVMValueRef x)
1885 {
1886    /* log(2) */
1887    LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1888
1889    assert(lp_check_value(bld->type, x));
1890
1891    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1892 }
1893
1894
1895 /**
1896  * Generate polynomial.
1897  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1898  */
1899 static LLVMValueRef
1900 lp_build_polynomial(struct lp_build_context *bld,
1901                     LLVMValueRef x,
1902                     const double *coeffs,
1903                     unsigned num_coeffs)
1904 {
1905    const struct lp_type type = bld->type;
1906    LLVMValueRef res = NULL;
1907    unsigned i;
1908
1909    assert(lp_check_value(bld->type, x));
1910
1911    /* TODO: optimize the constant case */
1912    if(LLVMIsConstant(x))
1913       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1914                    __FUNCTION__);
1915
1916    for (i = num_coeffs; i--; ) {
1917       LLVMValueRef coeff;
1918
1919       coeff = lp_build_const_vec(type, coeffs[i]);
1920
1921       if(res)
1922          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1923       else
1924          res = coeff;
1925    }
1926
1927    if(res)
1928       return res;
1929    else
1930       return bld->undef;
1931 }
1932
1933
1934 /**
1935  * Minimax polynomial fit of 2**x, in range [0, 1[
1936  */
1937 const double lp_build_exp2_polynomial[] = {
1938 #if EXP_POLY_DEGREE == 5
1939    0.999999999690134838155,
1940    0.583974334321735217258,
1941    0.164553105719676828492,
1942    0.0292811063701710962255,
1943    0.00354944426657875141846,
1944    0.000296253726543423377365
1945 #elif EXP_POLY_DEGREE == 4
1946    1.00000001502262084505,
1947    0.563586057338685991394,
1948    0.150436017652442413623,
1949    0.0243220604213317927308,
1950    0.0025359088446580436489
1951 #elif EXP_POLY_DEGREE == 3
1952    0.999925218562710312959,
1953    0.695833540494823811697,
1954    0.226067155427249155588,
1955    0.0780245226406372992967
1956 #elif EXP_POLY_DEGREE == 2
1957    1.00172476321474503578,
1958    0.657636275736077639316,
1959    0.33718943461968720704
1960 #else
1961 #error
1962 #endif
1963 };
1964
1965
1966 void
1967 lp_build_exp2_approx(struct lp_build_context *bld,
1968                      LLVMValueRef x,
1969                      LLVMValueRef *p_exp2_int_part,
1970                      LLVMValueRef *p_frac_part,
1971                      LLVMValueRef *p_exp2)
1972 {
1973    const struct lp_type type = bld->type;
1974    LLVMTypeRef vec_type = lp_build_vec_type(type);
1975    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1976    LLVMValueRef ipart = NULL;
1977    LLVMValueRef fpart = NULL;
1978    LLVMValueRef expipart = NULL;
1979    LLVMValueRef expfpart = NULL;
1980    LLVMValueRef res = NULL;
1981
1982    assert(lp_check_value(bld->type, x));
1983
1984    if(p_exp2_int_part || p_frac_part || p_exp2) {
1985       /* TODO: optimize the constant case */
1986       if(LLVMIsConstant(x))
1987          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1988                       __FUNCTION__);
1989
1990       assert(type.floating && type.width == 32);
1991
1992       x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1993       x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1994
1995       /* ipart = floor(x) */
1996       ipart = lp_build_floor(bld, x);
1997
1998       /* fpart = x - ipart */
1999       fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
2000    }
2001
2002    if(p_exp2_int_part || p_exp2) {
2003       /* expipart = (float) (1 << ipart) */
2004       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2005       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2006       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2007       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2008    }
2009
2010    if(p_exp2) {
2011       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2012                                      Elements(lp_build_exp2_polynomial));
2013
2014       res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2015    }
2016
2017    if(p_exp2_int_part)
2018       *p_exp2_int_part = expipart;
2019
2020    if(p_frac_part)
2021       *p_frac_part = fpart;
2022
2023    if(p_exp2)
2024       *p_exp2 = res;
2025 }
2026
2027
2028 LLVMValueRef
2029 lp_build_exp2(struct lp_build_context *bld,
2030               LLVMValueRef x)
2031 {
2032    LLVMValueRef res;
2033    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2034    return res;
2035 }
2036
2037
2038 /**
2039  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2040  * These coefficients can be generate with
2041  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2042  */
2043 const double lp_build_log2_polynomial[] = {
2044 #if LOG_POLY_DEGREE == 6
2045    3.11578814719469302614,
2046    -3.32419399085241980044,
2047    2.59883907202499966007,
2048    -1.23152682416275988241,
2049    0.318212422185251071475,
2050    -0.0344359067839062357313
2051 #elif LOG_POLY_DEGREE == 5
2052    2.8882704548164776201,
2053    -2.52074962577807006663,
2054    1.48116647521213171641,
2055    -0.465725644288844778798,
2056    0.0596515482674574969533
2057 #elif LOG_POLY_DEGREE == 4
2058    2.61761038894603480148,
2059    -1.75647175389045657003,
2060    0.688243882994381274313,
2061    -0.107254423828329604454
2062 #elif LOG_POLY_DEGREE == 3
2063    2.28330284476918490682,
2064    -1.04913055217340124191,
2065    0.204446009836232697516
2066 #else
2067 #error
2068 #endif
2069 };
2070
2071
2072 /**
2073  * See http://www.devmaster.net/forums/showthread.php?p=43580
2074  */
2075 void
2076 lp_build_log2_approx(struct lp_build_context *bld,
2077                      LLVMValueRef x,
2078                      LLVMValueRef *p_exp,
2079                      LLVMValueRef *p_floor_log2,
2080                      LLVMValueRef *p_log2)
2081 {
2082    const struct lp_type type = bld->type;
2083    LLVMTypeRef vec_type = lp_build_vec_type(type);
2084    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2085
2086    LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2087    LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2088    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2089
2090    LLVMValueRef i = NULL;
2091    LLVMValueRef exp = NULL;
2092    LLVMValueRef mant = NULL;
2093    LLVMValueRef logexp = NULL;
2094    LLVMValueRef logmant = NULL;
2095    LLVMValueRef res = NULL;
2096
2097    assert(lp_check_value(bld->type, x));
2098
2099    if(p_exp || p_floor_log2 || p_log2) {
2100       /* TODO: optimize the constant case */
2101       if(LLVMIsConstant(x))
2102          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2103                       __FUNCTION__);
2104
2105       assert(type.floating && type.width == 32);
2106
2107       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2108
2109       /* exp = (float) exponent(x) */
2110       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2111    }
2112
2113    if(p_floor_log2 || p_log2) {
2114       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2115       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2116       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2117    }
2118
2119    if(p_log2) {
2120       /* mant = (float) mantissa(x) */
2121       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2122       mant = LLVMBuildOr(bld->builder, mant, one, "");
2123       mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2124
2125       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2126                                     Elements(lp_build_log2_polynomial));
2127
2128       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2129       logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2130
2131       res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2132    }
2133
2134    if(p_exp) {
2135       exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2136       *p_exp = exp;
2137    }
2138
2139    if(p_floor_log2)
2140       *p_floor_log2 = logexp;
2141
2142    if(p_log2)
2143       *p_log2 = res;
2144 }
2145
2146
2147 LLVMValueRef
2148 lp_build_log2(struct lp_build_context *bld,
2149               LLVMValueRef x)
2150 {
2151    LLVMValueRef res;
2152    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2153    return res;
2154 }