src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_intr.h"
  57 #include "lp_bld_logic.h"
  58 #include "lp_bld_pack.h"
  59 #include "lp_bld_arit.h"
  60
  61
  62 /*
  63  * XXX: Increasing eliminates some artifacts, but adds others, most
  64  * noticeably corruption in the Earth halo in Google Earth.
  65  */
  66 #define RCP_NEWTON_STEPS 0
  67
  68 #define RSQRT_NEWTON_STEPS 0
  69
  70 #define EXP_POLY_DEGREE 3
  71
  72 #define LOG_POLY_DEGREE 5
  73
  74
  75 /**
  76  * Generate min(a, b)
  77  * No checks for special case values of a or b = 1 or 0 are done.
  78  */
  79 static LLVMValueRef
  80 lp_build_min_simple(struct lp_build_context *bld,
  81                     LLVMValueRef a,
  82                     LLVMValueRef b)
  83 {
  84    const struct lp_type type = bld->type;
  85    const char *intrinsic = NULL;
  86    LLVMValueRef cond;
  87
  88    assert(lp_check_value(type, a));
  89    assert(lp_check_value(type, b));
  90
  91    /* TODO: optimize the constant case */
  92
  93    if(type.width * type.length == 128) {
  94       if(type.floating) {
  95          if(type.width == 32 && util_cpu_caps.has_sse)
  96             intrinsic = "llvm.x86.sse.min.ps";
  97          if(type.width == 64 && util_cpu_caps.has_sse2)
  98             intrinsic = "llvm.x86.sse2.min.pd";
  99       }
 100       else {
 101          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 102             intrinsic = "llvm.x86.sse2.pminu.b";
 103          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 104             intrinsic = "llvm.x86.sse41.pminsb";
 105          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 106             intrinsic = "llvm.x86.sse41.pminuw";
 107          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 108             intrinsic = "llvm.x86.sse2.pmins.w";
 109          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 110             intrinsic = "llvm.x86.sse41.pminud";
 111          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 112             intrinsic = "llvm.x86.sse41.pminsd";
 113       }
 114    }
 115
 116    if(intrinsic)
 117       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 118
 119    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 120    return lp_build_select(bld, cond, a, b);
 121 }
 122
 123
 124 /**
 125  * Generate max(a, b)
 126  * No checks for special case values of a or b = 1 or 0 are done.
 127  */
 128 static LLVMValueRef
 129 lp_build_max_simple(struct lp_build_context *bld,
 130                     LLVMValueRef a,
 131                     LLVMValueRef b)
 132 {
 133    const struct lp_type type = bld->type;
 134    const char *intrinsic = NULL;
 135    LLVMValueRef cond;
 136
 137    assert(lp_check_value(type, a));
 138    assert(lp_check_value(type, b));
 139
 140    /* TODO: optimize the constant case */
 141
 142    if(type.width * type.length == 128) {
 143       if(type.floating) {
 144          if(type.width == 32 && util_cpu_caps.has_sse)
 145             intrinsic = "llvm.x86.sse.max.ps";
 146          if(type.width == 64 && util_cpu_caps.has_sse2)
 147             intrinsic = "llvm.x86.sse2.max.pd";
 148       }
 149       else {
 150          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 151             intrinsic = "llvm.x86.sse2.pmaxu.b";
 152          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 153             intrinsic = "llvm.x86.sse41.pmaxsb";
 154          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 155             intrinsic = "llvm.x86.sse41.pmaxuw";
 156          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 157             intrinsic = "llvm.x86.sse2.pmaxs.w";
 158          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 159             intrinsic = "llvm.x86.sse41.pmaxud";
 160          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 161             intrinsic = "llvm.x86.sse41.pmaxsd";
 162       }
 163    }
 164
 165    if(intrinsic)
 166       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 167
 168    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 169    return lp_build_select(bld, cond, a, b);
 170 }
 171
 172
 173 /**
 174  * Generate 1 - a, or ~a depending on bld->type.
 175  */
 176 LLVMValueRef
 177 lp_build_comp(struct lp_build_context *bld,
 178               LLVMValueRef a)
 179 {
 180    const struct lp_type type = bld->type;
 181
 182    assert(lp_check_value(type, a));
 183
 184    if(a == bld->one)
 185       return bld->zero;
 186    if(a == bld->zero)
 187       return bld->one;
 188
 189    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 190       if(LLVMIsConstant(a))
 191          return LLVMConstNot(a);
 192       else
 193          return LLVMBuildNot(bld->builder, a, "");
 194    }
 195
 196    if(LLVMIsConstant(a))
 197       if (type.floating)
 198           return LLVMConstFSub(bld->one, a);
 199       else
 200           return LLVMConstSub(bld->one, a);
 201    else
 202       if (type.floating)
 203          return LLVMBuildFSub(bld->builder, bld->one, a, "");
 204       else
 205          return LLVMBuildSub(bld->builder, bld->one, a, "");
 206 }
 207
 208
 209 /**
 210  * Generate a + b
 211  */
 212 LLVMValueRef
 213 lp_build_add(struct lp_build_context *bld,
 214              LLVMValueRef a,
 215              LLVMValueRef b)
 216 {
 217    const struct lp_type type = bld->type;
 218    LLVMValueRef res;
 219
 220    assert(lp_check_value(type, a));
 221    assert(lp_check_value(type, b));
 222
 223    if(a == bld->zero)
 224       return b;
 225    if(b == bld->zero)
 226       return a;
 227    if(a == bld->undef || b == bld->undef)
 228       return bld->undef;
 229
 230    if(bld->type.norm) {
 231       const char *intrinsic = NULL;
 232
 233       if(a == bld->one || b == bld->one)
 234         return bld->one;
 235
 236       if(util_cpu_caps.has_sse2 &&
 237          type.width * type.length == 128 &&
 238          !type.floating && !type.fixed) {
 239          if(type.width == 8)
 240             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 241          if(type.width == 16)
 242             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 243       }
 244
 245       if(intrinsic)
 246          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 247    }
 248
 249    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 250       if (type.floating)
 251          res = LLVMConstFAdd(a, b);
 252       else
 253          res = LLVMConstAdd(a, b);
 254    else
 255       if (type.floating)
 256          res = LLVMBuildFAdd(bld->builder, a, b, "");
 257       else
 258          res = LLVMBuildAdd(bld->builder, a, b, "");
 259
 260    /* clamp to ceiling of 1.0 */
 261    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 262       res = lp_build_min_simple(bld, res, bld->one);
 263
 264    /* XXX clamp to floor of -1 or 0??? */
 265
 266    return res;
 267 }
 268
 269
 270 /** Return the sum of the elements of a */
 271 LLVMValueRef
 272 lp_build_sum_vector(struct lp_build_context *bld,
 273                     LLVMValueRef a)
 274 {
 275    const struct lp_type type = bld->type;
 276    LLVMValueRef index, res;
 277    unsigned i;
 278
 279    assert(lp_check_value(type, a));
 280
 281    if (a == bld->zero)
 282       return bld->zero;
 283    if (a == bld->undef)
 284       return bld->undef;
 285    assert(type.length > 1);
 286
 287    assert(!bld->type.norm);
 288
 289    index = LLVMConstInt(LLVMInt32Type(), 0, 0);
 290    res = LLVMBuildExtractElement(bld->builder, a, index, "");
 291
 292    for (i = 1; i < type.length; i++) {
 293       index = LLVMConstInt(LLVMInt32Type(), i, 0);
 294       if (type.floating)
 295          res = LLVMBuildFAdd(bld->builder, res,
 296                             LLVMBuildExtractElement(bld->builder,
 297                                                     a, index, ""),
 298                             "");
 299       else
 300          res = LLVMBuildAdd(bld->builder, res,
 301                             LLVMBuildExtractElement(bld->builder,
 302                                                     a, index, ""),
 303                             "");
 304    }
 305
 306    return res;
 307 }
 308
 309
 310 /**
 311  * Generate a - b
 312  */
 313 LLVMValueRef
 314 lp_build_sub(struct lp_build_context *bld,
 315              LLVMValueRef a,
 316              LLVMValueRef b)
 317 {
 318    const struct lp_type type = bld->type;
 319    LLVMValueRef res;
 320
 321    assert(lp_check_value(type, a));
 322    assert(lp_check_value(type, b));
 323
 324    if(b == bld->zero)
 325       return a;
 326    if(a == bld->undef || b == bld->undef)
 327       return bld->undef;
 328    if(a == b)
 329       return bld->zero;
 330
 331    if(bld->type.norm) {
 332       const char *intrinsic = NULL;
 333
 334       if(b == bld->one)
 335         return bld->zero;
 336
 337       if(util_cpu_caps.has_sse2 &&
 338          type.width * type.length == 128 &&
 339          !type.floating && !type.fixed) {
 340          if(type.width == 8)
 341             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 342          if(type.width == 16)
 343             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 344       }
 345
 346       if(intrinsic)
 347          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 348    }
 349
 350    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 351       if (type.floating)
 352          res = LLVMConstFSub(a, b);
 353       else
 354          res = LLVMConstSub(a, b);
 355    else
 356       if (type.floating)
 357          res = LLVMBuildFSub(bld->builder, a, b, "");
 358       else
 359          res = LLVMBuildSub(bld->builder, a, b, "");
 360
 361    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 362       res = lp_build_max_simple(bld, res, bld->zero);
 363
 364    return res;
 365 }
 366
 367
 368 /**
 369  * Normalized 8bit multiplication.
 370  *
 371  * - alpha plus one
 372  *
 373  *     makes the following approximation to the division (Sree)
 374  *
 375  *       a*b/255 ~= (a*(b + 1)) >> 256
 376  *
 377  *     which is the fastest method that satisfies the following OpenGL criteria
 378  *
 379  *       0*0 = 0 and 255*255 = 255
 380  *
 381  * - geometric series
 382  *
 383  *     takes the geometric series approximation to the division
 384  *
 385  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 386  *
 387  *     in this case just the first two terms to fit in 16bit arithmetic
 388  *
 389  *       t/255 ~= (t + (t >> 8)) >> 8
 390  *
 391  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 392  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 393  *     must be used
 394  *
 395  * - geometric series plus rounding
 396  *
 397  *     when using a geometric series division instead of truncating the result
 398  *     use roundoff in the approximation (Jim Blinn)
 399  *
 400  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 401  *
 402  *     achieving the exact results
 403  *
 404  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 405  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 406  * @sa Michael Herf, The "double blend trick", May 2000,
 407  *     http://www.stereopsis.com/doubleblend.html
 408  */
 409 static LLVMValueRef
 410 lp_build_mul_u8n(LLVMBuilderRef builder,
 411                  struct lp_type i16_type,
 412                  LLVMValueRef a, LLVMValueRef b)
 413 {
 414    LLVMValueRef c8;
 415    LLVMValueRef ab;
 416
 417    assert(!i16_type.floating);
 418    assert(lp_check_value(i16_type, a));
 419    assert(lp_check_value(i16_type, b));
 420
 421    c8 = lp_build_const_int_vec(i16_type, 8);
 422
 423 #if 0
 424
 425    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 426    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
 427    ab = LLVMBuildMul(builder, a, b, "");
 428
 429 #else
 430
 431    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 432    ab = LLVMBuildMul(builder, a, b, "");
 433    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 434    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
 435
 436 #endif
 437
 438    ab = LLVMBuildLShr(builder, ab, c8, "");
 439
 440    return ab;
 441 }
 442
 443
 444 /**
 445  * Generate a * b
 446  */
 447 LLVMValueRef
 448 lp_build_mul(struct lp_build_context *bld,
 449              LLVMValueRef a,
 450              LLVMValueRef b)
 451 {
 452    const struct lp_type type = bld->type;
 453    LLVMValueRef shift;
 454    LLVMValueRef res;
 455
 456    assert(lp_check_value(type, a));
 457    assert(lp_check_value(type, b));
 458
 459    if(a == bld->zero)
 460       return bld->zero;
 461    if(a == bld->one)
 462       return b;
 463    if(b == bld->zero)
 464       return bld->zero;
 465    if(b == bld->one)
 466       return a;
 467    if(a == bld->undef || b == bld->undef)
 468       return bld->undef;
 469
 470    if(!type.floating && !type.fixed && type.norm) {
 471       if(type.width == 8) {
 472          struct lp_type i16_type = lp_wider_type(type);
 473          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 474
 475          lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
 476          lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 477
 478          /* PMULLW, PSRLW, PADDW */
 479          abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
 480          abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 481
 482          ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
 483
 484          return ab;
 485       }
 486
 487       /* FIXME */
 488       assert(0);
 489    }
 490
 491    if(type.fixed)
 492       shift = lp_build_const_int_vec(type, type.width/2);
 493    else
 494       shift = NULL;
 495
 496    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 497       if (type.floating)
 498          res = LLVMConstFMul(a, b);
 499       else
 500          res = LLVMConstMul(a, b);
 501       if(shift) {
 502          if(type.sign)
 503             res = LLVMConstAShr(res, shift);
 504          else
 505             res = LLVMConstLShr(res, shift);
 506       }
 507    }
 508    else {
 509       if (type.floating)
 510          res = LLVMBuildFMul(bld->builder, a, b, "");
 511       else
 512          res = LLVMBuildMul(bld->builder, a, b, "");
 513       if(shift) {
 514          if(type.sign)
 515             res = LLVMBuildAShr(bld->builder, res, shift, "");
 516          else
 517             res = LLVMBuildLShr(bld->builder, res, shift, "");
 518       }
 519    }
 520
 521    return res;
 522 }
 523
 524
 525 /**
 526  * Small vector x scale multiplication optimization.
 527  */
 528 LLVMValueRef
 529 lp_build_mul_imm(struct lp_build_context *bld,
 530                  LLVMValueRef a,
 531                  int b)
 532 {
 533    LLVMValueRef factor;
 534
 535    assert(lp_check_value(bld->type, a));
 536
 537    if(b == 0)
 538       return bld->zero;
 539
 540    if(b == 1)
 541       return a;
 542
 543    if(b == -1)
 544       return lp_build_negate(bld, a);
 545
 546    if(b == 2 && bld->type.floating)
 547       return lp_build_add(bld, a, a);
 548
 549    if(util_is_pot(b)) {
 550       unsigned shift = ffs(b) - 1;
 551
 552       if(bld->type.floating) {
 553 #if 0
 554          /*
 555           * Power of two multiplication by directly manipulating the mantissa.
 556           *
 557           * XXX: This might not be always faster, it will introduce a small error
 558           * for multiplication by zero, and it will produce wrong results
 559           * for Inf and NaN.
 560           */
 561          unsigned mantissa = lp_mantissa(bld->type);
 562          factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
 563          a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
 564          a = LLVMBuildAdd(bld->builder, a, factor, "");
 565          a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
 566          return a;
 567 #endif
 568       }
 569       else {
 570          factor = lp_build_const_vec(bld->type, shift);
 571          return LLVMBuildShl(bld->builder, a, factor, "");
 572       }
 573    }
 574
 575    factor = lp_build_const_vec(bld->type, (double)b);
 576    return lp_build_mul(bld, a, factor);
 577 }
 578
 579
 580 /**
 581  * Generate a / b
 582  */
 583 LLVMValueRef
 584 lp_build_div(struct lp_build_context *bld,
 585              LLVMValueRef a,
 586              LLVMValueRef b)
 587 {
 588    const struct lp_type type = bld->type;
 589
 590    assert(lp_check_value(type, a));
 591    assert(lp_check_value(type, b));
 592
 593    if(a == bld->zero)
 594       return bld->zero;
 595    if(a == bld->one)
 596       return lp_build_rcp(bld, b);
 597    if(b == bld->zero)
 598       return bld->undef;
 599    if(b == bld->one)
 600       return a;
 601    if(a == bld->undef || b == bld->undef)
 602       return bld->undef;
 603
 604    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 605       if (type.floating)
 606          return LLVMConstFDiv(a, b);
 607       else if (type.sign)
 608          return LLVMConstSDiv(a, b);
 609       else
 610          return LLVMConstUDiv(a, b);
 611    }
 612
 613    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 614       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 615
 616    if (type.floating)
 617       return LLVMBuildFDiv(bld->builder, a, b, "");
 618    else if (type.sign)
 619       return LLVMBuildSDiv(bld->builder, a, b, "");
 620    else
 621       return LLVMBuildUDiv(bld->builder, a, b, "");
 622 }
 623
 624
 625 /**
 626  * Linear interpolation.
 627  *
 628  * This also works for integer values with a few caveats.
 629  *
 630  * @sa http://www.stereopsis.com/doubleblend.html
 631  */
 632 LLVMValueRef
 633 lp_build_lerp(struct lp_build_context *bld,
 634               LLVMValueRef x,
 635               LLVMValueRef v0,
 636               LLVMValueRef v1)
 637 {
 638    LLVMValueRef delta;
 639    LLVMValueRef res;
 640
 641    assert(lp_check_value(bld->type, x));
 642    assert(lp_check_value(bld->type, v0));
 643    assert(lp_check_value(bld->type, v1));
 644
 645    delta = lp_build_sub(bld, v1, v0);
 646
 647    res = lp_build_mul(bld, x, delta);
 648
 649    res = lp_build_add(bld, v0, res);
 650
 651    if(bld->type.fixed)
 652       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 653        * but it will be wrong for other uses. Basically we need a more
 654        * powerful lp_type, capable of further distinguishing the values
 655        * interpretation from the value storage. */
 656       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
 657
 658    return res;
 659 }
 660
 661
 662 LLVMValueRef
 663 lp_build_lerp_2d(struct lp_build_context *bld,
 664                  LLVMValueRef x,
 665                  LLVMValueRef y,
 666                  LLVMValueRef v00,
 667                  LLVMValueRef v01,
 668                  LLVMValueRef v10,
 669                  LLVMValueRef v11)
 670 {
 671    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 672    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 673    return lp_build_lerp(bld, y, v0, v1);
 674 }
 675
 676
 677 /**
 678  * Generate min(a, b)
 679  * Do checks for special cases.
 680  */
 681 LLVMValueRef
 682 lp_build_min(struct lp_build_context *bld,
 683              LLVMValueRef a,
 684              LLVMValueRef b)
 685 {
 686    assert(lp_check_value(bld->type, a));
 687    assert(lp_check_value(bld->type, b));
 688
 689    if(a == bld->undef || b == bld->undef)
 690       return bld->undef;
 691
 692    if(a == b)
 693       return a;
 694
 695    if(bld->type.norm) {
 696       if(a == bld->zero || b == bld->zero)
 697          return bld->zero;
 698       if(a == bld->one)
 699          return b;
 700       if(b == bld->one)
 701          return a;
 702    }
 703
 704    return lp_build_min_simple(bld, a, b);
 705 }
 706
 707
 708 /**
 709  * Generate max(a, b)
 710  * Do checks for special cases.
 711  */
 712 LLVMValueRef
 713 lp_build_max(struct lp_build_context *bld,
 714              LLVMValueRef a,
 715              LLVMValueRef b)
 716 {
 717    assert(lp_check_value(bld->type, a));
 718    assert(lp_check_value(bld->type, b));
 719
 720    if(a == bld->undef || b == bld->undef)
 721       return bld->undef;
 722
 723    if(a == b)
 724       return a;
 725
 726    if(bld->type.norm) {
 727       if(a == bld->one || b == bld->one)
 728          return bld->one;
 729       if(a == bld->zero)
 730          return b;
 731       if(b == bld->zero)
 732          return a;
 733    }
 734
 735    return lp_build_max_simple(bld, a, b);
 736 }
 737
 738
 739 /**
 740  * Generate clamp(a, min, max)
 741  * Do checks for special cases.
 742  */
 743 LLVMValueRef
 744 lp_build_clamp(struct lp_build_context *bld,
 745                LLVMValueRef a,
 746                LLVMValueRef min,
 747                LLVMValueRef max)
 748 {
 749    assert(lp_check_value(bld->type, a));
 750    assert(lp_check_value(bld->type, min));
 751    assert(lp_check_value(bld->type, max));
 752
 753    a = lp_build_min(bld, a, max);
 754    a = lp_build_max(bld, a, min);
 755    return a;
 756 }
 757
 758
 759 /**
 760  * Generate abs(a)
 761  */
 762 LLVMValueRef
 763 lp_build_abs(struct lp_build_context *bld,
 764              LLVMValueRef a)
 765 {
 766    const struct lp_type type = bld->type;
 767    LLVMTypeRef vec_type = lp_build_vec_type(type);
 768
 769    assert(lp_check_value(type, a));
 770
 771    if(!type.sign)
 772       return a;
 773
 774    if(type.floating) {
 775       /* Mask out the sign bit */
 776       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 777       unsigned long long absMask = ~(1ULL << (type.width - 1));
 778       LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
 779       a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 780       a = LLVMBuildAnd(bld->builder, a, mask, "");
 781       a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 782       return a;
 783    }
 784
 785    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 786       switch(type.width) {
 787       case 8:
 788          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 789       case 16:
 790          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 791       case 32:
 792          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 793       }
 794    }
 795
 796    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 797 }
 798
 799
 800 LLVMValueRef
 801 lp_build_negate(struct lp_build_context *bld,
 802                 LLVMValueRef a)
 803 {
 804    assert(lp_check_value(bld->type, a));
 805
 806 #if HAVE_LLVM >= 0x0207
 807    if (bld->type.floating)
 808       a = LLVMBuildFNeg(bld->builder, a, "");
 809    else
 810 #endif
 811       a = LLVMBuildNeg(bld->builder, a, "");
 812
 813    return a;
 814 }
 815
 816
 817 /** Return -1, 0 or +1 depending on the sign of a */
 818 LLVMValueRef
 819 lp_build_sgn(struct lp_build_context *bld,
 820              LLVMValueRef a)
 821 {
 822    const struct lp_type type = bld->type;
 823    LLVMValueRef cond;
 824    LLVMValueRef res;
 825
 826    assert(lp_check_value(type, a));
 827
 828    /* Handle non-zero case */
 829    if(!type.sign) {
 830       /* if not zero then sign must be positive */
 831       res = bld->one;
 832    }
 833    else if(type.floating) {
 834       LLVMTypeRef vec_type;
 835       LLVMTypeRef int_type;
 836       LLVMValueRef mask;
 837       LLVMValueRef sign;
 838       LLVMValueRef one;
 839       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 840
 841       int_type = lp_build_int_vec_type(type);
 842       vec_type = lp_build_vec_type(type);
 843       mask = lp_build_const_int_vec(type, maskBit);
 844
 845       /* Take the sign bit and add it to 1 constant */
 846       sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
 847       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 848       one = LLVMConstBitCast(bld->one, int_type);
 849       res = LLVMBuildOr(bld->builder, sign, one, "");
 850       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 851    }
 852    else
 853    {
 854       LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
 855       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 856       res = lp_build_select(bld, cond, bld->one, minus_one);
 857    }
 858
 859    /* Handle zero */
 860    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 861    res = lp_build_select(bld, cond, bld->zero, res);
 862
 863    return res;
 864 }
 865
 866
 867 /**
 868  * Set the sign of float vector 'a' according to 'sign'.
 869  * If sign==0, return abs(a).
 870  * If sign==1, return -abs(a);
 871  * Other values for sign produce undefined results.
 872  */
 873 LLVMValueRef
 874 lp_build_set_sign(struct lp_build_context *bld,
 875                   LLVMValueRef a, LLVMValueRef sign)
 876 {
 877    const struct lp_type type = bld->type;
 878    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 879    LLVMTypeRef vec_type = lp_build_vec_type(type);
 880    LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
 881    LLVMValueRef mask = lp_build_const_int_vec(type,
 882                              ~((unsigned long long) 1 << (type.width - 1)));
 883    LLVMValueRef val, res;
 884
 885    assert(type.floating);
 886    assert(lp_check_value(type, a));
 887
 888    /* val = reinterpret_cast<int>(a) */
 889    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 890    /* val = val & mask */
 891    val = LLVMBuildAnd(bld->builder, val, mask, "");
 892    /* sign = sign << shift */
 893    sign = LLVMBuildShl(bld->builder, sign, shift, "");
 894    /* res = val | sign */
 895    res = LLVMBuildOr(bld->builder, val, sign, "");
 896    /* res = reinterpret_cast<float>(res) */
 897    res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 898
 899    return res;
 900 }
 901
 902
 903 /**
 904  * Convert vector of (or scalar) int to vector of (or scalar) float.
 905  */
 906 LLVMValueRef
 907 lp_build_int_to_float(struct lp_build_context *bld,
 908                       LLVMValueRef a)
 909 {
 910    const struct lp_type type = bld->type;
 911    LLVMTypeRef vec_type = lp_build_vec_type(type);
 912
 913    assert(type.floating);
 914
 915    return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
 916 }
 917
 918
 919
 920 enum lp_build_round_sse41_mode
 921 {
 922    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 923    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 924    LP_BUILD_ROUND_SSE41_CEIL = 2,
 925    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 926 };
 927
 928
 929 static INLINE LLVMValueRef
 930 lp_build_round_sse41(struct lp_build_context *bld,
 931                      LLVMValueRef a,
 932                      enum lp_build_round_sse41_mode mode)
 933 {
 934    const struct lp_type type = bld->type;
 935    LLVMTypeRef vec_type = lp_build_vec_type(type);
 936    const char *intrinsic;
 937
 938    assert(type.floating);
 939    assert(type.width*type.length == 128);
 940    assert(lp_check_value(type, a));
 941    assert(util_cpu_caps.has_sse4_1);
 942
 943    switch(type.width) {
 944    case 32:
 945       intrinsic = "llvm.x86.sse41.round.ps";
 946       break;
 947    case 64:
 948       intrinsic = "llvm.x86.sse41.round.pd";
 949       break;
 950    default:
 951       assert(0);
 952       return bld->undef;
 953    }
 954
 955    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 956                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 957 }
 958
 959
 960 /**
 961  * Return the integer part of a float (vector) value.  The returned value is
 962  * a float (vector).
 963  * Ex: trunc(-1.5) = 1.0
 964  */
 965 LLVMValueRef
 966 lp_build_trunc(struct lp_build_context *bld,
 967                LLVMValueRef a)
 968 {
 969    const struct lp_type type = bld->type;
 970
 971    assert(type.floating);
 972    assert(lp_check_value(type, a));
 973
 974    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
 975       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 976    else {
 977       LLVMTypeRef vec_type = lp_build_vec_type(type);
 978       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 979       LLVMValueRef res;
 980       res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 981       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 982       return res;
 983    }
 984 }
 985
 986
 987 /**
 988  * Return float (vector) rounded to nearest integer (vector).  The returned
 989  * value is a float (vector).
 990  * Ex: round(0.9) = 1.0
 991  * Ex: round(-1.5) = -2.0
 992  */
 993 LLVMValueRef
 994 lp_build_round(struct lp_build_context *bld,
 995                LLVMValueRef a)
 996 {
 997    const struct lp_type type = bld->type;
 998
 999    assert(type.floating);
1000    assert(lp_check_value(type, a));
1001
1002    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1003       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1004    else {
1005       LLVMTypeRef vec_type = lp_build_vec_type(type);
1006       LLVMValueRef res;
1007       res = lp_build_iround(bld, a);
1008       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1009       return res;
1010    }
1011 }
1012
1013
1014 /**
1015  * Return floor of float (vector), result is a float (vector)
1016  * Ex: floor(1.1) = 1.0
1017  * Ex: floor(-1.1) = -2.0
1018  */
1019 LLVMValueRef
1020 lp_build_floor(struct lp_build_context *bld,
1021                LLVMValueRef a)
1022 {
1023    const struct lp_type type = bld->type;
1024
1025    assert(type.floating);
1026    assert(lp_check_value(type, a));
1027
1028    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1029       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1030    else {
1031       LLVMTypeRef vec_type = lp_build_vec_type(type);
1032       LLVMValueRef res;
1033       res = lp_build_ifloor(bld, a);
1034       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1035       return res;
1036    }
1037 }
1038
1039
1040 /**
1041  * Return ceiling of float (vector), returning float (vector).
1042  * Ex: ceil( 1.1) = 2.0
1043  * Ex: ceil(-1.1) = -1.0
1044  */
1045 LLVMValueRef
1046 lp_build_ceil(struct lp_build_context *bld,
1047               LLVMValueRef a)
1048 {
1049    const struct lp_type type = bld->type;
1050
1051    assert(type.floating);
1052    assert(lp_check_value(type, a));
1053
1054    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1055       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1056    else {
1057       LLVMTypeRef vec_type = lp_build_vec_type(type);
1058       LLVMValueRef res;
1059       res = lp_build_iceil(bld, a);
1060       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1061       return res;
1062    }
1063 }
1064
1065
1066 /**
1067  * Return fractional part of 'a' computed as a - floor(a)
1068  * Typically used in texture coord arithmetic.
1069  */
1070 LLVMValueRef
1071 lp_build_fract(struct lp_build_context *bld,
1072                LLVMValueRef a)
1073 {
1074    assert(bld->type.floating);
1075    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1076 }
1077
1078
1079 /**
1080  * Return the integer part of a float (vector) value.  The returned value is
1081  * an integer (vector).
1082  * Ex: itrunc(-1.5) = 1
1083  */
1084 LLVMValueRef
1085 lp_build_itrunc(struct lp_build_context *bld,
1086                 LLVMValueRef a)
1087 {
1088    const struct lp_type type = bld->type;
1089    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1090
1091    assert(type.floating);
1092    assert(lp_check_value(type, a));
1093
1094    return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1095 }
1096
1097
1098 /**
1099  * Return float (vector) rounded to nearest integer (vector).  The returned
1100  * value is an integer (vector).
1101  * Ex: iround(0.9) = 1
1102  * Ex: iround(-1.5) = -2
1103  */
1104 LLVMValueRef
1105 lp_build_iround(struct lp_build_context *bld,
1106                 LLVMValueRef a)
1107 {
1108    const struct lp_type type = bld->type;
1109    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1110    LLVMValueRef res;
1111
1112    assert(type.floating);
1113
1114    assert(lp_check_value(type, a));
1115
1116    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1117       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1118    }
1119    else {
1120       LLVMTypeRef vec_type = lp_build_vec_type(type);
1121       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1122       LLVMValueRef sign;
1123       LLVMValueRef half;
1124
1125       /* get sign bit */
1126       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1127       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1128
1129       /* sign * 0.5 */
1130       half = lp_build_const_vec(type, 0.5);
1131       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1132       half = LLVMBuildOr(bld->builder, sign, half, "");
1133       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1134
1135       res = LLVMBuildFAdd(bld->builder, a, half, "");
1136    }
1137
1138    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1139
1140    return res;
1141 }
1142
1143
1144 /**
1145  * Return floor of float (vector), result is an int (vector)
1146  * Ex: ifloor(1.1) = 1.0
1147  * Ex: ifloor(-1.1) = -2.0
1148  */
1149 LLVMValueRef
1150 lp_build_ifloor(struct lp_build_context *bld,
1151                 LLVMValueRef a)
1152 {
1153    const struct lp_type type = bld->type;
1154    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1155    LLVMValueRef res;
1156
1157    assert(type.floating);
1158    assert(lp_check_value(type, a));
1159
1160    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1161       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1162    }
1163    else {
1164       /* Take the sign bit and add it to 1 constant */
1165       LLVMTypeRef vec_type = lp_build_vec_type(type);
1166       unsigned mantissa = lp_mantissa(type);
1167       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1168       LLVMValueRef sign;
1169       LLVMValueRef offset;
1170
1171       /* sign = a < 0 ? ~0 : 0 */
1172       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1173       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1174       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1175
1176       /* offset = -0.99999(9)f */
1177       offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1178       offset = LLVMConstBitCast(offset, int_vec_type);
1179
1180       /* offset = a < 0 ? offset : 0.0f */
1181       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1182       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1183
1184       res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1185    }
1186
1187    /* round to nearest (toward zero) */
1188    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1189
1190    return res;
1191 }
1192
1193
1194 /**
1195  * Return ceiling of float (vector), returning int (vector).
1196  * Ex: iceil( 1.1) = 2
1197  * Ex: iceil(-1.1) = -1
1198  */
1199 LLVMValueRef
1200 lp_build_iceil(struct lp_build_context *bld,
1201                LLVMValueRef a)
1202 {
1203    const struct lp_type type = bld->type;
1204    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1205    LLVMValueRef res;
1206
1207    assert(type.floating);
1208    assert(lp_check_value(type, a));
1209
1210    if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1211       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1212    }
1213    else {
1214       LLVMTypeRef vec_type = lp_build_vec_type(type);
1215       unsigned mantissa = lp_mantissa(type);
1216       LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1217       LLVMValueRef sign;
1218       LLVMValueRef offset;
1219
1220       /* sign = a < 0 ? 0 : ~0 */
1221       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1222       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1223       sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1224       sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1225
1226       /* offset = 0.99999(9)f */
1227       offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1228       offset = LLVMConstBitCast(offset, int_vec_type);
1229
1230       /* offset = a < 0 ? 0.0 : offset */
1231       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1232       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1233
1234       res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1235    }
1236
1237    /* round to nearest (toward zero) */
1238    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1239
1240    return res;
1241 }
1242
1243
1244 LLVMValueRef
1245 lp_build_sqrt(struct lp_build_context *bld,
1246               LLVMValueRef a)
1247 {
1248    const struct lp_type type = bld->type;
1249    LLVMTypeRef vec_type = lp_build_vec_type(type);
1250    char intrinsic[32];
1251
1252    assert(lp_check_value(type, a));
1253
1254    /* TODO: optimize the constant case */
1255    /* TODO: optimize the constant case */
1256
1257    assert(type.floating);
1258    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1259
1260    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1261 }
1262
1263
1264 /**
1265  * Do one Newton-Raphson step to improve reciprocate precision:
1266  *
1267  *   x_{i+1} = x_i * (2 - a * x_i)
1268  *
1269  * See also:
1270  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1271  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1272  */
1273 static INLINE LLVMValueRef
1274 lp_build_rcp_refine(struct lp_build_context *bld,
1275                     LLVMValueRef a,
1276                     LLVMValueRef rcp_a)
1277 {
1278    LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1279    LLVMValueRef res;
1280
1281    res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1282    res = LLVMBuildFSub(bld->builder, two, res, "");
1283    res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1284
1285    return res;
1286 }
1287
1288
1289 LLVMValueRef
1290 lp_build_rcp(struct lp_build_context *bld,
1291              LLVMValueRef a)
1292 {
1293    const struct lp_type type = bld->type;
1294
1295    assert(lp_check_value(type, a));
1296
1297    if(a == bld->zero)
1298       return bld->undef;
1299    if(a == bld->one)
1300       return bld->one;
1301    if(a == bld->undef)
1302       return bld->undef;
1303
1304    assert(type.floating);
1305
1306    if(LLVMIsConstant(a))
1307       return LLVMConstFDiv(bld->one, a);
1308
1309    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1310       LLVMValueRef res;
1311       unsigned i;
1312
1313       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1314
1315       for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
1316          res = lp_build_rcp_refine(bld, a, res);
1317       }
1318
1319       return res;
1320    }
1321
1322    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1323 }
1324
1325
1326 /**
1327  * Do one Newton-Raphson step to improve rsqrt precision:
1328  *
1329  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1330  *
1331  * See also:
1332  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1333  */
1334 static INLINE LLVMValueRef
1335 lp_build_rsqrt_refine(struct lp_build_context *bld,
1336                       LLVMValueRef a,
1337                       LLVMValueRef rsqrt_a)
1338 {
1339    LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1340    LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1341    LLVMValueRef res;
1342
1343    res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1344    res = LLVMBuildFMul(bld->builder, a, res, "");
1345    res = LLVMBuildFSub(bld->builder, three, res, "");
1346    res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1347    res = LLVMBuildFMul(bld->builder, half, res, "");
1348
1349    return res;
1350 }
1351
1352
1353 /**
1354  * Generate 1/sqrt(a)
1355  */
1356 LLVMValueRef
1357 lp_build_rsqrt(struct lp_build_context *bld,
1358                LLVMValueRef a)
1359 {
1360    const struct lp_type type = bld->type;
1361
1362    assert(lp_check_value(type, a));
1363
1364    assert(type.floating);
1365
1366    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1367       LLVMValueRef res;
1368       unsigned i;
1369
1370       res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1371
1372       for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
1373          res = lp_build_rsqrt_refine(bld, a, res);
1374       }
1375
1376       return res;
1377    }
1378
1379    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1380 }
1381
1382
1383 static inline LLVMValueRef
1384 lp_build_const_v4si(unsigned long value)
1385 {
1386    LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1387    LLVMValueRef elements[4] = { element, element, element, element };
1388    return LLVMConstVector(elements, 4);
1389 }
1390
1391 static inline LLVMValueRef
1392 lp_build_const_v4sf(float value)
1393 {
1394    LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1395    LLVMValueRef elements[4] = { element, element, element, element };
1396    return LLVMConstVector(elements, 4);
1397 }
1398
1399
1400 /**
1401  * Generate sin(a) using SSE2
1402  */
1403 LLVMValueRef
1404 lp_build_sin(struct lp_build_context *bld,
1405              LLVMValueRef a)
1406 {
1407    struct lp_type int_type = lp_int_type(bld->type);
1408    LLVMBuilderRef b = bld->builder;
1409    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1410    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1411
1412    /*
1413     *  take the absolute value,
1414     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1415     */
1416
1417    LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1418    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1419
1420    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1421    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1422
1423    /*
1424     * extract the sign bit (upper one)
1425     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1426     */
1427    LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1428    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1429
1430    /*
1431     * scale by 4/Pi
1432     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1433     */
1434
1435    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1436    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1437
1438    /*
1439     * store the integer part of y in mm0
1440     * emm2 = _mm_cvttps_epi32(y);
1441     */
1442
1443    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1444
1445    /*
1446     * j=(j+1) & (~1) (see the cephes sources)
1447     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1448     */
1449
1450    LLVMValueRef all_one = lp_build_const_v4si(1);
1451    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1452    /*
1453     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1454     */
1455    LLVMValueRef inv_one = lp_build_const_v4si(~1);
1456    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1457
1458    /*
1459     * y = _mm_cvtepi32_ps(emm2);
1460     */
1461    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1462
1463    /* get the swap sign flag
1464     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1465     */
1466    LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1467    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1468
1469    /*
1470     * emm2 = _mm_slli_epi32(emm0, 29);
1471     */
1472    LLVMValueRef const_29 = lp_build_const_v4si(29);
1473    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1474
1475    /*
1476     * get the polynom selection mask
1477     * there is one polynom for 0 <= x <= Pi/4
1478     * and another one for Pi/4<x<=Pi/2
1479     * Both branches will be computed.
1480     *
1481     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1482     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1483     */
1484
1485    LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1486    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1487    LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1488                                              emm2_3, lp_build_const_v4si(0));
1489    /*
1490     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1491     */
1492    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1493
1494    /*
1495     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1496     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1497     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1498     */
1499    LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1500    LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1501    LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1502
1503    /*
1504     * The magic pass: "Extended precision modular arithmetic"
1505     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1506     * xmm1 = _mm_mul_ps(y, xmm1);
1507     * xmm2 = _mm_mul_ps(y, xmm2);
1508     * xmm3 = _mm_mul_ps(y, xmm3);
1509     */
1510    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1511    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1512    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1513
1514    /*
1515     * x = _mm_add_ps(x, xmm1);
1516     * x = _mm_add_ps(x, xmm2);
1517     * x = _mm_add_ps(x, xmm3);
1518     */
1519
1520    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1521    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1522    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1523
1524    /*
1525     * Evaluate the first polynom  (0 <= x <= Pi/4)
1526     *
1527     * z = _mm_mul_ps(x,x);
1528     */
1529    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1530
1531    /*
1532     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1533     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1534     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1535     */
1536    LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1537    LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1538    LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1539
1540    /*
1541     * y = *(v4sf*)_ps_coscof_p0;
1542     * y = _mm_mul_ps(y, z);
1543     */
1544    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1545    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1546    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1547    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1548    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1549    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1550
1551
1552    /*
1553     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1554     * y = _mm_sub_ps(y, tmp);
1555     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1556     */
1557    LLVMValueRef half = lp_build_const_v4sf(0.5);
1558    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1559    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1560    LLVMValueRef one = lp_build_const_v4sf(1.0);
1561    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1562
1563    /*
1564     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1565     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1566     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1567     */
1568    LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1569    LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1570    LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1571
1572    /*
1573     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1574     *
1575     * y2 = *(v4sf*)_ps_sincof_p0;
1576     * y2 = _mm_mul_ps(y2, z);
1577     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1578     * y2 = _mm_mul_ps(y2, z);
1579     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1580     * y2 = _mm_mul_ps(y2, z);
1581     * y2 = _mm_mul_ps(y2, x);
1582     * y2 = _mm_add_ps(y2, x);
1583     */
1584
1585    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1586    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1587    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1588    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1589    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1590    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1591    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1592
1593    /*
1594     * select the correct result from the two polynoms
1595     * xmm3 = poly_mask;
1596     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1597     * y = _mm_andnot_ps(xmm3, y);
1598     * y = _mm_add_ps(y,y2);
1599     */
1600    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1601    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1602    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1603    LLVMValueRef inv = lp_build_const_v4si(~0);
1604    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1605    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1606    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1607
1608    /*
1609     * update the sign
1610     * y = _mm_xor_ps(y, sign_bit);
1611     */
1612    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1613    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1614    return y_result;
1615 }
1616
1617
1618 /**
1619  * Generate cos(a) using SSE2
1620  */
1621 LLVMValueRef
1622 lp_build_cos(struct lp_build_context *bld,
1623              LLVMValueRef a)
1624 {
1625    struct lp_type int_type = lp_int_type(bld->type);
1626    LLVMBuilderRef b = bld->builder;
1627    LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1628    LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1629
1630    /*
1631     *  take the absolute value,
1632     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1633     */
1634
1635    LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1636    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1637
1638    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1639    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1640
1641    /*
1642     * scale by 4/Pi
1643     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1644     */
1645
1646    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1647    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1648
1649    /*
1650     * store the integer part of y in mm0
1651     * emm2 = _mm_cvttps_epi32(y);
1652     */
1653
1654    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1655
1656    /*
1657     * j=(j+1) & (~1) (see the cephes sources)
1658     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1659     */
1660
1661    LLVMValueRef all_one = lp_build_const_v4si(1);
1662    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1663    /*
1664     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1665     */
1666    LLVMValueRef inv_one = lp_build_const_v4si(~1);
1667    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1668
1669    /*
1670     * y = _mm_cvtepi32_ps(emm2);
1671     */
1672    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1673
1674
1675    /*
1676     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1677     */
1678    LLVMValueRef const_2 = lp_build_const_v4si(2);
1679    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1680
1681
1682    /* get the swap sign flag
1683     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1684     */
1685    LLVMValueRef inv = lp_build_const_v4si(~0);
1686    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1687    LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1688    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1689
1690    /*
1691     * emm2 = _mm_slli_epi32(emm0, 29);
1692     */
1693    LLVMValueRef const_29 = lp_build_const_v4si(29);
1694    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1695
1696    /*
1697     * get the polynom selection mask
1698     * there is one polynom for 0 <= x <= Pi/4
1699     * and another one for Pi/4<x<=Pi/2
1700     * Both branches will be computed.
1701     *
1702     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1703     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1704     */
1705
1706    LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1707    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1708    LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1709                                              emm2_3, lp_build_const_v4si(0));
1710
1711    /*
1712     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1713     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1714     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1715     */
1716    LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1717    LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1718    LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1719
1720    /*
1721     * The magic pass: "Extended precision modular arithmetic"
1722     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1723     * xmm1 = _mm_mul_ps(y, xmm1);
1724     * xmm2 = _mm_mul_ps(y, xmm2);
1725     * xmm3 = _mm_mul_ps(y, xmm3);
1726     */
1727    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1728    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1729    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1730
1731    /*
1732     * x = _mm_add_ps(x, xmm1);
1733     * x = _mm_add_ps(x, xmm2);
1734     * x = _mm_add_ps(x, xmm3);
1735     */
1736
1737    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1738    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1739    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1740
1741    /*
1742     * Evaluate the first polynom  (0 <= x <= Pi/4)
1743     *
1744     * z = _mm_mul_ps(x,x);
1745     */
1746    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1747
1748    /*
1749     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1750     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1751     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1752     */
1753    LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1754    LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1755    LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1756
1757    /*
1758     * y = *(v4sf*)_ps_coscof_p0;
1759     * y = _mm_mul_ps(y, z);
1760     */
1761    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1762    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1763    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1764    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1765    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1766    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1767
1768
1769    /*
1770     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1771     * y = _mm_sub_ps(y, tmp);
1772     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1773     */
1774    LLVMValueRef half = lp_build_const_v4sf(0.5);
1775    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1776    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1777    LLVMValueRef one = lp_build_const_v4sf(1.0);
1778    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1779
1780    /*
1781     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1782     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1783     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1784     */
1785    LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1786    LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1787    LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1788
1789    /*
1790     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1791     *
1792     * y2 = *(v4sf*)_ps_sincof_p0;
1793     * y2 = _mm_mul_ps(y2, z);
1794     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1795     * y2 = _mm_mul_ps(y2, z);
1796     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1797     * y2 = _mm_mul_ps(y2, z);
1798     * y2 = _mm_mul_ps(y2, x);
1799     * y2 = _mm_add_ps(y2, x);
1800     */
1801
1802    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1803    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1804    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1805    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1806    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1807    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1808    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1809
1810    /*
1811     * select the correct result from the two polynoms
1812     * xmm3 = poly_mask;
1813     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1814     * y = _mm_andnot_ps(xmm3, y);
1815     * y = _mm_add_ps(y,y2);
1816     */
1817    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1818    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1819    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1820    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1821    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1822    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1823
1824    /*
1825     * update the sign
1826     * y = _mm_xor_ps(y, sign_bit);
1827     */
1828    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1829    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1830    return y_result;
1831 }
1832
1833
1834 /**
1835  * Generate pow(x, y)
1836  */
1837 LLVMValueRef
1838 lp_build_pow(struct lp_build_context *bld,
1839              LLVMValueRef x,
1840              LLVMValueRef y)
1841 {
1842    /* TODO: optimize the constant case */
1843    if(LLVMIsConstant(x) && LLVMIsConstant(y))
1844       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1845                    __FUNCTION__);
1846
1847    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1848 }
1849
1850
1851 /**
1852  * Generate exp(x)
1853  */
1854 LLVMValueRef
1855 lp_build_exp(struct lp_build_context *bld,
1856              LLVMValueRef x)
1857 {
1858    /* log2(e) = 1/log(2) */
1859    LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1860
1861    assert(lp_check_value(bld->type, x));
1862
1863    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1864 }
1865
1866
1867 /**
1868  * Generate log(x)
1869  */
1870 LLVMValueRef
1871 lp_build_log(struct lp_build_context *bld,
1872              LLVMValueRef x)
1873 {
1874    /* log(2) */
1875    LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1876
1877    assert(lp_check_value(bld->type, x));
1878
1879    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1880 }
1881
1882
1883 /**
1884  * Generate polynomial.
1885  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1886  */
1887 static LLVMValueRef
1888 lp_build_polynomial(struct lp_build_context *bld,
1889                     LLVMValueRef x,
1890                     const double *coeffs,
1891                     unsigned num_coeffs)
1892 {
1893    const struct lp_type type = bld->type;
1894    LLVMValueRef res = NULL;
1895    unsigned i;
1896
1897    assert(lp_check_value(bld->type, x));
1898
1899    /* TODO: optimize the constant case */
1900    if(LLVMIsConstant(x))
1901       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1902                    __FUNCTION__);
1903
1904    for (i = num_coeffs; i--; ) {
1905       LLVMValueRef coeff;
1906
1907       coeff = lp_build_const_vec(type, coeffs[i]);
1908
1909       if(res)
1910          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1911       else
1912          res = coeff;
1913    }
1914
1915    if(res)
1916       return res;
1917    else
1918       return bld->undef;
1919 }
1920
1921
1922 /**
1923  * Minimax polynomial fit of 2**x, in range [0, 1[
1924  */
1925 const double lp_build_exp2_polynomial[] = {
1926 #if EXP_POLY_DEGREE == 5
1927    0.999999999690134838155,
1928    0.583974334321735217258,
1929    0.164553105719676828492,
1930    0.0292811063701710962255,
1931    0.00354944426657875141846,
1932    0.000296253726543423377365
1933 #elif EXP_POLY_DEGREE == 4
1934    1.00000001502262084505,
1935    0.563586057338685991394,
1936    0.150436017652442413623,
1937    0.0243220604213317927308,
1938    0.0025359088446580436489
1939 #elif EXP_POLY_DEGREE == 3
1940    0.999925218562710312959,
1941    0.695833540494823811697,
1942    0.226067155427249155588,
1943    0.0780245226406372992967
1944 #elif EXP_POLY_DEGREE == 2
1945    1.00172476321474503578,
1946    0.657636275736077639316,
1947    0.33718943461968720704
1948 #else
1949 #error
1950 #endif
1951 };
1952
1953
1954 void
1955 lp_build_exp2_approx(struct lp_build_context *bld,
1956                      LLVMValueRef x,
1957                      LLVMValueRef *p_exp2_int_part,
1958                      LLVMValueRef *p_frac_part,
1959                      LLVMValueRef *p_exp2)
1960 {
1961    const struct lp_type type = bld->type;
1962    LLVMTypeRef vec_type = lp_build_vec_type(type);
1963    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1964    LLVMValueRef ipart = NULL;
1965    LLVMValueRef fpart = NULL;
1966    LLVMValueRef expipart = NULL;
1967    LLVMValueRef expfpart = NULL;
1968    LLVMValueRef res = NULL;
1969
1970    assert(lp_check_value(bld->type, x));
1971
1972    if(p_exp2_int_part || p_frac_part || p_exp2) {
1973       /* TODO: optimize the constant case */
1974       if(LLVMIsConstant(x))
1975          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1976                       __FUNCTION__);
1977
1978       assert(type.floating && type.width == 32);
1979
1980       x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
1981       x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1982
1983       /* ipart = floor(x) */
1984       ipart = lp_build_floor(bld, x);
1985
1986       /* fpart = x - ipart */
1987       fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1988    }
1989
1990    if(p_exp2_int_part || p_exp2) {
1991       /* expipart = (float) (1 << ipart) */
1992       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1993       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1994       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1995       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1996    }
1997
1998    if(p_exp2) {
1999       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2000                                      Elements(lp_build_exp2_polynomial));
2001
2002       res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2003    }
2004
2005    if(p_exp2_int_part)
2006       *p_exp2_int_part = expipart;
2007
2008    if(p_frac_part)
2009       *p_frac_part = fpart;
2010
2011    if(p_exp2)
2012       *p_exp2 = res;
2013 }
2014
2015
2016 LLVMValueRef
2017 lp_build_exp2(struct lp_build_context *bld,
2018               LLVMValueRef x)
2019 {
2020    LLVMValueRef res;
2021    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2022    return res;
2023 }
2024
2025
2026 /**
2027  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2028  * These coefficients can be generate with
2029  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2030  */
2031 const double lp_build_log2_polynomial[] = {
2032 #if LOG_POLY_DEGREE == 6
2033    3.11578814719469302614,
2034    -3.32419399085241980044,
2035    2.59883907202499966007,
2036    -1.23152682416275988241,
2037    0.318212422185251071475,
2038    -0.0344359067839062357313
2039 #elif LOG_POLY_DEGREE == 5
2040    2.8882704548164776201,
2041    -2.52074962577807006663,
2042    1.48116647521213171641,
2043    -0.465725644288844778798,
2044    0.0596515482674574969533
2045 #elif LOG_POLY_DEGREE == 4
2046    2.61761038894603480148,
2047    -1.75647175389045657003,
2048    0.688243882994381274313,
2049    -0.107254423828329604454
2050 #elif LOG_POLY_DEGREE == 3
2051    2.28330284476918490682,
2052    -1.04913055217340124191,
2053    0.204446009836232697516
2054 #else
2055 #error
2056 #endif
2057 };
2058
2059
2060 /**
2061  * See http://www.devmaster.net/forums/showthread.php?p=43580
2062  */
2063 void
2064 lp_build_log2_approx(struct lp_build_context *bld,
2065                      LLVMValueRef x,
2066                      LLVMValueRef *p_exp,
2067                      LLVMValueRef *p_floor_log2,
2068                      LLVMValueRef *p_log2)
2069 {
2070    const struct lp_type type = bld->type;
2071    LLVMTypeRef vec_type = lp_build_vec_type(type);
2072    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2073
2074    LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2075    LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2076    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2077
2078    LLVMValueRef i = NULL;
2079    LLVMValueRef exp = NULL;
2080    LLVMValueRef mant = NULL;
2081    LLVMValueRef logexp = NULL;
2082    LLVMValueRef logmant = NULL;
2083    LLVMValueRef res = NULL;
2084
2085    assert(lp_check_value(bld->type, x));
2086
2087    if(p_exp || p_floor_log2 || p_log2) {
2088       /* TODO: optimize the constant case */
2089       if(LLVMIsConstant(x))
2090          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2091                       __FUNCTION__);
2092
2093       assert(type.floating && type.width == 32);
2094
2095       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2096
2097       /* exp = (float) exponent(x) */
2098       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2099    }
2100
2101    if(p_floor_log2 || p_log2) {
2102       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2103       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2104       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2105    }
2106
2107    if(p_log2) {
2108       /* mant = (float) mantissa(x) */
2109       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2110       mant = LLVMBuildOr(bld->builder, mant, one, "");
2111       mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2112
2113       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2114                                     Elements(lp_build_log2_polynomial));
2115
2116       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2117       logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2118
2119       res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2120    }
2121
2122    if(p_exp) {
2123       exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2124       *p_exp = exp;
2125    }
2126
2127    if(p_floor_log2)
2128       *p_floor_log2 = logexp;
2129
2130    if(p_log2)
2131       *p_log2 = res;
2132 }
2133
2134
2135 LLVMValueRef
2136 lp_build_log2(struct lp_build_context *bld,
2137               LLVMValueRef x)
2138 {
2139    LLVMValueRef res;
2140    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2141    return res;
2142 }