src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_math.h"
  51 #include "util/u_string.h"
  52 #include "util/u_cpu_detect.h"
  53
  54 #include "lp_bld_type.h"
  55 #include "lp_bld_const.h"
  56 #include "lp_bld_init.h"
  57 #include "lp_bld_intr.h"
  58 #include "lp_bld_logic.h"
  59 #include "lp_bld_pack.h"
  60 #include "lp_bld_debug.h"
  61 #include "lp_bld_arit.h"
  62
  63
  64 #define EXP_POLY_DEGREE 5
  65
  66 #define LOG_POLY_DEGREE 5
  67
  68
  69 /**
  70  * Generate min(a, b)
  71  * No checks for special case values of a or b = 1 or 0 are done.
  72  */
  73 static LLVMValueRef
  74 lp_build_min_simple(struct lp_build_context *bld,
  75                     LLVMValueRef a,
  76                     LLVMValueRef b)
  77 {
  78    LLVMBuilderRef builder = bld->gallivm->builder;
  79    const struct lp_type type = bld->type;
  80    const char *intrinsic = NULL;
  81    LLVMValueRef cond;
  82
  83    assert(lp_check_value(type, a));
  84    assert(lp_check_value(type, b));
  85
  86    /* TODO: optimize the constant case */
  87
  88    if(type.width * type.length == 128) {
  89       if(type.floating) {
  90          if(type.width == 32 && util_cpu_caps.has_sse)
  91             intrinsic = "llvm.x86.sse.min.ps";
  92          if(type.width == 64 && util_cpu_caps.has_sse2)
  93             intrinsic = "llvm.x86.sse2.min.pd";
  94       }
  95       else {
  96          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  97             intrinsic = "llvm.x86.sse2.pminu.b";
  98          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  99             intrinsic = "llvm.x86.sse41.pminsb";
 100          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 101             intrinsic = "llvm.x86.sse41.pminuw";
 102          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 103             intrinsic = "llvm.x86.sse2.pmins.w";
 104          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 105             intrinsic = "llvm.x86.sse41.pminud";
 106          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 107             intrinsic = "llvm.x86.sse41.pminsd";
 108       }
 109    }
 110
 111    if(intrinsic)
 112       return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 113
 114    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 115    return lp_build_select(bld, cond, a, b);
 116 }
 117
 118
 119 /**
 120  * Generate max(a, b)
 121  * No checks for special case values of a or b = 1 or 0 are done.
 122  */
 123 static LLVMValueRef
 124 lp_build_max_simple(struct lp_build_context *bld,
 125                     LLVMValueRef a,
 126                     LLVMValueRef b)
 127 {
 128    LLVMBuilderRef builder = bld->gallivm->builder;
 129    const struct lp_type type = bld->type;
 130    const char *intrinsic = NULL;
 131    LLVMValueRef cond;
 132
 133    assert(lp_check_value(type, a));
 134    assert(lp_check_value(type, b));
 135
 136    /* TODO: optimize the constant case */
 137
 138    if(type.width * type.length == 128) {
 139       if(type.floating) {
 140          if(type.width == 32 && util_cpu_caps.has_sse)
 141             intrinsic = "llvm.x86.sse.max.ps";
 142          if(type.width == 64 && util_cpu_caps.has_sse2)
 143             intrinsic = "llvm.x86.sse2.max.pd";
 144       }
 145       else {
 146          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 147             intrinsic = "llvm.x86.sse2.pmaxu.b";
 148          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 149             intrinsic = "llvm.x86.sse41.pmaxsb";
 150          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 151             intrinsic = "llvm.x86.sse41.pmaxuw";
 152          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 153             intrinsic = "llvm.x86.sse2.pmaxs.w";
 154          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 155             intrinsic = "llvm.x86.sse41.pmaxud";
 156          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 157             intrinsic = "llvm.x86.sse41.pmaxsd";
 158       }
 159    }
 160
 161    if(intrinsic)
 162       return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 163
 164    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 165    return lp_build_select(bld, cond, a, b);
 166 }
 167
 168
 169 /**
 170  * Generate 1 - a, or ~a depending on bld->type.
 171  */
 172 LLVMValueRef
 173 lp_build_comp(struct lp_build_context *bld,
 174               LLVMValueRef a)
 175 {
 176    LLVMBuilderRef builder = bld->gallivm->builder;
 177    const struct lp_type type = bld->type;
 178
 179    assert(lp_check_value(type, a));
 180
 181    if(a == bld->one)
 182       return bld->zero;
 183    if(a == bld->zero)
 184       return bld->one;
 185
 186    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 187       if(LLVMIsConstant(a))
 188          return LLVMConstNot(a);
 189       else
 190          return LLVMBuildNot(builder, a, "");
 191    }
 192
 193    if(LLVMIsConstant(a))
 194       if (type.floating)
 195           return LLVMConstFSub(bld->one, a);
 196       else
 197           return LLVMConstSub(bld->one, a);
 198    else
 199       if (type.floating)
 200          return LLVMBuildFSub(builder, bld->one, a, "");
 201       else
 202          return LLVMBuildSub(builder, bld->one, a, "");
 203 }
 204
 205
 206 /**
 207  * Generate a + b
 208  */
 209 LLVMValueRef
 210 lp_build_add(struct lp_build_context *bld,
 211              LLVMValueRef a,
 212              LLVMValueRef b)
 213 {
 214    LLVMBuilderRef builder = bld->gallivm->builder;
 215    const struct lp_type type = bld->type;
 216    LLVMValueRef res;
 217
 218    assert(lp_check_value(type, a));
 219    assert(lp_check_value(type, b));
 220
 221    if(a == bld->zero)
 222       return b;
 223    if(b == bld->zero)
 224       return a;
 225    if(a == bld->undef || b == bld->undef)
 226       return bld->undef;
 227
 228    if(bld->type.norm) {
 229       const char *intrinsic = NULL;
 230
 231       if(a == bld->one || b == bld->one)
 232         return bld->one;
 233
 234       if(util_cpu_caps.has_sse2 &&
 235          type.width * type.length == 128 &&
 236          !type.floating && !type.fixed) {
 237          if(type.width == 8)
 238             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 239          if(type.width == 16)
 240             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 241       }
 242
 243       if(intrinsic)
 244          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 245    }
 246
 247    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 248       if (type.floating)
 249          res = LLVMConstFAdd(a, b);
 250       else
 251          res = LLVMConstAdd(a, b);
 252    else
 253       if (type.floating)
 254          res = LLVMBuildFAdd(builder, a, b, "");
 255       else
 256          res = LLVMBuildAdd(builder, a, b, "");
 257
 258    /* clamp to ceiling of 1.0 */
 259    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 260       res = lp_build_min_simple(bld, res, bld->one);
 261
 262    /* XXX clamp to floor of -1 or 0??? */
 263
 264    return res;
 265 }
 266
 267
 268 /** Return the scalar sum of the elements of a */
 269 LLVMValueRef
 270 lp_build_sum_vector(struct lp_build_context *bld,
 271                     LLVMValueRef a)
 272 {
 273    LLVMBuilderRef builder = bld->gallivm->builder;
 274    const struct lp_type type = bld->type;
 275    LLVMValueRef index, res;
 276    unsigned i;
 277
 278    assert(lp_check_value(type, a));
 279
 280    if (type.length == 1) {
 281       return a;
 282    }
 283
 284    assert(!bld->type.norm);
 285
 286    index = lp_build_const_int32(bld->gallivm, 0);
 287    res = LLVMBuildExtractElement(builder, a, index, "");
 288
 289    for (i = 1; i < type.length; i++) {
 290       index = lp_build_const_int32(bld->gallivm, i);
 291       if (type.floating)
 292          res = LLVMBuildFAdd(builder, res,
 293                             LLVMBuildExtractElement(builder,
 294                                                     a, index, ""),
 295                             "");
 296       else
 297          res = LLVMBuildAdd(builder, res,
 298                             LLVMBuildExtractElement(builder,
 299                                                     a, index, ""),
 300                             "");
 301    }
 302
 303    return res;
 304 }
 305
 306
 307 /**
 308  * Generate a - b
 309  */
 310 LLVMValueRef
 311 lp_build_sub(struct lp_build_context *bld,
 312              LLVMValueRef a,
 313              LLVMValueRef b)
 314 {
 315    LLVMBuilderRef builder = bld->gallivm->builder;
 316    const struct lp_type type = bld->type;
 317    LLVMValueRef res;
 318
 319    assert(lp_check_value(type, a));
 320    assert(lp_check_value(type, b));
 321
 322    if(b == bld->zero)
 323       return a;
 324    if(a == bld->undef || b == bld->undef)
 325       return bld->undef;
 326    if(a == b)
 327       return bld->zero;
 328
 329    if(bld->type.norm) {
 330       const char *intrinsic = NULL;
 331
 332       if(b == bld->one)
 333         return bld->zero;
 334
 335       if(util_cpu_caps.has_sse2 &&
 336          type.width * type.length == 128 &&
 337          !type.floating && !type.fixed) {
 338          if(type.width == 8)
 339             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 340          if(type.width == 16)
 341             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 342       }
 343
 344       if(intrinsic)
 345          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 346    }
 347
 348    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 349       if (type.floating)
 350          res = LLVMConstFSub(a, b);
 351       else
 352          res = LLVMConstSub(a, b);
 353    else
 354       if (type.floating)
 355          res = LLVMBuildFSub(builder, a, b, "");
 356       else
 357          res = LLVMBuildSub(builder, a, b, "");
 358
 359    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 360       res = lp_build_max_simple(bld, res, bld->zero);
 361
 362    return res;
 363 }
 364
 365
 366 /**
 367  * Normalized 8bit multiplication.
 368  *
 369  * - alpha plus one
 370  *
 371  *     makes the following approximation to the division (Sree)
 372  *
 373  *       a*b/255 ~= (a*(b + 1)) >> 256
 374  *
 375  *     which is the fastest method that satisfies the following OpenGL criteria
 376  *
 377  *       0*0 = 0 and 255*255 = 255
 378  *
 379  * - geometric series
 380  *
 381  *     takes the geometric series approximation to the division
 382  *
 383  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 384  *
 385  *     in this case just the first two terms to fit in 16bit arithmetic
 386  *
 387  *       t/255 ~= (t + (t >> 8)) >> 8
 388  *
 389  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 390  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 391  *     must be used
 392  *
 393  * - geometric series plus rounding
 394  *
 395  *     when using a geometric series division instead of truncating the result
 396  *     use roundoff in the approximation (Jim Blinn)
 397  *
 398  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 399  *
 400  *     achieving the exact results
 401  *
 402  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 403  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 404  * @sa Michael Herf, The "double blend trick", May 2000,
 405  *     http://www.stereopsis.com/doubleblend.html
 406  */
 407 static LLVMValueRef
 408 lp_build_mul_u8n(struct gallivm_state *gallivm,
 409                  struct lp_type i16_type,
 410                  LLVMValueRef a, LLVMValueRef b)
 411 {
 412    LLVMBuilderRef builder = gallivm->builder;
 413    LLVMValueRef c8;
 414    LLVMValueRef ab;
 415
 416    assert(!i16_type.floating);
 417    assert(lp_check_value(i16_type, a));
 418    assert(lp_check_value(i16_type, b));
 419
 420    c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
 421
 422 #if 0
 423
 424    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 425    b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
 426    ab = LLVMBuildMul(builder, a, b, "");
 427
 428 #else
 429
 430    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 431    ab = LLVMBuildMul(builder, a, b, "");
 432    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 433    ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
 434
 435 #endif
 436
 437    ab = LLVMBuildLShr(builder, ab, c8, "");
 438
 439    return ab;
 440 }
 441
 442
 443 /**
 444  * Generate a * b
 445  */
 446 LLVMValueRef
 447 lp_build_mul(struct lp_build_context *bld,
 448              LLVMValueRef a,
 449              LLVMValueRef b)
 450 {
 451    LLVMBuilderRef builder = bld->gallivm->builder;
 452    const struct lp_type type = bld->type;
 453    LLVMValueRef shift;
 454    LLVMValueRef res;
 455
 456    assert(lp_check_value(type, a));
 457    assert(lp_check_value(type, b));
 458
 459    if(a == bld->zero)
 460       return bld->zero;
 461    if(a == bld->one)
 462       return b;
 463    if(b == bld->zero)
 464       return bld->zero;
 465    if(b == bld->one)
 466       return a;
 467    if(a == bld->undef || b == bld->undef)
 468       return bld->undef;
 469
 470    if(!type.floating && !type.fixed && type.norm) {
 471       if(type.width == 8) {
 472          struct lp_type i16_type = lp_wider_type(type);
 473          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 474
 475          lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
 476          lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
 477
 478          /* PMULLW, PSRLW, PADDW */
 479          abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
 480          abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
 481
 482          ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
 483
 484          return ab;
 485       }
 486
 487       /* FIXME */
 488       assert(0);
 489    }
 490
 491    if(type.fixed)
 492       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 493    else
 494       shift = NULL;
 495
 496    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 497       if (type.floating)
 498          res = LLVMConstFMul(a, b);
 499       else
 500          res = LLVMConstMul(a, b);
 501       if(shift) {
 502          if(type.sign)
 503             res = LLVMConstAShr(res, shift);
 504          else
 505             res = LLVMConstLShr(res, shift);
 506       }
 507    }
 508    else {
 509       if (type.floating)
 510          res = LLVMBuildFMul(builder, a, b, "");
 511       else
 512          res = LLVMBuildMul(builder, a, b, "");
 513       if(shift) {
 514          if(type.sign)
 515             res = LLVMBuildAShr(builder, res, shift, "");
 516          else
 517             res = LLVMBuildLShr(builder, res, shift, "");
 518       }
 519    }
 520
 521    return res;
 522 }
 523
 524
 525 /**
 526  * Small vector x scale multiplication optimization.
 527  */
 528 LLVMValueRef
 529 lp_build_mul_imm(struct lp_build_context *bld,
 530                  LLVMValueRef a,
 531                  int b)
 532 {
 533    LLVMBuilderRef builder = bld->gallivm->builder;
 534    LLVMValueRef factor;
 535
 536    assert(lp_check_value(bld->type, a));
 537
 538    if(b == 0)
 539       return bld->zero;
 540
 541    if(b == 1)
 542       return a;
 543
 544    if(b == -1)
 545       return lp_build_negate(bld, a);
 546
 547    if(b == 2 && bld->type.floating)
 548       return lp_build_add(bld, a, a);
 549
 550    if(util_is_power_of_two(b)) {
 551       unsigned shift = ffs(b) - 1;
 552
 553       if(bld->type.floating) {
 554 #if 0
 555          /*
 556           * Power of two multiplication by directly manipulating the mantissa.
 557           *
 558           * XXX: This might not be always faster, it will introduce a small error
 559           * for multiplication by zero, and it will produce wrong results
 560           * for Inf and NaN.
 561           */
 562          unsigned mantissa = lp_mantissa(bld->type);
 563          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
 564          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
 565          a = LLVMBuildAdd(builder, a, factor, "");
 566          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
 567          return a;
 568 #endif
 569       }
 570       else {
 571          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
 572          return LLVMBuildShl(builder, a, factor, "");
 573       }
 574    }
 575
 576    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
 577    return lp_build_mul(bld, a, factor);
 578 }
 579
 580
 581 /**
 582  * Generate a / b
 583  */
 584 LLVMValueRef
 585 lp_build_div(struct lp_build_context *bld,
 586              LLVMValueRef a,
 587              LLVMValueRef b)
 588 {
 589    LLVMBuilderRef builder = bld->gallivm->builder;
 590    const struct lp_type type = bld->type;
 591
 592    assert(lp_check_value(type, a));
 593    assert(lp_check_value(type, b));
 594
 595    if(a == bld->zero)
 596       return bld->zero;
 597    if(a == bld->one)
 598       return lp_build_rcp(bld, b);
 599    if(b == bld->zero)
 600       return bld->undef;
 601    if(b == bld->one)
 602       return a;
 603    if(a == bld->undef || b == bld->undef)
 604       return bld->undef;
 605
 606    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 607       if (type.floating)
 608          return LLVMConstFDiv(a, b);
 609       else if (type.sign)
 610          return LLVMConstSDiv(a, b);
 611       else
 612          return LLVMConstUDiv(a, b);
 613    }
 614
 615    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 616       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 617
 618    if (type.floating)
 619       return LLVMBuildFDiv(builder, a, b, "");
 620    else if (type.sign)
 621       return LLVMBuildSDiv(builder, a, b, "");
 622    else
 623       return LLVMBuildUDiv(builder, a, b, "");
 624 }
 625
 626
 627 /**
 628  * Linear interpolation -- without any checks.
 629  *
 630  * @sa http://www.stereopsis.com/doubleblend.html
 631  */
 632 static INLINE LLVMValueRef
 633 lp_build_lerp_simple(struct lp_build_context *bld,
 634                      LLVMValueRef x,
 635                      LLVMValueRef v0,
 636                      LLVMValueRef v1)
 637 {
 638    LLVMBuilderRef builder = bld->gallivm->builder;
 639    LLVMValueRef delta;
 640    LLVMValueRef res;
 641
 642    assert(lp_check_value(bld->type, x));
 643    assert(lp_check_value(bld->type, v0));
 644    assert(lp_check_value(bld->type, v1));
 645
 646    delta = lp_build_sub(bld, v1, v0);
 647
 648    res = lp_build_mul(bld, x, delta);
 649
 650    res = lp_build_add(bld, v0, res);
 651
 652    if (bld->type.fixed) {
 653       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
 654        * but it will be wrong for other uses. Basically we need a more
 655        * powerful lp_type, capable of further distinguishing the values
 656        * interpretation from the value storage. */
 657       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
 658    }
 659
 660    return res;
 661 }
 662
 663
 664 /**
 665  * Linear interpolation.
 666  */
 667 LLVMValueRef
 668 lp_build_lerp(struct lp_build_context *bld,
 669               LLVMValueRef x,
 670               LLVMValueRef v0,
 671               LLVMValueRef v1)
 672 {
 673    LLVMBuilderRef builder = bld->gallivm->builder;
 674    const struct lp_type type = bld->type;
 675    LLVMValueRef res;
 676
 677    assert(lp_check_value(type, x));
 678    assert(lp_check_value(type, v0));
 679    assert(lp_check_value(type, v1));
 680
 681    if (type.norm) {
 682       struct lp_type wide_type;
 683       struct lp_build_context wide_bld;
 684       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
 685       LLVMValueRef shift;
 686
 687       assert(type.length >= 2);
 688       assert(!type.sign);
 689
 690       /*
 691        * Create a wider type, enough to hold the intermediate result of the
 692        * multiplication.
 693        */
 694       memset(&wide_type, 0, sizeof wide_type);
 695       wide_type.fixed  = TRUE;
 696       wide_type.width  = type.width*2;
 697       wide_type.length = type.length/2;
 698
 699       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
 700
 701       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
 702       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
 703       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
 704
 705       /*
 706        * Scale x from [0, 255] to [0, 256]
 707        */
 708
 709       shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
 710
 711       xl = lp_build_add(&wide_bld, xl,
 712                         LLVMBuildAShr(builder, xl, shift, ""));
 713       xh = lp_build_add(&wide_bld, xh,
 714                         LLVMBuildAShr(builder, xh, shift, ""));
 715
 716       /*
 717        * Lerp both halves.
 718        */
 719
 720       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
 721       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
 722
 723       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
 724    } else {
 725       res = lp_build_lerp_simple(bld, x, v0, v1);
 726    }
 727
 728    return res;
 729 }
 730
 731
 732 LLVMValueRef
 733 lp_build_lerp_2d(struct lp_build_context *bld,
 734                  LLVMValueRef x,
 735                  LLVMValueRef y,
 736                  LLVMValueRef v00,
 737                  LLVMValueRef v01,
 738                  LLVMValueRef v10,
 739                  LLVMValueRef v11)
 740 {
 741    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 742    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 743    return lp_build_lerp(bld, y, v0, v1);
 744 }
 745
 746
 747 /**
 748  * Generate min(a, b)
 749  * Do checks for special cases.
 750  */
 751 LLVMValueRef
 752 lp_build_min(struct lp_build_context *bld,
 753              LLVMValueRef a,
 754              LLVMValueRef b)
 755 {
 756    assert(lp_check_value(bld->type, a));
 757    assert(lp_check_value(bld->type, b));
 758
 759    if(a == bld->undef || b == bld->undef)
 760       return bld->undef;
 761
 762    if(a == b)
 763       return a;
 764
 765    if(bld->type.norm) {
 766       if(a == bld->zero || b == bld->zero)
 767          return bld->zero;
 768       if(a == bld->one)
 769          return b;
 770       if(b == bld->one)
 771          return a;
 772    }
 773
 774    return lp_build_min_simple(bld, a, b);
 775 }
 776
 777
 778 /**
 779  * Generate max(a, b)
 780  * Do checks for special cases.
 781  */
 782 LLVMValueRef
 783 lp_build_max(struct lp_build_context *bld,
 784              LLVMValueRef a,
 785              LLVMValueRef b)
 786 {
 787    assert(lp_check_value(bld->type, a));
 788    assert(lp_check_value(bld->type, b));
 789
 790    if(a == bld->undef || b == bld->undef)
 791       return bld->undef;
 792
 793    if(a == b)
 794       return a;
 795
 796    if(bld->type.norm) {
 797       if(a == bld->one || b == bld->one)
 798          return bld->one;
 799       if(a == bld->zero)
 800          return b;
 801       if(b == bld->zero)
 802          return a;
 803    }
 804
 805    return lp_build_max_simple(bld, a, b);
 806 }
 807
 808
 809 /**
 810  * Generate clamp(a, min, max)
 811  * Do checks for special cases.
 812  */
 813 LLVMValueRef
 814 lp_build_clamp(struct lp_build_context *bld,
 815                LLVMValueRef a,
 816                LLVMValueRef min,
 817                LLVMValueRef max)
 818 {
 819    assert(lp_check_value(bld->type, a));
 820    assert(lp_check_value(bld->type, min));
 821    assert(lp_check_value(bld->type, max));
 822
 823    a = lp_build_min(bld, a, max);
 824    a = lp_build_max(bld, a, min);
 825    return a;
 826 }
 827
 828
 829 /**
 830  * Generate abs(a)
 831  */
 832 LLVMValueRef
 833 lp_build_abs(struct lp_build_context *bld,
 834              LLVMValueRef a)
 835 {
 836    LLVMBuilderRef builder = bld->gallivm->builder;
 837    const struct lp_type type = bld->type;
 838    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 839
 840    assert(lp_check_value(type, a));
 841
 842    if(!type.sign)
 843       return a;
 844
 845    if(type.floating) {
 846       /* Mask out the sign bit */
 847       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
 848       unsigned long long absMask = ~(1ULL << (type.width - 1));
 849       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
 850       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
 851       a = LLVMBuildAnd(builder, a, mask, "");
 852       a = LLVMBuildBitCast(builder, a, vec_type, "");
 853       return a;
 854    }
 855
 856    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 857       switch(type.width) {
 858       case 8:
 859          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 860       case 16:
 861          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 862       case 32:
 863          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 864       }
 865    }
 866
 867    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 868 }
 869
 870
 871 LLVMValueRef
 872 lp_build_negate(struct lp_build_context *bld,
 873                 LLVMValueRef a)
 874 {
 875    LLVMBuilderRef builder = bld->gallivm->builder;
 876
 877    assert(lp_check_value(bld->type, a));
 878
 879 #if HAVE_LLVM >= 0x0207
 880    if (bld->type.floating)
 881       a = LLVMBuildFNeg(builder, a, "");
 882    else
 883 #endif
 884       a = LLVMBuildNeg(builder, a, "");
 885
 886    return a;
 887 }
 888
 889
 890 /** Return -1, 0 or +1 depending on the sign of a */
 891 LLVMValueRef
 892 lp_build_sgn(struct lp_build_context *bld,
 893              LLVMValueRef a)
 894 {
 895    LLVMBuilderRef builder = bld->gallivm->builder;
 896    const struct lp_type type = bld->type;
 897    LLVMValueRef cond;
 898    LLVMValueRef res;
 899
 900    assert(lp_check_value(type, a));
 901
 902    /* Handle non-zero case */
 903    if(!type.sign) {
 904       /* if not zero then sign must be positive */
 905       res = bld->one;
 906    }
 907    else if(type.floating) {
 908       LLVMTypeRef vec_type;
 909       LLVMTypeRef int_type;
 910       LLVMValueRef mask;
 911       LLVMValueRef sign;
 912       LLVMValueRef one;
 913       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 914
 915       int_type = lp_build_int_vec_type(bld->gallivm, type);
 916       vec_type = lp_build_vec_type(bld->gallivm, type);
 917       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
 918
 919       /* Take the sign bit and add it to 1 constant */
 920       sign = LLVMBuildBitCast(builder, a, int_type, "");
 921       sign = LLVMBuildAnd(builder, sign, mask, "");
 922       one = LLVMConstBitCast(bld->one, int_type);
 923       res = LLVMBuildOr(builder, sign, one, "");
 924       res = LLVMBuildBitCast(builder, res, vec_type, "");
 925    }
 926    else
 927    {
 928       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
 929       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 930       res = lp_build_select(bld, cond, bld->one, minus_one);
 931    }
 932
 933    /* Handle zero */
 934    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 935    res = lp_build_select(bld, cond, bld->zero, res);
 936
 937    return res;
 938 }
 939
 940
 941 /**
 942  * Set the sign of float vector 'a' according to 'sign'.
 943  * If sign==0, return abs(a).
 944  * If sign==1, return -abs(a);
 945  * Other values for sign produce undefined results.
 946  */
 947 LLVMValueRef
 948 lp_build_set_sign(struct lp_build_context *bld,
 949                   LLVMValueRef a, LLVMValueRef sign)
 950 {
 951    LLVMBuilderRef builder = bld->gallivm->builder;
 952    const struct lp_type type = bld->type;
 953    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
 954    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 955    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
 956    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
 957                              ~((unsigned long long) 1 << (type.width - 1)));
 958    LLVMValueRef val, res;
 959
 960    assert(type.floating);
 961    assert(lp_check_value(type, a));
 962
 963    /* val = reinterpret_cast<int>(a) */
 964    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
 965    /* val = val & mask */
 966    val = LLVMBuildAnd(builder, val, mask, "");
 967    /* sign = sign << shift */
 968    sign = LLVMBuildShl(builder, sign, shift, "");
 969    /* res = val | sign */
 970    res = LLVMBuildOr(builder, val, sign, "");
 971    /* res = reinterpret_cast<float>(res) */
 972    res = LLVMBuildBitCast(builder, res, vec_type, "");
 973
 974    return res;
 975 }
 976
 977
 978 /**
 979  * Convert vector of (or scalar) int to vector of (or scalar) float.
 980  */
 981 LLVMValueRef
 982 lp_build_int_to_float(struct lp_build_context *bld,
 983                       LLVMValueRef a)
 984 {
 985    LLVMBuilderRef builder = bld->gallivm->builder;
 986    const struct lp_type type = bld->type;
 987    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
 988
 989    assert(type.floating);
 990
 991    return LLVMBuildSIToFP(builder, a, vec_type, "");
 992 }
 993
 994
 995
 996 enum lp_build_round_sse41_mode
 997 {
 998    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 999    LP_BUILD_ROUND_SSE41_FLOOR = 1,
1000    LP_BUILD_ROUND_SSE41_CEIL = 2,
1001    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1002 };
1003
1004
1005 /**
1006  * Helper for SSE4.1's ROUNDxx instructions.
1007  *
1008  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1009  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1010  */
1011 static INLINE LLVMValueRef
1012 lp_build_round_sse41(struct lp_build_context *bld,
1013                      LLVMValueRef a,
1014                      enum lp_build_round_sse41_mode mode)
1015 {
1016    LLVMBuilderRef builder = bld->gallivm->builder;
1017    const struct lp_type type = bld->type;
1018    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1019    const char *intrinsic;
1020    LLVMValueRef res;
1021
1022    assert(type.floating);
1023
1024    assert(lp_check_value(type, a));
1025    assert(util_cpu_caps.has_sse4_1);
1026
1027    if (type.length == 1) {
1028       LLVMTypeRef vec_type;
1029       LLVMValueRef undef;
1030       LLVMValueRef args[3];
1031       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1032
1033       switch(type.width) {
1034       case 32:
1035          intrinsic = "llvm.x86.sse41.round.ss";
1036          break;
1037       case 64:
1038          intrinsic = "llvm.x86.sse41.round.sd";
1039          break;
1040       default:
1041          assert(0);
1042          return bld->undef;
1043       }
1044
1045       vec_type = LLVMVectorType(bld->elem_type, 4);
1046
1047       undef = LLVMGetUndef(vec_type);
1048
1049       args[0] = undef;
1050       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1051       args[2] = LLVMConstInt(i32t, mode, 0);
1052
1053       res = lp_build_intrinsic(builder, intrinsic,
1054                                vec_type, args, Elements(args));
1055
1056       res = LLVMBuildExtractElement(builder, res, index0, "");
1057    }
1058    else {
1059       assert(type.width*type.length == 128);
1060
1061       switch(type.width) {
1062       case 32:
1063          intrinsic = "llvm.x86.sse41.round.ps";
1064          break;
1065       case 64:
1066          intrinsic = "llvm.x86.sse41.round.pd";
1067          break;
1068       default:
1069          assert(0);
1070          return bld->undef;
1071       }
1072
1073       res = lp_build_intrinsic_binary(builder, intrinsic,
1074                                       bld->vec_type, a,
1075                                       LLVMConstInt(i32t, mode, 0));
1076    }
1077
1078    return res;
1079 }
1080
1081
1082 static INLINE LLVMValueRef
1083 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1084                              LLVMValueRef a)
1085 {
1086    LLVMBuilderRef builder = bld->gallivm->builder;
1087    const struct lp_type type = bld->type;
1088    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1089    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1090    const char *intrinsic;
1091    LLVMValueRef res;
1092
1093    assert(type.floating);
1094    /* using the double precision conversions is a bit more complicated */
1095    assert(type.width == 32);
1096
1097    assert(lp_check_value(type, a));
1098    assert(util_cpu_caps.has_sse2);
1099
1100    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1101    if (type.length == 1) {
1102       LLVMTypeRef vec_type;
1103       LLVMValueRef undef;
1104       LLVMValueRef arg;
1105       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1106
1107       vec_type = LLVMVectorType(bld->elem_type, 4);
1108
1109       intrinsic = "llvm.x86.sse.cvtss2si";
1110
1111       undef = LLVMGetUndef(vec_type);
1112
1113       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1114
1115       res = lp_build_intrinsic_unary(builder, intrinsic,
1116                                      ret_type, arg);
1117    }
1118    else {
1119       assert(type.width*type.length == 128);
1120
1121       intrinsic = "llvm.x86.sse2.cvtps2dq";
1122
1123       res = lp_build_intrinsic_unary(builder, intrinsic,
1124                                      ret_type, a);
1125    }
1126
1127    return res;
1128 }
1129
1130
1131 /**
1132  * Return the integer part of a float (vector) value (== round toward zero).
1133  * The returned value is a float (vector).
1134  * Ex: trunc(-1.5) = -1.0
1135  */
1136 LLVMValueRef
1137 lp_build_trunc(struct lp_build_context *bld,
1138                LLVMValueRef a)
1139 {
1140    LLVMBuilderRef builder = bld->gallivm->builder;
1141    const struct lp_type type = bld->type;
1142
1143    assert(type.floating);
1144    assert(lp_check_value(type, a));
1145
1146    if (util_cpu_caps.has_sse4_1 &&
1147        (type.length == 1 || type.width*type.length == 128)) {
1148       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1149    }
1150    else {
1151       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1152       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1153       LLVMValueRef res;
1154       res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1155       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1156       return res;
1157    }
1158 }
1159
1160
1161 /**
1162  * Return float (vector) rounded to nearest integer (vector).  The returned
1163  * value is a float (vector).
1164  * Ex: round(0.9) = 1.0
1165  * Ex: round(-1.5) = -2.0
1166  */
1167 LLVMValueRef
1168 lp_build_round(struct lp_build_context *bld,
1169                LLVMValueRef a)
1170 {
1171    LLVMBuilderRef builder = bld->gallivm->builder;
1172    const struct lp_type type = bld->type;
1173
1174    assert(type.floating);
1175    assert(lp_check_value(type, a));
1176
1177    if (util_cpu_caps.has_sse4_1 &&
1178        (type.length == 1 || type.width*type.length == 128)) {
1179       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1180    }
1181    else {
1182       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1183       LLVMValueRef res;
1184       res = lp_build_iround(bld, a);
1185       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1186       return res;
1187    }
1188 }
1189
1190
1191 /**
1192  * Return floor of float (vector), result is a float (vector)
1193  * Ex: floor(1.1) = 1.0
1194  * Ex: floor(-1.1) = -2.0
1195  */
1196 LLVMValueRef
1197 lp_build_floor(struct lp_build_context *bld,
1198                LLVMValueRef a)
1199 {
1200    LLVMBuilderRef builder = bld->gallivm->builder;
1201    const struct lp_type type = bld->type;
1202
1203    assert(type.floating);
1204    assert(lp_check_value(type, a));
1205
1206    if (util_cpu_caps.has_sse4_1 &&
1207        (type.length == 1 || type.width*type.length == 128)) {
1208       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1209    }
1210    else {
1211       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1212       LLVMValueRef res;
1213       res = lp_build_ifloor(bld, a);
1214       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1215       return res;
1216    }
1217 }
1218
1219
1220 /**
1221  * Return ceiling of float (vector), returning float (vector).
1222  * Ex: ceil( 1.1) = 2.0
1223  * Ex: ceil(-1.1) = -1.0
1224  */
1225 LLVMValueRef
1226 lp_build_ceil(struct lp_build_context *bld,
1227               LLVMValueRef a)
1228 {
1229    LLVMBuilderRef builder = bld->gallivm->builder;
1230    const struct lp_type type = bld->type;
1231
1232    assert(type.floating);
1233    assert(lp_check_value(type, a));
1234
1235    if (util_cpu_caps.has_sse4_1 &&
1236        (type.length == 1 || type.width*type.length == 128)) {
1237       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1238    }
1239    else {
1240       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1241       LLVMValueRef res;
1242       res = lp_build_iceil(bld, a);
1243       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1244       return res;
1245    }
1246 }
1247
1248
1249 /**
1250  * Return fractional part of 'a' computed as a - floor(a)
1251  * Typically used in texture coord arithmetic.
1252  */
1253 LLVMValueRef
1254 lp_build_fract(struct lp_build_context *bld,
1255                LLVMValueRef a)
1256 {
1257    assert(bld->type.floating);
1258    return lp_build_sub(bld, a, lp_build_floor(bld, a));
1259 }
1260
1261
1262 /**
1263  * Return the integer part of a float (vector) value (== round toward zero).
1264  * The returned value is an integer (vector).
1265  * Ex: itrunc(-1.5) = -1
1266  */
1267 LLVMValueRef
1268 lp_build_itrunc(struct lp_build_context *bld,
1269                 LLVMValueRef a)
1270 {
1271    LLVMBuilderRef builder = bld->gallivm->builder;
1272    const struct lp_type type = bld->type;
1273    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1274
1275    assert(type.floating);
1276    assert(lp_check_value(type, a));
1277
1278    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1279 }
1280
1281
1282 /**
1283  * Return float (vector) rounded to nearest integer (vector).  The returned
1284  * value is an integer (vector).
1285  * Ex: iround(0.9) = 1
1286  * Ex: iround(-1.5) = -2
1287  */
1288 LLVMValueRef
1289 lp_build_iround(struct lp_build_context *bld,
1290                 LLVMValueRef a)
1291 {
1292    LLVMBuilderRef builder = bld->gallivm->builder;
1293    const struct lp_type type = bld->type;
1294    LLVMTypeRef int_vec_type = bld->int_vec_type;
1295    LLVMValueRef res;
1296
1297    assert(type.floating);
1298
1299    assert(lp_check_value(type, a));
1300
1301    if (util_cpu_caps.has_sse2 &&
1302        ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1303       return lp_build_iround_nearest_sse2(bld, a);
1304    }
1305    else if (util_cpu_caps.has_sse4_1 &&
1306        (type.length == 1 || type.width*type.length == 128)) {
1307       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1308    }
1309    else {
1310       LLVMValueRef half;
1311
1312       half = lp_build_const_vec(bld->gallivm, type, 0.5);
1313
1314       if (type.sign) {
1315          LLVMTypeRef vec_type = bld->vec_type;
1316          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1317                                     (unsigned long long)1 << (type.width - 1));
1318          LLVMValueRef sign;
1319
1320          /* get sign bit */
1321          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1322          sign = LLVMBuildAnd(builder, sign, mask, "");
1323
1324          /* sign * 0.5 */
1325          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1326          half = LLVMBuildOr(builder, sign, half, "");
1327          half = LLVMBuildBitCast(builder, half, vec_type, "");
1328       }
1329
1330       res = LLVMBuildFAdd(builder, a, half, "");
1331    }
1332
1333    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1334
1335    return res;
1336 }
1337
1338
1339 /**
1340  * Return floor of float (vector), result is an int (vector)
1341  * Ex: ifloor(1.1) = 1.0
1342  * Ex: ifloor(-1.1) = -2.0
1343  */
1344 LLVMValueRef
1345 lp_build_ifloor(struct lp_build_context *bld,
1346                 LLVMValueRef a)
1347 {
1348    LLVMBuilderRef builder = bld->gallivm->builder;
1349    const struct lp_type type = bld->type;
1350    LLVMTypeRef int_vec_type = bld->int_vec_type;
1351    LLVMValueRef res;
1352
1353    assert(type.floating);
1354    assert(lp_check_value(type, a));
1355
1356    if (util_cpu_caps.has_sse4_1 &&
1357        (type.length == 1 || type.width*type.length == 128)) {
1358       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1359    }
1360    else {
1361       res = a;
1362
1363       if (type.sign) {
1364          /* Take the sign bit and add it to 1 constant */
1365          LLVMTypeRef vec_type = bld->vec_type;
1366          unsigned mantissa = lp_mantissa(type);
1367          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1368                                   (unsigned long long)1 << (type.width - 1));
1369          LLVMValueRef sign;
1370          LLVMValueRef offset;
1371
1372          /* sign = a < 0 ? ~0 : 0 */
1373          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1374          sign = LLVMBuildAnd(builder, sign, mask, "");
1375          sign = LLVMBuildAShr(builder, sign,
1376                               lp_build_const_int_vec(bld->gallivm, type,
1377                                                      type.width - 1),
1378                               "ifloor.sign");
1379
1380          /* offset = -0.99999(9)f */
1381          offset = lp_build_const_vec(bld->gallivm, type,
1382                                      -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1383          offset = LLVMConstBitCast(offset, int_vec_type);
1384
1385          /* offset = a < 0 ? offset : 0.0f */
1386          offset = LLVMBuildAnd(builder, offset, sign, "");
1387          offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1388
1389          res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1390       }
1391    }
1392
1393    /* round to nearest (toward zero) */
1394    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1395
1396    return res;
1397 }
1398
1399
1400 /**
1401  * Return ceiling of float (vector), returning int (vector).
1402  * Ex: iceil( 1.1) = 2
1403  * Ex: iceil(-1.1) = -1
1404  */
1405 LLVMValueRef
1406 lp_build_iceil(struct lp_build_context *bld,
1407                LLVMValueRef a)
1408 {
1409    LLVMBuilderRef builder = bld->gallivm->builder;
1410    const struct lp_type type = bld->type;
1411    LLVMTypeRef int_vec_type = bld->int_vec_type;
1412    LLVMValueRef res;
1413
1414    assert(type.floating);
1415    assert(lp_check_value(type, a));
1416
1417    if (util_cpu_caps.has_sse4_1 &&
1418        (type.length == 1 || type.width*type.length == 128)) {
1419       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1420    }
1421    else {
1422       LLVMTypeRef vec_type = bld->vec_type;
1423       unsigned mantissa = lp_mantissa(type);
1424       LLVMValueRef offset;
1425
1426       /* offset = 0.99999(9)f */
1427       offset = lp_build_const_vec(bld->gallivm, type,
1428                                   (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1429
1430       if (type.sign) {
1431          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1432                                 (unsigned long long)1 << (type.width - 1));
1433          LLVMValueRef sign;
1434
1435          /* sign = a < 0 ? 0 : ~0 */
1436          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1437          sign = LLVMBuildAnd(builder, sign, mask, "");
1438          sign = LLVMBuildAShr(builder, sign,
1439                               lp_build_const_int_vec(bld->gallivm, type,
1440                                                      type.width - 1),
1441                               "iceil.sign");
1442          sign = LLVMBuildNot(builder, sign, "iceil.not");
1443
1444          /* offset = a < 0 ? 0.0 : offset */
1445          offset = LLVMConstBitCast(offset, int_vec_type);
1446          offset = LLVMBuildAnd(builder, offset, sign, "");
1447          offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1448       }
1449
1450       res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1451    }
1452
1453    /* round to nearest (toward zero) */
1454    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1455
1456    return res;
1457 }
1458
1459
1460 /**
1461  * Combined ifloor() & fract().
1462  *
1463  * Preferred to calling the functions separately, as it will ensure that the
1464  * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1465  */
1466 void
1467 lp_build_ifloor_fract(struct lp_build_context *bld,
1468                       LLVMValueRef a,
1469                       LLVMValueRef *out_ipart,
1470                       LLVMValueRef *out_fpart)
1471 {
1472    LLVMBuilderRef builder = bld->gallivm->builder;
1473    const struct lp_type type = bld->type;
1474    LLVMValueRef ipart;
1475
1476    assert(type.floating);
1477    assert(lp_check_value(type, a));
1478
1479    if (util_cpu_caps.has_sse4_1 &&
1480        (type.length == 1 || type.width*type.length == 128)) {
1481       /*
1482        * floor() is easier.
1483        */
1484
1485       ipart = lp_build_floor(bld, a);
1486       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1487       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1488    }
1489    else {
1490       /*
1491        * ifloor() is easier.
1492        */
1493
1494       *out_ipart = lp_build_ifloor(bld, a);
1495       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1496       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1497    }
1498 }
1499
1500
1501 LLVMValueRef
1502 lp_build_sqrt(struct lp_build_context *bld,
1503               LLVMValueRef a)
1504 {
1505    LLVMBuilderRef builder = bld->gallivm->builder;
1506    const struct lp_type type = bld->type;
1507    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1508    char intrinsic[32];
1509
1510    assert(lp_check_value(type, a));
1511
1512    /* TODO: optimize the constant case */
1513    /* TODO: optimize the constant case */
1514
1515    assert(type.floating);
1516    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1517
1518    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1519 }
1520
1521
1522 /**
1523  * Do one Newton-Raphson step to improve reciprocate precision:
1524  *
1525  *   x_{i+1} = x_i * (2 - a * x_i)
1526  *
1527  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1528  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
1529  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1530  * halo. It would be necessary to clamp the argument to prevent this.
1531  *
1532  * See also:
1533  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1534  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1535  */
1536 static INLINE LLVMValueRef
1537 lp_build_rcp_refine(struct lp_build_context *bld,
1538                     LLVMValueRef a,
1539                     LLVMValueRef rcp_a)
1540 {
1541    LLVMBuilderRef builder = bld->gallivm->builder;
1542    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1543    LLVMValueRef res;
1544
1545    res = LLVMBuildFMul(builder, a, rcp_a, "");
1546    res = LLVMBuildFSub(builder, two, res, "");
1547    res = LLVMBuildFMul(builder, rcp_a, res, "");
1548
1549    return res;
1550 }
1551
1552
1553 LLVMValueRef
1554 lp_build_rcp(struct lp_build_context *bld,
1555              LLVMValueRef a)
1556 {
1557    LLVMBuilderRef builder = bld->gallivm->builder;
1558    const struct lp_type type = bld->type;
1559
1560    assert(lp_check_value(type, a));
1561
1562    if(a == bld->zero)
1563       return bld->undef;
1564    if(a == bld->one)
1565       return bld->one;
1566    if(a == bld->undef)
1567       return bld->undef;
1568
1569    assert(type.floating);
1570
1571    if(LLVMIsConstant(a))
1572       return LLVMConstFDiv(bld->one, a);
1573
1574    /*
1575     * We don't use RCPPS because:
1576     * - it only has 10bits of precision
1577     * - it doesn't even get the reciprocate of 1.0 exactly
1578     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1579     * - for recent processors the benefit over DIVPS is marginal, a case
1580     *   depedent
1581     *
1582     * We could still use it on certain processors if benchmarks show that the
1583     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1584     * particular uses that require less workarounds.
1585     */
1586
1587    if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1588       const unsigned num_iterations = 0;
1589       LLVMValueRef res;
1590       unsigned i;
1591
1592       res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1593
1594       for (i = 0; i < num_iterations; ++i) {
1595          res = lp_build_rcp_refine(bld, a, res);
1596       }
1597
1598       return res;
1599    }
1600
1601    return LLVMBuildFDiv(builder, bld->one, a, "");
1602 }
1603
1604
1605 /**
1606  * Do one Newton-Raphson step to improve rsqrt precision:
1607  *
1608  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1609  *
1610  * See also:
1611  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1612  */
1613 static INLINE LLVMValueRef
1614 lp_build_rsqrt_refine(struct lp_build_context *bld,
1615                       LLVMValueRef a,
1616                       LLVMValueRef rsqrt_a)
1617 {
1618    LLVMBuilderRef builder = bld->gallivm->builder;
1619    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1620    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1621    LLVMValueRef res;
1622
1623    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1624    res = LLVMBuildFMul(builder, a, res, "");
1625    res = LLVMBuildFSub(builder, three, res, "");
1626    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1627    res = LLVMBuildFMul(builder, half, res, "");
1628
1629    return res;
1630 }
1631
1632
1633 /**
1634  * Generate 1/sqrt(a)
1635  */
1636 LLVMValueRef
1637 lp_build_rsqrt(struct lp_build_context *bld,
1638                LLVMValueRef a)
1639 {
1640    LLVMBuilderRef builder = bld->gallivm->builder;
1641    const struct lp_type type = bld->type;
1642
1643    assert(lp_check_value(type, a));
1644
1645    assert(type.floating);
1646
1647    if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1648       const unsigned num_iterations = 1;
1649       LLVMValueRef res;
1650       unsigned i;
1651
1652       res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1653
1654       for (i = 0; i < num_iterations; ++i) {
1655          res = lp_build_rsqrt_refine(bld, a, res);
1656       }
1657
1658       return res;
1659    }
1660
1661    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1662 }
1663
1664
1665 /**
1666  * Generate sin(a) using SSE2
1667  */
1668 LLVMValueRef
1669 lp_build_sin(struct lp_build_context *bld,
1670              LLVMValueRef a)
1671 {
1672    struct gallivm_state *gallivm = bld->gallivm;
1673    LLVMBuilderRef builder = gallivm->builder;
1674    struct lp_type int_type = lp_int_type(bld->type);
1675    LLVMBuilderRef b = builder;
1676
1677    /*
1678     *  take the absolute value,
1679     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1680     */
1681
1682    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1683    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1684
1685    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1686    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1687
1688    /*
1689     * extract the sign bit (upper one)
1690     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1691     */
1692    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1693    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1694
1695    /*
1696     * scale by 4/Pi
1697     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1698     */
1699
1700    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1701    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1702
1703    /*
1704     * store the integer part of y in mm0
1705     * emm2 = _mm_cvttps_epi32(y);
1706     */
1707
1708    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1709
1710    /*
1711     * j=(j+1) & (~1) (see the cephes sources)
1712     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1713     */
1714
1715    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1716    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1717    /*
1718     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1719     */
1720    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1721    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1722
1723    /*
1724     * y = _mm_cvtepi32_ps(emm2);
1725     */
1726    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1727
1728    /* get the swap sign flag
1729     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1730     */
1731    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1732    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1733
1734    /*
1735     * emm2 = _mm_slli_epi32(emm0, 29);
1736     */
1737    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1738    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1739
1740    /*
1741     * get the polynom selection mask
1742     * there is one polynom for 0 <= x <= Pi/4
1743     * and another one for Pi/4<x<=Pi/2
1744     * Both branches will be computed.
1745     *
1746     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1747     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1748     */
1749
1750    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1751    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1752    LLVMValueRef poly_mask = lp_build_compare(gallivm,
1753                                              int_type, PIPE_FUNC_EQUAL,
1754                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1755    /*
1756     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1757     */
1758    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1759
1760    /*
1761     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1762     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1763     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1764     */
1765    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1766    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1767    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1768
1769    /*
1770     * The magic pass: "Extended precision modular arithmetic"
1771     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1772     * xmm1 = _mm_mul_ps(y, xmm1);
1773     * xmm2 = _mm_mul_ps(y, xmm2);
1774     * xmm3 = _mm_mul_ps(y, xmm3);
1775     */
1776    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1777    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1778    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1779
1780    /*
1781     * x = _mm_add_ps(x, xmm1);
1782     * x = _mm_add_ps(x, xmm2);
1783     * x = _mm_add_ps(x, xmm3);
1784     */
1785
1786    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1787    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1788    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1789
1790    /*
1791     * Evaluate the first polynom  (0 <= x <= Pi/4)
1792     *
1793     * z = _mm_mul_ps(x,x);
1794     */
1795    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1796
1797    /*
1798     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
1799     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1800     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
1801     */
1802    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1803    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1804    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1805
1806    /*
1807     * y = *(v4sf*)_ps_coscof_p0;
1808     * y = _mm_mul_ps(y, z);
1809     */
1810    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1811    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1812    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1813    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1814    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1815    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1816
1817
1818    /*
1819     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1820     * y = _mm_sub_ps(y, tmp);
1821     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1822     */
1823    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1824    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1825    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1826    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1827    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1828
1829    /*
1830     * _PS_CONST(sincof_p0, -1.9515295891E-4);
1831     * _PS_CONST(sincof_p1,  8.3321608736E-3);
1832     * _PS_CONST(sincof_p2, -1.6666654611E-1);
1833     */
1834    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1835    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1836    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1837
1838    /*
1839     * Evaluate the second polynom  (Pi/4 <= x <= 0)
1840     *
1841     * y2 = *(v4sf*)_ps_sincof_p0;
1842     * y2 = _mm_mul_ps(y2, z);
1843     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1844     * y2 = _mm_mul_ps(y2, z);
1845     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1846     * y2 = _mm_mul_ps(y2, z);
1847     * y2 = _mm_mul_ps(y2, x);
1848     * y2 = _mm_add_ps(y2, x);
1849     */
1850
1851    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1852    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1853    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1854    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1855    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1856    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1857    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1858
1859    /*
1860     * select the correct result from the two polynoms
1861     * xmm3 = poly_mask;
1862     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1863     * y = _mm_andnot_ps(xmm3, y);
1864     * y = _mm_add_ps(y,y2);
1865     */
1866    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1867    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1868    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1869    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1870    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1871    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1872    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1873
1874    /*
1875     * update the sign
1876     * y = _mm_xor_ps(y, sign_bit);
1877     */
1878    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1879    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1880    return y_result;
1881 }
1882
1883
1884 /**
1885  * Generate cos(a) using SSE2
1886  */
1887 LLVMValueRef
1888 lp_build_cos(struct lp_build_context *bld,
1889              LLVMValueRef a)
1890 {
1891    struct gallivm_state *gallivm = bld->gallivm;
1892    LLVMBuilderRef builder = gallivm->builder;
1893    struct lp_type int_type = lp_int_type(bld->type);
1894    LLVMBuilderRef b = builder;
1895
1896    /*
1897     *  take the absolute value,
1898     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1899     */
1900
1901    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1902    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1903
1904    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1905    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1906
1907    /*
1908     * scale by 4/Pi
1909     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1910     */
1911
1912    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1913    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1914
1915    /*
1916     * store the integer part of y in mm0
1917     * emm2 = _mm_cvttps_epi32(y);
1918     */
1919
1920    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1921
1922    /*
1923     * j=(j+1) & (~1) (see the cephes sources)
1924     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1925     */
1926
1927    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1928    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1929    /*
1930     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1931     */
1932    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1933    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1934
1935    /*
1936     * y = _mm_cvtepi32_ps(emm2);
1937     */
1938    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1939
1940
1941    /*
1942     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1943     */
1944    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1945    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1946
1947
1948    /* get the swap sign flag
1949     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1950     */
1951    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1952    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1953    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1954    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1955
1956    /*
1957     * emm2 = _mm_slli_epi32(emm0, 29);
1958     */
1959    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1960    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1961
1962    /*
1963     * get the polynom selection mask
1964     * there is one polynom for 0 <= x <= Pi/4
1965     * and another one for Pi/4<x<=Pi/2
1966     * Both branches will be computed.
1967     *
1968     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1969     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1970     */
1971
1972    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1973    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1974    LLVMValueRef poly_mask = lp_build_compare(gallivm,
1975                                              int_type, PIPE_FUNC_EQUAL,
1976                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1977
1978    /*
1979     * _PS_CONST(minus_cephes_DP1, -0.78515625);
1980     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1981     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1982     */
1983    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1984    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1985    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1986
1987    /*
1988     * The magic pass: "Extended precision modular arithmetic"
1989     * x = ((x - y * DP1) - y * DP2) - y * DP3;
1990     * xmm1 = _mm_mul_ps(y, xmm1);
1991     * xmm2 = _mm_mul_ps(y, xmm2);
1992     * xmm3 = _mm_mul_ps(y, xmm3);
1993     */
1994    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1995    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1996    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1997
1998    /*
1999     * x = _mm_add_ps(x, xmm1);
2000     * x = _mm_add_ps(x, xmm2);
2001     * x = _mm_add_ps(x, xmm3);
2002     */
2003
2004    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2005    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2006    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2007
2008    /*
2009     * Evaluate the first polynom  (0 <= x <= Pi/4)
2010     *
2011     * z = _mm_mul_ps(x,x);
2012     */
2013    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2014
2015    /*
2016     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2017     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2018     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2019     */
2020    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2021    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2022    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2023
2024    /*
2025     * y = *(v4sf*)_ps_coscof_p0;
2026     * y = _mm_mul_ps(y, z);
2027     */
2028    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2029    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2030    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2031    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2032    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2033    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2034
2035
2036    /*
2037     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2038     * y = _mm_sub_ps(y, tmp);
2039     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2040     */
2041    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2042    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2043    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2044    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2045    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2046
2047    /*
2048     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2049     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2050     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2051     */
2052    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2053    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2054    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2055
2056    /*
2057     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2058     *
2059     * y2 = *(v4sf*)_ps_sincof_p0;
2060     * y2 = _mm_mul_ps(y2, z);
2061     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2062     * y2 = _mm_mul_ps(y2, z);
2063     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2064     * y2 = _mm_mul_ps(y2, z);
2065     * y2 = _mm_mul_ps(y2, x);
2066     * y2 = _mm_add_ps(y2, x);
2067     */
2068
2069    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2070    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2071    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2072    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2073    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2074    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2075    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2076
2077    /*
2078     * select the correct result from the two polynoms
2079     * xmm3 = poly_mask;
2080     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2081     * y = _mm_andnot_ps(xmm3, y);
2082     * y = _mm_add_ps(y,y2);
2083     */
2084    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2085    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2086    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2087    LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2088    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2089    LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2090
2091    /*
2092     * update the sign
2093     * y = _mm_xor_ps(y, sign_bit);
2094     */
2095    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2096    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2097    return y_result;
2098 }
2099
2100
2101 /**
2102  * Generate pow(x, y)
2103  */
2104 LLVMValueRef
2105 lp_build_pow(struct lp_build_context *bld,
2106              LLVMValueRef x,
2107              LLVMValueRef y)
2108 {
2109    /* TODO: optimize the constant case */
2110    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2111        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2112       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2113                    __FUNCTION__);
2114    }
2115
2116    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2117 }
2118
2119
2120 /**
2121  * Generate exp(x)
2122  */
2123 LLVMValueRef
2124 lp_build_exp(struct lp_build_context *bld,
2125              LLVMValueRef x)
2126 {
2127    /* log2(e) = 1/log(2) */
2128    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2129                                            1.4426950408889634);
2130
2131    assert(lp_check_value(bld->type, x));
2132
2133    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2134 }
2135
2136
2137 /**
2138  * Generate log(x)
2139  */
2140 LLVMValueRef
2141 lp_build_log(struct lp_build_context *bld,
2142              LLVMValueRef x)
2143 {
2144    /* log(2) */
2145    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2146                                           0.69314718055994529);
2147
2148    assert(lp_check_value(bld->type, x));
2149
2150    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2151 }
2152
2153
2154 /**
2155  * Generate polynomial.
2156  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2157  */
2158 static LLVMValueRef
2159 lp_build_polynomial(struct lp_build_context *bld,
2160                     LLVMValueRef x,
2161                     const double *coeffs,
2162                     unsigned num_coeffs)
2163 {
2164    const struct lp_type type = bld->type;
2165    LLVMValueRef res = NULL;
2166    unsigned i;
2167
2168    assert(lp_check_value(bld->type, x));
2169
2170    /* TODO: optimize the constant case */
2171    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2172        LLVMIsConstant(x)) {
2173       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2174                    __FUNCTION__);
2175    }
2176
2177    for (i = num_coeffs; i--; ) {
2178       LLVMValueRef coeff;
2179
2180       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2181
2182       if(res)
2183          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2184       else
2185          res = coeff;
2186    }
2187
2188    if(res)
2189       return res;
2190    else
2191       return bld->undef;
2192 }
2193
2194
2195 /**
2196  * Minimax polynomial fit of 2**x, in range [0, 1[
2197  */
2198 const double lp_build_exp2_polynomial[] = {
2199 #if EXP_POLY_DEGREE == 5
2200    0.999999925063526176901,
2201    0.693153073200168932794,
2202    0.240153617044375388211,
2203    0.0558263180532956664775,
2204    0.00898934009049466391101,
2205    0.00187757667519147912699
2206 #elif EXP_POLY_DEGREE == 4
2207    1.00000259337069434683,
2208    0.693003834469974940458,
2209    0.24144275689150793076,
2210    0.0520114606103070150235,
2211    0.0135341679161270268764
2212 #elif EXP_POLY_DEGREE == 3
2213    0.999925218562710312959,
2214    0.695833540494823811697,
2215    0.226067155427249155588,
2216    0.0780245226406372992967
2217 #elif EXP_POLY_DEGREE == 2
2218    1.00172476321474503578,
2219    0.657636275736077639316,
2220    0.33718943461968720704
2221 #else
2222 #error
2223 #endif
2224 };
2225
2226
2227 void
2228 lp_build_exp2_approx(struct lp_build_context *bld,
2229                      LLVMValueRef x,
2230                      LLVMValueRef *p_exp2_int_part,
2231                      LLVMValueRef *p_frac_part,
2232                      LLVMValueRef *p_exp2)
2233 {
2234    LLVMBuilderRef builder = bld->gallivm->builder;
2235    const struct lp_type type = bld->type;
2236    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2237    LLVMValueRef ipart = NULL;
2238    LLVMValueRef fpart = NULL;
2239    LLVMValueRef expipart = NULL;
2240    LLVMValueRef expfpart = NULL;
2241    LLVMValueRef res = NULL;
2242
2243    assert(lp_check_value(bld->type, x));
2244
2245    if(p_exp2_int_part || p_frac_part || p_exp2) {
2246       /* TODO: optimize the constant case */
2247       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2248           LLVMIsConstant(x)) {
2249          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2250                       __FUNCTION__);
2251       }
2252
2253       assert(type.floating && type.width == 32);
2254
2255       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
2256       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2257
2258       /* ipart = floor(x) */
2259       /* fpart = x - ipart */
2260       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2261    }
2262
2263    if(p_exp2_int_part || p_exp2) {
2264       /* expipart = (float) (1 << ipart) */
2265       expipart = LLVMBuildAdd(builder, ipart,
2266                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
2267       expipart = LLVMBuildShl(builder, expipart,
2268                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
2269       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2270    }
2271
2272    if(p_exp2) {
2273       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2274                                      Elements(lp_build_exp2_polynomial));
2275
2276       res = LLVMBuildFMul(builder, expipart, expfpart, "");
2277    }
2278
2279    if(p_exp2_int_part)
2280       *p_exp2_int_part = expipart;
2281
2282    if(p_frac_part)
2283       *p_frac_part = fpart;
2284
2285    if(p_exp2)
2286       *p_exp2 = res;
2287 }
2288
2289
2290 LLVMValueRef
2291 lp_build_exp2(struct lp_build_context *bld,
2292               LLVMValueRef x)
2293 {
2294    LLVMValueRef res;
2295    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2296    return res;
2297 }
2298
2299
2300 /**
2301  * Extract the exponent of a IEEE-754 floating point value.
2302  *
2303  * Optionally apply an integer bias.
2304  *
2305  * Result is an integer value with
2306  *
2307  *   ifloor(log2(x)) + bias
2308  */
2309 LLVMValueRef
2310 lp_build_extract_exponent(struct lp_build_context *bld,
2311                           LLVMValueRef x,
2312                           int bias)
2313 {
2314    LLVMBuilderRef builder = bld->gallivm->builder;
2315    const struct lp_type type = bld->type;
2316    unsigned mantissa = lp_mantissa(type);
2317    LLVMValueRef res;
2318
2319    assert(type.floating);
2320
2321    assert(lp_check_value(bld->type, x));
2322
2323    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2324
2325    res = LLVMBuildLShr(builder, x,
2326                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2327    res = LLVMBuildAnd(builder, res,
2328                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
2329    res = LLVMBuildSub(builder, res,
2330                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2331
2332    return res;
2333 }
2334
2335
2336 /**
2337  * Extract the mantissa of the a floating.
2338  *
2339  * Result is a floating point value with
2340  *
2341  *   x / floor(log2(x))
2342  */
2343 LLVMValueRef
2344 lp_build_extract_mantissa(struct lp_build_context *bld,
2345                           LLVMValueRef x)
2346 {
2347    LLVMBuilderRef builder = bld->gallivm->builder;
2348    const struct lp_type type = bld->type;
2349    unsigned mantissa = lp_mantissa(type);
2350    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2351                                                   (1ULL << mantissa) - 1);
2352    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2353    LLVMValueRef res;
2354
2355    assert(lp_check_value(bld->type, x));
2356
2357    assert(type.floating);
2358
2359    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2360
2361    /* res = x / 2**ipart */
2362    res = LLVMBuildAnd(builder, x, mantmask, "");
2363    res = LLVMBuildOr(builder, res, one, "");
2364    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2365
2366    return res;
2367 }
2368
2369
2370
2371 /**
2372  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2373  * These coefficients can be generate with
2374  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2375  */
2376 const double lp_build_log2_polynomial[] = {
2377 #if LOG_POLY_DEGREE == 6
2378    3.11578814719469302614,
2379    -3.32419399085241980044,
2380    2.59883907202499966007,
2381    -1.23152682416275988241,
2382    0.318212422185251071475,
2383    -0.0344359067839062357313
2384 #elif LOG_POLY_DEGREE == 5
2385    2.8882704548164776201,
2386    -2.52074962577807006663,
2387    1.48116647521213171641,
2388    -0.465725644288844778798,
2389    0.0596515482674574969533
2390 #elif LOG_POLY_DEGREE == 4
2391    2.61761038894603480148,
2392    -1.75647175389045657003,
2393    0.688243882994381274313,
2394    -0.107254423828329604454
2395 #elif LOG_POLY_DEGREE == 3
2396    2.28330284476918490682,
2397    -1.04913055217340124191,
2398    0.204446009836232697516
2399 #else
2400 #error
2401 #endif
2402 };
2403
2404
2405 /**
2406  * See http://www.devmaster.net/forums/showthread.php?p=43580
2407  */
2408 void
2409 lp_build_log2_approx(struct lp_build_context *bld,
2410                      LLVMValueRef x,
2411                      LLVMValueRef *p_exp,
2412                      LLVMValueRef *p_floor_log2,
2413                      LLVMValueRef *p_log2)
2414 {
2415    LLVMBuilderRef builder = bld->gallivm->builder;
2416    const struct lp_type type = bld->type;
2417    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2418    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2419
2420    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2421    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2422    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2423
2424    LLVMValueRef i = NULL;
2425    LLVMValueRef exp = NULL;
2426    LLVMValueRef mant = NULL;
2427    LLVMValueRef logexp = NULL;
2428    LLVMValueRef logmant = NULL;
2429    LLVMValueRef res = NULL;
2430
2431    assert(lp_check_value(bld->type, x));
2432
2433    if(p_exp || p_floor_log2 || p_log2) {
2434       /* TODO: optimize the constant case */
2435       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2436           LLVMIsConstant(x)) {
2437          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2438                       __FUNCTION__);
2439       }
2440
2441       assert(type.floating && type.width == 32);
2442
2443       /*
2444        * We don't explicitly handle denormalized numbers. They will yield a
2445        * result in the neighbourhood of -127, which appears to be adequate
2446        * enough.
2447        */
2448
2449       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2450
2451       /* exp = (float) exponent(x) */
2452       exp = LLVMBuildAnd(builder, i, expmask, "");
2453    }
2454
2455    if(p_floor_log2 || p_log2) {
2456       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2457       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2458       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2459    }
2460
2461    if(p_log2) {
2462       /* mant = (float) mantissa(x) */
2463       mant = LLVMBuildAnd(builder, i, mantmask, "");
2464       mant = LLVMBuildOr(builder, mant, one, "");
2465       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2466
2467       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2468                                     Elements(lp_build_log2_polynomial));
2469
2470       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2471       logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2472
2473       res = LLVMBuildFAdd(builder, logmant, logexp, "");
2474    }
2475
2476    if(p_exp) {
2477       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2478       *p_exp = exp;
2479    }
2480
2481    if(p_floor_log2)
2482       *p_floor_log2 = logexp;
2483
2484    if(p_log2)
2485       *p_log2 = res;
2486 }
2487
2488
2489 LLVMValueRef
2490 lp_build_log2(struct lp_build_context *bld,
2491               LLVMValueRef x)
2492 {
2493    LLVMValueRef res;
2494    lp_build_log2_approx(bld, x, NULL, NULL, &res);
2495    return res;
2496 }
2497
2498
2499 /**
2500  * Faster (and less accurate) log2.
2501  *
2502  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2503  *
2504  * Piece-wise linear approximation, with exact results when x is a
2505  * power of two.
2506  *
2507  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2508  */
2509 LLVMValueRef
2510 lp_build_fast_log2(struct lp_build_context *bld,
2511                    LLVMValueRef x)
2512 {
2513    LLVMBuilderRef builder = bld->gallivm->builder;
2514    LLVMValueRef ipart;
2515    LLVMValueRef fpart;
2516
2517    assert(lp_check_value(bld->type, x));
2518
2519    assert(bld->type.floating);
2520
2521    /* ipart = floor(log2(x)) - 1 */
2522    ipart = lp_build_extract_exponent(bld, x, -1);
2523    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2524
2525    /* fpart = x / 2**ipart */
2526    fpart = lp_build_extract_mantissa(bld, x);
2527
2528    /* ipart + fpart */
2529    return LLVMBuildFAdd(builder, ipart, fpart, "");
2530 }
2531
2532
2533 /**
2534  * Fast implementation of iround(log2(x)).
2535  *
2536  * Not an approximation -- it should give accurate results all the time.
2537  */
2538 LLVMValueRef
2539 lp_build_ilog2(struct lp_build_context *bld,
2540                LLVMValueRef x)
2541 {
2542    LLVMBuilderRef builder = bld->gallivm->builder;
2543    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2544    LLVMValueRef ipart;
2545
2546    assert(bld->type.floating);
2547
2548    assert(lp_check_value(bld->type, x));
2549
2550    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
2551    x = LLVMBuildFMul(builder, x, sqrt2, "");
2552
2553    /* ipart = floor(log2(x) + 0.5)  */
2554    ipart = lp_build_extract_exponent(bld, x, 0);
2555
2556    return ipart;
2557 }