src/gallium/drivers/llvmpipe/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_string.h"
  51 #include "util/u_cpu_detect.h"
  52
  53 #include "lp_bld_type.h"
  54 #include "lp_bld_const.h"
  55 #include "lp_bld_intr.h"
  56 #include "lp_bld_logic.h"
  57 #include "lp_bld_arit.h"
  58
  59
  60 /**
  61  * Generate min(a, b)
  62  * No checks for special case values of a or b = 1 or 0 are done.
  63  */
  64 static LLVMValueRef
  65 lp_build_min_simple(struct lp_build_context *bld,
  66                     LLVMValueRef a,
  67                     LLVMValueRef b)
  68 {
  69    const struct lp_type type = bld->type;
  70    const char *intrinsic = NULL;
  71    LLVMValueRef cond;
  72
  73    /* TODO: optimize the constant case */
  74
  75 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
  76    if(type.width * type.length == 128) {
  77       if(type.floating) {
  78          if(type.width == 32)
  79             intrinsic = "llvm.x86.sse.min.ps";
  80          if(type.width == 64)
  81             intrinsic = "llvm.x86.sse2.min.pd";
  82       }
  83       else {
  84          if(type.width == 8 && !type.sign)
  85             intrinsic = "llvm.x86.sse2.pminu.b";
  86          if(type.width == 8 && type.sign)
  87             intrinsic = "llvm.x86.sse41.pminsb";
  88          if(type.width == 16 && !type.sign)
  89             intrinsic = "llvm.x86.sse41.pminuw";
  90          if(type.width == 16 && type.sign)
  91             intrinsic = "llvm.x86.sse2.pmins.w";
  92          if(type.width == 32 && !type.sign)
  93             intrinsic = "llvm.x86.sse41.pminud";
  94          if(type.width == 32 && type.sign)
  95             intrinsic = "llvm.x86.sse41.pminsd";
  96       }
  97    }
  98 #endif
  99
 100    if(intrinsic)
 101       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 102
 103    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 104    return lp_build_select(bld, cond, a, b);
 105 }
 106
 107
 108 /**
 109  * Generate max(a, b)
 110  * No checks for special case values of a or b = 1 or 0 are done.
 111  */
 112 static LLVMValueRef
 113 lp_build_max_simple(struct lp_build_context *bld,
 114                     LLVMValueRef a,
 115                     LLVMValueRef b)
 116 {
 117    const struct lp_type type = bld->type;
 118    const char *intrinsic = NULL;
 119    LLVMValueRef cond;
 120
 121    /* TODO: optimize the constant case */
 122
 123    if(type.width * type.length == 128) {
 124       if(type.floating) {
 125          if(type.width == 32 && util_cpu_caps.has_sse)
 126             intrinsic = "llvm.x86.sse.max.ps";
 127          if(type.width == 64 && util_cpu_caps.has_sse2)
 128             intrinsic = "llvm.x86.sse2.max.pd";
 129       }
 130       else {
 131          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 132             intrinsic = "llvm.x86.sse2.pmaxu.b";
 133          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 134             intrinsic = "llvm.x86.sse41.pmaxsb";
 135          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 136             intrinsic = "llvm.x86.sse41.pmaxuw";
 137          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 138             intrinsic = "llvm.x86.sse2.pmaxs.w";
 139          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 140             intrinsic = "llvm.x86.sse41.pmaxud";
 141          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 142             intrinsic = "llvm.x86.sse41.pmaxsd";
 143       }
 144    }
 145
 146    if(intrinsic)
 147       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 148
 149    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 150    return lp_build_select(bld, cond, a, b);
 151 }
 152
 153
 154 /**
 155  * Generate 1 - a, or ~a depending on bld->type.
 156  */
 157 LLVMValueRef
 158 lp_build_comp(struct lp_build_context *bld,
 159               LLVMValueRef a)
 160 {
 161    const struct lp_type type = bld->type;
 162
 163    if(a == bld->one)
 164       return bld->zero;
 165    if(a == bld->zero)
 166       return bld->one;
 167
 168    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 169       if(LLVMIsConstant(a))
 170          return LLVMConstNot(a);
 171       else
 172          return LLVMBuildNot(bld->builder, a, "");
 173    }
 174
 175    if(LLVMIsConstant(a))
 176       return LLVMConstSub(bld->one, a);
 177    else
 178       return LLVMBuildSub(bld->builder, bld->one, a, "");
 179 }
 180
 181
 182 /**
 183  * Generate a + b
 184  */
 185 LLVMValueRef
 186 lp_build_add(struct lp_build_context *bld,
 187              LLVMValueRef a,
 188              LLVMValueRef b)
 189 {
 190    const struct lp_type type = bld->type;
 191    LLVMValueRef res;
 192
 193    if(a == bld->zero)
 194       return b;
 195    if(b == bld->zero)
 196       return a;
 197    if(a == bld->undef || b == bld->undef)
 198       return bld->undef;
 199
 200    if(bld->type.norm) {
 201       const char *intrinsic = NULL;
 202
 203       if(a == bld->one || b == bld->one)
 204         return bld->one;
 205
 206       if(util_cpu_caps.has_sse2 &&
 207          type.width * type.length == 128 &&
 208          !type.floating && !type.fixed) {
 209          if(type.width == 8)
 210             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 211          if(type.width == 16)
 212             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 213       }
 214
 215       if(intrinsic)
 216          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 217    }
 218
 219    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 220       res = LLVMConstAdd(a, b);
 221    else
 222       res = LLVMBuildAdd(bld->builder, a, b, "");
 223
 224    /* clamp to ceiling of 1.0 */
 225    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 226       res = lp_build_min_simple(bld, res, bld->one);
 227
 228    /* XXX clamp to floor of -1 or 0??? */
 229
 230    return res;
 231 }
 232
 233
 234 /**
 235  * Generate a - b
 236  */
 237 LLVMValueRef
 238 lp_build_sub(struct lp_build_context *bld,
 239              LLVMValueRef a,
 240              LLVMValueRef b)
 241 {
 242    const struct lp_type type = bld->type;
 243    LLVMValueRef res;
 244
 245    if(b == bld->zero)
 246       return a;
 247    if(a == bld->undef || b == bld->undef)
 248       return bld->undef;
 249    if(a == b)
 250       return bld->zero;
 251
 252    if(bld->type.norm) {
 253       const char *intrinsic = NULL;
 254
 255       if(b == bld->one)
 256         return bld->zero;
 257
 258       if(util_cpu_caps.has_sse2 &&
 259          type.width * type.length == 128 &&
 260          !type.floating && !type.fixed) {
 261          if(type.width == 8)
 262             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 263          if(type.width == 16)
 264             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 265       }
 266
 267       if(intrinsic)
 268          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 269    }
 270
 271    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 272       res = LLVMConstSub(a, b);
 273    else
 274       res = LLVMBuildSub(bld->builder, a, b, "");
 275
 276    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 277       res = lp_build_max_simple(bld, res, bld->zero);
 278
 279    return res;
 280 }
 281
 282
 283 /**
 284  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
 285  */
 286 static LLVMValueRef
 287 lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
 288 {
 289    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 290    unsigned i, j;
 291
 292    assert(n <= LP_MAX_VECTOR_LENGTH);
 293    assert(lo_hi < 2);
 294
 295    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
 296       elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
 297       elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
 298    }
 299
 300    return LLVMConstVector(elems, n);
 301 }
 302
 303
 304 /**
 305  * Build constant int vector of width 'n' and value 'c'.
 306  */
 307 static LLVMValueRef
 308 lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
 309 {
 310    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
 311    unsigned i;
 312
 313    assert(n <= LP_MAX_VECTOR_LENGTH);
 314
 315    for(i = 0; i < n; ++i)
 316       elems[i] = LLVMConstInt(type, c, 0);
 317
 318    return LLVMConstVector(elems, n);
 319 }
 320
 321
 322 /**
 323  * Normalized 8bit multiplication.
 324  *
 325  * - alpha plus one
 326  *
 327  *     makes the following approximation to the division (Sree)
 328  *
 329  *       a*b/255 ~= (a*(b + 1)) >> 256
 330  *
 331  *     which is the fastest method that satisfies the following OpenGL criteria
 332  *
 333  *       0*0 = 0 and 255*255 = 255
 334  *
 335  * - geometric series
 336  *
 337  *     takes the geometric series approximation to the division
 338  *
 339  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 340  *
 341  *     in this case just the first two terms to fit in 16bit arithmetic
 342  *
 343  *       t/255 ~= (t + (t >> 8)) >> 8
 344  *
 345  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 346  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 347  *     must be used
 348  *
 349  * - geometric series plus rounding
 350  *
 351  *     when using a geometric series division instead of truncating the result
 352  *     use roundoff in the approximation (Jim Blinn)
 353  *
 354  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 355  *
 356  *     achieving the exact results
 357  *
 358  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 359  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 360  * @sa Michael Herf, The "double blend trick", May 2000,
 361  *     http://www.stereopsis.com/doubleblend.html
 362  */
 363 static LLVMValueRef
 364 lp_build_mul_u8n(LLVMBuilderRef builder,
 365                  LLVMValueRef a, LLVMValueRef b)
 366 {
 367    static LLVMValueRef c01 = NULL;
 368    static LLVMValueRef c08 = NULL;
 369    static LLVMValueRef c80 = NULL;
 370    LLVMValueRef ab;
 371
 372    if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
 373    if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
 374    if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
 375
 376 #if 0
 377
 378    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 379    b = LLVMBuildAdd(builder, b, c01, "");
 380    ab = LLVMBuildMul(builder, a, b, "");
 381
 382 #else
 383
 384    /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
 385    ab = LLVMBuildMul(builder, a, b, "");
 386    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
 387    ab = LLVMBuildAdd(builder, ab, c80, "");
 388
 389 #endif
 390
 391    ab = LLVMBuildLShr(builder, ab, c08, "");
 392
 393    return ab;
 394 }
 395
 396
 397 /**
 398  * Generate a * b
 399  */
 400 LLVMValueRef
 401 lp_build_mul(struct lp_build_context *bld,
 402              LLVMValueRef a,
 403              LLVMValueRef b)
 404 {
 405    const struct lp_type type = bld->type;
 406
 407    if(a == bld->zero)
 408       return bld->zero;
 409    if(a == bld->one)
 410       return b;
 411    if(b == bld->zero)
 412       return bld->zero;
 413    if(b == bld->one)
 414       return a;
 415    if(a == bld->undef || b == bld->undef)
 416       return bld->undef;
 417
 418    if(!type.floating && !type.fixed && type.norm) {
 419       if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
 420          LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
 421          LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
 422          static LLVMValueRef ml = NULL;
 423          static LLVMValueRef mh = NULL;
 424          LLVMValueRef al, ah, bl, bh;
 425          LLVMValueRef abl, abh;
 426          LLVMValueRef ab;
 427
 428          if(!ml) ml = lp_build_unpack_shuffle(16, 0);
 429          if(!mh) mh = lp_build_unpack_shuffle(16, 1);
 430
 431          /*  PUNPCKLBW, PUNPCKHBW */
 432          al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
 433          bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
 434          ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
 435          bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
 436
 437          /* NOP */
 438          al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
 439          bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
 440          ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
 441          bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
 442
 443          /* PMULLW, PSRLW, PADDW */
 444          abl = lp_build_mul_u8n(bld->builder, al, bl);
 445          abh = lp_build_mul_u8n(bld->builder, ah, bh);
 446
 447          /* PACKUSWB */
 448          ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
 449
 450          /* NOP */
 451          ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
 452
 453          return ab;
 454       }
 455
 456       /* FIXME */
 457       assert(0);
 458    }
 459
 460    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 461       return LLVMConstMul(a, b);
 462
 463    return LLVMBuildMul(bld->builder, a, b, "");
 464 }
 465
 466
 467 /**
 468  * Generate a / b
 469  */
 470 LLVMValueRef
 471 lp_build_div(struct lp_build_context *bld,
 472              LLVMValueRef a,
 473              LLVMValueRef b)
 474 {
 475    const struct lp_type type = bld->type;
 476
 477    if(a == bld->zero)
 478       return bld->zero;
 479    if(a == bld->one)
 480       return lp_build_rcp(bld, b);
 481    if(b == bld->zero)
 482       return bld->undef;
 483    if(b == bld->one)
 484       return a;
 485    if(a == bld->undef || b == bld->undef)
 486       return bld->undef;
 487
 488    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 489       return LLVMConstFDiv(a, b);
 490
 491    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 492       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 493
 494    return LLVMBuildFDiv(bld->builder, a, b, "");
 495 }
 496
 497
 498 LLVMValueRef
 499 lp_build_lerp(struct lp_build_context *bld,
 500               LLVMValueRef x,
 501               LLVMValueRef v0,
 502               LLVMValueRef v1)
 503 {
 504    return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
 505 }
 506
 507
 508 LLVMValueRef
 509 lp_build_lerp_2d(struct lp_build_context *bld,
 510                  LLVMValueRef x,
 511                  LLVMValueRef y,
 512                  LLVMValueRef v00,
 513                  LLVMValueRef v01,
 514                  LLVMValueRef v10,
 515                  LLVMValueRef v11)
 516 {
 517    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 518    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 519    return lp_build_lerp(bld, y, v0, v1);
 520 }
 521
 522
 523 /**
 524  * Generate min(a, b)
 525  * Do checks for special cases.
 526  */
 527 LLVMValueRef
 528 lp_build_min(struct lp_build_context *bld,
 529              LLVMValueRef a,
 530              LLVMValueRef b)
 531 {
 532    if(a == bld->undef || b == bld->undef)
 533       return bld->undef;
 534
 535    if(a == b)
 536       return a;
 537
 538    if(bld->type.norm) {
 539       if(a == bld->zero || b == bld->zero)
 540          return bld->zero;
 541       if(a == bld->one)
 542          return b;
 543       if(b == bld->one)
 544          return a;
 545    }
 546
 547    return lp_build_min_simple(bld, a, b);
 548 }
 549
 550
 551 /**
 552  * Generate max(a, b)
 553  * Do checks for special cases.
 554  */
 555 LLVMValueRef
 556 lp_build_max(struct lp_build_context *bld,
 557              LLVMValueRef a,
 558              LLVMValueRef b)
 559 {
 560    if(a == bld->undef || b == bld->undef)
 561       return bld->undef;
 562
 563    if(a == b)
 564       return a;
 565
 566    if(bld->type.norm) {
 567       if(a == bld->one || b == bld->one)
 568          return bld->one;
 569       if(a == bld->zero)
 570          return b;
 571       if(b == bld->zero)
 572          return a;
 573    }
 574
 575    return lp_build_max_simple(bld, a, b);
 576 }
 577
 578
 579 /**
 580  * Generate abs(a)
 581  */
 582 LLVMValueRef
 583 lp_build_abs(struct lp_build_context *bld,
 584              LLVMValueRef a)
 585 {
 586    const struct lp_type type = bld->type;
 587    LLVMTypeRef vec_type = lp_build_vec_type(type);
 588
 589    if(!type.sign)
 590       return a;
 591
 592    if(type.floating) {
 593       /* Mask out the sign bit */
 594       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 595       LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
 596       a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 597       a = LLVMBuildAnd(bld->builder, a, mask, "");
 598       a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 599       return a;
 600    }
 601
 602    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 603       switch(type.width) {
 604       case 8:
 605          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 606       case 16:
 607          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 608       case 32:
 609          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 610       }
 611    }
 612
 613    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 614 }
 615
 616
 617 LLVMValueRef
 618 lp_build_sgn(struct lp_build_context *bld,
 619              LLVMValueRef a)
 620 {
 621    const struct lp_type type = bld->type;
 622    LLVMTypeRef vec_type = lp_build_vec_type(type);
 623    LLVMValueRef cond;
 624    LLVMValueRef res;
 625
 626    /* Handle non-zero case */
 627    if(!type.sign) {
 628       /* if not zero then sign must be positive */
 629       res = bld->one;
 630    }
 631    else if(type.floating) {
 632       /* Take the sign bit and add it to 1 constant */
 633       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 634       LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
 635       LLVMValueRef sign;
 636       LLVMValueRef one;
 637       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 638       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 639       one = LLVMConstBitCast(bld->one, int_vec_type);
 640       res = LLVMBuildOr(bld->builder, sign, one, "");
 641       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 642    }
 643    else
 644    {
 645       LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
 646       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 647       res = lp_build_select(bld, cond, bld->one, minus_one);
 648    }
 649
 650    /* Handle zero */
 651    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 652    res = lp_build_select(bld, cond, bld->zero, bld->one);
 653
 654    return res;
 655 }
 656
 657
 658 enum lp_build_round_sse41_mode
 659 {
 660    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 661    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 662    LP_BUILD_ROUND_SSE41_CEIL = 2,
 663    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 664 };
 665
 666
 667 static INLINE LLVMValueRef
 668 lp_build_round_sse41(struct lp_build_context *bld,
 669                      LLVMValueRef a,
 670                      enum lp_build_round_sse41_mode mode)
 671 {
 672    const struct lp_type type = bld->type;
 673    LLVMTypeRef vec_type = lp_build_vec_type(type);
 674    const char *intrinsic;
 675
 676    assert(type.floating);
 677    assert(type.width*type.length == 128);
 678
 679    switch(type.width) {
 680    case 32:
 681       intrinsic = "llvm.x86.sse41.round.ps";
 682       break;
 683    case 64:
 684       intrinsic = "llvm.x86.sse41.round.pd";
 685       break;
 686    default:
 687       assert(0);
 688       return bld->undef;
 689    }
 690
 691    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 692                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 693 }
 694
 695
 696 LLVMValueRef
 697 lp_build_round(struct lp_build_context *bld,
 698                LLVMValueRef a)
 699 {
 700    const struct lp_type type = bld->type;
 701
 702    assert(type.floating);
 703
 704    if(util_cpu_caps.has_sse4_1)
 705       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 706
 707    /* FIXME */
 708    assert(0);
 709    return bld->undef;
 710 }
 711
 712
 713 LLVMValueRef
 714 lp_build_floor(struct lp_build_context *bld,
 715                LLVMValueRef a)
 716 {
 717    const struct lp_type type = bld->type;
 718
 719    assert(type.floating);
 720
 721    if(util_cpu_caps.has_sse4_1)
 722       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 723
 724    /* FIXME */
 725    assert(0);
 726    return bld->undef;
 727 }
 728
 729
 730 LLVMValueRef
 731 lp_build_ceil(struct lp_build_context *bld,
 732               LLVMValueRef a)
 733 {
 734    const struct lp_type type = bld->type;
 735
 736    assert(type.floating);
 737
 738    if(util_cpu_caps.has_sse4_1)
 739       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 740
 741    /* FIXME */
 742    assert(0);
 743    return bld->undef;
 744 }
 745
 746
 747 LLVMValueRef
 748 lp_build_trunc(struct lp_build_context *bld,
 749                LLVMValueRef a)
 750 {
 751    const struct lp_type type = bld->type;
 752
 753    assert(type.floating);
 754
 755    if(util_cpu_caps.has_sse4_1)
 756       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 757
 758    /* FIXME */
 759    assert(0);
 760    return bld->undef;
 761 }
 762
 763
 764 /**
 765  * Convert to integer, through whichever rounding method that's fastest,
 766  * typically truncating to zero.
 767  */
 768 LLVMValueRef
 769 lp_build_int(struct lp_build_context *bld,
 770              LLVMValueRef a)
 771 {
 772    const struct lp_type type = bld->type;
 773    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 774
 775    assert(type.floating);
 776
 777    return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 778 }
 779
 780
 781 LLVMValueRef
 782 lp_build_ifloor(struct lp_build_context *bld,
 783                 LLVMValueRef a)
 784 {
 785    a = lp_build_floor(bld, a);
 786    a = lp_build_int(bld, a);
 787    return a;
 788 }
 789
 790
 791 LLVMValueRef
 792 lp_build_sqrt(struct lp_build_context *bld,
 793               LLVMValueRef a)
 794 {
 795    const struct lp_type type = bld->type;
 796    LLVMTypeRef vec_type = lp_build_vec_type(type);
 797    char intrinsic[32];
 798
 799    /* TODO: optimize the constant case */
 800    /* TODO: optimize the constant case */
 801
 802    assert(type.floating);
 803    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
 804
 805    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
 806 }
 807
 808
 809 LLVMValueRef
 810 lp_build_rcp(struct lp_build_context *bld,
 811              LLVMValueRef a)
 812 {
 813    const struct lp_type type = bld->type;
 814
 815    if(a == bld->zero)
 816       return bld->undef;
 817    if(a == bld->one)
 818       return bld->one;
 819    if(a == bld->undef)
 820       return bld->undef;
 821
 822    assert(type.floating);
 823
 824    if(LLVMIsConstant(a))
 825       return LLVMConstFDiv(bld->one, a);
 826
 827    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 828       /* FIXME: improve precision */
 829       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
 830
 831    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 832 }
 833
 834
 835 /**
 836  * Generate 1/sqrt(a)
 837  */
 838 LLVMValueRef
 839 lp_build_rsqrt(struct lp_build_context *bld,
 840                LLVMValueRef a)
 841 {
 842    const struct lp_type type = bld->type;
 843
 844    assert(type.floating);
 845
 846    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 847       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
 848
 849    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 850 }
 851
 852
 853 /**
 854  * Generate cos(a)
 855  */
 856 LLVMValueRef
 857 lp_build_cos(struct lp_build_context *bld,
 858               LLVMValueRef a)
 859 {
 860    const struct lp_type type = bld->type;
 861    LLVMTypeRef vec_type = lp_build_vec_type(type);
 862    char intrinsic[32];
 863
 864    /* TODO: optimize the constant case */
 865
 866    assert(type.floating);
 867    util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
 868
 869    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
 870 }
 871
 872
 873 /**
 874  * Generate sin(a)
 875  */
 876 LLVMValueRef
 877 lp_build_sin(struct lp_build_context *bld,
 878               LLVMValueRef a)
 879 {
 880    const struct lp_type type = bld->type;
 881    LLVMTypeRef vec_type = lp_build_vec_type(type);
 882    char intrinsic[32];
 883
 884    /* TODO: optimize the constant case */
 885
 886    assert(type.floating);
 887    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
 888
 889    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
 890 }
 891
 892
 893 /**
 894  * Generate pow(x, y)
 895  */
 896 LLVMValueRef
 897 lp_build_pow(struct lp_build_context *bld,
 898              LLVMValueRef x,
 899              LLVMValueRef y)
 900 {
 901    /* TODO: optimize the constant case */
 902    if(LLVMIsConstant(x) && LLVMIsConstant(y))
 903       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
 904                    __FUNCTION__);
 905
 906    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 907 }
 908
 909
 910 /**
 911  * Generate exp(x)
 912  */
 913 LLVMValueRef
 914 lp_build_exp(struct lp_build_context *bld,
 915              LLVMValueRef x)
 916 {
 917    /* log2(e) = 1/log(2) */
 918    LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
 919
 920    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 921 }
 922
 923
 924 /**
 925  * Generate log(x)
 926  */
 927 LLVMValueRef
 928 lp_build_log(struct lp_build_context *bld,
 929              LLVMValueRef x)
 930 {
 931    /* log(2) */
 932    LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
 933
 934    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 935 }
 936
 937
 938 #define EXP_POLY_DEGREE 3
 939 #define LOG_POLY_DEGREE 5
 940
 941
 942 /**
 943  * Generate polynomial.
 944  * Ex:  x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
 945  */
 946 static LLVMValueRef
 947 lp_build_polynomial(struct lp_build_context *bld,
 948                     LLVMValueRef x,
 949                     const double *coeffs,
 950                     unsigned num_coeffs)
 951 {
 952    const struct lp_type type = bld->type;
 953    LLVMValueRef res = NULL;
 954    unsigned i;
 955
 956    /* TODO: optimize the constant case */
 957    if(LLVMIsConstant(x))
 958       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
 959                    __FUNCTION__);
 960
 961    for (i = num_coeffs; i--; ) {
 962       LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
 963       if(res)
 964          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
 965       else
 966          res = coeff;
 967    }
 968
 969    if(res)
 970       return res;
 971    else
 972       return bld->undef;
 973 }
 974
 975
 976 /**
 977  * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
 978  */
 979 const double lp_build_exp2_polynomial[] = {
 980 #if EXP_POLY_DEGREE == 5
 981    9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
 982 #elif EXP_POLY_DEGREE == 4
 983    1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
 984 #elif EXP_POLY_DEGREE == 3
 985    9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
 986 #elif EXP_POLY_DEGREE == 2
 987    1.0017247, 6.5763628e-1, 3.3718944e-1
 988 #else
 989 #error
 990 #endif
 991 };
 992
 993
 994 void
 995 lp_build_exp2_approx(struct lp_build_context *bld,
 996                      LLVMValueRef x,
 997                      LLVMValueRef *p_exp2_int_part,
 998                      LLVMValueRef *p_frac_part,
 999                      LLVMValueRef *p_exp2)
1000 {
1001    const struct lp_type type = bld->type;
1002    LLVMTypeRef vec_type = lp_build_vec_type(type);
1003    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1004    LLVMValueRef ipart = NULL;
1005    LLVMValueRef fpart = NULL;
1006    LLVMValueRef expipart = NULL;
1007    LLVMValueRef expfpart = NULL;
1008    LLVMValueRef res = NULL;
1009
1010    if(p_exp2_int_part || p_frac_part || p_exp2) {
1011       /* TODO: optimize the constant case */
1012       if(LLVMIsConstant(x))
1013          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1014                       __FUNCTION__);
1015
1016       assert(type.floating && type.width == 32);
1017
1018       x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
1019       x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1020
1021       /* ipart = int(x - 0.5) */
1022       ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1023       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1024
1025       /* fpart = x - ipart */
1026       fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1027       fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1028    }
1029
1030    if(p_exp2_int_part || p_exp2) {
1031       /* expipart = (float) (1 << ipart) */
1032       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1033       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1034       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1035    }
1036
1037    if(p_exp2) {
1038       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1039                                      Elements(lp_build_exp2_polynomial));
1040
1041       res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1042    }
1043
1044    if(p_exp2_int_part)
1045       *p_exp2_int_part = expipart;
1046
1047    if(p_frac_part)
1048       *p_frac_part = fpart;
1049
1050    if(p_exp2)
1051       *p_exp2 = res;
1052 }
1053
1054
1055 LLVMValueRef
1056 lp_build_exp2(struct lp_build_context *bld,
1057               LLVMValueRef x)
1058 {
1059    LLVMValueRef res;
1060    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1061    return res;
1062 }
1063
1064
1065 /**
1066  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1067  * These coefficients can be generate with
1068  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1069  */
1070 const double lp_build_log2_polynomial[] = {
1071 #if LOG_POLY_DEGREE == 6
1072    3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1073 #elif LOG_POLY_DEGREE == 5
1074    2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1075 #elif LOG_POLY_DEGREE == 4
1076    2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1077 #elif LOG_POLY_DEGREE == 3
1078    2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1079 #else
1080 #error
1081 #endif
1082 };
1083
1084
1085 /**
1086  * See http://www.devmaster.net/forums/showthread.php?p=43580
1087  */
1088 void
1089 lp_build_log2_approx(struct lp_build_context *bld,
1090                      LLVMValueRef x,
1091                      LLVMValueRef *p_exp,
1092                      LLVMValueRef *p_floor_log2,
1093                      LLVMValueRef *p_log2)
1094 {
1095    const struct lp_type type = bld->type;
1096    LLVMTypeRef vec_type = lp_build_vec_type(type);
1097    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1098
1099    LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1100    LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1101    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1102
1103    LLVMValueRef i = NULL;
1104    LLVMValueRef exp = NULL;
1105    LLVMValueRef mant = NULL;
1106    LLVMValueRef logexp = NULL;
1107    LLVMValueRef logmant = NULL;
1108    LLVMValueRef res = NULL;
1109
1110    if(p_exp || p_floor_log2 || p_log2) {
1111       /* TODO: optimize the constant case */
1112       if(LLVMIsConstant(x))
1113          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1114                       __FUNCTION__);
1115
1116       assert(type.floating && type.width == 32);
1117
1118       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1119
1120       /* exp = (float) exponent(x) */
1121       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1122    }
1123
1124    if(p_floor_log2 || p_log2) {
1125       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1126       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1127       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1128    }
1129
1130    if(p_log2) {
1131       /* mant = (float) mantissa(x) */
1132       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1133       mant = LLVMBuildOr(bld->builder, mant, one, "");
1134       mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1135
1136       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1137                                     Elements(lp_build_log2_polynomial));
1138
1139       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1140       logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1141
1142       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1143    }
1144
1145    if(p_exp)
1146       *p_exp = exp;
1147
1148    if(p_floor_log2)
1149       *p_floor_log2 = logexp;
1150
1151    if(p_log2)
1152       *p_log2 = res;
1153 }
1154
1155
1156 LLVMValueRef
1157 lp_build_log2(struct lp_build_context *bld,
1158               LLVMValueRef x)
1159 {
1160    LLVMValueRef res;
1161    lp_build_log2_approx(bld, x, NULL, NULL, &res);
1162    return res;
1163 }