src/gallium/drivers/llvmpipe/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include "util/u_memory.h"
  49 #include "util/u_debug.h"
  50 #include "util/u_string.h"
  51 #include "util/u_cpu_detect.h"
  52
  53 #include "lp_bld_type.h"
  54 #include "lp_bld_const.h"
  55 #include "lp_bld_intr.h"
  56 #include "lp_bld_logic.h"
  57 #include "lp_bld_pack.h"
  58 #include "lp_bld_debug.h"
  59 #include "lp_bld_arit.h"
  60
  61
  62 /**
  63  * Generate min(a, b)
  64  * No checks for special case values of a or b = 1 or 0 are done.
  65  */
  66 static LLVMValueRef
  67 lp_build_min_simple(struct lp_build_context *bld,
  68                     LLVMValueRef a,
  69                     LLVMValueRef b)
  70 {
  71    const struct lp_type type = bld->type;
  72    const char *intrinsic = NULL;
  73    LLVMValueRef cond;
  74
  75    /* TODO: optimize the constant case */
  76
  77    if(type.width * type.length == 128) {
  78       if(type.floating) {
  79          if(type.width == 32 && util_cpu_caps.has_sse)
  80             intrinsic = "llvm.x86.sse.min.ps";
  81          if(type.width == 64 && util_cpu_caps.has_sse2)
  82             intrinsic = "llvm.x86.sse2.min.pd";
  83       }
  84       else {
  85          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
  86             intrinsic = "llvm.x86.sse2.pminu.b";
  87          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
  88             intrinsic = "llvm.x86.sse41.pminsb";
  89          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
  90             intrinsic = "llvm.x86.sse41.pminuw";
  91          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
  92             intrinsic = "llvm.x86.sse2.pmins.w";
  93          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
  94             intrinsic = "llvm.x86.sse41.pminud";
  95          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
  96             intrinsic = "llvm.x86.sse41.pminsd";
  97       }
  98    }
  99
 100    if(intrinsic)
 101       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 102
 103    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 104    return lp_build_select(bld, cond, a, b);
 105 }
 106
 107
 108 /**
 109  * Generate max(a, b)
 110  * No checks for special case values of a or b = 1 or 0 are done.
 111  */
 112 static LLVMValueRef
 113 lp_build_max_simple(struct lp_build_context *bld,
 114                     LLVMValueRef a,
 115                     LLVMValueRef b)
 116 {
 117    const struct lp_type type = bld->type;
 118    const char *intrinsic = NULL;
 119    LLVMValueRef cond;
 120
 121    /* TODO: optimize the constant case */
 122
 123    if(type.width * type.length == 128) {
 124       if(type.floating) {
 125          if(type.width == 32 && util_cpu_caps.has_sse)
 126             intrinsic = "llvm.x86.sse.max.ps";
 127          if(type.width == 64 && util_cpu_caps.has_sse2)
 128             intrinsic = "llvm.x86.sse2.max.pd";
 129       }
 130       else {
 131          if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
 132             intrinsic = "llvm.x86.sse2.pmaxu.b";
 133          if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
 134             intrinsic = "llvm.x86.sse41.pmaxsb";
 135          if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
 136             intrinsic = "llvm.x86.sse41.pmaxuw";
 137          if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
 138             intrinsic = "llvm.x86.sse2.pmaxs.w";
 139          if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
 140             intrinsic = "llvm.x86.sse41.pmaxud";
 141          if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
 142             intrinsic = "llvm.x86.sse41.pmaxsd";
 143       }
 144    }
 145
 146    if(intrinsic)
 147       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 148
 149    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 150    return lp_build_select(bld, cond, a, b);
 151 }
 152
 153
 154 /**
 155  * Generate 1 - a, or ~a depending on bld->type.
 156  */
 157 LLVMValueRef
 158 lp_build_comp(struct lp_build_context *bld,
 159               LLVMValueRef a)
 160 {
 161    const struct lp_type type = bld->type;
 162
 163    if(a == bld->one)
 164       return bld->zero;
 165    if(a == bld->zero)
 166       return bld->one;
 167
 168    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 169       if(LLVMIsConstant(a))
 170          return LLVMConstNot(a);
 171       else
 172          return LLVMBuildNot(bld->builder, a, "");
 173    }
 174
 175    if(LLVMIsConstant(a))
 176       return LLVMConstSub(bld->one, a);
 177    else
 178       return LLVMBuildSub(bld->builder, bld->one, a, "");
 179 }
 180
 181
 182 /**
 183  * Generate a + b
 184  */
 185 LLVMValueRef
 186 lp_build_add(struct lp_build_context *bld,
 187              LLVMValueRef a,
 188              LLVMValueRef b)
 189 {
 190    const struct lp_type type = bld->type;
 191    LLVMValueRef res;
 192
 193    if(a == bld->zero)
 194       return b;
 195    if(b == bld->zero)
 196       return a;
 197    if(a == bld->undef || b == bld->undef)
 198       return bld->undef;
 199
 200    if(bld->type.norm) {
 201       const char *intrinsic = NULL;
 202
 203       if(a == bld->one || b == bld->one)
 204         return bld->one;
 205
 206       if(util_cpu_caps.has_sse2 &&
 207          type.width * type.length == 128 &&
 208          !type.floating && !type.fixed) {
 209          if(type.width == 8)
 210             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 211          if(type.width == 16)
 212             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 213       }
 214
 215       if(intrinsic)
 216          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 217    }
 218
 219    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 220       res = LLVMConstAdd(a, b);
 221    else
 222       res = LLVMBuildAdd(bld->builder, a, b, "");
 223
 224    /* clamp to ceiling of 1.0 */
 225    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 226       res = lp_build_min_simple(bld, res, bld->one);
 227
 228    /* XXX clamp to floor of -1 or 0??? */
 229
 230    return res;
 231 }
 232
 233
 234 /**
 235  * Generate a - b
 236  */
 237 LLVMValueRef
 238 lp_build_sub(struct lp_build_context *bld,
 239              LLVMValueRef a,
 240              LLVMValueRef b)
 241 {
 242    const struct lp_type type = bld->type;
 243    LLVMValueRef res;
 244
 245    if(b == bld->zero)
 246       return a;
 247    if(a == bld->undef || b == bld->undef)
 248       return bld->undef;
 249    if(a == b)
 250       return bld->zero;
 251
 252    if(bld->type.norm) {
 253       const char *intrinsic = NULL;
 254
 255       if(b == bld->one)
 256         return bld->zero;
 257
 258       if(util_cpu_caps.has_sse2 &&
 259          type.width * type.length == 128 &&
 260          !type.floating && !type.fixed) {
 261          if(type.width == 8)
 262             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 263          if(type.width == 16)
 264             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 265       }
 266
 267       if(intrinsic)
 268          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
 269    }
 270
 271    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 272       res = LLVMConstSub(a, b);
 273    else
 274       res = LLVMBuildSub(bld->builder, a, b, "");
 275
 276    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 277       res = lp_build_max_simple(bld, res, bld->zero);
 278
 279    return res;
 280 }
 281
 282
 283 /**
 284  * Normalized 8bit multiplication.
 285  *
 286  * - alpha plus one
 287  *
 288  *     makes the following approximation to the division (Sree)
 289  *
 290  *       a*b/255 ~= (a*(b + 1)) >> 256
 291  *
 292  *     which is the fastest method that satisfies the following OpenGL criteria
 293  *
 294  *       0*0 = 0 and 255*255 = 255
 295  *
 296  * - geometric series
 297  *
 298  *     takes the geometric series approximation to the division
 299  *
 300  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 301  *
 302  *     in this case just the first two terms to fit in 16bit arithmetic
 303  *
 304  *       t/255 ~= (t + (t >> 8)) >> 8
 305  *
 306  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 307  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 308  *     must be used
 309  *
 310  * - geometric series plus rounding
 311  *
 312  *     when using a geometric series division instead of truncating the result
 313  *     use roundoff in the approximation (Jim Blinn)
 314  *
 315  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 316  *
 317  *     achieving the exact results
 318  *
 319  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 320  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 321  * @sa Michael Herf, The "double blend trick", May 2000,
 322  *     http://www.stereopsis.com/doubleblend.html
 323  */
 324 static LLVMValueRef
 325 lp_build_mul_u8n(LLVMBuilderRef builder,
 326                  struct lp_type i16_type,
 327                  LLVMValueRef a, LLVMValueRef b)
 328 {
 329    LLVMValueRef c8;
 330    LLVMValueRef ab;
 331
 332    c8 = lp_build_int_const_scalar(i16_type, 8);
 333
 334 #if 0
 335
 336    /* a*b/255 ~= (a*(b + 1)) >> 256 */
 337    b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
 338    ab = LLVMBuildMul(builder, a, b, "");
 339
 340 #else
 341
 342    /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
 343    ab = LLVMBuildMul(builder, a, b, "");
 344    ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
 345    ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
 346
 347 #endif
 348
 349    ab = LLVMBuildLShr(builder, ab, c8, "");
 350
 351    return ab;
 352 }
 353
 354
 355 /**
 356  * Generate a * b
 357  */
 358 LLVMValueRef
 359 lp_build_mul(struct lp_build_context *bld,
 360              LLVMValueRef a,
 361              LLVMValueRef b)
 362 {
 363    const struct lp_type type = bld->type;
 364
 365    if(a == bld->zero)
 366       return bld->zero;
 367    if(a == bld->one)
 368       return b;
 369    if(b == bld->zero)
 370       return bld->zero;
 371    if(b == bld->one)
 372       return a;
 373    if(a == bld->undef || b == bld->undef)
 374       return bld->undef;
 375
 376    if(!type.floating && !type.fixed && type.norm) {
 377       if(type.width == 8) {
 378          struct lp_type i16_type = lp_wider_type(type);
 379          LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 380
 381          lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
 382          lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 383
 384          /* PMULLW, PSRLW, PADDW */
 385          abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
 386          abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 387
 388          ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
 389
 390          return ab;
 391       }
 392
 393       /* FIXME */
 394       assert(0);
 395    }
 396
 397    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 398       return LLVMConstMul(a, b);
 399
 400    return LLVMBuildMul(bld->builder, a, b, "");
 401 }
 402
 403
 404 /**
 405  * Generate a / b
 406  */
 407 LLVMValueRef
 408 lp_build_div(struct lp_build_context *bld,
 409              LLVMValueRef a,
 410              LLVMValueRef b)
 411 {
 412    const struct lp_type type = bld->type;
 413
 414    if(a == bld->zero)
 415       return bld->zero;
 416    if(a == bld->one)
 417       return lp_build_rcp(bld, b);
 418    if(b == bld->zero)
 419       return bld->undef;
 420    if(b == bld->one)
 421       return a;
 422    if(a == bld->undef || b == bld->undef)
 423       return bld->undef;
 424
 425    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 426       return LLVMConstFDiv(a, b);
 427
 428    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 429       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 430
 431    return LLVMBuildFDiv(bld->builder, a, b, "");
 432 }
 433
 434
 435 LLVMValueRef
 436 lp_build_lerp(struct lp_build_context *bld,
 437               LLVMValueRef x,
 438               LLVMValueRef v0,
 439               LLVMValueRef v1)
 440 {
 441    return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
 442 }
 443
 444
 445 LLVMValueRef
 446 lp_build_lerp_2d(struct lp_build_context *bld,
 447                  LLVMValueRef x,
 448                  LLVMValueRef y,
 449                  LLVMValueRef v00,
 450                  LLVMValueRef v01,
 451                  LLVMValueRef v10,
 452                  LLVMValueRef v11)
 453 {
 454    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
 455    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
 456    return lp_build_lerp(bld, y, v0, v1);
 457 }
 458
 459
 460 /**
 461  * Generate min(a, b)
 462  * Do checks for special cases.
 463  */
 464 LLVMValueRef
 465 lp_build_min(struct lp_build_context *bld,
 466              LLVMValueRef a,
 467              LLVMValueRef b)
 468 {
 469    if(a == bld->undef || b == bld->undef)
 470       return bld->undef;
 471
 472    if(a == b)
 473       return a;
 474
 475    if(bld->type.norm) {
 476       if(a == bld->zero || b == bld->zero)
 477          return bld->zero;
 478       if(a == bld->one)
 479          return b;
 480       if(b == bld->one)
 481          return a;
 482    }
 483
 484    return lp_build_min_simple(bld, a, b);
 485 }
 486
 487
 488 /**
 489  * Generate max(a, b)
 490  * Do checks for special cases.
 491  */
 492 LLVMValueRef
 493 lp_build_max(struct lp_build_context *bld,
 494              LLVMValueRef a,
 495              LLVMValueRef b)
 496 {
 497    if(a == bld->undef || b == bld->undef)
 498       return bld->undef;
 499
 500    if(a == b)
 501       return a;
 502
 503    if(bld->type.norm) {
 504       if(a == bld->one || b == bld->one)
 505          return bld->one;
 506       if(a == bld->zero)
 507          return b;
 508       if(b == bld->zero)
 509          return a;
 510    }
 511
 512    return lp_build_max_simple(bld, a, b);
 513 }
 514
 515
 516 /**
 517  * Generate abs(a)
 518  */
 519 LLVMValueRef
 520 lp_build_abs(struct lp_build_context *bld,
 521              LLVMValueRef a)
 522 {
 523    const struct lp_type type = bld->type;
 524    LLVMTypeRef vec_type = lp_build_vec_type(type);
 525
 526    if(!type.sign)
 527       return a;
 528
 529    if(type.floating) {
 530       /* Mask out the sign bit */
 531       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 532       LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
 533       a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 534       a = LLVMBuildAnd(bld->builder, a, mask, "");
 535       a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
 536       return a;
 537    }
 538
 539    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
 540       switch(type.width) {
 541       case 8:
 542          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
 543       case 16:
 544          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
 545       case 32:
 546          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
 547       }
 548    }
 549
 550    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 551 }
 552
 553
 554 LLVMValueRef
 555 lp_build_sgn(struct lp_build_context *bld,
 556              LLVMValueRef a)
 557 {
 558    const struct lp_type type = bld->type;
 559    LLVMTypeRef vec_type = lp_build_vec_type(type);
 560    LLVMValueRef cond;
 561    LLVMValueRef res;
 562
 563    /* Handle non-zero case */
 564    if(!type.sign) {
 565       /* if not zero then sign must be positive */
 566       res = bld->one;
 567    }
 568    else if(type.floating) {
 569       /* Take the sign bit and add it to 1 constant */
 570       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 571       LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
 572       LLVMValueRef sign;
 573       LLVMValueRef one;
 574       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 575       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 576       one = LLVMConstBitCast(bld->one, int_vec_type);
 577       res = LLVMBuildOr(bld->builder, sign, one, "");
 578       res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
 579    }
 580    else
 581    {
 582       LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
 583       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
 584       res = lp_build_select(bld, cond, bld->one, minus_one);
 585    }
 586
 587    /* Handle zero */
 588    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
 589    res = lp_build_select(bld, cond, bld->zero, bld->one);
 590
 591    return res;
 592 }
 593
 594
 595 enum lp_build_round_sse41_mode
 596 {
 597    LP_BUILD_ROUND_SSE41_NEAREST = 0,
 598    LP_BUILD_ROUND_SSE41_FLOOR = 1,
 599    LP_BUILD_ROUND_SSE41_CEIL = 2,
 600    LP_BUILD_ROUND_SSE41_TRUNCATE = 3
 601 };
 602
 603
 604 static INLINE LLVMValueRef
 605 lp_build_round_sse41(struct lp_build_context *bld,
 606                      LLVMValueRef a,
 607                      enum lp_build_round_sse41_mode mode)
 608 {
 609    const struct lp_type type = bld->type;
 610    LLVMTypeRef vec_type = lp_build_vec_type(type);
 611    const char *intrinsic;
 612
 613    assert(type.floating);
 614    assert(type.width*type.length == 128);
 615    assert(lp_check_value(type, a));
 616    assert(util_cpu_caps.has_sse4_1);
 617
 618    switch(type.width) {
 619    case 32:
 620       intrinsic = "llvm.x86.sse41.round.ps";
 621       break;
 622    case 64:
 623       intrinsic = "llvm.x86.sse41.round.pd";
 624       break;
 625    default:
 626       assert(0);
 627       return bld->undef;
 628    }
 629
 630    return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
 631                                     LLVMConstInt(LLVMInt32Type(), mode, 0));
 632 }
 633
 634
 635 LLVMValueRef
 636 lp_build_trunc(struct lp_build_context *bld,
 637                LLVMValueRef a)
 638 {
 639    const struct lp_type type = bld->type;
 640
 641    assert(type.floating);
 642    assert(lp_check_value(type, a));
 643
 644    if(util_cpu_caps.has_sse4_1)
 645       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 646    else {
 647       LLVMTypeRef vec_type = lp_build_vec_type(type);
 648       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 649       LLVMValueRef res;
 650       res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 651       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 652       return res;
 653    }
 654 }
 655
 656
 657 LLVMValueRef
 658 lp_build_round(struct lp_build_context *bld,
 659                LLVMValueRef a)
 660 {
 661    const struct lp_type type = bld->type;
 662
 663    assert(type.floating);
 664    assert(lp_check_value(type, a));
 665
 666    if(util_cpu_caps.has_sse4_1)
 667       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 668    else {
 669       LLVMTypeRef vec_type = lp_build_vec_type(type);
 670       LLVMValueRef res;
 671       res = lp_build_iround(bld, a);
 672       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 673       return res;
 674    }
 675 }
 676
 677
 678 LLVMValueRef
 679 lp_build_floor(struct lp_build_context *bld,
 680                LLVMValueRef a)
 681 {
 682    const struct lp_type type = bld->type;
 683
 684    assert(type.floating);
 685
 686    if(util_cpu_caps.has_sse4_1)
 687       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 688    else {
 689       LLVMTypeRef vec_type = lp_build_vec_type(type);
 690       LLVMValueRef res;
 691       res = lp_build_ifloor(bld, a);
 692       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 693       return res;
 694    }
 695 }
 696
 697
 698 LLVMValueRef
 699 lp_build_ceil(struct lp_build_context *bld,
 700               LLVMValueRef a)
 701 {
 702    const struct lp_type type = bld->type;
 703
 704    assert(type.floating);
 705    assert(lp_check_value(type, a));
 706
 707    if(util_cpu_caps.has_sse4_1)
 708       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 709    else {
 710       LLVMTypeRef vec_type = lp_build_vec_type(type);
 711       LLVMValueRef res;
 712       res = lp_build_iceil(bld, a);
 713       res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
 714       return res;
 715    }
 716 }
 717
 718
 719 /**
 720  * Convert to integer, through whichever rounding method that's fastest,
 721  * typically truncating to zero.
 722  */
 723 LLVMValueRef
 724 lp_build_itrunc(struct lp_build_context *bld,
 725                 LLVMValueRef a)
 726 {
 727    const struct lp_type type = bld->type;
 728    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 729
 730    assert(type.floating);
 731    assert(lp_check_value(type, a));
 732
 733    return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 734 }
 735
 736
 737 LLVMValueRef
 738 lp_build_iround(struct lp_build_context *bld,
 739                 LLVMValueRef a)
 740 {
 741    const struct lp_type type = bld->type;
 742    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 743    LLVMValueRef res;
 744
 745    assert(type.floating);
 746    assert(lp_check_value(type, a));
 747
 748    if(util_cpu_caps.has_sse4_1) {
 749       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 750    }
 751    else {
 752       LLVMTypeRef vec_type = lp_build_vec_type(type);
 753       LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
 754       LLVMValueRef sign;
 755       LLVMValueRef half;
 756
 757       /* get sign bit */
 758       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 759       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 760
 761       /* sign * 0.5 */
 762       half = lp_build_const_scalar(type, 0.5);
 763       half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
 764       half = LLVMBuildOr(bld->builder, sign, half, "");
 765       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
 766
 767       res = LLVMBuildAdd(bld->builder, a, half, "");
 768    }
 769
 770    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
 771
 772    return res;
 773 }
 774
 775
 776 LLVMValueRef
 777 lp_build_ifloor(struct lp_build_context *bld,
 778                 LLVMValueRef a)
 779 {
 780    const struct lp_type type = bld->type;
 781    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 782    LLVMValueRef res;
 783
 784    assert(type.floating);
 785    assert(lp_check_value(type, a));
 786
 787    if(util_cpu_caps.has_sse4_1) {
 788       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 789    }
 790    else {
 791       /* Take the sign bit and add it to 1 constant */
 792       LLVMTypeRef vec_type = lp_build_vec_type(type);
 793       unsigned mantissa = lp_mantissa(type);
 794       LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
 795       LLVMValueRef sign;
 796       LLVMValueRef offset;
 797
 798       /* sign = a < 0 ? ~0 : 0 */
 799       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
 800       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
 801       sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
 802
 803       /* offset = -0.99999(9)f */
 804       offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
 805       offset = LLVMConstBitCast(offset, int_vec_type);
 806
 807       /* offset = a < 0 ? -0.99999(9)f : 0.0f */
 808       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
 809       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
 810
 811       res = LLVMBuildAdd(bld->builder, a, offset, "");
 812    }
 813
 814    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
 815
 816    return res;
 817 }
 818
 819
 820 LLVMValueRef
 821 lp_build_iceil(struct lp_build_context *bld,
 822                LLVMValueRef a)
 823 {
 824    const struct lp_type type = bld->type;
 825    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 826    LLVMValueRef res;
 827
 828    assert(type.floating);
 829    assert(lp_check_value(type, a));
 830
 831    if(util_cpu_caps.has_sse4_1) {
 832       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 833    }
 834    else {
 835       assert(0);
 836       res = bld->undef;
 837    }
 838
 839    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
 840
 841    return res;
 842 }
 843
 844
 845 LLVMValueRef
 846 lp_build_sqrt(struct lp_build_context *bld,
 847               LLVMValueRef a)
 848 {
 849    const struct lp_type type = bld->type;
 850    LLVMTypeRef vec_type = lp_build_vec_type(type);
 851    char intrinsic[32];
 852
 853    /* TODO: optimize the constant case */
 854    /* TODO: optimize the constant case */
 855
 856    assert(type.floating);
 857    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
 858
 859    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
 860 }
 861
 862
 863 LLVMValueRef
 864 lp_build_rcp(struct lp_build_context *bld,
 865              LLVMValueRef a)
 866 {
 867    const struct lp_type type = bld->type;
 868
 869    if(a == bld->zero)
 870       return bld->undef;
 871    if(a == bld->one)
 872       return bld->one;
 873    if(a == bld->undef)
 874       return bld->undef;
 875
 876    assert(type.floating);
 877
 878    if(LLVMIsConstant(a))
 879       return LLVMConstFDiv(bld->one, a);
 880
 881    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 882       /* FIXME: improve precision */
 883       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
 884
 885    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 886 }
 887
 888
 889 /**
 890  * Generate 1/sqrt(a)
 891  */
 892 LLVMValueRef
 893 lp_build_rsqrt(struct lp_build_context *bld,
 894                LLVMValueRef a)
 895 {
 896    const struct lp_type type = bld->type;
 897
 898    assert(type.floating);
 899
 900    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
 901       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
 902
 903    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 904 }
 905
 906
 907 /**
 908  * Generate cos(a)
 909  */
 910 LLVMValueRef
 911 lp_build_cos(struct lp_build_context *bld,
 912               LLVMValueRef a)
 913 {
 914    const struct lp_type type = bld->type;
 915    LLVMTypeRef vec_type = lp_build_vec_type(type);
 916    char intrinsic[32];
 917
 918    /* TODO: optimize the constant case */
 919
 920    assert(type.floating);
 921    util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
 922
 923    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
 924 }
 925
 926
 927 /**
 928  * Generate sin(a)
 929  */
 930 LLVMValueRef
 931 lp_build_sin(struct lp_build_context *bld,
 932               LLVMValueRef a)
 933 {
 934    const struct lp_type type = bld->type;
 935    LLVMTypeRef vec_type = lp_build_vec_type(type);
 936    char intrinsic[32];
 937
 938    /* TODO: optimize the constant case */
 939
 940    assert(type.floating);
 941    util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
 942
 943    return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
 944 }
 945
 946
 947 /**
 948  * Generate pow(x, y)
 949  */
 950 LLVMValueRef
 951 lp_build_pow(struct lp_build_context *bld,
 952              LLVMValueRef x,
 953              LLVMValueRef y)
 954 {
 955    /* TODO: optimize the constant case */
 956    if(LLVMIsConstant(x) && LLVMIsConstant(y))
 957       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
 958                    __FUNCTION__);
 959
 960    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 961 }
 962
 963
 964 /**
 965  * Generate exp(x)
 966  */
 967 LLVMValueRef
 968 lp_build_exp(struct lp_build_context *bld,
 969              LLVMValueRef x)
 970 {
 971    /* log2(e) = 1/log(2) */
 972    LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
 973
 974    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 975 }
 976
 977
 978 /**
 979  * Generate log(x)
 980  */
 981 LLVMValueRef
 982 lp_build_log(struct lp_build_context *bld,
 983              LLVMValueRef x)
 984 {
 985    /* log(2) */
 986    LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
 987
 988    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 989 }
 990
 991
 992 #define EXP_POLY_DEGREE 3
 993 #define LOG_POLY_DEGREE 5
 994
 995
 996 /**
 997  * Generate polynomial.
 998  * Ex:  x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
 999  */
1000 static LLVMValueRef
1001 lp_build_polynomial(struct lp_build_context *bld,
1002                     LLVMValueRef x,
1003                     const double *coeffs,
1004                     unsigned num_coeffs)
1005 {
1006    const struct lp_type type = bld->type;
1007    LLVMValueRef res = NULL;
1008    unsigned i;
1009
1010    /* TODO: optimize the constant case */
1011    if(LLVMIsConstant(x))
1012       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1013                    __FUNCTION__);
1014
1015    for (i = num_coeffs; i--; ) {
1016       LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
1017       if(res)
1018          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1019       else
1020          res = coeff;
1021    }
1022
1023    if(res)
1024       return res;
1025    else
1026       return bld->undef;
1027 }
1028
1029
1030 /**
1031  * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1032  */
1033 const double lp_build_exp2_polynomial[] = {
1034 #if EXP_POLY_DEGREE == 5
1035    9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1036 #elif EXP_POLY_DEGREE == 4
1037    1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1038 #elif EXP_POLY_DEGREE == 3
1039    9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1040 #elif EXP_POLY_DEGREE == 2
1041    1.0017247, 6.5763628e-1, 3.3718944e-1
1042 #else
1043 #error
1044 #endif
1045 };
1046
1047
1048 void
1049 lp_build_exp2_approx(struct lp_build_context *bld,
1050                      LLVMValueRef x,
1051                      LLVMValueRef *p_exp2_int_part,
1052                      LLVMValueRef *p_frac_part,
1053                      LLVMValueRef *p_exp2)
1054 {
1055    const struct lp_type type = bld->type;
1056    LLVMTypeRef vec_type = lp_build_vec_type(type);
1057    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1058    LLVMValueRef ipart = NULL;
1059    LLVMValueRef fpart = NULL;
1060    LLVMValueRef expipart = NULL;
1061    LLVMValueRef expfpart = NULL;
1062    LLVMValueRef res = NULL;
1063
1064    if(p_exp2_int_part || p_frac_part || p_exp2) {
1065       /* TODO: optimize the constant case */
1066       if(LLVMIsConstant(x))
1067          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1068                       __FUNCTION__);
1069
1070       assert(type.floating && type.width == 32);
1071
1072       x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
1073       x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1074
1075       /* ipart = int(x - 0.5) */
1076       ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1077       ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1078
1079       /* fpart = x - ipart */
1080       fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1081       fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1082    }
1083
1084    if(p_exp2_int_part || p_exp2) {
1085       /* expipart = (float) (1 << ipart) */
1086       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1087       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1088       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1089    }
1090
1091    if(p_exp2) {
1092       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1093                                      Elements(lp_build_exp2_polynomial));
1094
1095       res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1096    }
1097
1098    if(p_exp2_int_part)
1099       *p_exp2_int_part = expipart;
1100
1101    if(p_frac_part)
1102       *p_frac_part = fpart;
1103
1104    if(p_exp2)
1105       *p_exp2 = res;
1106 }
1107
1108
1109 LLVMValueRef
1110 lp_build_exp2(struct lp_build_context *bld,
1111               LLVMValueRef x)
1112 {
1113    LLVMValueRef res;
1114    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1115    return res;
1116 }
1117
1118
1119 /**
1120  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1121  * These coefficients can be generate with
1122  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1123  */
1124 const double lp_build_log2_polynomial[] = {
1125 #if LOG_POLY_DEGREE == 6
1126    3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1127 #elif LOG_POLY_DEGREE == 5
1128    2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1129 #elif LOG_POLY_DEGREE == 4
1130    2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1131 #elif LOG_POLY_DEGREE == 3
1132    2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1133 #else
1134 #error
1135 #endif
1136 };
1137
1138
1139 /**
1140  * See http://www.devmaster.net/forums/showthread.php?p=43580
1141  */
1142 void
1143 lp_build_log2_approx(struct lp_build_context *bld,
1144                      LLVMValueRef x,
1145                      LLVMValueRef *p_exp,
1146                      LLVMValueRef *p_floor_log2,
1147                      LLVMValueRef *p_log2)
1148 {
1149    const struct lp_type type = bld->type;
1150    LLVMTypeRef vec_type = lp_build_vec_type(type);
1151    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1152
1153    LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1154    LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1155    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1156
1157    LLVMValueRef i = NULL;
1158    LLVMValueRef exp = NULL;
1159    LLVMValueRef mant = NULL;
1160    LLVMValueRef logexp = NULL;
1161    LLVMValueRef logmant = NULL;
1162    LLVMValueRef res = NULL;
1163
1164    if(p_exp || p_floor_log2 || p_log2) {
1165       /* TODO: optimize the constant case */
1166       if(LLVMIsConstant(x))
1167          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1168                       __FUNCTION__);
1169
1170       assert(type.floating && type.width == 32);
1171
1172       i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1173
1174       /* exp = (float) exponent(x) */
1175       exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1176    }
1177
1178    if(p_floor_log2 || p_log2) {
1179       logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1180       logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1181       logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1182    }
1183
1184    if(p_log2) {
1185       /* mant = (float) mantissa(x) */
1186       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1187       mant = LLVMBuildOr(bld->builder, mant, one, "");
1188       mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1189
1190       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1191                                     Elements(lp_build_log2_polynomial));
1192
1193       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1194       logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1195
1196       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1197    }
1198
1199    if(p_exp)
1200       *p_exp = exp;
1201
1202    if(p_floor_log2)
1203       *p_floor_log2 = logexp;
1204
1205    if(p_log2)
1206       *p_log2 = res;
1207 }
1208
1209
1210 LLVMValueRef
1211 lp_build_log2(struct lp_build_context *bld,
1212               LLVMValueRef x)
1213 {
1214    LLVMValueRef res;
1215    lp_build_log2_approx(bld, x, NULL, NULL, &res);
1216    return res;
1217 }