src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 138           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 140                       __FUNCTION__);
 141       }
 142       if (type.width == 32 && type.length == 4) {
 143          intrinsic = "llvm.ppc.altivec.vminfp";
 144          intr_size = 128;
 145       }
 146    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 147       intr_size = 128;
 148       if ((type.width == 8 || type.width == 16) &&
 149           (type.width * type.length <= 64) &&
 150           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 151          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 152                       __FUNCTION__);
 153       }
 154       if (type.width == 8 && !type.sign) {
 155          intrinsic = "llvm.x86.sse2.pminu.b";
 156       }
 157       else if (type.width == 16 && type.sign) {
 158          intrinsic = "llvm.x86.sse2.pmins.w";
 159       }
 160       if (util_cpu_caps.has_sse4_1) {
 161          if (type.width == 8 && type.sign) {
 162             intrinsic = "llvm.x86.sse41.pminsb";
 163          }
 164          if (type.width == 16 && !type.sign) {
 165             intrinsic = "llvm.x86.sse41.pminuw";
 166          }
 167          if (type.width == 32 && !type.sign) {
 168             intrinsic = "llvm.x86.sse41.pminud";
 169          }
 170          if (type.width == 32 && type.sign) {
 171             intrinsic = "llvm.x86.sse41.pminsd";
 172          }
 173       }
 174    } else if (util_cpu_caps.has_altivec) {
 175       intr_size = 128;
 176       if (type.width == 8) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminub";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsb";
 181          }
 182       } else if (type.width == 16) {
 183          if (!type.sign) {
 184             intrinsic = "llvm.ppc.altivec.vminuh";
 185          } else {
 186             intrinsic = "llvm.ppc.altivec.vminsh";
 187          }
 188       } else if (type.width == 32) {
 189          if (!type.sign) {
 190             intrinsic = "llvm.ppc.altivec.vminuw";
 191          } else {
 192             intrinsic = "llvm.ppc.altivec.vminsw";
 193          }
 194       }
 195    }
 196
 197    if (intrinsic) {
 198       /* We need to handle nan's for floating point numbers. If one of the
 199        * inputs is nan the other should be returned (required by both D3D10+
 200        * and OpenCL).
 201        * The sse intrinsics return the second operator in case of nan by
 202        * default so we need to special code to handle those.
 203        */
 204       if (util_cpu_caps.has_sse && type.floating &&
 205           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 206           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 207           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 208          LLVMValueRef isnan, min;
 209          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 210                                                    type,
 211                                                    intr_size, a, b);
 212          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 213             isnan = lp_build_isnan(bld, b);
 214             return lp_build_select(bld, isnan, a, min);
 215          } else {
 216             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 217             isnan = lp_build_isnan(bld, a);
 218             return lp_build_select(bld, isnan, a, min);
 219          }
 220       } else {
 221          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 222                                                     type,
 223                                                     intr_size, a, b);
 224       }
 225    }
 226
 227    if (type.floating) {
 228       switch (nan_behavior) {
 229       case GALLIVM_NAN_RETURN_NAN: {
 230          LLVMValueRef isnan = lp_build_isnan(bld, b);
 231          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 232          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 233          return lp_build_select(bld, cond, a, b);
 234       }
 235          break;
 236       case GALLIVM_NAN_RETURN_OTHER: {
 237          LLVMValueRef isnan = lp_build_isnan(bld, a);
 238          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 239          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242          break;
 243       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 244          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 245          return lp_build_select(bld, cond, a, b);
 246       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 248          return lp_build_select(bld, cond, b, a);
 249       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252          break;
 253       default:
 254          assert(0);
 255          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258    } else {
 259       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 260       return lp_build_select(bld, cond, a, b);
 261    }
 262 }
 263
 264
 265 /**
 266  * Generate max(a, b)
 267  * No checks for special case values of a or b = 1 or 0 are done.
 268  * NaN's are handled according to the behavior specified by the
 269  * nan_behavior argument.
 270  */
 271 static LLVMValueRef
 272 lp_build_max_simple(struct lp_build_context *bld,
 273                     LLVMValueRef a,
 274                     LLVMValueRef b,
 275                     enum gallivm_nan_behavior nan_behavior)
 276 {
 277    const struct lp_type type = bld->type;
 278    const char *intrinsic = NULL;
 279    unsigned intr_size = 0;
 280    LLVMValueRef cond;
 281
 282    assert(lp_check_value(type, a));
 283    assert(lp_check_value(type, b));
 284
 285    /* TODO: optimize the constant case */
 286
 287    if (type.floating && util_cpu_caps.has_sse) {
 288       if (type.width == 32) {
 289          if (type.length == 1) {
 290             intrinsic = "llvm.x86.sse.max.ss";
 291             intr_size = 128;
 292          }
 293          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 294             intrinsic = "llvm.x86.sse.max.ps";
 295             intr_size = 128;
 296          }
 297          else {
 298             intrinsic = "llvm.x86.avx.max.ps.256";
 299             intr_size = 256;
 300          }
 301       }
 302       if (type.width == 64 && util_cpu_caps.has_sse2) {
 303          if (type.length == 1) {
 304             intrinsic = "llvm.x86.sse2.max.sd";
 305             intr_size = 128;
 306          }
 307          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 308             intrinsic = "llvm.x86.sse2.max.pd";
 309             intr_size = 128;
 310          }
 311          else {
 312             intrinsic = "llvm.x86.avx.max.pd.256";
 313             intr_size = 256;
 314          }
 315       }
 316    }
 317    else if (type.floating && util_cpu_caps.has_altivec) {
 318       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 319           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 320          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 321                       __FUNCTION__);
 322       }
 323       if (type.width == 32 || type.length == 4) {
 324          intrinsic = "llvm.ppc.altivec.vmaxfp";
 325          intr_size = 128;
 326       }
 327    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 328       intr_size = 128;
 329       if ((type.width == 8 || type.width == 16) &&
 330           (type.width * type.length <= 64) &&
 331           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 332          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 333                       __FUNCTION__);
 334          }
 335       if (type.width == 8 && !type.sign) {
 336          intrinsic = "llvm.x86.sse2.pmaxu.b";
 337          intr_size = 128;
 338       }
 339       else if (type.width == 16 && type.sign) {
 340          intrinsic = "llvm.x86.sse2.pmaxs.w";
 341       }
 342       if (util_cpu_caps.has_sse4_1) {
 343          if (type.width == 8 && type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxsb";
 345          }
 346          if (type.width == 16 && !type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxuw";
 348          }
 349          if (type.width == 32 && !type.sign) {
 350             intrinsic = "llvm.x86.sse41.pmaxud";
 351         }
 352          if (type.width == 32 && type.sign) {
 353             intrinsic = "llvm.x86.sse41.pmaxsd";
 354          }
 355       }
 356    } else if (util_cpu_caps.has_altivec) {
 357      intr_size = 128;
 358      if (type.width == 8) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxub";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsb";
 363        }
 364      } else if (type.width == 16) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuh";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsh";
 369        }
 370      } else if (type.width == 32) {
 371        if (!type.sign) {
 372          intrinsic = "llvm.ppc.altivec.vmaxuw";
 373        } else {
 374          intrinsic = "llvm.ppc.altivec.vmaxsw";
 375        }
 376      }
 377    }
 378
 379    if (intrinsic) {
 380       if (util_cpu_caps.has_sse && type.floating &&
 381           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 382           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 383           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 384          LLVMValueRef isnan, max;
 385          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 386                                                    type,
 387                                                    intr_size, a, b);
 388          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 389             isnan = lp_build_isnan(bld, b);
 390             return lp_build_select(bld, isnan, a, max);
 391          } else {
 392             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 393             isnan = lp_build_isnan(bld, a);
 394             return lp_build_select(bld, isnan, a, max);
 395          }
 396       } else {
 397          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 398                                                     type,
 399                                                     intr_size, a, b);
 400       }
 401    }
 402
 403    if (type.floating) {
 404       switch (nan_behavior) {
 405       case GALLIVM_NAN_RETURN_NAN: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, b);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER: {
 413          LLVMValueRef isnan = lp_build_isnan(bld, a);
 414          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 416          return lp_build_select(bld, cond, a, b);
 417       }
 418          break;
 419       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 420          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 421          return lp_build_select(bld, cond, a, b);
 422       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 423          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 424          return lp_build_select(bld, cond, b, a);
 425       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 426          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 427          return lp_build_select(bld, cond, a, b);
 428          break;
 429       default:
 430          assert(0);
 431          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 432          return lp_build_select(bld, cond, a, b);
 433       }
 434    } else {
 435       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 436       return lp_build_select(bld, cond, a, b);
 437    }
 438 }
 439
 440
 441 /**
 442  * Generate 1 - a, or ~a depending on bld->type.
 443  */
 444 LLVMValueRef
 445 lp_build_comp(struct lp_build_context *bld,
 446               LLVMValueRef a)
 447 {
 448    LLVMBuilderRef builder = bld->gallivm->builder;
 449    const struct lp_type type = bld->type;
 450
 451    assert(lp_check_value(type, a));
 452
 453    if(a == bld->one)
 454       return bld->zero;
 455    if(a == bld->zero)
 456       return bld->one;
 457
 458    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 459       if(LLVMIsConstant(a))
 460          return LLVMConstNot(a);
 461       else
 462          return LLVMBuildNot(builder, a, "");
 463    }
 464
 465    if(LLVMIsConstant(a))
 466       if (type.floating)
 467           return LLVMConstFSub(bld->one, a);
 468       else
 469           return LLVMConstSub(bld->one, a);
 470    else
 471       if (type.floating)
 472          return LLVMBuildFSub(builder, bld->one, a, "");
 473       else
 474          return LLVMBuildSub(builder, bld->one, a, "");
 475 }
 476
 477
 478 /**
 479  * Generate a + b
 480  */
 481 LLVMValueRef
 482 lp_build_add(struct lp_build_context *bld,
 483              LLVMValueRef a,
 484              LLVMValueRef b)
 485 {
 486    LLVMBuilderRef builder = bld->gallivm->builder;
 487    const struct lp_type type = bld->type;
 488    LLVMValueRef res;
 489
 490    assert(lp_check_value(type, a));
 491    assert(lp_check_value(type, b));
 492
 493    if(a == bld->zero)
 494       return b;
 495    if(b == bld->zero)
 496       return a;
 497    if(a == bld->undef || b == bld->undef)
 498       return bld->undef;
 499
 500    if(bld->type.norm) {
 501       const char *intrinsic = NULL;
 502
 503       if(a == bld->one || b == bld->one)
 504         return bld->one;
 505
 506       if (type.width * type.length == 128 &&
 507           !type.floating && !type.fixed) {
 508          if(util_cpu_caps.has_sse2) {
 509            if(type.width == 8)
 510              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 511            if(type.width == 16)
 512              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 513          } else if (util_cpu_caps.has_altivec) {
 514            if(type.width == 8)
 515               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 516            if(type.width == 16)
 517               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 518          }
 519       }
 520
 521       if (intrinsic)
 522          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 523    }
 524
 525    if(type.norm && !type.floating && !type.fixed) {
 526       if (type.sign) {
 527          uint64_t sign = (uint64_t)1 << (type.width - 1);
 528          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 529          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 530          /* a_clamp_max is the maximum a for positive b,
 531             a_clamp_min is the minimum a for negative b. */
 532          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 533          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 534          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 535       } else {
 536          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 537       }
 538    }
 539
 540    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 541       if (type.floating)
 542          res = LLVMConstFAdd(a, b);
 543       else
 544          res = LLVMConstAdd(a, b);
 545    else
 546       if (type.floating)
 547          res = LLVMBuildFAdd(builder, a, b, "");
 548       else
 549          res = LLVMBuildAdd(builder, a, b, "");
 550
 551    /* clamp to ceiling of 1.0 */
 552    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 553       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 554
 555    /* XXX clamp to floor of -1 or 0??? */
 556
 557    return res;
 558 }
 559
 560
 561 /** Return the scalar sum of the elements of a.
 562  * Should avoid this operation whenever possible.
 563  */
 564 LLVMValueRef
 565 lp_build_horizontal_add(struct lp_build_context *bld,
 566                         LLVMValueRef a)
 567 {
 568    LLVMBuilderRef builder = bld->gallivm->builder;
 569    const struct lp_type type = bld->type;
 570    LLVMValueRef index, res;
 571    unsigned i, length;
 572    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 573    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 574    LLVMValueRef vecres, elem2;
 575
 576    assert(lp_check_value(type, a));
 577
 578    if (type.length == 1) {
 579       return a;
 580    }
 581
 582    assert(!bld->type.norm);
 583
 584    /*
 585     * for byte vectors can do much better with psadbw.
 586     * Using repeated shuffle/adds here. Note with multiple vectors
 587     * this can be done more efficiently as outlined in the intel
 588     * optimization manual.
 589     * Note: could cause data rearrangement if used with smaller element
 590     * sizes.
 591     */
 592
 593    vecres = a;
 594    length = type.length / 2;
 595    while (length > 1) {
 596       LLVMValueRef vec1, vec2;
 597       for (i = 0; i < length; i++) {
 598          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 599          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 600       }
 601       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 602                                     LLVMConstVector(shuffles1, length), "");
 603       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 604                                     LLVMConstVector(shuffles2, length), "");
 605       if (type.floating) {
 606          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 607       }
 608       else {
 609          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 610       }
 611       length = length >> 1;
 612    }
 613
 614    /* always have vector of size 2 here */
 615    assert(length == 1);
 616
 617    index = lp_build_const_int32(bld->gallivm, 0);
 618    res = LLVMBuildExtractElement(builder, vecres, index, "");
 619    index = lp_build_const_int32(bld->gallivm, 1);
 620    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 621
 622    if (type.floating)
 623       res = LLVMBuildFAdd(builder, res, elem2, "");
 624     else
 625       res = LLVMBuildAdd(builder, res, elem2, "");
 626
 627    return res;
 628 }
 629
 630 /**
 631  * Return the horizontal sums of 4 float vectors as a float4 vector.
 632  * This uses the technique as outlined in Intel Optimization Manual.
 633  */
 634 static LLVMValueRef
 635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 636                             LLVMValueRef src[4])
 637 {
 638    struct gallivm_state *gallivm = bld->gallivm;
 639    LLVMBuilderRef builder = gallivm->builder;
 640    LLVMValueRef shuffles[4];
 641    LLVMValueRef tmp[4];
 642    LLVMValueRef sumtmp[2], shuftmp[2];
 643
 644    /* lower half of regs */
 645    shuffles[0] = lp_build_const_int32(gallivm, 0);
 646    shuffles[1] = lp_build_const_int32(gallivm, 1);
 647    shuffles[2] = lp_build_const_int32(gallivm, 4);
 648    shuffles[3] = lp_build_const_int32(gallivm, 5);
 649    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 650                                    LLVMConstVector(shuffles, 4), "");
 651    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 652                                    LLVMConstVector(shuffles, 4), "");
 653
 654    /* upper half of regs */
 655    shuffles[0] = lp_build_const_int32(gallivm, 2);
 656    shuffles[1] = lp_build_const_int32(gallivm, 3);
 657    shuffles[2] = lp_build_const_int32(gallivm, 6);
 658    shuffles[3] = lp_build_const_int32(gallivm, 7);
 659    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 660                                    LLVMConstVector(shuffles, 4), "");
 661    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 662                                    LLVMConstVector(shuffles, 4), "");
 663
 664    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 665    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 666
 667    shuffles[0] = lp_build_const_int32(gallivm, 0);
 668    shuffles[1] = lp_build_const_int32(gallivm, 2);
 669    shuffles[2] = lp_build_const_int32(gallivm, 4);
 670    shuffles[3] = lp_build_const_int32(gallivm, 6);
 671    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 672                                        LLVMConstVector(shuffles, 4), "");
 673
 674    shuffles[0] = lp_build_const_int32(gallivm, 1);
 675    shuffles[1] = lp_build_const_int32(gallivm, 3);
 676    shuffles[2] = lp_build_const_int32(gallivm, 5);
 677    shuffles[3] = lp_build_const_int32(gallivm, 7);
 678    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 679                                        LLVMConstVector(shuffles, 4), "");
 680
 681    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 682 }
 683
 684
 685 /*
 686  * partially horizontally add 2-4 float vectors with length nx4,
 687  * i.e. only four adjacent values in each vector will be added,
 688  * assuming values are really grouped in 4 which also determines
 689  * output order.
 690  *
 691  * Return a vector of the same length as the initial vectors,
 692  * with the excess elements (if any) being undefined.
 693  * The element order is independent of number of input vectors.
 694  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 695  * the output order thus will be
 696  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 697  */
 698 LLVMValueRef
 699 lp_build_hadd_partial4(struct lp_build_context *bld,
 700                        LLVMValueRef vectors[],
 701                        unsigned num_vecs)
 702 {
 703    struct gallivm_state *gallivm = bld->gallivm;
 704    LLVMBuilderRef builder = gallivm->builder;
 705    LLVMValueRef ret_vec;
 706    LLVMValueRef tmp[4];
 707    const char *intrinsic = NULL;
 708
 709    assert(num_vecs >= 2 && num_vecs <= 4);
 710    assert(bld->type.floating);
 711
 712    /* only use this with at least 2 vectors, as it is sort of expensive
 713     * (depending on cpu) and we always need two horizontal adds anyway,
 714     * so a shuffle/add approach might be better.
 715     */
 716
 717    tmp[0] = vectors[0];
 718    tmp[1] = vectors[1];
 719
 720    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 721    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 722
 723    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 724        bld->type.length == 4) {
 725       intrinsic = "llvm.x86.sse3.hadd.ps";
 726    }
 727    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 728             bld->type.length == 8) {
 729       intrinsic = "llvm.x86.avx.hadd.ps.256";
 730    }
 731    if (intrinsic) {
 732       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 733                                        lp_build_vec_type(gallivm, bld->type),
 734                                        tmp[0], tmp[1]);
 735       if (num_vecs > 2) {
 736          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 737                                           lp_build_vec_type(gallivm, bld->type),
 738                                           tmp[2], tmp[3]);
 739       }
 740       else {
 741          tmp[1] = tmp[0];
 742       }
 743       return lp_build_intrinsic_binary(builder, intrinsic,
 744                                        lp_build_vec_type(gallivm, bld->type),
 745                                        tmp[0], tmp[1]);
 746    }
 747
 748    if (bld->type.length == 4) {
 749       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 750    }
 751    else {
 752       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 753       unsigned j;
 754       unsigned num_iter = bld->type.length / 4;
 755       struct lp_type parttype = bld->type;
 756       parttype.length = 4;
 757       for (j = 0; j < num_iter; j++) {
 758          LLVMValueRef partsrc[4];
 759          unsigned i;
 760          for (i = 0; i < 4; i++) {
 761             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 762          }
 763          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 764       }
 765       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 766    }
 767    return ret_vec;
 768 }
 769
 770 /**
 771  * Generate a - b
 772  */
 773 LLVMValueRef
 774 lp_build_sub(struct lp_build_context *bld,
 775              LLVMValueRef a,
 776              LLVMValueRef b)
 777 {
 778    LLVMBuilderRef builder = bld->gallivm->builder;
 779    const struct lp_type type = bld->type;
 780    LLVMValueRef res;
 781
 782    assert(lp_check_value(type, a));
 783    assert(lp_check_value(type, b));
 784
 785    if(b == bld->zero)
 786       return a;
 787    if(a == bld->undef || b == bld->undef)
 788       return bld->undef;
 789    if(a == b)
 790       return bld->zero;
 791
 792    if(bld->type.norm) {
 793       const char *intrinsic = NULL;
 794
 795       if(b == bld->one)
 796         return bld->zero;
 797
 798       if (type.width * type.length == 128 &&
 799           !type.floating && !type.fixed) {
 800          if (util_cpu_caps.has_sse2) {
 801            if(type.width == 8)
 802               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 803            if(type.width == 16)
 804               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 805          } else if (util_cpu_caps.has_altivec) {
 806            if(type.width == 8)
 807               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 808            if(type.width == 16)
 809               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 810          }
 811       }
 812
 813       if (intrinsic)
 814          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 815    }
 816
 817    if(type.norm && !type.floating && !type.fixed) {
 818       if (type.sign) {
 819          uint64_t sign = (uint64_t)1 << (type.width - 1);
 820          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 821          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 822          /* a_clamp_max is the maximum a for negative b,
 823             a_clamp_min is the minimum a for positive b. */
 824          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 825          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 826          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 827       } else {
 828          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 829       }
 830    }
 831
 832    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 833       if (type.floating)
 834          res = LLVMConstFSub(a, b);
 835       else
 836          res = LLVMConstSub(a, b);
 837    else
 838       if (type.floating)
 839          res = LLVMBuildFSub(builder, a, b, "");
 840       else
 841          res = LLVMBuildSub(builder, a, b, "");
 842
 843    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 844       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 845
 846    return res;
 847 }
 848
 849
 850
 851 /**
 852  * Normalized multiplication.
 853  *
 854  * There are several approaches for (using 8-bit normalized multiplication as
 855  * an example):
 856  *
 857  * - alpha plus one
 858  *
 859  *     makes the following approximation to the division (Sree)
 860  *
 861  *       a*b/255 ~= (a*(b + 1)) >> 256
 862  *
 863  *     which is the fastest method that satisfies the following OpenGL criteria of
 864  *
 865  *       0*0 = 0 and 255*255 = 255
 866  *
 867  * - geometric series
 868  *
 869  *     takes the geometric series approximation to the division
 870  *
 871  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 872  *
 873  *     in this case just the first two terms to fit in 16bit arithmetic
 874  *
 875  *       t/255 ~= (t + (t >> 8)) >> 8
 876  *
 877  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 878  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 879  *     must be used.
 880  *
 881  * - geometric series plus rounding
 882  *
 883  *     when using a geometric series division instead of truncating the result
 884  *     use roundoff in the approximation (Jim Blinn)
 885  *
 886  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 887  *
 888  *     achieving the exact results.
 889  *
 890  *
 891  *
 892  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 893  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 894  * @sa Michael Herf, The "double blend trick", May 2000,
 895  *     http://www.stereopsis.com/doubleblend.html
 896  */
 897 static LLVMValueRef
 898 lp_build_mul_norm(struct gallivm_state *gallivm,
 899                   struct lp_type wide_type,
 900                   LLVMValueRef a, LLVMValueRef b)
 901 {
 902    LLVMBuilderRef builder = gallivm->builder;
 903    struct lp_build_context bld;
 904    unsigned n;
 905    LLVMValueRef half;
 906    LLVMValueRef ab;
 907
 908    assert(!wide_type.floating);
 909    assert(lp_check_value(wide_type, a));
 910    assert(lp_check_value(wide_type, b));
 911
 912    lp_build_context_init(&bld, gallivm, wide_type);
 913
 914    n = wide_type.width / 2;
 915    if (wide_type.sign) {
 916       --n;
 917    }
 918
 919    /*
 920     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 921     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 922     */
 923
 924    /*
 925     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 926     */
 927
 928    ab = LLVMBuildMul(builder, a, b, "");
 929    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 930
 931    /*
 932     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 933     */
 934
 935    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 936    if (wide_type.sign) {
 937       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 938       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 939       half = lp_build_select(&bld, sign, minus_half, half);
 940    }
 941    ab = LLVMBuildAdd(builder, ab, half, "");
 942
 943    /* Final division */
 944    ab = lp_build_shr_imm(&bld, ab, n);
 945
 946    return ab;
 947 }
 948
 949 /**
 950  * Generate a * b
 951  */
 952 LLVMValueRef
 953 lp_build_mul(struct lp_build_context *bld,
 954              LLVMValueRef a,
 955              LLVMValueRef b)
 956 {
 957    LLVMBuilderRef builder = bld->gallivm->builder;
 958    const struct lp_type type = bld->type;
 959    LLVMValueRef shift;
 960    LLVMValueRef res;
 961
 962    assert(lp_check_value(type, a));
 963    assert(lp_check_value(type, b));
 964
 965    if(a == bld->zero)
 966       return bld->zero;
 967    if(a == bld->one)
 968       return b;
 969    if(b == bld->zero)
 970       return bld->zero;
 971    if(b == bld->one)
 972       return a;
 973    if(a == bld->undef || b == bld->undef)
 974       return bld->undef;
 975
 976    if (!type.floating && !type.fixed && type.norm) {
 977       struct lp_type wide_type = lp_wider_type(type);
 978       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 979
 980       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 981       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 982
 983       /* PMULLW, PSRLW, PADDW */
 984       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 985       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 986
 987       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 988
 989       return ab;
 990    }
 991
 992    if(type.fixed)
 993       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 994    else
 995       shift = NULL;
 996
 997    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 998       if (type.floating)
 999          res = LLVMConstFMul(a, b);
1000       else
1001          res = LLVMConstMul(a, b);
1002       if(shift) {
1003          if(type.sign)
1004             res = LLVMConstAShr(res, shift);
1005          else
1006             res = LLVMConstLShr(res, shift);
1007       }
1008    }
1009    else {
1010       if (type.floating)
1011          res = LLVMBuildFMul(builder, a, b, "");
1012       else
1013          res = LLVMBuildMul(builder, a, b, "");
1014       if(shift) {
1015          if(type.sign)
1016             res = LLVMBuildAShr(builder, res, shift, "");
1017          else
1018             res = LLVMBuildLShr(builder, res, shift, "");
1019       }
1020    }
1021
1022    return res;
1023 }
1024
1025
1026 /**
1027  * Small vector x scale multiplication optimization.
1028  */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031                  LLVMValueRef a,
1032                  int b)
1033 {
1034    LLVMBuilderRef builder = bld->gallivm->builder;
1035    LLVMValueRef factor;
1036
1037    assert(lp_check_value(bld->type, a));
1038
1039    if(b == 0)
1040       return bld->zero;
1041
1042    if(b == 1)
1043       return a;
1044
1045    if(b == -1)
1046       return lp_build_negate(bld, a);
1047
1048    if(b == 2 && bld->type.floating)
1049       return lp_build_add(bld, a, a);
1050
1051    if(util_is_power_of_two(b)) {
1052       unsigned shift = ffs(b) - 1;
1053
1054       if(bld->type.floating) {
1055 #if 0
1056          /*
1057           * Power of two multiplication by directly manipulating the exponent.
1058           *
1059           * XXX: This might not be always faster, it will introduce a small error
1060           * for multiplication by zero, and it will produce wrong results
1061           * for Inf and NaN.
1062           */
1063          unsigned mantissa = lp_mantissa(bld->type);
1064          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066          a = LLVMBuildAdd(builder, a, factor, "");
1067          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068          return a;
1069 #endif
1070       }
1071       else {
1072          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073          return LLVMBuildShl(builder, a, factor, "");
1074       }
1075    }
1076
1077    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078    return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083  * Generate a / b
1084  */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087              LLVMValueRef a,
1088              LLVMValueRef b)
1089 {
1090    LLVMBuilderRef builder = bld->gallivm->builder;
1091    const struct lp_type type = bld->type;
1092
1093    assert(lp_check_value(type, a));
1094    assert(lp_check_value(type, b));
1095
1096    if(a == bld->zero)
1097       return bld->zero;
1098    if(a == bld->one && type.floating)
1099       return lp_build_rcp(bld, b);
1100    if(b == bld->zero)
1101       return bld->undef;
1102    if(b == bld->one)
1103       return a;
1104    if(a == bld->undef || b == bld->undef)
1105       return bld->undef;
1106
1107    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108       if (type.floating)
1109          return LLVMConstFDiv(a, b);
1110       else if (type.sign)
1111          return LLVMConstSDiv(a, b);
1112       else
1113          return LLVMConstUDiv(a, b);
1114    }
1115
1116    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118       type.floating)
1119       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121    if (type.floating)
1122       return LLVMBuildFDiv(builder, a, b, "");
1123    else if (type.sign)
1124       return LLVMBuildSDiv(builder, a, b, "");
1125    else
1126       return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131  * Linear interpolation helper.
1132  *
1133  * @param normalized whether we are interpolating normalized values,
1134  *        encoded in normalized integers, twice as wide.
1135  *
1136  * @sa http://www.stereopsis.com/doubleblend.html
1137  */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140                      LLVMValueRef x,
1141                      LLVMValueRef v0,
1142                      LLVMValueRef v1,
1143                      unsigned flags)
1144 {
1145    unsigned half_width = bld->type.width/2;
1146    LLVMBuilderRef builder = bld->gallivm->builder;
1147    LLVMValueRef delta;
1148    LLVMValueRef res;
1149
1150    assert(lp_check_value(bld->type, x));
1151    assert(lp_check_value(bld->type, v0));
1152    assert(lp_check_value(bld->type, v1));
1153
1154    delta = lp_build_sub(bld, v1, v0);
1155
1156    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157       if (!bld->type.sign) {
1158          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159             /*
1160              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161              * most-significant-bit to the lowest-significant-bit, so that
1162              * later we can just divide by 2**n instead of 2**n - 1.
1163              */
1164
1165             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166          }
1167
1168          /* (x * delta) >> n */
1169          res = lp_build_mul(bld, x, delta);
1170          res = lp_build_shr_imm(bld, res, half_width);
1171       } else {
1172          /*
1173           * The rescaling trick above doesn't work for signed numbers, so
1174           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175           * instead.
1176           */
1177          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179       }
1180    } else {
1181       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182       res = lp_build_mul(bld, x, delta);
1183    }
1184
1185    res = lp_build_add(bld, v0, res);
1186
1187    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1188        bld->type.fixed) {
1189       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191        * but it will be wrong for true fixed point use cases. Basically we need
1192        * a more powerful lp_type, capable of further distinguishing the values
1193        * interpretation from the value storage. */
1194       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1195    }
1196
1197    return res;
1198 }
1199
1200
1201 /**
1202  * Linear interpolation.
1203  */
1204 LLVMValueRef
1205 lp_build_lerp(struct lp_build_context *bld,
1206               LLVMValueRef x,
1207               LLVMValueRef v0,
1208               LLVMValueRef v1,
1209               unsigned flags)
1210 {
1211    const struct lp_type type = bld->type;
1212    LLVMValueRef res;
1213
1214    assert(lp_check_value(type, x));
1215    assert(lp_check_value(type, v0));
1216    assert(lp_check_value(type, v1));
1217
1218    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1219
1220    if (type.norm) {
1221       struct lp_type wide_type;
1222       struct lp_build_context wide_bld;
1223       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1224
1225       assert(type.length >= 2);
1226
1227       /*
1228        * Create a wider integer type, enough to hold the
1229        * intermediate result of the multiplication.
1230        */
1231       memset(&wide_type, 0, sizeof wide_type);
1232       wide_type.sign   = type.sign;
1233       wide_type.width  = type.width*2;
1234       wide_type.length = type.length/2;
1235
1236       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1237
1238       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1239       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1240       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1241
1242       /*
1243        * Lerp both halves.
1244        */
1245
1246       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1247
1248       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1249       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1250
1251       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1252    } else {
1253       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1254    }
1255
1256    return res;
1257 }
1258
1259
1260 /**
1261  * Bilinear interpolation.
1262  *
1263  * Values indices are in v_{yx}.
1264  */
1265 LLVMValueRef
1266 lp_build_lerp_2d(struct lp_build_context *bld,
1267                  LLVMValueRef x,
1268                  LLVMValueRef y,
1269                  LLVMValueRef v00,
1270                  LLVMValueRef v01,
1271                  LLVMValueRef v10,
1272                  LLVMValueRef v11,
1273                  unsigned flags)
1274 {
1275    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1276    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1277    return lp_build_lerp(bld, y, v0, v1, flags);
1278 }
1279
1280
1281 LLVMValueRef
1282 lp_build_lerp_3d(struct lp_build_context *bld,
1283                  LLVMValueRef x,
1284                  LLVMValueRef y,
1285                  LLVMValueRef z,
1286                  LLVMValueRef v000,
1287                  LLVMValueRef v001,
1288                  LLVMValueRef v010,
1289                  LLVMValueRef v011,
1290                  LLVMValueRef v100,
1291                  LLVMValueRef v101,
1292                  LLVMValueRef v110,
1293                  LLVMValueRef v111,
1294                  unsigned flags)
1295 {
1296    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1297    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1298    return lp_build_lerp(bld, z, v0, v1, flags);
1299 }
1300
1301
1302 /**
1303  * Generate min(a, b)
1304  * Do checks for special cases but not for nans.
1305  */
1306 LLVMValueRef
1307 lp_build_min(struct lp_build_context *bld,
1308              LLVMValueRef a,
1309              LLVMValueRef b)
1310 {
1311    assert(lp_check_value(bld->type, a));
1312    assert(lp_check_value(bld->type, b));
1313
1314    if(a == bld->undef || b == bld->undef)
1315       return bld->undef;
1316
1317    if(a == b)
1318       return a;
1319
1320    if (bld->type.norm) {
1321       if (!bld->type.sign) {
1322          if (a == bld->zero || b == bld->zero) {
1323             return bld->zero;
1324          }
1325       }
1326       if(a == bld->one)
1327          return b;
1328       if(b == bld->one)
1329          return a;
1330    }
1331
1332    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1333 }
1334
1335
1336 /**
1337  * Generate min(a, b)
1338  * NaN's are handled according to the behavior specified by the
1339  * nan_behavior argument.
1340  */
1341 LLVMValueRef
1342 lp_build_min_ext(struct lp_build_context *bld,
1343                  LLVMValueRef a,
1344                  LLVMValueRef b,
1345                  enum gallivm_nan_behavior nan_behavior)
1346 {
1347    assert(lp_check_value(bld->type, a));
1348    assert(lp_check_value(bld->type, b));
1349
1350    if(a == bld->undef || b == bld->undef)
1351       return bld->undef;
1352
1353    if(a == b)
1354       return a;
1355
1356    if (bld->type.norm) {
1357       if (!bld->type.sign) {
1358          if (a == bld->zero || b == bld->zero) {
1359             return bld->zero;
1360          }
1361       }
1362       if(a == bld->one)
1363          return b;
1364       if(b == bld->one)
1365          return a;
1366    }
1367
1368    return lp_build_min_simple(bld, a, b, nan_behavior);
1369 }
1370
1371 /**
1372  * Generate max(a, b)
1373  * Do checks for special cases, but NaN behavior is undefined.
1374  */
1375 LLVMValueRef
1376 lp_build_max(struct lp_build_context *bld,
1377              LLVMValueRef a,
1378              LLVMValueRef b)
1379 {
1380    assert(lp_check_value(bld->type, a));
1381    assert(lp_check_value(bld->type, b));
1382
1383    if(a == bld->undef || b == bld->undef)
1384       return bld->undef;
1385
1386    if(a == b)
1387       return a;
1388
1389    if(bld->type.norm) {
1390       if(a == bld->one || b == bld->one)
1391          return bld->one;
1392       if (!bld->type.sign) {
1393          if (a == bld->zero) {
1394             return b;
1395          }
1396          if (b == bld->zero) {
1397             return a;
1398          }
1399       }
1400    }
1401
1402    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1403 }
1404
1405
1406 /**
1407  * Generate max(a, b)
1408  * Checks for special cases.
1409  * NaN's are handled according to the behavior specified by the
1410  * nan_behavior argument.
1411  */
1412 LLVMValueRef
1413 lp_build_max_ext(struct lp_build_context *bld,
1414                   LLVMValueRef a,
1415                   LLVMValueRef b,
1416                   enum gallivm_nan_behavior nan_behavior)
1417 {
1418    assert(lp_check_value(bld->type, a));
1419    assert(lp_check_value(bld->type, b));
1420
1421    if(a == bld->undef || b == bld->undef)
1422       return bld->undef;
1423
1424    if(a == b)
1425       return a;
1426
1427    if(bld->type.norm) {
1428       if(a == bld->one || b == bld->one)
1429          return bld->one;
1430       if (!bld->type.sign) {
1431          if (a == bld->zero) {
1432             return b;
1433          }
1434          if (b == bld->zero) {
1435             return a;
1436          }
1437       }
1438    }
1439
1440    return lp_build_max_simple(bld, a, b, nan_behavior);
1441 }
1442
1443 /**
1444  * Generate clamp(a, min, max)
1445  * NaN behavior (for any of a, min, max) is undefined.
1446  * Do checks for special cases.
1447  */
1448 LLVMValueRef
1449 lp_build_clamp(struct lp_build_context *bld,
1450                LLVMValueRef a,
1451                LLVMValueRef min,
1452                LLVMValueRef max)
1453 {
1454    assert(lp_check_value(bld->type, a));
1455    assert(lp_check_value(bld->type, min));
1456    assert(lp_check_value(bld->type, max));
1457
1458    a = lp_build_min(bld, a, max);
1459    a = lp_build_max(bld, a, min);
1460    return a;
1461 }
1462
1463
1464 /**
1465  * Generate clamp(a, 0, 1)
1466  * A NaN will get converted to zero.
1467  */
1468 LLVMValueRef
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1470                                 LLVMValueRef a)
1471 {
1472    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1473    a = lp_build_min(bld, a, bld->one);
1474    return a;
1475 }
1476
1477
1478 /**
1479  * Generate abs(a)
1480  */
1481 LLVMValueRef
1482 lp_build_abs(struct lp_build_context *bld,
1483              LLVMValueRef a)
1484 {
1485    LLVMBuilderRef builder = bld->gallivm->builder;
1486    const struct lp_type type = bld->type;
1487    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1488
1489    assert(lp_check_value(type, a));
1490
1491    if(!type.sign)
1492       return a;
1493
1494    if(type.floating) {
1495       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1496          /* Workaround llvm.org/PR27332 */
1497          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1498          unsigned long long absMask = ~(1ULL << (type.width - 1));
1499          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1500          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1501          a = LLVMBuildAnd(builder, a, mask, "");
1502          a = LLVMBuildBitCast(builder, a, vec_type, "");
1503          return a;
1504       } else {
1505          char intrinsic[32];
1506          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1507          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1508       }
1509    }
1510
1511    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1512       switch(type.width) {
1513       case 8:
1514          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1515       case 16:
1516          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1517       case 32:
1518          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1519       }
1520    }
1521    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1522             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1523             (type.width == 8 || type.width == 16 || type.width == 32)) {
1524       debug_printf("%s: inefficient code, should split vectors manually\n",
1525                    __FUNCTION__);
1526    }
1527
1528    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1529 }
1530
1531
1532 LLVMValueRef
1533 lp_build_negate(struct lp_build_context *bld,
1534                 LLVMValueRef a)
1535 {
1536    LLVMBuilderRef builder = bld->gallivm->builder;
1537
1538    assert(lp_check_value(bld->type, a));
1539
1540    if (bld->type.floating)
1541       a = LLVMBuildFNeg(builder, a, "");
1542    else
1543       a = LLVMBuildNeg(builder, a, "");
1544
1545    return a;
1546 }
1547
1548
1549 /** Return -1, 0 or +1 depending on the sign of a */
1550 LLVMValueRef
1551 lp_build_sgn(struct lp_build_context *bld,
1552              LLVMValueRef a)
1553 {
1554    LLVMBuilderRef builder = bld->gallivm->builder;
1555    const struct lp_type type = bld->type;
1556    LLVMValueRef cond;
1557    LLVMValueRef res;
1558
1559    assert(lp_check_value(type, a));
1560
1561    /* Handle non-zero case */
1562    if(!type.sign) {
1563       /* if not zero then sign must be positive */
1564       res = bld->one;
1565    }
1566    else if(type.floating) {
1567       LLVMTypeRef vec_type;
1568       LLVMTypeRef int_type;
1569       LLVMValueRef mask;
1570       LLVMValueRef sign;
1571       LLVMValueRef one;
1572       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1573
1574       int_type = lp_build_int_vec_type(bld->gallivm, type);
1575       vec_type = lp_build_vec_type(bld->gallivm, type);
1576       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1577
1578       /* Take the sign bit and add it to 1 constant */
1579       sign = LLVMBuildBitCast(builder, a, int_type, "");
1580       sign = LLVMBuildAnd(builder, sign, mask, "");
1581       one = LLVMConstBitCast(bld->one, int_type);
1582       res = LLVMBuildOr(builder, sign, one, "");
1583       res = LLVMBuildBitCast(builder, res, vec_type, "");
1584    }
1585    else
1586    {
1587       /* signed int/norm/fixed point */
1588       /* could use psign with sse3 and appropriate vectors here */
1589       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1590       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1591       res = lp_build_select(bld, cond, bld->one, minus_one);
1592    }
1593
1594    /* Handle zero */
1595    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1596    res = lp_build_select(bld, cond, bld->zero, res);
1597
1598    return res;
1599 }
1600
1601
1602 /**
1603  * Set the sign of float vector 'a' according to 'sign'.
1604  * If sign==0, return abs(a).
1605  * If sign==1, return -abs(a);
1606  * Other values for sign produce undefined results.
1607  */
1608 LLVMValueRef
1609 lp_build_set_sign(struct lp_build_context *bld,
1610                   LLVMValueRef a, LLVMValueRef sign)
1611 {
1612    LLVMBuilderRef builder = bld->gallivm->builder;
1613    const struct lp_type type = bld->type;
1614    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1615    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1616    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1617    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1618                              ~((unsigned long long) 1 << (type.width - 1)));
1619    LLVMValueRef val, res;
1620
1621    assert(type.floating);
1622    assert(lp_check_value(type, a));
1623
1624    /* val = reinterpret_cast<int>(a) */
1625    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1626    /* val = val & mask */
1627    val = LLVMBuildAnd(builder, val, mask, "");
1628    /* sign = sign << shift */
1629    sign = LLVMBuildShl(builder, sign, shift, "");
1630    /* res = val | sign */
1631    res = LLVMBuildOr(builder, val, sign, "");
1632    /* res = reinterpret_cast<float>(res) */
1633    res = LLVMBuildBitCast(builder, res, vec_type, "");
1634
1635    return res;
1636 }
1637
1638
1639 /**
1640  * Convert vector of (or scalar) int to vector of (or scalar) float.
1641  */
1642 LLVMValueRef
1643 lp_build_int_to_float(struct lp_build_context *bld,
1644                       LLVMValueRef a)
1645 {
1646    LLVMBuilderRef builder = bld->gallivm->builder;
1647    const struct lp_type type = bld->type;
1648    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1649
1650    assert(type.floating);
1651
1652    return LLVMBuildSIToFP(builder, a, vec_type, "");
1653 }
1654
1655 static boolean
1656 arch_rounding_available(const struct lp_type type)
1657 {
1658    if ((util_cpu_caps.has_sse4_1 &&
1659        (type.length == 1 || type.width*type.length == 128)) ||
1660        (util_cpu_caps.has_avx && type.width*type.length == 256))
1661       return TRUE;
1662    else if ((util_cpu_caps.has_altivec &&
1663             (type.width == 32 && type.length == 4)))
1664       return TRUE;
1665
1666    return FALSE;
1667 }
1668
1669 enum lp_build_round_mode
1670 {
1671    LP_BUILD_ROUND_NEAREST = 0,
1672    LP_BUILD_ROUND_FLOOR = 1,
1673    LP_BUILD_ROUND_CEIL = 2,
1674    LP_BUILD_ROUND_TRUNCATE = 3
1675 };
1676
1677 static inline LLVMValueRef
1678 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1679                              LLVMValueRef a)
1680 {
1681    LLVMBuilderRef builder = bld->gallivm->builder;
1682    const struct lp_type type = bld->type;
1683    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1684    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1685    const char *intrinsic;
1686    LLVMValueRef res;
1687
1688    assert(type.floating);
1689    /* using the double precision conversions is a bit more complicated */
1690    assert(type.width == 32);
1691
1692    assert(lp_check_value(type, a));
1693    assert(util_cpu_caps.has_sse2);
1694
1695    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1696    if (type.length == 1) {
1697       LLVMTypeRef vec_type;
1698       LLVMValueRef undef;
1699       LLVMValueRef arg;
1700       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1701
1702       vec_type = LLVMVectorType(bld->elem_type, 4);
1703
1704       intrinsic = "llvm.x86.sse.cvtss2si";
1705
1706       undef = LLVMGetUndef(vec_type);
1707
1708       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1709
1710       res = lp_build_intrinsic_unary(builder, intrinsic,
1711                                      ret_type, arg);
1712    }
1713    else {
1714       if (type.width* type.length == 128) {
1715          intrinsic = "llvm.x86.sse2.cvtps2dq";
1716       }
1717       else {
1718          assert(type.width*type.length == 256);
1719          assert(util_cpu_caps.has_avx);
1720
1721          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1722       }
1723       res = lp_build_intrinsic_unary(builder, intrinsic,
1724                                      ret_type, a);
1725    }
1726
1727    return res;
1728 }
1729
1730
1731 /*
1732  */
1733 static inline LLVMValueRef
1734 lp_build_round_altivec(struct lp_build_context *bld,
1735                        LLVMValueRef a,
1736                        enum lp_build_round_mode mode)
1737 {
1738    LLVMBuilderRef builder = bld->gallivm->builder;
1739    const struct lp_type type = bld->type;
1740    const char *intrinsic = NULL;
1741
1742    assert(type.floating);
1743
1744    assert(lp_check_value(type, a));
1745    assert(util_cpu_caps.has_altivec);
1746
1747    (void)type;
1748
1749    switch (mode) {
1750    case LP_BUILD_ROUND_NEAREST:
1751       intrinsic = "llvm.ppc.altivec.vrfin";
1752       break;
1753    case LP_BUILD_ROUND_FLOOR:
1754       intrinsic = "llvm.ppc.altivec.vrfim";
1755       break;
1756    case LP_BUILD_ROUND_CEIL:
1757       intrinsic = "llvm.ppc.altivec.vrfip";
1758       break;
1759    case LP_BUILD_ROUND_TRUNCATE:
1760       intrinsic = "llvm.ppc.altivec.vrfiz";
1761       break;
1762    }
1763
1764    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1765 }
1766
1767 static inline LLVMValueRef
1768 lp_build_round_arch(struct lp_build_context *bld,
1769                     LLVMValueRef a,
1770                     enum lp_build_round_mode mode)
1771 {
1772    if (util_cpu_caps.has_sse4_1) {
1773       LLVMBuilderRef builder = bld->gallivm->builder;
1774       const struct lp_type type = bld->type;
1775       const char *intrinsic_root;
1776       char intrinsic[32];
1777
1778       assert(type.floating);
1779       assert(lp_check_value(type, a));
1780       (void)type;
1781
1782       switch (mode) {
1783       case LP_BUILD_ROUND_NEAREST:
1784          intrinsic_root = "llvm.nearbyint";
1785          break;
1786       case LP_BUILD_ROUND_FLOOR:
1787          intrinsic_root = "llvm.floor";
1788          break;
1789       case LP_BUILD_ROUND_CEIL:
1790          intrinsic_root = "llvm.ceil";
1791          break;
1792       case LP_BUILD_ROUND_TRUNCATE:
1793          intrinsic_root = "llvm.trunc";
1794          break;
1795       }
1796
1797       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1798       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1799    }
1800    else /* (util_cpu_caps.has_altivec) */
1801      return lp_build_round_altivec(bld, a, mode);
1802 }
1803
1804 /**
1805  * Return the integer part of a float (vector) value (== round toward zero).
1806  * The returned value is a float (vector).
1807  * Ex: trunc(-1.5) = -1.0
1808  */
1809 LLVMValueRef
1810 lp_build_trunc(struct lp_build_context *bld,
1811                LLVMValueRef a)
1812 {
1813    LLVMBuilderRef builder = bld->gallivm->builder;
1814    const struct lp_type type = bld->type;
1815
1816    assert(type.floating);
1817    assert(lp_check_value(type, a));
1818
1819    if (arch_rounding_available(type)) {
1820       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1821    }
1822    else {
1823       const struct lp_type type = bld->type;
1824       struct lp_type inttype;
1825       struct lp_build_context intbld;
1826       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1827       LLVMValueRef trunc, res, anosign, mask;
1828       LLVMTypeRef int_vec_type = bld->int_vec_type;
1829       LLVMTypeRef vec_type = bld->vec_type;
1830
1831       assert(type.width == 32); /* might want to handle doubles at some point */
1832
1833       inttype = type;
1834       inttype.floating = 0;
1835       lp_build_context_init(&intbld, bld->gallivm, inttype);
1836
1837       /* round by truncation */
1838       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1839       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1840
1841       /* mask out sign bit */
1842       anosign = lp_build_abs(bld, a);
1843       /*
1844        * mask out all values if anosign > 2^24
1845        * This should work both for large ints (all rounding is no-op for them
1846        * because such floats are always exact) as well as special cases like
1847        * NaNs, Infs (taking advantage of the fact they use max exponent).
1848        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1849        */
1850       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1851       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1852       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1853       return lp_build_select(bld, mask, a, res);
1854    }
1855 }
1856
1857
1858 /**
1859  * Return float (vector) rounded to nearest integer (vector).  The returned
1860  * value is a float (vector).
1861  * Ex: round(0.9) = 1.0
1862  * Ex: round(-1.5) = -2.0
1863  */
1864 LLVMValueRef
1865 lp_build_round(struct lp_build_context *bld,
1866                LLVMValueRef a)
1867 {
1868    LLVMBuilderRef builder = bld->gallivm->builder;
1869    const struct lp_type type = bld->type;
1870
1871    assert(type.floating);
1872    assert(lp_check_value(type, a));
1873
1874    if (arch_rounding_available(type)) {
1875       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1876    }
1877    else {
1878       const struct lp_type type = bld->type;
1879       struct lp_type inttype;
1880       struct lp_build_context intbld;
1881       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1882       LLVMValueRef res, anosign, mask;
1883       LLVMTypeRef int_vec_type = bld->int_vec_type;
1884       LLVMTypeRef vec_type = bld->vec_type;
1885
1886       assert(type.width == 32); /* might want to handle doubles at some point */
1887
1888       inttype = type;
1889       inttype.floating = 0;
1890       lp_build_context_init(&intbld, bld->gallivm, inttype);
1891
1892       res = lp_build_iround(bld, a);
1893       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1894
1895       /* mask out sign bit */
1896       anosign = lp_build_abs(bld, a);
1897       /*
1898        * mask out all values if anosign > 2^24
1899        * This should work both for large ints (all rounding is no-op for them
1900        * because such floats are always exact) as well as special cases like
1901        * NaNs, Infs (taking advantage of the fact they use max exponent).
1902        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1903        */
1904       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1905       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1906       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1907       return lp_build_select(bld, mask, a, res);
1908    }
1909 }
1910
1911
1912 /**
1913  * Return floor of float (vector), result is a float (vector)
1914  * Ex: floor(1.1) = 1.0
1915  * Ex: floor(-1.1) = -2.0
1916  */
1917 LLVMValueRef
1918 lp_build_floor(struct lp_build_context *bld,
1919                LLVMValueRef a)
1920 {
1921    LLVMBuilderRef builder = bld->gallivm->builder;
1922    const struct lp_type type = bld->type;
1923
1924    assert(type.floating);
1925    assert(lp_check_value(type, a));
1926
1927    if (arch_rounding_available(type)) {
1928       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1929    }
1930    else {
1931       const struct lp_type type = bld->type;
1932       struct lp_type inttype;
1933       struct lp_build_context intbld;
1934       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1935       LLVMValueRef trunc, res, anosign, mask;
1936       LLVMTypeRef int_vec_type = bld->int_vec_type;
1937       LLVMTypeRef vec_type = bld->vec_type;
1938
1939       if (type.width != 32) {
1940          char intrinsic[32];
1941          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
1942          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1943       }
1944
1945       assert(type.width == 32); /* might want to handle doubles at some point */
1946
1947       inttype = type;
1948       inttype.floating = 0;
1949       lp_build_context_init(&intbld, bld->gallivm, inttype);
1950
1951       /* round by truncation */
1952       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1953       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1954
1955       if (type.sign) {
1956          LLVMValueRef tmp;
1957
1958          /*
1959           * fix values if rounding is wrong (for non-special cases)
1960           * - this is the case if trunc > a
1961           */
1962          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1963          /* tmp = trunc > a ? 1.0 : 0.0 */
1964          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1965          tmp = lp_build_and(&intbld, mask, tmp);
1966          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1967          res = lp_build_sub(bld, res, tmp);
1968       }
1969
1970       /* mask out sign bit */
1971       anosign = lp_build_abs(bld, a);
1972       /*
1973        * mask out all values if anosign > 2^24
1974        * This should work both for large ints (all rounding is no-op for them
1975        * because such floats are always exact) as well as special cases like
1976        * NaNs, Infs (taking advantage of the fact they use max exponent).
1977        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1978        */
1979       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1980       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1981       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1982       return lp_build_select(bld, mask, a, res);
1983    }
1984 }
1985
1986
1987 /**
1988  * Return ceiling of float (vector), returning float (vector).
1989  * Ex: ceil( 1.1) = 2.0
1990  * Ex: ceil(-1.1) = -1.0
1991  */
1992 LLVMValueRef
1993 lp_build_ceil(struct lp_build_context *bld,
1994               LLVMValueRef a)
1995 {
1996    LLVMBuilderRef builder = bld->gallivm->builder;
1997    const struct lp_type type = bld->type;
1998
1999    assert(type.floating);
2000    assert(lp_check_value(type, a));
2001
2002    if (arch_rounding_available(type)) {
2003       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2004    }
2005    else {
2006       const struct lp_type type = bld->type;
2007       struct lp_type inttype;
2008       struct lp_build_context intbld;
2009       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2010       LLVMValueRef trunc, res, anosign, mask, tmp;
2011       LLVMTypeRef int_vec_type = bld->int_vec_type;
2012       LLVMTypeRef vec_type = bld->vec_type;
2013
2014       if (type.width != 32) {
2015          char intrinsic[32];
2016          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2017          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2018       }
2019
2020       assert(type.width == 32); /* might want to handle doubles at some point */
2021
2022       inttype = type;
2023       inttype.floating = 0;
2024       lp_build_context_init(&intbld, bld->gallivm, inttype);
2025
2026       /* round by truncation */
2027       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2028       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2029
2030       /*
2031        * fix values if rounding is wrong (for non-special cases)
2032        * - this is the case if trunc < a
2033        */
2034       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2035       /* tmp = trunc < a ? 1.0 : 0.0 */
2036       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2037       tmp = lp_build_and(&intbld, mask, tmp);
2038       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2039       res = lp_build_add(bld, trunc, tmp);
2040
2041       /* mask out sign bit */
2042       anosign = lp_build_abs(bld, a);
2043       /*
2044        * mask out all values if anosign > 2^24
2045        * This should work both for large ints (all rounding is no-op for them
2046        * because such floats are always exact) as well as special cases like
2047        * NaNs, Infs (taking advantage of the fact they use max exponent).
2048        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2049        */
2050       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2051       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2052       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2053       return lp_build_select(bld, mask, a, res);
2054    }
2055 }
2056
2057
2058 /**
2059  * Return fractional part of 'a' computed as a - floor(a)
2060  * Typically used in texture coord arithmetic.
2061  */
2062 LLVMValueRef
2063 lp_build_fract(struct lp_build_context *bld,
2064                LLVMValueRef a)
2065 {
2066    assert(bld->type.floating);
2067    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2068 }
2069
2070
2071 /**
2072  * Prevent returning a fractional part of 1.0 for very small negative values of
2073  * 'a' by clamping against 0.99999(9).
2074  */
2075 static inline LLVMValueRef
2076 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2077 {
2078    LLVMValueRef max;
2079
2080    /* this is the largest number smaller than 1.0 representable as float */
2081    max = lp_build_const_vec(bld->gallivm, bld->type,
2082                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2083    return lp_build_min(bld, fract, max);
2084 }
2085
2086
2087 /**
2088  * Same as lp_build_fract, but guarantees that the result is always smaller
2089  * than one.
2090  */
2091 LLVMValueRef
2092 lp_build_fract_safe(struct lp_build_context *bld,
2093                     LLVMValueRef a)
2094 {
2095    return clamp_fract(bld, lp_build_fract(bld, a));
2096 }
2097
2098
2099 /**
2100  * Return the integer part of a float (vector) value (== round toward zero).
2101  * The returned value is an integer (vector).
2102  * Ex: itrunc(-1.5) = -1
2103  */
2104 LLVMValueRef
2105 lp_build_itrunc(struct lp_build_context *bld,
2106                 LLVMValueRef a)
2107 {
2108    LLVMBuilderRef builder = bld->gallivm->builder;
2109    const struct lp_type type = bld->type;
2110    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2111
2112    assert(type.floating);
2113    assert(lp_check_value(type, a));
2114
2115    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2116 }
2117
2118
2119 /**
2120  * Return float (vector) rounded to nearest integer (vector).  The returned
2121  * value is an integer (vector).
2122  * Ex: iround(0.9) = 1
2123  * Ex: iround(-1.5) = -2
2124  */
2125 LLVMValueRef
2126 lp_build_iround(struct lp_build_context *bld,
2127                 LLVMValueRef a)
2128 {
2129    LLVMBuilderRef builder = bld->gallivm->builder;
2130    const struct lp_type type = bld->type;
2131    LLVMTypeRef int_vec_type = bld->int_vec_type;
2132    LLVMValueRef res;
2133
2134    assert(type.floating);
2135
2136    assert(lp_check_value(type, a));
2137
2138    if ((util_cpu_caps.has_sse2 &&
2139        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2140        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2141       return lp_build_iround_nearest_sse2(bld, a);
2142    }
2143    if (arch_rounding_available(type)) {
2144       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2145    }
2146    else {
2147       LLVMValueRef half;
2148
2149       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2150
2151       if (type.sign) {
2152          LLVMTypeRef vec_type = bld->vec_type;
2153          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2154                                     (unsigned long long)1 << (type.width - 1));
2155          LLVMValueRef sign;
2156
2157          /* get sign bit */
2158          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2159          sign = LLVMBuildAnd(builder, sign, mask, "");
2160
2161          /* sign * 0.5 */
2162          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2163          half = LLVMBuildOr(builder, sign, half, "");
2164          half = LLVMBuildBitCast(builder, half, vec_type, "");
2165       }
2166
2167       res = LLVMBuildFAdd(builder, a, half, "");
2168    }
2169
2170    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2171
2172    return res;
2173 }
2174
2175
2176 /**
2177  * Return floor of float (vector), result is an int (vector)
2178  * Ex: ifloor(1.1) = 1.0
2179  * Ex: ifloor(-1.1) = -2.0
2180  */
2181 LLVMValueRef
2182 lp_build_ifloor(struct lp_build_context *bld,
2183                 LLVMValueRef a)
2184 {
2185    LLVMBuilderRef builder = bld->gallivm->builder;
2186    const struct lp_type type = bld->type;
2187    LLVMTypeRef int_vec_type = bld->int_vec_type;
2188    LLVMValueRef res;
2189
2190    assert(type.floating);
2191    assert(lp_check_value(type, a));
2192
2193    res = a;
2194    if (type.sign) {
2195       if (arch_rounding_available(type)) {
2196          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2197       }
2198       else {
2199          struct lp_type inttype;
2200          struct lp_build_context intbld;
2201          LLVMValueRef trunc, itrunc, mask;
2202
2203          assert(type.floating);
2204          assert(lp_check_value(type, a));
2205
2206          inttype = type;
2207          inttype.floating = 0;
2208          lp_build_context_init(&intbld, bld->gallivm, inttype);
2209
2210          /* round by truncation */
2211          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2212          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2213
2214          /*
2215           * fix values if rounding is wrong (for non-special cases)
2216           * - this is the case if trunc > a
2217           * The results of doing this with NaNs, very large values etc.
2218           * are undefined but this seems to be the case anyway.
2219           */
2220          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2221          /* cheapie minus one with mask since the mask is minus one / zero */
2222          return lp_build_add(&intbld, itrunc, mask);
2223       }
2224    }
2225
2226    /* round to nearest (toward zero) */
2227    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2228
2229    return res;
2230 }
2231
2232
2233 /**
2234  * Return ceiling of float (vector), returning int (vector).
2235  * Ex: iceil( 1.1) = 2
2236  * Ex: iceil(-1.1) = -1
2237  */
2238 LLVMValueRef
2239 lp_build_iceil(struct lp_build_context *bld,
2240                LLVMValueRef a)
2241 {
2242    LLVMBuilderRef builder = bld->gallivm->builder;
2243    const struct lp_type type = bld->type;
2244    LLVMTypeRef int_vec_type = bld->int_vec_type;
2245    LLVMValueRef res;
2246
2247    assert(type.floating);
2248    assert(lp_check_value(type, a));
2249
2250    if (arch_rounding_available(type)) {
2251       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2252    }
2253    else {
2254       struct lp_type inttype;
2255       struct lp_build_context intbld;
2256       LLVMValueRef trunc, itrunc, mask;
2257
2258       assert(type.floating);
2259       assert(lp_check_value(type, a));
2260
2261       inttype = type;
2262       inttype.floating = 0;
2263       lp_build_context_init(&intbld, bld->gallivm, inttype);
2264
2265       /* round by truncation */
2266       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2267       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2268
2269       /*
2270        * fix values if rounding is wrong (for non-special cases)
2271        * - this is the case if trunc < a
2272        * The results of doing this with NaNs, very large values etc.
2273        * are undefined but this seems to be the case anyway.
2274        */
2275       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2276       /* cheapie plus one with mask since the mask is minus one / zero */
2277       return lp_build_sub(&intbld, itrunc, mask);
2278    }
2279
2280    /* round to nearest (toward zero) */
2281    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2282
2283    return res;
2284 }
2285
2286
2287 /**
2288  * Combined ifloor() & fract().
2289  *
2290  * Preferred to calling the functions separately, as it will ensure that the
2291  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2292  */
2293 void
2294 lp_build_ifloor_fract(struct lp_build_context *bld,
2295                       LLVMValueRef a,
2296                       LLVMValueRef *out_ipart,
2297                       LLVMValueRef *out_fpart)
2298 {
2299    LLVMBuilderRef builder = bld->gallivm->builder;
2300    const struct lp_type type = bld->type;
2301    LLVMValueRef ipart;
2302
2303    assert(type.floating);
2304    assert(lp_check_value(type, a));
2305
2306    if (arch_rounding_available(type)) {
2307       /*
2308        * floor() is easier.
2309        */
2310
2311       ipart = lp_build_floor(bld, a);
2312       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2313       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2314    }
2315    else {
2316       /*
2317        * ifloor() is easier.
2318        */
2319
2320       *out_ipart = lp_build_ifloor(bld, a);
2321       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2322       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2323    }
2324 }
2325
2326
2327 /**
2328  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2329  * always smaller than one.
2330  */
2331 void
2332 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2333                            LLVMValueRef a,
2334                            LLVMValueRef *out_ipart,
2335                            LLVMValueRef *out_fpart)
2336 {
2337    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2338    *out_fpart = clamp_fract(bld, *out_fpart);
2339 }
2340
2341
2342 LLVMValueRef
2343 lp_build_sqrt(struct lp_build_context *bld,
2344               LLVMValueRef a)
2345 {
2346    LLVMBuilderRef builder = bld->gallivm->builder;
2347    const struct lp_type type = bld->type;
2348    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2349    char intrinsic[32];
2350
2351    assert(lp_check_value(type, a));
2352
2353    assert(type.floating);
2354    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2355
2356    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2357 }
2358
2359
2360 /**
2361  * Do one Newton-Raphson step to improve reciprocate precision:
2362  *
2363  *   x_{i+1} = x_i * (2 - a * x_i)
2364  *
2365  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2366  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2367  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2368  * halo. It would be necessary to clamp the argument to prevent this.
2369  *
2370  * See also:
2371  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2372  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2373  */
2374 static inline LLVMValueRef
2375 lp_build_rcp_refine(struct lp_build_context *bld,
2376                     LLVMValueRef a,
2377                     LLVMValueRef rcp_a)
2378 {
2379    LLVMBuilderRef builder = bld->gallivm->builder;
2380    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2381    LLVMValueRef res;
2382
2383    res = LLVMBuildFMul(builder, a, rcp_a, "");
2384    res = LLVMBuildFSub(builder, two, res, "");
2385    res = LLVMBuildFMul(builder, rcp_a, res, "");
2386
2387    return res;
2388 }
2389
2390
2391 LLVMValueRef
2392 lp_build_rcp(struct lp_build_context *bld,
2393              LLVMValueRef a)
2394 {
2395    LLVMBuilderRef builder = bld->gallivm->builder;
2396    const struct lp_type type = bld->type;
2397
2398    assert(lp_check_value(type, a));
2399
2400    if(a == bld->zero)
2401       return bld->undef;
2402    if(a == bld->one)
2403       return bld->one;
2404    if(a == bld->undef)
2405       return bld->undef;
2406
2407    assert(type.floating);
2408
2409    if(LLVMIsConstant(a))
2410       return LLVMConstFDiv(bld->one, a);
2411
2412    /*
2413     * We don't use RCPPS because:
2414     * - it only has 10bits of precision
2415     * - it doesn't even get the reciprocate of 1.0 exactly
2416     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2417     * - for recent processors the benefit over DIVPS is marginal, a case
2418     *   dependent
2419     *
2420     * We could still use it on certain processors if benchmarks show that the
2421     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2422     * particular uses that require less workarounds.
2423     */
2424
2425    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2426          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2427       const unsigned num_iterations = 0;
2428       LLVMValueRef res;
2429       unsigned i;
2430       const char *intrinsic = NULL;
2431
2432       if (type.length == 4) {
2433          intrinsic = "llvm.x86.sse.rcp.ps";
2434       }
2435       else {
2436          intrinsic = "llvm.x86.avx.rcp.ps.256";
2437       }
2438
2439       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2440
2441       for (i = 0; i < num_iterations; ++i) {
2442          res = lp_build_rcp_refine(bld, a, res);
2443       }
2444
2445       return res;
2446    }
2447
2448    return LLVMBuildFDiv(builder, bld->one, a, "");
2449 }
2450
2451
2452 /**
2453  * Do one Newton-Raphson step to improve rsqrt precision:
2454  *
2455  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2456  *
2457  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2458  */
2459 static inline LLVMValueRef
2460 lp_build_rsqrt_refine(struct lp_build_context *bld,
2461                       LLVMValueRef a,
2462                       LLVMValueRef rsqrt_a)
2463 {
2464    LLVMBuilderRef builder = bld->gallivm->builder;
2465    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2466    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2467    LLVMValueRef res;
2468
2469    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2470    res = LLVMBuildFMul(builder, a, res, "");
2471    res = LLVMBuildFSub(builder, three, res, "");
2472    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2473    res = LLVMBuildFMul(builder, half, res, "");
2474
2475    return res;
2476 }
2477
2478
2479 /**
2480  * Generate 1/sqrt(a).
2481  * Result is undefined for values < 0, infinity for +0.
2482  */
2483 LLVMValueRef
2484 lp_build_rsqrt(struct lp_build_context *bld,
2485                LLVMValueRef a)
2486 {
2487    const struct lp_type type = bld->type;
2488
2489    assert(lp_check_value(type, a));
2490
2491    assert(type.floating);
2492
2493    /*
2494     * This should be faster but all denormals will end up as infinity.
2495     */
2496    if (0 && lp_build_fast_rsqrt_available(type)) {
2497       const unsigned num_iterations = 1;
2498       LLVMValueRef res;
2499       unsigned i;
2500
2501       /* rsqrt(1.0) != 1.0 here */
2502       res = lp_build_fast_rsqrt(bld, a);
2503
2504       if (num_iterations) {
2505          /*
2506           * Newton-Raphson will result in NaN instead of infinity for zero,
2507           * and NaN instead of zero for infinity.
2508           * Also, need to ensure rsqrt(1.0) == 1.0.
2509           * All numbers smaller than FLT_MIN will result in +infinity
2510           * (rsqrtps treats all denormals as zero).
2511           */
2512          LLVMValueRef cmp;
2513          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2514          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2515
2516          for (i = 0; i < num_iterations; ++i) {
2517             res = lp_build_rsqrt_refine(bld, a, res);
2518          }
2519          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2520          res = lp_build_select(bld, cmp, inf, res);
2521          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2522          res = lp_build_select(bld, cmp, bld->zero, res);
2523          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2524          res = lp_build_select(bld, cmp, bld->one, res);
2525       }
2526
2527       return res;
2528    }
2529
2530    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2531 }
2532
2533 /**
2534  * If there's a fast (inaccurate) rsqrt instruction available
2535  * (caller may want to avoid to call rsqrt_fast if it's not available,
2536  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2537  * unavailable it would result in sqrt/div/mul so obviously
2538  * much better to just call sqrt, skipping both div and mul).
2539  */
2540 boolean
2541 lp_build_fast_rsqrt_available(struct lp_type type)
2542 {
2543    assert(type.floating);
2544
2545    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2546        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2547       return true;
2548    }
2549    return false;
2550 }
2551
2552
2553 /**
2554  * Generate 1/sqrt(a).
2555  * Result is undefined for values < 0, infinity for +0.
2556  * Precision is limited, only ~10 bits guaranteed
2557  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2558  */
2559 LLVMValueRef
2560 lp_build_fast_rsqrt(struct lp_build_context *bld,
2561                     LLVMValueRef a)
2562 {
2563    LLVMBuilderRef builder = bld->gallivm->builder;
2564    const struct lp_type type = bld->type;
2565
2566    assert(lp_check_value(type, a));
2567
2568    if (lp_build_fast_rsqrt_available(type)) {
2569       const char *intrinsic = NULL;
2570
2571       if (type.length == 4) {
2572          intrinsic = "llvm.x86.sse.rsqrt.ps";
2573       }
2574       else {
2575          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2576       }
2577       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2578    }
2579    else {
2580       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2581    }
2582    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2583 }
2584
2585
2586 /**
2587  * Generate sin(a) or cos(a) using polynomial approximation.
2588  * TODO: it might be worth recognizing sin and cos using same source
2589  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2590  * would be way cheaper than calculating (nearly) everything twice...
2591  * Not sure it's common enough to be worth bothering however, scs
2592  * opcode could also benefit from calculating both though.
2593  */
2594 static LLVMValueRef
2595 lp_build_sin_or_cos(struct lp_build_context *bld,
2596                     LLVMValueRef a,
2597                     boolean cos)
2598 {
2599    struct gallivm_state *gallivm = bld->gallivm;
2600    LLVMBuilderRef b = gallivm->builder;
2601    struct lp_type int_type = lp_int_type(bld->type);
2602
2603    /*
2604     *  take the absolute value,
2605     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2606     */
2607
2608    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2609    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2610
2611    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2612    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2613
2614    /*
2615     * scale by 4/Pi
2616     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2617     */
2618
2619    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2620    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2621
2622    /*
2623     * store the integer part of y in mm0
2624     * emm2 = _mm_cvttps_epi32(y);
2625     */
2626
2627    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2628
2629    /*
2630     * j=(j+1) & (~1) (see the cephes sources)
2631     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2632     */
2633
2634    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2635    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2636    /*
2637     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2638     */
2639    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2640    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2641
2642    /*
2643     * y = _mm_cvtepi32_ps(emm2);
2644     */
2645    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2646
2647    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2648    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2649    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2650    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2651
2652    /*
2653     * Argument used for poly selection and sign bit determination
2654     * is different for sin vs. cos.
2655     */
2656    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2657                                emm2_and;
2658
2659    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2660                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2661                                               const_29, "sign_bit") :
2662                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2663                                                               LLVMBuildShl(b, emm2_add,
2664                                                                            const_29, ""), ""),
2665                                               sign_mask, "sign_bit");
2666
2667    /*
2668     * get the polynom selection mask
2669     * there is one polynom for 0 <= x <= Pi/4
2670     * and another one for Pi/4<x<=Pi/2
2671     * Both branches will be computed.
2672     *
2673     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2674     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2675     */
2676
2677    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2678    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2679                                              int_type, PIPE_FUNC_EQUAL,
2680                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2681
2682    /*
2683     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2684     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2685     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2686     */
2687    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2688    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2689    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2690
2691    /*
2692     * The magic pass: "Extended precision modular arithmetic"
2693     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2694     * xmm1 = _mm_mul_ps(y, xmm1);
2695     * xmm2 = _mm_mul_ps(y, xmm2);
2696     * xmm3 = _mm_mul_ps(y, xmm3);
2697     */
2698    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2699    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2700    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2701
2702    /*
2703     * x = _mm_add_ps(x, xmm1);
2704     * x = _mm_add_ps(x, xmm2);
2705     * x = _mm_add_ps(x, xmm3);
2706     */
2707
2708    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2709    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2710    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2711
2712    /*
2713     * Evaluate the first polynom  (0 <= x <= Pi/4)
2714     *
2715     * z = _mm_mul_ps(x,x);
2716     */
2717    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2718
2719    /*
2720     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2721     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2722     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2723     */
2724    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2725    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2726    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2727
2728    /*
2729     * y = *(v4sf*)_ps_coscof_p0;
2730     * y = _mm_mul_ps(y, z);
2731     */
2732    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2733    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2734    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2735    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2736    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2737    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2738
2739
2740    /*
2741     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2742     * y = _mm_sub_ps(y, tmp);
2743     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2744     */
2745    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2746    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2747    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2748    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2749    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2750
2751    /*
2752     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2753     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2754     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2755     */
2756    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2757    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2758    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2759
2760    /*
2761     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2762     *
2763     * y2 = *(v4sf*)_ps_sincof_p0;
2764     * y2 = _mm_mul_ps(y2, z);
2765     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2766     * y2 = _mm_mul_ps(y2, z);
2767     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2768     * y2 = _mm_mul_ps(y2, z);
2769     * y2 = _mm_mul_ps(y2, x);
2770     * y2 = _mm_add_ps(y2, x);
2771     */
2772
2773    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2774    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2775    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2776    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2777    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2778    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2779    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2780
2781    /*
2782     * select the correct result from the two polynoms
2783     * xmm3 = poly_mask;
2784     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2785     * y = _mm_andnot_ps(xmm3, y);
2786     * y = _mm_or_ps(y,y2);
2787     */
2788    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2789    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2790    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2791    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2792    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2793    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2794
2795    /*
2796     * update the sign
2797     * y = _mm_xor_ps(y, sign_bit);
2798     */
2799    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2800    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2801
2802    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2803
2804    /* clamp output to be within [-1, 1] */
2805    y_result = lp_build_clamp(bld, y_result,
2806                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2807                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2808    /* If a is -inf, inf or NaN then return NaN */
2809    y_result = lp_build_select(bld, isfinite, y_result,
2810                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2811    return y_result;
2812 }
2813
2814
2815 /**
2816  * Generate sin(a)
2817  */
2818 LLVMValueRef
2819 lp_build_sin(struct lp_build_context *bld,
2820              LLVMValueRef a)
2821 {
2822    return lp_build_sin_or_cos(bld, a, FALSE);
2823 }
2824
2825
2826 /**
2827  * Generate cos(a)
2828  */
2829 LLVMValueRef
2830 lp_build_cos(struct lp_build_context *bld,
2831              LLVMValueRef a)
2832 {
2833    return lp_build_sin_or_cos(bld, a, TRUE);
2834 }
2835
2836
2837 /**
2838  * Generate pow(x, y)
2839  */
2840 LLVMValueRef
2841 lp_build_pow(struct lp_build_context *bld,
2842              LLVMValueRef x,
2843              LLVMValueRef y)
2844 {
2845    /* TODO: optimize the constant case */
2846    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2847        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2848       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2849                    __FUNCTION__);
2850    }
2851
2852    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2853 }
2854
2855
2856 /**
2857  * Generate exp(x)
2858  */
2859 LLVMValueRef
2860 lp_build_exp(struct lp_build_context *bld,
2861              LLVMValueRef x)
2862 {
2863    /* log2(e) = 1/log(2) */
2864    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2865                                            1.4426950408889634);
2866
2867    assert(lp_check_value(bld->type, x));
2868
2869    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2870 }
2871
2872
2873 /**
2874  * Generate log(x)
2875  * Behavior is undefined with infs, 0s and nans
2876  */
2877 LLVMValueRef
2878 lp_build_log(struct lp_build_context *bld,
2879              LLVMValueRef x)
2880 {
2881    /* log(2) */
2882    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2883                                           0.69314718055994529);
2884
2885    assert(lp_check_value(bld->type, x));
2886
2887    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2888 }
2889
2890 /**
2891  * Generate log(x) that handles edge cases (infs, 0s and nans)
2892  */
2893 LLVMValueRef
2894 lp_build_log_safe(struct lp_build_context *bld,
2895                   LLVMValueRef x)
2896 {
2897    /* log(2) */
2898    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2899                                           0.69314718055994529);
2900
2901    assert(lp_check_value(bld->type, x));
2902
2903    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2904 }
2905
2906
2907 /**
2908  * Generate polynomial.
2909  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2910  */
2911 LLVMValueRef
2912 lp_build_polynomial(struct lp_build_context *bld,
2913                     LLVMValueRef x,
2914                     const double *coeffs,
2915                     unsigned num_coeffs)
2916 {
2917    const struct lp_type type = bld->type;
2918    LLVMValueRef even = NULL, odd = NULL;
2919    LLVMValueRef x2;
2920    unsigned i;
2921
2922    assert(lp_check_value(bld->type, x));
2923
2924    /* TODO: optimize the constant case */
2925    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2926        LLVMIsConstant(x)) {
2927       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2928                    __FUNCTION__);
2929    }
2930
2931    /*
2932     * Calculate odd and even terms seperately to decrease data dependency
2933     * Ex:
2934     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2935     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2936     */
2937    x2 = lp_build_mul(bld, x, x);
2938
2939    for (i = num_coeffs; i--; ) {
2940       LLVMValueRef coeff;
2941
2942       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2943
2944       if (i % 2 == 0) {
2945          if (even)
2946             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2947          else
2948             even = coeff;
2949       } else {
2950          if (odd)
2951             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2952          else
2953             odd = coeff;
2954       }
2955    }
2956
2957    if (odd)
2958       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2959    else if (even)
2960       return even;
2961    else
2962       return bld->undef;
2963 }
2964
2965
2966 /**
2967  * Minimax polynomial fit of 2**x, in range [0, 1[
2968  */
2969 const double lp_build_exp2_polynomial[] = {
2970 #if EXP_POLY_DEGREE == 5
2971    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2972    0.693153073200168932794,
2973    0.240153617044375388211,
2974    0.0558263180532956664775,
2975    0.00898934009049466391101,
2976    0.00187757667519147912699
2977 #elif EXP_POLY_DEGREE == 4
2978    1.00000259337069434683,
2979    0.693003834469974940458,
2980    0.24144275689150793076,
2981    0.0520114606103070150235,
2982    0.0135341679161270268764
2983 #elif EXP_POLY_DEGREE == 3
2984    0.999925218562710312959,
2985    0.695833540494823811697,
2986    0.226067155427249155588,
2987    0.0780245226406372992967
2988 #elif EXP_POLY_DEGREE == 2
2989    1.00172476321474503578,
2990    0.657636275736077639316,
2991    0.33718943461968720704
2992 #else
2993 #error
2994 #endif
2995 };
2996
2997
2998 LLVMValueRef
2999 lp_build_exp2(struct lp_build_context *bld,
3000               LLVMValueRef x)
3001 {
3002    LLVMBuilderRef builder = bld->gallivm->builder;
3003    const struct lp_type type = bld->type;
3004    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3005    LLVMValueRef ipart = NULL;
3006    LLVMValueRef fpart = NULL;
3007    LLVMValueRef expipart = NULL;
3008    LLVMValueRef expfpart = NULL;
3009    LLVMValueRef res = NULL;
3010
3011    assert(lp_check_value(bld->type, x));
3012
3013    /* TODO: optimize the constant case */
3014    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3015        LLVMIsConstant(x)) {
3016       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3017                    __FUNCTION__);
3018    }
3019
3020    assert(type.floating && type.width == 32);
3021
3022    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3023     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3024    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3025                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3026    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3027                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3028
3029    /* ipart = floor(x) */
3030    /* fpart = x - ipart */
3031    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3032
3033    /* expipart = (float) (1 << ipart) */
3034    expipart = LLVMBuildAdd(builder, ipart,
3035                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3036    expipart = LLVMBuildShl(builder, expipart,
3037                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3038    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3039
3040    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3041                                   Elements(lp_build_exp2_polynomial));
3042
3043    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3044
3045    return res;
3046 }
3047
3048
3049
3050 /**
3051  * Extract the exponent of a IEEE-754 floating point value.
3052  *
3053  * Optionally apply an integer bias.
3054  *
3055  * Result is an integer value with
3056  *
3057  *   ifloor(log2(x)) + bias
3058  */
3059 LLVMValueRef
3060 lp_build_extract_exponent(struct lp_build_context *bld,
3061                           LLVMValueRef x,
3062                           int bias)
3063 {
3064    LLVMBuilderRef builder = bld->gallivm->builder;
3065    const struct lp_type type = bld->type;
3066    unsigned mantissa = lp_mantissa(type);
3067    LLVMValueRef res;
3068
3069    assert(type.floating);
3070
3071    assert(lp_check_value(bld->type, x));
3072
3073    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3074
3075    res = LLVMBuildLShr(builder, x,
3076                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3077    res = LLVMBuildAnd(builder, res,
3078                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3079    res = LLVMBuildSub(builder, res,
3080                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3081
3082    return res;
3083 }
3084
3085
3086 /**
3087  * Extract the mantissa of the a floating.
3088  *
3089  * Result is a floating point value with
3090  *
3091  *   x / floor(log2(x))
3092  */
3093 LLVMValueRef
3094 lp_build_extract_mantissa(struct lp_build_context *bld,
3095                           LLVMValueRef x)
3096 {
3097    LLVMBuilderRef builder = bld->gallivm->builder;
3098    const struct lp_type type = bld->type;
3099    unsigned mantissa = lp_mantissa(type);
3100    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3101                                                   (1ULL << mantissa) - 1);
3102    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3103    LLVMValueRef res;
3104
3105    assert(lp_check_value(bld->type, x));
3106
3107    assert(type.floating);
3108
3109    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3110
3111    /* res = x / 2**ipart */
3112    res = LLVMBuildAnd(builder, x, mantmask, "");
3113    res = LLVMBuildOr(builder, res, one, "");
3114    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3115
3116    return res;
3117 }
3118
3119
3120
3121 /**
3122  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3123  * These coefficients can be generate with
3124  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3125  */
3126 const double lp_build_log2_polynomial[] = {
3127 #if LOG_POLY_DEGREE == 5
3128    2.88539008148777786488L,
3129    0.961796878841293367824L,
3130    0.577058946784739859012L,
3131    0.412914355135828735411L,
3132    0.308591899232910175289L,
3133    0.352376952300281371868L,
3134 #elif LOG_POLY_DEGREE == 4
3135    2.88539009343309178325L,
3136    0.961791550404184197881L,
3137    0.577440339438736392009L,
3138    0.403343858251329912514L,
3139    0.406718052498846252698L,
3140 #elif LOG_POLY_DEGREE == 3
3141    2.88538959748872753838L,
3142    0.961932915889597772928L,
3143    0.571118517972136195241L,
3144    0.493997535084709500285L,
3145 #else
3146 #error
3147 #endif
3148 };
3149
3150 /**
3151  * See http://www.devmaster.net/forums/showthread.php?p=43580
3152  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3153  * http://www.nezumi.demon.co.uk/consult/logx.htm
3154  *
3155  * If handle_edge_cases is true the function will perform computations
3156  * to match the required D3D10+ behavior for each of the edge cases.
3157  * That means that if input is:
3158  * - less than zero (to and including -inf) then NaN will be returned
3159  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3160  * - +infinity, then +infinity will be returned
3161  * - NaN, then NaN will be returned
3162  *
3163  * Those checks are fairly expensive so if you don't need them make sure
3164  * handle_edge_cases is false.
3165  */
3166 void
3167 lp_build_log2_approx(struct lp_build_context *bld,
3168                      LLVMValueRef x,
3169                      LLVMValueRef *p_exp,
3170                      LLVMValueRef *p_floor_log2,
3171                      LLVMValueRef *p_log2,
3172                      boolean handle_edge_cases)
3173 {
3174    LLVMBuilderRef builder = bld->gallivm->builder;
3175    const struct lp_type type = bld->type;
3176    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3177    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3178
3179    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3180    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3181    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3182
3183    LLVMValueRef i = NULL;
3184    LLVMValueRef y = NULL;
3185    LLVMValueRef z = NULL;
3186    LLVMValueRef exp = NULL;
3187    LLVMValueRef mant = NULL;
3188    LLVMValueRef logexp = NULL;
3189    LLVMValueRef logmant = NULL;
3190    LLVMValueRef res = NULL;
3191
3192    assert(lp_check_value(bld->type, x));
3193
3194    if(p_exp || p_floor_log2 || p_log2) {
3195       /* TODO: optimize the constant case */
3196       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3197           LLVMIsConstant(x)) {
3198          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3199                       __FUNCTION__);
3200       }
3201
3202       assert(type.floating && type.width == 32);
3203
3204       /*
3205        * We don't explicitly handle denormalized numbers. They will yield a
3206        * result in the neighbourhood of -127, which appears to be adequate
3207        * enough.
3208        */
3209
3210       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3211
3212       /* exp = (float) exponent(x) */
3213       exp = LLVMBuildAnd(builder, i, expmask, "");
3214    }
3215
3216    if(p_floor_log2 || p_log2) {
3217       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3218       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3219       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3220    }
3221
3222    if (p_log2) {
3223       /* mant = 1 + (float) mantissa(x) */
3224       mant = LLVMBuildAnd(builder, i, mantmask, "");
3225       mant = LLVMBuildOr(builder, mant, one, "");
3226       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3227
3228       /* y = (mant - 1) / (mant + 1) */
3229       y = lp_build_div(bld,
3230          lp_build_sub(bld, mant, bld->one),
3231          lp_build_add(bld, mant, bld->one)
3232       );
3233
3234       /* z = y^2 */
3235       z = lp_build_mul(bld, y, y);
3236
3237       /* compute P(z) */
3238       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3239                                     Elements(lp_build_log2_polynomial));
3240
3241       /* logmant = y * P(z) */
3242       logmant = lp_build_mul(bld, y, logmant);
3243
3244       res = lp_build_add(bld, logmant, logexp);
3245
3246       if (type.floating && handle_edge_cases) {
3247          LLVMValueRef negmask, infmask,  zmask;
3248          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3249                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3250          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3251                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3252          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3253                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3254
3255          /* If x is qual to inf make sure we return inf */
3256          res = lp_build_select(bld, infmask,
3257                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3258                                res);
3259          /* If x is qual to 0, return -inf */
3260          res = lp_build_select(bld, zmask,
3261                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3262                                res);
3263          /* If x is nan or less than 0, return nan */
3264          res = lp_build_select(bld, negmask,
3265                                lp_build_const_vec(bld->gallivm, type,  NAN),
3266                                res);
3267       }
3268    }
3269
3270    if (p_exp) {
3271       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3272       *p_exp = exp;
3273    }
3274
3275    if (p_floor_log2)
3276       *p_floor_log2 = logexp;
3277
3278    if (p_log2)
3279       *p_log2 = res;
3280 }
3281
3282
3283 /*
3284  * log2 implementation which doesn't have special code to
3285  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3286  * the results for those cases are undefined.
3287  */
3288 LLVMValueRef
3289 lp_build_log2(struct lp_build_context *bld,
3290               LLVMValueRef x)
3291 {
3292    LLVMValueRef res;
3293    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3294    return res;
3295 }
3296
3297 /*
3298  * Version of log2 which handles all edge cases.
3299  * Look at documentation of lp_build_log2_approx for
3300  * description of the behavior for each of the edge cases.
3301  */
3302 LLVMValueRef
3303 lp_build_log2_safe(struct lp_build_context *bld,
3304                    LLVMValueRef x)
3305 {
3306    LLVMValueRef res;
3307    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3308    return res;
3309 }
3310
3311
3312 /**
3313  * Faster (and less accurate) log2.
3314  *
3315  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3316  *
3317  * Piece-wise linear approximation, with exact results when x is a
3318  * power of two.
3319  *
3320  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3321  */
3322 LLVMValueRef
3323 lp_build_fast_log2(struct lp_build_context *bld,
3324                    LLVMValueRef x)
3325 {
3326    LLVMBuilderRef builder = bld->gallivm->builder;
3327    LLVMValueRef ipart;
3328    LLVMValueRef fpart;
3329
3330    assert(lp_check_value(bld->type, x));
3331
3332    assert(bld->type.floating);
3333
3334    /* ipart = floor(log2(x)) - 1 */
3335    ipart = lp_build_extract_exponent(bld, x, -1);
3336    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3337
3338    /* fpart = x / 2**ipart */
3339    fpart = lp_build_extract_mantissa(bld, x);
3340
3341    /* ipart + fpart */
3342    return LLVMBuildFAdd(builder, ipart, fpart, "");
3343 }
3344
3345
3346 /**
3347  * Fast implementation of iround(log2(x)).
3348  *
3349  * Not an approximation -- it should give accurate results all the time.
3350  */
3351 LLVMValueRef
3352 lp_build_ilog2(struct lp_build_context *bld,
3353                LLVMValueRef x)
3354 {
3355    LLVMBuilderRef builder = bld->gallivm->builder;
3356    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3357    LLVMValueRef ipart;
3358
3359    assert(bld->type.floating);
3360
3361    assert(lp_check_value(bld->type, x));
3362
3363    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3364    x = LLVMBuildFMul(builder, x, sqrt2, "");
3365
3366    /* ipart = floor(log2(x) + 0.5)  */
3367    ipart = lp_build_extract_exponent(bld, x, 0);
3368
3369    return ipart;
3370 }
3371
3372 LLVMValueRef
3373 lp_build_mod(struct lp_build_context *bld,
3374              LLVMValueRef x,
3375              LLVMValueRef y)
3376 {
3377    LLVMBuilderRef builder = bld->gallivm->builder;
3378    LLVMValueRef res;
3379    const struct lp_type type = bld->type;
3380
3381    assert(lp_check_value(type, x));
3382    assert(lp_check_value(type, y));
3383
3384    if (type.floating)
3385       res = LLVMBuildFRem(builder, x, y, "");
3386    else if (type.sign)
3387       res = LLVMBuildSRem(builder, x, y, "");
3388    else
3389       res = LLVMBuildURem(builder, x, y, "");
3390    return res;
3391 }
3392
3393
3394 /*
3395  * For floating inputs it creates and returns a mask
3396  * which is all 1's for channels which are NaN.
3397  * Channels inside x which are not NaN will be 0.
3398  */
3399 LLVMValueRef
3400 lp_build_isnan(struct lp_build_context *bld,
3401                LLVMValueRef x)
3402 {
3403    LLVMValueRef mask;
3404    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3405
3406    assert(bld->type.floating);
3407    assert(lp_check_value(bld->type, x));
3408
3409    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3410                         "isnotnan");
3411    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3412    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3413    return mask;
3414 }
3415
3416 /* Returns all 1's for floating point numbers that are
3417  * finite numbers and returns all zeros for -inf,
3418  * inf and nan's */
3419 LLVMValueRef
3420 lp_build_isfinite(struct lp_build_context *bld,
3421                   LLVMValueRef x)
3422 {
3423    LLVMBuilderRef builder = bld->gallivm->builder;
3424    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3425    struct lp_type int_type = lp_int_type(bld->type);
3426    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3427    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3428                                                     0x7f800000);
3429
3430    if (!bld->type.floating) {
3431       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3432    }
3433    assert(bld->type.floating);
3434    assert(lp_check_value(bld->type, x));
3435    assert(bld->type.width == 32);
3436
3437    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3438    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3439                            intx, infornan32);
3440 }
3441
3442 /*
3443  * Returns true if the number is nan or inf and false otherwise.
3444  * The input has to be a floating point vector.
3445  */
3446 LLVMValueRef
3447 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3448                        const struct lp_type type,
3449                        LLVMValueRef x)
3450 {
3451    LLVMBuilderRef builder = gallivm->builder;
3452    struct lp_type int_type = lp_int_type(type);
3453    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3454                                                 0x7f800000);
3455    LLVMValueRef ret;
3456
3457    assert(type.floating);
3458
3459    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3460    ret = LLVMBuildAnd(builder, ret, const0, "");
3461    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3462                           ret, const0);
3463
3464    return ret;
3465 }
3466
3467
3468 LLVMValueRef
3469 lp_build_fpstate_get(struct gallivm_state *gallivm)
3470 {
3471    if (util_cpu_caps.has_sse) {
3472       LLVMBuilderRef builder = gallivm->builder;
3473       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3474          gallivm,
3475          LLVMInt32TypeInContext(gallivm->context),
3476          "mxcsr_ptr");
3477       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3478           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3479       lp_build_intrinsic(builder,
3480                          "llvm.x86.sse.stmxcsr",
3481                          LLVMVoidTypeInContext(gallivm->context),
3482                          &mxcsr_ptr8, 1, 0);
3483       return mxcsr_ptr;
3484    }
3485    return 0;
3486 }
3487
3488 void
3489 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3490                                   boolean zero)
3491 {
3492    if (util_cpu_caps.has_sse) {
3493       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3494       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3495
3496       LLVMBuilderRef builder = gallivm->builder;
3497       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3498       LLVMValueRef mxcsr =
3499          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3500
3501       if (util_cpu_caps.has_daz) {
3502          /* Enable denormals are zero mode */
3503          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3504       }
3505       if (zero) {
3506          mxcsr = LLVMBuildOr(builder, mxcsr,
3507                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3508       } else {
3509          mxcsr = LLVMBuildAnd(builder, mxcsr,
3510                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3511       }
3512
3513       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3514       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3515    }
3516 }
3517
3518 void
3519 lp_build_fpstate_set(struct gallivm_state *gallivm,
3520                      LLVMValueRef mxcsr_ptr)
3521 {
3522    if (util_cpu_caps.has_sse) {
3523       LLVMBuilderRef builder = gallivm->builder;
3524       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3525                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3526       lp_build_intrinsic(builder,
3527                          "llvm.x86.sse.ldmxcsr",
3528                          LLVMVoidTypeInContext(gallivm->context),
3529                          &mxcsr_ptr, 1, 0);
3530    }
3531 }