src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 138           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 140                       __FUNCTION__);
 141       }
 142       if (type.width == 32 && type.length == 4) {
 143          intrinsic = "llvm.ppc.altivec.vminfp";
 144          intr_size = 128;
 145       }
 146    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 147       intr_size = 128;
 148       if ((type.width == 8 || type.width == 16) &&
 149           (type.width * type.length <= 64) &&
 150           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 151          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 152                       __FUNCTION__);
 153       }
 154       if (type.width == 8 && !type.sign) {
 155          intrinsic = "llvm.x86.sse2.pminu.b";
 156       }
 157       else if (type.width == 16 && type.sign) {
 158          intrinsic = "llvm.x86.sse2.pmins.w";
 159       }
 160       if (util_cpu_caps.has_sse4_1) {
 161          if (type.width == 8 && type.sign) {
 162             intrinsic = "llvm.x86.sse41.pminsb";
 163          }
 164          if (type.width == 16 && !type.sign) {
 165             intrinsic = "llvm.x86.sse41.pminuw";
 166          }
 167          if (type.width == 32 && !type.sign) {
 168             intrinsic = "llvm.x86.sse41.pminud";
 169          }
 170          if (type.width == 32 && type.sign) {
 171             intrinsic = "llvm.x86.sse41.pminsd";
 172          }
 173       }
 174    } else if (util_cpu_caps.has_altivec) {
 175       intr_size = 128;
 176       if (type.width == 8) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminub";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsb";
 181          }
 182       } else if (type.width == 16) {
 183          if (!type.sign) {
 184             intrinsic = "llvm.ppc.altivec.vminuh";
 185          } else {
 186             intrinsic = "llvm.ppc.altivec.vminsh";
 187          }
 188       } else if (type.width == 32) {
 189          if (!type.sign) {
 190             intrinsic = "llvm.ppc.altivec.vminuw";
 191          } else {
 192             intrinsic = "llvm.ppc.altivec.vminsw";
 193          }
 194       }
 195    }
 196
 197    if (intrinsic) {
 198       /* We need to handle nan's for floating point numbers. If one of the
 199        * inputs is nan the other should be returned (required by both D3D10+
 200        * and OpenCL).
 201        * The sse intrinsics return the second operator in case of nan by
 202        * default so we need to special code to handle those.
 203        */
 204       if (util_cpu_caps.has_sse && type.floating &&
 205           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 206           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 207           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 208          LLVMValueRef isnan, min;
 209          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 210                                                    type,
 211                                                    intr_size, a, b);
 212          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 213             isnan = lp_build_isnan(bld, b);
 214             return lp_build_select(bld, isnan, a, min);
 215          } else {
 216             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 217             isnan = lp_build_isnan(bld, a);
 218             return lp_build_select(bld, isnan, a, min);
 219          }
 220       } else {
 221          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 222                                                     type,
 223                                                     intr_size, a, b);
 224       }
 225    }
 226
 227    if (type.floating) {
 228       switch (nan_behavior) {
 229       case GALLIVM_NAN_RETURN_NAN: {
 230          LLVMValueRef isnan = lp_build_isnan(bld, b);
 231          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 232          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 233          return lp_build_select(bld, cond, a, b);
 234       }
 235          break;
 236       case GALLIVM_NAN_RETURN_OTHER: {
 237          LLVMValueRef isnan = lp_build_isnan(bld, a);
 238          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 239          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242          break;
 243       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 244          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 245          return lp_build_select(bld, cond, a, b);
 246       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 248          return lp_build_select(bld, cond, b, a);
 249       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252          break;
 253       default:
 254          assert(0);
 255          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258    } else {
 259       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 260       return lp_build_select(bld, cond, a, b);
 261    }
 262 }
 263
 264
 265 /**
 266  * Generate max(a, b)
 267  * No checks for special case values of a or b = 1 or 0 are done.
 268  * NaN's are handled according to the behavior specified by the
 269  * nan_behavior argument.
 270  */
 271 static LLVMValueRef
 272 lp_build_max_simple(struct lp_build_context *bld,
 273                     LLVMValueRef a,
 274                     LLVMValueRef b,
 275                     enum gallivm_nan_behavior nan_behavior)
 276 {
 277    const struct lp_type type = bld->type;
 278    const char *intrinsic = NULL;
 279    unsigned intr_size = 0;
 280    LLVMValueRef cond;
 281
 282    assert(lp_check_value(type, a));
 283    assert(lp_check_value(type, b));
 284
 285    /* TODO: optimize the constant case */
 286
 287    if (type.floating && util_cpu_caps.has_sse) {
 288       if (type.width == 32) {
 289          if (type.length == 1) {
 290             intrinsic = "llvm.x86.sse.max.ss";
 291             intr_size = 128;
 292          }
 293          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 294             intrinsic = "llvm.x86.sse.max.ps";
 295             intr_size = 128;
 296          }
 297          else {
 298             intrinsic = "llvm.x86.avx.max.ps.256";
 299             intr_size = 256;
 300          }
 301       }
 302       if (type.width == 64 && util_cpu_caps.has_sse2) {
 303          if (type.length == 1) {
 304             intrinsic = "llvm.x86.sse2.max.sd";
 305             intr_size = 128;
 306          }
 307          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 308             intrinsic = "llvm.x86.sse2.max.pd";
 309             intr_size = 128;
 310          }
 311          else {
 312             intrinsic = "llvm.x86.avx.max.pd.256";
 313             intr_size = 256;
 314          }
 315       }
 316    }
 317    else if (type.floating && util_cpu_caps.has_altivec) {
 318       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 319           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 320          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 321                       __FUNCTION__);
 322       }
 323       if (type.width == 32 || type.length == 4) {
 324          intrinsic = "llvm.ppc.altivec.vmaxfp";
 325          intr_size = 128;
 326       }
 327    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 328       intr_size = 128;
 329       if ((type.width == 8 || type.width == 16) &&
 330           (type.width * type.length <= 64) &&
 331           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 332          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 333                       __FUNCTION__);
 334          }
 335       if (type.width == 8 && !type.sign) {
 336          intrinsic = "llvm.x86.sse2.pmaxu.b";
 337          intr_size = 128;
 338       }
 339       else if (type.width == 16 && type.sign) {
 340          intrinsic = "llvm.x86.sse2.pmaxs.w";
 341       }
 342       if (util_cpu_caps.has_sse4_1) {
 343          if (type.width == 8 && type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxsb";
 345          }
 346          if (type.width == 16 && !type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxuw";
 348          }
 349          if (type.width == 32 && !type.sign) {
 350             intrinsic = "llvm.x86.sse41.pmaxud";
 351         }
 352          if (type.width == 32 && type.sign) {
 353             intrinsic = "llvm.x86.sse41.pmaxsd";
 354          }
 355       }
 356    } else if (util_cpu_caps.has_altivec) {
 357      intr_size = 128;
 358      if (type.width == 8) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxub";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsb";
 363        }
 364      } else if (type.width == 16) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuh";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsh";
 369        }
 370      } else if (type.width == 32) {
 371        if (!type.sign) {
 372          intrinsic = "llvm.ppc.altivec.vmaxuw";
 373        } else {
 374          intrinsic = "llvm.ppc.altivec.vmaxsw";
 375        }
 376      }
 377    }
 378
 379    if (intrinsic) {
 380       if (util_cpu_caps.has_sse && type.floating &&
 381           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 382           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 383           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 384          LLVMValueRef isnan, max;
 385          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 386                                                    type,
 387                                                    intr_size, a, b);
 388          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 389             isnan = lp_build_isnan(bld, b);
 390             return lp_build_select(bld, isnan, a, max);
 391          } else {
 392             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 393             isnan = lp_build_isnan(bld, a);
 394             return lp_build_select(bld, isnan, a, max);
 395          }
 396       } else {
 397          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 398                                                     type,
 399                                                     intr_size, a, b);
 400       }
 401    }
 402
 403    if (type.floating) {
 404       switch (nan_behavior) {
 405       case GALLIVM_NAN_RETURN_NAN: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, b);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER: {
 413          LLVMValueRef isnan = lp_build_isnan(bld, a);
 414          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 416          return lp_build_select(bld, cond, a, b);
 417       }
 418          break;
 419       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 420          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 421          return lp_build_select(bld, cond, a, b);
 422       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 423          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 424          return lp_build_select(bld, cond, b, a);
 425       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 426          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 427          return lp_build_select(bld, cond, a, b);
 428          break;
 429       default:
 430          assert(0);
 431          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 432          return lp_build_select(bld, cond, a, b);
 433       }
 434    } else {
 435       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 436       return lp_build_select(bld, cond, a, b);
 437    }
 438 }
 439
 440
 441 /**
 442  * Generate 1 - a, or ~a depending on bld->type.
 443  */
 444 LLVMValueRef
 445 lp_build_comp(struct lp_build_context *bld,
 446               LLVMValueRef a)
 447 {
 448    LLVMBuilderRef builder = bld->gallivm->builder;
 449    const struct lp_type type = bld->type;
 450
 451    assert(lp_check_value(type, a));
 452
 453    if(a == bld->one)
 454       return bld->zero;
 455    if(a == bld->zero)
 456       return bld->one;
 457
 458    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 459       if(LLVMIsConstant(a))
 460          return LLVMConstNot(a);
 461       else
 462          return LLVMBuildNot(builder, a, "");
 463    }
 464
 465    if(LLVMIsConstant(a))
 466       if (type.floating)
 467           return LLVMConstFSub(bld->one, a);
 468       else
 469           return LLVMConstSub(bld->one, a);
 470    else
 471       if (type.floating)
 472          return LLVMBuildFSub(builder, bld->one, a, "");
 473       else
 474          return LLVMBuildSub(builder, bld->one, a, "");
 475 }
 476
 477
 478 /**
 479  * Generate a + b
 480  */
 481 LLVMValueRef
 482 lp_build_add(struct lp_build_context *bld,
 483              LLVMValueRef a,
 484              LLVMValueRef b)
 485 {
 486    LLVMBuilderRef builder = bld->gallivm->builder;
 487    const struct lp_type type = bld->type;
 488    LLVMValueRef res;
 489
 490    assert(lp_check_value(type, a));
 491    assert(lp_check_value(type, b));
 492
 493    if(a == bld->zero)
 494       return b;
 495    if(b == bld->zero)
 496       return a;
 497    if(a == bld->undef || b == bld->undef)
 498       return bld->undef;
 499
 500    if(bld->type.norm) {
 501       const char *intrinsic = NULL;
 502
 503       if(a == bld->one || b == bld->one)
 504         return bld->one;
 505
 506       if (type.width * type.length == 128 &&
 507           !type.floating && !type.fixed) {
 508          if(util_cpu_caps.has_sse2) {
 509            if(type.width == 8)
 510              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 511            if(type.width == 16)
 512              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 513          } else if (util_cpu_caps.has_altivec) {
 514            if(type.width == 8)
 515               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 516            if(type.width == 16)
 517               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 518          }
 519       }
 520
 521       if (intrinsic)
 522          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 523    }
 524
 525    if(type.norm && !type.floating && !type.fixed) {
 526       if (type.sign) {
 527          uint64_t sign = (uint64_t)1 << (type.width - 1);
 528          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 529          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 530          /* a_clamp_max is the maximum a for positive b,
 531             a_clamp_min is the minimum a for negative b. */
 532          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 533          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 534          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 535       } else {
 536          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 537       }
 538    }
 539
 540    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 541       if (type.floating)
 542          res = LLVMConstFAdd(a, b);
 543       else
 544          res = LLVMConstAdd(a, b);
 545    else
 546       if (type.floating)
 547          res = LLVMBuildFAdd(builder, a, b, "");
 548       else
 549          res = LLVMBuildAdd(builder, a, b, "");
 550
 551    /* clamp to ceiling of 1.0 */
 552    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 553       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 554
 555    /* XXX clamp to floor of -1 or 0??? */
 556
 557    return res;
 558 }
 559
 560
 561 /** Return the scalar sum of the elements of a.
 562  * Should avoid this operation whenever possible.
 563  */
 564 LLVMValueRef
 565 lp_build_horizontal_add(struct lp_build_context *bld,
 566                         LLVMValueRef a)
 567 {
 568    LLVMBuilderRef builder = bld->gallivm->builder;
 569    const struct lp_type type = bld->type;
 570    LLVMValueRef index, res;
 571    unsigned i, length;
 572    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 573    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 574    LLVMValueRef vecres, elem2;
 575
 576    assert(lp_check_value(type, a));
 577
 578    if (type.length == 1) {
 579       return a;
 580    }
 581
 582    assert(!bld->type.norm);
 583
 584    /*
 585     * for byte vectors can do much better with psadbw.
 586     * Using repeated shuffle/adds here. Note with multiple vectors
 587     * this can be done more efficiently as outlined in the intel
 588     * optimization manual.
 589     * Note: could cause data rearrangement if used with smaller element
 590     * sizes.
 591     */
 592
 593    vecres = a;
 594    length = type.length / 2;
 595    while (length > 1) {
 596       LLVMValueRef vec1, vec2;
 597       for (i = 0; i < length; i++) {
 598          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 599          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 600       }
 601       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 602                                     LLVMConstVector(shuffles1, length), "");
 603       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 604                                     LLVMConstVector(shuffles2, length), "");
 605       if (type.floating) {
 606          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 607       }
 608       else {
 609          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 610       }
 611       length = length >> 1;
 612    }
 613
 614    /* always have vector of size 2 here */
 615    assert(length == 1);
 616
 617    index = lp_build_const_int32(bld->gallivm, 0);
 618    res = LLVMBuildExtractElement(builder, vecres, index, "");
 619    index = lp_build_const_int32(bld->gallivm, 1);
 620    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 621
 622    if (type.floating)
 623       res = LLVMBuildFAdd(builder, res, elem2, "");
 624     else
 625       res = LLVMBuildAdd(builder, res, elem2, "");
 626
 627    return res;
 628 }
 629
 630 /**
 631  * Return the horizontal sums of 4 float vectors as a float4 vector.
 632  * This uses the technique as outlined in Intel Optimization Manual.
 633  */
 634 static LLVMValueRef
 635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 636                             LLVMValueRef src[4])
 637 {
 638    struct gallivm_state *gallivm = bld->gallivm;
 639    LLVMBuilderRef builder = gallivm->builder;
 640    LLVMValueRef shuffles[4];
 641    LLVMValueRef tmp[4];
 642    LLVMValueRef sumtmp[2], shuftmp[2];
 643
 644    /* lower half of regs */
 645    shuffles[0] = lp_build_const_int32(gallivm, 0);
 646    shuffles[1] = lp_build_const_int32(gallivm, 1);
 647    shuffles[2] = lp_build_const_int32(gallivm, 4);
 648    shuffles[3] = lp_build_const_int32(gallivm, 5);
 649    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 650                                    LLVMConstVector(shuffles, 4), "");
 651    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 652                                    LLVMConstVector(shuffles, 4), "");
 653
 654    /* upper half of regs */
 655    shuffles[0] = lp_build_const_int32(gallivm, 2);
 656    shuffles[1] = lp_build_const_int32(gallivm, 3);
 657    shuffles[2] = lp_build_const_int32(gallivm, 6);
 658    shuffles[3] = lp_build_const_int32(gallivm, 7);
 659    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 660                                    LLVMConstVector(shuffles, 4), "");
 661    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 662                                    LLVMConstVector(shuffles, 4), "");
 663
 664    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 665    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 666
 667    shuffles[0] = lp_build_const_int32(gallivm, 0);
 668    shuffles[1] = lp_build_const_int32(gallivm, 2);
 669    shuffles[2] = lp_build_const_int32(gallivm, 4);
 670    shuffles[3] = lp_build_const_int32(gallivm, 6);
 671    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 672                                        LLVMConstVector(shuffles, 4), "");
 673
 674    shuffles[0] = lp_build_const_int32(gallivm, 1);
 675    shuffles[1] = lp_build_const_int32(gallivm, 3);
 676    shuffles[2] = lp_build_const_int32(gallivm, 5);
 677    shuffles[3] = lp_build_const_int32(gallivm, 7);
 678    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 679                                        LLVMConstVector(shuffles, 4), "");
 680
 681    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 682 }
 683
 684
 685 /*
 686  * partially horizontally add 2-4 float vectors with length nx4,
 687  * i.e. only four adjacent values in each vector will be added,
 688  * assuming values are really grouped in 4 which also determines
 689  * output order.
 690  *
 691  * Return a vector of the same length as the initial vectors,
 692  * with the excess elements (if any) being undefined.
 693  * The element order is independent of number of input vectors.
 694  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 695  * the output order thus will be
 696  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 697  */
 698 LLVMValueRef
 699 lp_build_hadd_partial4(struct lp_build_context *bld,
 700                        LLVMValueRef vectors[],
 701                        unsigned num_vecs)
 702 {
 703    struct gallivm_state *gallivm = bld->gallivm;
 704    LLVMBuilderRef builder = gallivm->builder;
 705    LLVMValueRef ret_vec;
 706    LLVMValueRef tmp[4];
 707    const char *intrinsic = NULL;
 708
 709    assert(num_vecs >= 2 && num_vecs <= 4);
 710    assert(bld->type.floating);
 711
 712    /* only use this with at least 2 vectors, as it is sort of expensive
 713     * (depending on cpu) and we always need two horizontal adds anyway,
 714     * so a shuffle/add approach might be better.
 715     */
 716
 717    tmp[0] = vectors[0];
 718    tmp[1] = vectors[1];
 719
 720    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 721    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 722
 723    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 724        bld->type.length == 4) {
 725       intrinsic = "llvm.x86.sse3.hadd.ps";
 726    }
 727    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 728             bld->type.length == 8) {
 729       intrinsic = "llvm.x86.avx.hadd.ps.256";
 730    }
 731    if (intrinsic) {
 732       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 733                                        lp_build_vec_type(gallivm, bld->type),
 734                                        tmp[0], tmp[1]);
 735       if (num_vecs > 2) {
 736          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 737                                           lp_build_vec_type(gallivm, bld->type),
 738                                           tmp[2], tmp[3]);
 739       }
 740       else {
 741          tmp[1] = tmp[0];
 742       }
 743       return lp_build_intrinsic_binary(builder, intrinsic,
 744                                        lp_build_vec_type(gallivm, bld->type),
 745                                        tmp[0], tmp[1]);
 746    }
 747
 748    if (bld->type.length == 4) {
 749       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 750    }
 751    else {
 752       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 753       unsigned j;
 754       unsigned num_iter = bld->type.length / 4;
 755       struct lp_type parttype = bld->type;
 756       parttype.length = 4;
 757       for (j = 0; j < num_iter; j++) {
 758          LLVMValueRef partsrc[4];
 759          unsigned i;
 760          for (i = 0; i < 4; i++) {
 761             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 762          }
 763          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 764       }
 765       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 766    }
 767    return ret_vec;
 768 }
 769
 770 /**
 771  * Generate a - b
 772  */
 773 LLVMValueRef
 774 lp_build_sub(struct lp_build_context *bld,
 775              LLVMValueRef a,
 776              LLVMValueRef b)
 777 {
 778    LLVMBuilderRef builder = bld->gallivm->builder;
 779    const struct lp_type type = bld->type;
 780    LLVMValueRef res;
 781
 782    assert(lp_check_value(type, a));
 783    assert(lp_check_value(type, b));
 784
 785    if(b == bld->zero)
 786       return a;
 787    if(a == bld->undef || b == bld->undef)
 788       return bld->undef;
 789    if(a == b)
 790       return bld->zero;
 791
 792    if(bld->type.norm) {
 793       const char *intrinsic = NULL;
 794
 795       if(b == bld->one)
 796         return bld->zero;
 797
 798       if (type.width * type.length == 128 &&
 799           !type.floating && !type.fixed) {
 800          if (util_cpu_caps.has_sse2) {
 801            if(type.width == 8)
 802               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 803            if(type.width == 16)
 804               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 805          } else if (util_cpu_caps.has_altivec) {
 806            if(type.width == 8)
 807               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 808            if(type.width == 16)
 809               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 810          }
 811       }
 812
 813       if (intrinsic)
 814          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 815    }
 816
 817    if(type.norm && !type.floating && !type.fixed) {
 818       if (type.sign) {
 819          uint64_t sign = (uint64_t)1 << (type.width - 1);
 820          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 821          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 822          /* a_clamp_max is the maximum a for negative b,
 823             a_clamp_min is the minimum a for positive b. */
 824          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 825          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 826          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 827       } else {
 828          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 829       }
 830    }
 831
 832    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 833       if (type.floating)
 834          res = LLVMConstFSub(a, b);
 835       else
 836          res = LLVMConstSub(a, b);
 837    else
 838       if (type.floating)
 839          res = LLVMBuildFSub(builder, a, b, "");
 840       else
 841          res = LLVMBuildSub(builder, a, b, "");
 842
 843    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 844       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 845
 846    return res;
 847 }
 848
 849
 850
 851 /**
 852  * Normalized multiplication.
 853  *
 854  * There are several approaches for (using 8-bit normalized multiplication as
 855  * an example):
 856  *
 857  * - alpha plus one
 858  *
 859  *     makes the following approximation to the division (Sree)
 860  *
 861  *       a*b/255 ~= (a*(b + 1)) >> 256
 862  *
 863  *     which is the fastest method that satisfies the following OpenGL criteria of
 864  *
 865  *       0*0 = 0 and 255*255 = 255
 866  *
 867  * - geometric series
 868  *
 869  *     takes the geometric series approximation to the division
 870  *
 871  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 872  *
 873  *     in this case just the first two terms to fit in 16bit arithmetic
 874  *
 875  *       t/255 ~= (t + (t >> 8)) >> 8
 876  *
 877  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 878  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 879  *     must be used.
 880  *
 881  * - geometric series plus rounding
 882  *
 883  *     when using a geometric series division instead of truncating the result
 884  *     use roundoff in the approximation (Jim Blinn)
 885  *
 886  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 887  *
 888  *     achieving the exact results.
 889  *
 890  *
 891  *
 892  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 893  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 894  * @sa Michael Herf, The "double blend trick", May 2000,
 895  *     http://www.stereopsis.com/doubleblend.html
 896  */
 897 static LLVMValueRef
 898 lp_build_mul_norm(struct gallivm_state *gallivm,
 899                   struct lp_type wide_type,
 900                   LLVMValueRef a, LLVMValueRef b)
 901 {
 902    LLVMBuilderRef builder = gallivm->builder;
 903    struct lp_build_context bld;
 904    unsigned n;
 905    LLVMValueRef half;
 906    LLVMValueRef ab;
 907
 908    assert(!wide_type.floating);
 909    assert(lp_check_value(wide_type, a));
 910    assert(lp_check_value(wide_type, b));
 911
 912    lp_build_context_init(&bld, gallivm, wide_type);
 913
 914    n = wide_type.width / 2;
 915    if (wide_type.sign) {
 916       --n;
 917    }
 918
 919    /*
 920     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 921     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 922     */
 923
 924    /*
 925     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 926     */
 927
 928    ab = LLVMBuildMul(builder, a, b, "");
 929    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 930
 931    /*
 932     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 933     */
 934
 935    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 936    if (wide_type.sign) {
 937       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 938       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 939       half = lp_build_select(&bld, sign, minus_half, half);
 940    }
 941    ab = LLVMBuildAdd(builder, ab, half, "");
 942
 943    /* Final division */
 944    ab = lp_build_shr_imm(&bld, ab, n);
 945
 946    return ab;
 947 }
 948
 949 /**
 950  * Generate a * b
 951  */
 952 LLVMValueRef
 953 lp_build_mul(struct lp_build_context *bld,
 954              LLVMValueRef a,
 955              LLVMValueRef b)
 956 {
 957    LLVMBuilderRef builder = bld->gallivm->builder;
 958    const struct lp_type type = bld->type;
 959    LLVMValueRef shift;
 960    LLVMValueRef res;
 961
 962    assert(lp_check_value(type, a));
 963    assert(lp_check_value(type, b));
 964
 965    if(a == bld->zero)
 966       return bld->zero;
 967    if(a == bld->one)
 968       return b;
 969    if(b == bld->zero)
 970       return bld->zero;
 971    if(b == bld->one)
 972       return a;
 973    if(a == bld->undef || b == bld->undef)
 974       return bld->undef;
 975
 976    if (!type.floating && !type.fixed && type.norm) {
 977       struct lp_type wide_type = lp_wider_type(type);
 978       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 979
 980       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 981       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 982
 983       /* PMULLW, PSRLW, PADDW */
 984       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 985       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 986
 987       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 988
 989       return ab;
 990    }
 991
 992    if(type.fixed)
 993       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 994    else
 995       shift = NULL;
 996
 997    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 998       if (type.floating)
 999          res = LLVMConstFMul(a, b);
1000       else
1001          res = LLVMConstMul(a, b);
1002       if(shift) {
1003          if(type.sign)
1004             res = LLVMConstAShr(res, shift);
1005          else
1006             res = LLVMConstLShr(res, shift);
1007       }
1008    }
1009    else {
1010       if (type.floating)
1011          res = LLVMBuildFMul(builder, a, b, "");
1012       else
1013          res = LLVMBuildMul(builder, a, b, "");
1014       if(shift) {
1015          if(type.sign)
1016             res = LLVMBuildAShr(builder, res, shift, "");
1017          else
1018             res = LLVMBuildLShr(builder, res, shift, "");
1019       }
1020    }
1021
1022    return res;
1023 }
1024
1025
1026 /**
1027  * Small vector x scale multiplication optimization.
1028  */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031                  LLVMValueRef a,
1032                  int b)
1033 {
1034    LLVMBuilderRef builder = bld->gallivm->builder;
1035    LLVMValueRef factor;
1036
1037    assert(lp_check_value(bld->type, a));
1038
1039    if(b == 0)
1040       return bld->zero;
1041
1042    if(b == 1)
1043       return a;
1044
1045    if(b == -1)
1046       return lp_build_negate(bld, a);
1047
1048    if(b == 2 && bld->type.floating)
1049       return lp_build_add(bld, a, a);
1050
1051    if(util_is_power_of_two(b)) {
1052       unsigned shift = ffs(b) - 1;
1053
1054       if(bld->type.floating) {
1055 #if 0
1056          /*
1057           * Power of two multiplication by directly manipulating the exponent.
1058           *
1059           * XXX: This might not be always faster, it will introduce a small error
1060           * for multiplication by zero, and it will produce wrong results
1061           * for Inf and NaN.
1062           */
1063          unsigned mantissa = lp_mantissa(bld->type);
1064          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066          a = LLVMBuildAdd(builder, a, factor, "");
1067          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068          return a;
1069 #endif
1070       }
1071       else {
1072          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073          return LLVMBuildShl(builder, a, factor, "");
1074       }
1075    }
1076
1077    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078    return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083  * Generate a / b
1084  */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087              LLVMValueRef a,
1088              LLVMValueRef b)
1089 {
1090    LLVMBuilderRef builder = bld->gallivm->builder;
1091    const struct lp_type type = bld->type;
1092
1093    assert(lp_check_value(type, a));
1094    assert(lp_check_value(type, b));
1095
1096    if(a == bld->zero)
1097       return bld->zero;
1098    if(a == bld->one && type.floating)
1099       return lp_build_rcp(bld, b);
1100    if(b == bld->zero)
1101       return bld->undef;
1102    if(b == bld->one)
1103       return a;
1104    if(a == bld->undef || b == bld->undef)
1105       return bld->undef;
1106
1107    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108       if (type.floating)
1109          return LLVMConstFDiv(a, b);
1110       else if (type.sign)
1111          return LLVMConstSDiv(a, b);
1112       else
1113          return LLVMConstUDiv(a, b);
1114    }
1115
1116    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118       type.floating)
1119       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121    if (type.floating)
1122       return LLVMBuildFDiv(builder, a, b, "");
1123    else if (type.sign)
1124       return LLVMBuildSDiv(builder, a, b, "");
1125    else
1126       return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131  * Linear interpolation helper.
1132  *
1133  * @param normalized whether we are interpolating normalized values,
1134  *        encoded in normalized integers, twice as wide.
1135  *
1136  * @sa http://www.stereopsis.com/doubleblend.html
1137  */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140                      LLVMValueRef x,
1141                      LLVMValueRef v0,
1142                      LLVMValueRef v1,
1143                      unsigned flags)
1144 {
1145    unsigned half_width = bld->type.width/2;
1146    LLVMBuilderRef builder = bld->gallivm->builder;
1147    LLVMValueRef delta;
1148    LLVMValueRef res;
1149
1150    assert(lp_check_value(bld->type, x));
1151    assert(lp_check_value(bld->type, v0));
1152    assert(lp_check_value(bld->type, v1));
1153
1154    delta = lp_build_sub(bld, v1, v0);
1155
1156    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157       if (!bld->type.sign) {
1158          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159             /*
1160              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161              * most-significant-bit to the lowest-significant-bit, so that
1162              * later we can just divide by 2**n instead of 2**n - 1.
1163              */
1164
1165             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166          }
1167
1168          /* (x * delta) >> n */
1169          res = lp_build_mul(bld, x, delta);
1170          res = lp_build_shr_imm(bld, res, half_width);
1171       } else {
1172          /*
1173           * The rescaling trick above doesn't work for signed numbers, so
1174           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175           * instead.
1176           */
1177          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179       }
1180    } else {
1181       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182       res = lp_build_mul(bld, x, delta);
1183    }
1184
1185    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1186       /*
1187        * At this point both res and v0 only use the lower half of the bits,
1188        * the rest is zero. Instead of add / mask, do add with half wide type.
1189        */
1190       struct lp_type narrow_type;
1191       struct lp_build_context narrow_bld;
1192
1193       memset(&narrow_type, 0, sizeof narrow_type);
1194       narrow_type.sign   = bld->type.sign;
1195       narrow_type.width  = bld->type.width/2;
1196       narrow_type.length = bld->type.length*2;
1197
1198       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1199       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1200       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1201       res = lp_build_add(&narrow_bld, v0, res);
1202       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1203    } else {
1204       res = lp_build_add(bld, v0, res);
1205
1206       if (bld->type.fixed) {
1207          /*
1208           * We need to mask out the high order bits when lerping 8bit
1209           * normalized colors stored on 16bits
1210           */
1211          /* XXX: This step is necessary for lerping 8bit colors stored on
1212           * 16bits, but it will be wrong for true fixed point use cases.
1213           * Basically we need a more powerful lp_type, capable of further
1214           * distinguishing the values interpretation from the value storage.
1215           */
1216          LLVMValueRef low_bits;
1217          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1218          res = LLVMBuildAnd(builder, res, low_bits, "");
1219       }
1220    }
1221
1222    return res;
1223 }
1224
1225
1226 /**
1227  * Linear interpolation.
1228  */
1229 LLVMValueRef
1230 lp_build_lerp(struct lp_build_context *bld,
1231               LLVMValueRef x,
1232               LLVMValueRef v0,
1233               LLVMValueRef v1,
1234               unsigned flags)
1235 {
1236    const struct lp_type type = bld->type;
1237    LLVMValueRef res;
1238
1239    assert(lp_check_value(type, x));
1240    assert(lp_check_value(type, v0));
1241    assert(lp_check_value(type, v1));
1242
1243    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1244
1245    if (type.norm) {
1246       struct lp_type wide_type;
1247       struct lp_build_context wide_bld;
1248       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1249
1250       assert(type.length >= 2);
1251
1252       /*
1253        * Create a wider integer type, enough to hold the
1254        * intermediate result of the multiplication.
1255        */
1256       memset(&wide_type, 0, sizeof wide_type);
1257       wide_type.sign   = type.sign;
1258       wide_type.width  = type.width*2;
1259       wide_type.length = type.length/2;
1260
1261       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1262
1263       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1264       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1265       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1266
1267       /*
1268        * Lerp both halves.
1269        */
1270
1271       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1272
1273       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1274       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1275
1276       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1277    } else {
1278       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1279    }
1280
1281    return res;
1282 }
1283
1284
1285 /**
1286  * Bilinear interpolation.
1287  *
1288  * Values indices are in v_{yx}.
1289  */
1290 LLVMValueRef
1291 lp_build_lerp_2d(struct lp_build_context *bld,
1292                  LLVMValueRef x,
1293                  LLVMValueRef y,
1294                  LLVMValueRef v00,
1295                  LLVMValueRef v01,
1296                  LLVMValueRef v10,
1297                  LLVMValueRef v11,
1298                  unsigned flags)
1299 {
1300    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1301    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1302    return lp_build_lerp(bld, y, v0, v1, flags);
1303 }
1304
1305
1306 LLVMValueRef
1307 lp_build_lerp_3d(struct lp_build_context *bld,
1308                  LLVMValueRef x,
1309                  LLVMValueRef y,
1310                  LLVMValueRef z,
1311                  LLVMValueRef v000,
1312                  LLVMValueRef v001,
1313                  LLVMValueRef v010,
1314                  LLVMValueRef v011,
1315                  LLVMValueRef v100,
1316                  LLVMValueRef v101,
1317                  LLVMValueRef v110,
1318                  LLVMValueRef v111,
1319                  unsigned flags)
1320 {
1321    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1322    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1323    return lp_build_lerp(bld, z, v0, v1, flags);
1324 }
1325
1326
1327 /**
1328  * Generate min(a, b)
1329  * Do checks for special cases but not for nans.
1330  */
1331 LLVMValueRef
1332 lp_build_min(struct lp_build_context *bld,
1333              LLVMValueRef a,
1334              LLVMValueRef b)
1335 {
1336    assert(lp_check_value(bld->type, a));
1337    assert(lp_check_value(bld->type, b));
1338
1339    if(a == bld->undef || b == bld->undef)
1340       return bld->undef;
1341
1342    if(a == b)
1343       return a;
1344
1345    if (bld->type.norm) {
1346       if (!bld->type.sign) {
1347          if (a == bld->zero || b == bld->zero) {
1348             return bld->zero;
1349          }
1350       }
1351       if(a == bld->one)
1352          return b;
1353       if(b == bld->one)
1354          return a;
1355    }
1356
1357    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1358 }
1359
1360
1361 /**
1362  * Generate min(a, b)
1363  * NaN's are handled according to the behavior specified by the
1364  * nan_behavior argument.
1365  */
1366 LLVMValueRef
1367 lp_build_min_ext(struct lp_build_context *bld,
1368                  LLVMValueRef a,
1369                  LLVMValueRef b,
1370                  enum gallivm_nan_behavior nan_behavior)
1371 {
1372    assert(lp_check_value(bld->type, a));
1373    assert(lp_check_value(bld->type, b));
1374
1375    if(a == bld->undef || b == bld->undef)
1376       return bld->undef;
1377
1378    if(a == b)
1379       return a;
1380
1381    if (bld->type.norm) {
1382       if (!bld->type.sign) {
1383          if (a == bld->zero || b == bld->zero) {
1384             return bld->zero;
1385          }
1386       }
1387       if(a == bld->one)
1388          return b;
1389       if(b == bld->one)
1390          return a;
1391    }
1392
1393    return lp_build_min_simple(bld, a, b, nan_behavior);
1394 }
1395
1396 /**
1397  * Generate max(a, b)
1398  * Do checks for special cases, but NaN behavior is undefined.
1399  */
1400 LLVMValueRef
1401 lp_build_max(struct lp_build_context *bld,
1402              LLVMValueRef a,
1403              LLVMValueRef b)
1404 {
1405    assert(lp_check_value(bld->type, a));
1406    assert(lp_check_value(bld->type, b));
1407
1408    if(a == bld->undef || b == bld->undef)
1409       return bld->undef;
1410
1411    if(a == b)
1412       return a;
1413
1414    if(bld->type.norm) {
1415       if(a == bld->one || b == bld->one)
1416          return bld->one;
1417       if (!bld->type.sign) {
1418          if (a == bld->zero) {
1419             return b;
1420          }
1421          if (b == bld->zero) {
1422             return a;
1423          }
1424       }
1425    }
1426
1427    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1428 }
1429
1430
1431 /**
1432  * Generate max(a, b)
1433  * Checks for special cases.
1434  * NaN's are handled according to the behavior specified by the
1435  * nan_behavior argument.
1436  */
1437 LLVMValueRef
1438 lp_build_max_ext(struct lp_build_context *bld,
1439                   LLVMValueRef a,
1440                   LLVMValueRef b,
1441                   enum gallivm_nan_behavior nan_behavior)
1442 {
1443    assert(lp_check_value(bld->type, a));
1444    assert(lp_check_value(bld->type, b));
1445
1446    if(a == bld->undef || b == bld->undef)
1447       return bld->undef;
1448
1449    if(a == b)
1450       return a;
1451
1452    if(bld->type.norm) {
1453       if(a == bld->one || b == bld->one)
1454          return bld->one;
1455       if (!bld->type.sign) {
1456          if (a == bld->zero) {
1457             return b;
1458          }
1459          if (b == bld->zero) {
1460             return a;
1461          }
1462       }
1463    }
1464
1465    return lp_build_max_simple(bld, a, b, nan_behavior);
1466 }
1467
1468 /**
1469  * Generate clamp(a, min, max)
1470  * NaN behavior (for any of a, min, max) is undefined.
1471  * Do checks for special cases.
1472  */
1473 LLVMValueRef
1474 lp_build_clamp(struct lp_build_context *bld,
1475                LLVMValueRef a,
1476                LLVMValueRef min,
1477                LLVMValueRef max)
1478 {
1479    assert(lp_check_value(bld->type, a));
1480    assert(lp_check_value(bld->type, min));
1481    assert(lp_check_value(bld->type, max));
1482
1483    a = lp_build_min(bld, a, max);
1484    a = lp_build_max(bld, a, min);
1485    return a;
1486 }
1487
1488
1489 /**
1490  * Generate clamp(a, 0, 1)
1491  * A NaN will get converted to zero.
1492  */
1493 LLVMValueRef
1494 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1495                                 LLVMValueRef a)
1496 {
1497    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1498    a = lp_build_min(bld, a, bld->one);
1499    return a;
1500 }
1501
1502
1503 /**
1504  * Generate abs(a)
1505  */
1506 LLVMValueRef
1507 lp_build_abs(struct lp_build_context *bld,
1508              LLVMValueRef a)
1509 {
1510    LLVMBuilderRef builder = bld->gallivm->builder;
1511    const struct lp_type type = bld->type;
1512    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1513
1514    assert(lp_check_value(type, a));
1515
1516    if(!type.sign)
1517       return a;
1518
1519    if(type.floating) {
1520       if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1521          /* Workaround llvm.org/PR27332 */
1522          LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1523          unsigned long long absMask = ~(1ULL << (type.width - 1));
1524          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1525          a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1526          a = LLVMBuildAnd(builder, a, mask, "");
1527          a = LLVMBuildBitCast(builder, a, vec_type, "");
1528          return a;
1529       } else {
1530          char intrinsic[32];
1531          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1532          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1533       }
1534    }
1535
1536    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1537       switch(type.width) {
1538       case 8:
1539          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1540       case 16:
1541          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1542       case 32:
1543          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1544       }
1545    }
1546    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1547             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1548             (type.width == 8 || type.width == 16 || type.width == 32)) {
1549       debug_printf("%s: inefficient code, should split vectors manually\n",
1550                    __FUNCTION__);
1551    }
1552
1553    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1554 }
1555
1556
1557 LLVMValueRef
1558 lp_build_negate(struct lp_build_context *bld,
1559                 LLVMValueRef a)
1560 {
1561    LLVMBuilderRef builder = bld->gallivm->builder;
1562
1563    assert(lp_check_value(bld->type, a));
1564
1565    if (bld->type.floating)
1566       a = LLVMBuildFNeg(builder, a, "");
1567    else
1568       a = LLVMBuildNeg(builder, a, "");
1569
1570    return a;
1571 }
1572
1573
1574 /** Return -1, 0 or +1 depending on the sign of a */
1575 LLVMValueRef
1576 lp_build_sgn(struct lp_build_context *bld,
1577              LLVMValueRef a)
1578 {
1579    LLVMBuilderRef builder = bld->gallivm->builder;
1580    const struct lp_type type = bld->type;
1581    LLVMValueRef cond;
1582    LLVMValueRef res;
1583
1584    assert(lp_check_value(type, a));
1585
1586    /* Handle non-zero case */
1587    if(!type.sign) {
1588       /* if not zero then sign must be positive */
1589       res = bld->one;
1590    }
1591    else if(type.floating) {
1592       LLVMTypeRef vec_type;
1593       LLVMTypeRef int_type;
1594       LLVMValueRef mask;
1595       LLVMValueRef sign;
1596       LLVMValueRef one;
1597       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1598
1599       int_type = lp_build_int_vec_type(bld->gallivm, type);
1600       vec_type = lp_build_vec_type(bld->gallivm, type);
1601       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1602
1603       /* Take the sign bit and add it to 1 constant */
1604       sign = LLVMBuildBitCast(builder, a, int_type, "");
1605       sign = LLVMBuildAnd(builder, sign, mask, "");
1606       one = LLVMConstBitCast(bld->one, int_type);
1607       res = LLVMBuildOr(builder, sign, one, "");
1608       res = LLVMBuildBitCast(builder, res, vec_type, "");
1609    }
1610    else
1611    {
1612       /* signed int/norm/fixed point */
1613       /* could use psign with sse3 and appropriate vectors here */
1614       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1615       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1616       res = lp_build_select(bld, cond, bld->one, minus_one);
1617    }
1618
1619    /* Handle zero */
1620    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1621    res = lp_build_select(bld, cond, bld->zero, res);
1622
1623    return res;
1624 }
1625
1626
1627 /**
1628  * Set the sign of float vector 'a' according to 'sign'.
1629  * If sign==0, return abs(a).
1630  * If sign==1, return -abs(a);
1631  * Other values for sign produce undefined results.
1632  */
1633 LLVMValueRef
1634 lp_build_set_sign(struct lp_build_context *bld,
1635                   LLVMValueRef a, LLVMValueRef sign)
1636 {
1637    LLVMBuilderRef builder = bld->gallivm->builder;
1638    const struct lp_type type = bld->type;
1639    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1640    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1641    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1642    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1643                              ~((unsigned long long) 1 << (type.width - 1)));
1644    LLVMValueRef val, res;
1645
1646    assert(type.floating);
1647    assert(lp_check_value(type, a));
1648
1649    /* val = reinterpret_cast<int>(a) */
1650    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1651    /* val = val & mask */
1652    val = LLVMBuildAnd(builder, val, mask, "");
1653    /* sign = sign << shift */
1654    sign = LLVMBuildShl(builder, sign, shift, "");
1655    /* res = val | sign */
1656    res = LLVMBuildOr(builder, val, sign, "");
1657    /* res = reinterpret_cast<float>(res) */
1658    res = LLVMBuildBitCast(builder, res, vec_type, "");
1659
1660    return res;
1661 }
1662
1663
1664 /**
1665  * Convert vector of (or scalar) int to vector of (or scalar) float.
1666  */
1667 LLVMValueRef
1668 lp_build_int_to_float(struct lp_build_context *bld,
1669                       LLVMValueRef a)
1670 {
1671    LLVMBuilderRef builder = bld->gallivm->builder;
1672    const struct lp_type type = bld->type;
1673    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1674
1675    assert(type.floating);
1676
1677    return LLVMBuildSIToFP(builder, a, vec_type, "");
1678 }
1679
1680 static boolean
1681 arch_rounding_available(const struct lp_type type)
1682 {
1683    if ((util_cpu_caps.has_sse4_1 &&
1684        (type.length == 1 || type.width*type.length == 128)) ||
1685        (util_cpu_caps.has_avx && type.width*type.length == 256))
1686       return TRUE;
1687    else if ((util_cpu_caps.has_altivec &&
1688             (type.width == 32 && type.length == 4)))
1689       return TRUE;
1690
1691    return FALSE;
1692 }
1693
1694 enum lp_build_round_mode
1695 {
1696    LP_BUILD_ROUND_NEAREST = 0,
1697    LP_BUILD_ROUND_FLOOR = 1,
1698    LP_BUILD_ROUND_CEIL = 2,
1699    LP_BUILD_ROUND_TRUNCATE = 3
1700 };
1701
1702 static inline LLVMValueRef
1703 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1704                              LLVMValueRef a)
1705 {
1706    LLVMBuilderRef builder = bld->gallivm->builder;
1707    const struct lp_type type = bld->type;
1708    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1709    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1710    const char *intrinsic;
1711    LLVMValueRef res;
1712
1713    assert(type.floating);
1714    /* using the double precision conversions is a bit more complicated */
1715    assert(type.width == 32);
1716
1717    assert(lp_check_value(type, a));
1718    assert(util_cpu_caps.has_sse2);
1719
1720    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1721    if (type.length == 1) {
1722       LLVMTypeRef vec_type;
1723       LLVMValueRef undef;
1724       LLVMValueRef arg;
1725       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1726
1727       vec_type = LLVMVectorType(bld->elem_type, 4);
1728
1729       intrinsic = "llvm.x86.sse.cvtss2si";
1730
1731       undef = LLVMGetUndef(vec_type);
1732
1733       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1734
1735       res = lp_build_intrinsic_unary(builder, intrinsic,
1736                                      ret_type, arg);
1737    }
1738    else {
1739       if (type.width* type.length == 128) {
1740          intrinsic = "llvm.x86.sse2.cvtps2dq";
1741       }
1742       else {
1743          assert(type.width*type.length == 256);
1744          assert(util_cpu_caps.has_avx);
1745
1746          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1747       }
1748       res = lp_build_intrinsic_unary(builder, intrinsic,
1749                                      ret_type, a);
1750    }
1751
1752    return res;
1753 }
1754
1755
1756 /*
1757  */
1758 static inline LLVMValueRef
1759 lp_build_round_altivec(struct lp_build_context *bld,
1760                        LLVMValueRef a,
1761                        enum lp_build_round_mode mode)
1762 {
1763    LLVMBuilderRef builder = bld->gallivm->builder;
1764    const struct lp_type type = bld->type;
1765    const char *intrinsic = NULL;
1766
1767    assert(type.floating);
1768
1769    assert(lp_check_value(type, a));
1770    assert(util_cpu_caps.has_altivec);
1771
1772    (void)type;
1773
1774    switch (mode) {
1775    case LP_BUILD_ROUND_NEAREST:
1776       intrinsic = "llvm.ppc.altivec.vrfin";
1777       break;
1778    case LP_BUILD_ROUND_FLOOR:
1779       intrinsic = "llvm.ppc.altivec.vrfim";
1780       break;
1781    case LP_BUILD_ROUND_CEIL:
1782       intrinsic = "llvm.ppc.altivec.vrfip";
1783       break;
1784    case LP_BUILD_ROUND_TRUNCATE:
1785       intrinsic = "llvm.ppc.altivec.vrfiz";
1786       break;
1787    }
1788
1789    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1790 }
1791
1792 static inline LLVMValueRef
1793 lp_build_round_arch(struct lp_build_context *bld,
1794                     LLVMValueRef a,
1795                     enum lp_build_round_mode mode)
1796 {
1797    if (util_cpu_caps.has_sse4_1) {
1798       LLVMBuilderRef builder = bld->gallivm->builder;
1799       const struct lp_type type = bld->type;
1800       const char *intrinsic_root;
1801       char intrinsic[32];
1802
1803       assert(type.floating);
1804       assert(lp_check_value(type, a));
1805       (void)type;
1806
1807       switch (mode) {
1808       case LP_BUILD_ROUND_NEAREST:
1809          intrinsic_root = "llvm.nearbyint";
1810          break;
1811       case LP_BUILD_ROUND_FLOOR:
1812          intrinsic_root = "llvm.floor";
1813          break;
1814       case LP_BUILD_ROUND_CEIL:
1815          intrinsic_root = "llvm.ceil";
1816          break;
1817       case LP_BUILD_ROUND_TRUNCATE:
1818          intrinsic_root = "llvm.trunc";
1819          break;
1820       }
1821
1822       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1823       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1824    }
1825    else /* (util_cpu_caps.has_altivec) */
1826      return lp_build_round_altivec(bld, a, mode);
1827 }
1828
1829 /**
1830  * Return the integer part of a float (vector) value (== round toward zero).
1831  * The returned value is a float (vector).
1832  * Ex: trunc(-1.5) = -1.0
1833  */
1834 LLVMValueRef
1835 lp_build_trunc(struct lp_build_context *bld,
1836                LLVMValueRef a)
1837 {
1838    LLVMBuilderRef builder = bld->gallivm->builder;
1839    const struct lp_type type = bld->type;
1840
1841    assert(type.floating);
1842    assert(lp_check_value(type, a));
1843
1844    if (arch_rounding_available(type)) {
1845       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1846    }
1847    else {
1848       const struct lp_type type = bld->type;
1849       struct lp_type inttype;
1850       struct lp_build_context intbld;
1851       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1852       LLVMValueRef trunc, res, anosign, mask;
1853       LLVMTypeRef int_vec_type = bld->int_vec_type;
1854       LLVMTypeRef vec_type = bld->vec_type;
1855
1856       assert(type.width == 32); /* might want to handle doubles at some point */
1857
1858       inttype = type;
1859       inttype.floating = 0;
1860       lp_build_context_init(&intbld, bld->gallivm, inttype);
1861
1862       /* round by truncation */
1863       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1864       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1865
1866       /* mask out sign bit */
1867       anosign = lp_build_abs(bld, a);
1868       /*
1869        * mask out all values if anosign > 2^24
1870        * This should work both for large ints (all rounding is no-op for them
1871        * because such floats are always exact) as well as special cases like
1872        * NaNs, Infs (taking advantage of the fact they use max exponent).
1873        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1874        */
1875       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1876       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1877       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1878       return lp_build_select(bld, mask, a, res);
1879    }
1880 }
1881
1882
1883 /**
1884  * Return float (vector) rounded to nearest integer (vector).  The returned
1885  * value is a float (vector).
1886  * Ex: round(0.9) = 1.0
1887  * Ex: round(-1.5) = -2.0
1888  */
1889 LLVMValueRef
1890 lp_build_round(struct lp_build_context *bld,
1891                LLVMValueRef a)
1892 {
1893    LLVMBuilderRef builder = bld->gallivm->builder;
1894    const struct lp_type type = bld->type;
1895
1896    assert(type.floating);
1897    assert(lp_check_value(type, a));
1898
1899    if (arch_rounding_available(type)) {
1900       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1901    }
1902    else {
1903       const struct lp_type type = bld->type;
1904       struct lp_type inttype;
1905       struct lp_build_context intbld;
1906       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1907       LLVMValueRef res, anosign, mask;
1908       LLVMTypeRef int_vec_type = bld->int_vec_type;
1909       LLVMTypeRef vec_type = bld->vec_type;
1910
1911       assert(type.width == 32); /* might want to handle doubles at some point */
1912
1913       inttype = type;
1914       inttype.floating = 0;
1915       lp_build_context_init(&intbld, bld->gallivm, inttype);
1916
1917       res = lp_build_iround(bld, a);
1918       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1919
1920       /* mask out sign bit */
1921       anosign = lp_build_abs(bld, a);
1922       /*
1923        * mask out all values if anosign > 2^24
1924        * This should work both for large ints (all rounding is no-op for them
1925        * because such floats are always exact) as well as special cases like
1926        * NaNs, Infs (taking advantage of the fact they use max exponent).
1927        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1928        */
1929       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1930       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1931       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1932       return lp_build_select(bld, mask, a, res);
1933    }
1934 }
1935
1936
1937 /**
1938  * Return floor of float (vector), result is a float (vector)
1939  * Ex: floor(1.1) = 1.0
1940  * Ex: floor(-1.1) = -2.0
1941  */
1942 LLVMValueRef
1943 lp_build_floor(struct lp_build_context *bld,
1944                LLVMValueRef a)
1945 {
1946    LLVMBuilderRef builder = bld->gallivm->builder;
1947    const struct lp_type type = bld->type;
1948
1949    assert(type.floating);
1950    assert(lp_check_value(type, a));
1951
1952    if (arch_rounding_available(type)) {
1953       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1954    }
1955    else {
1956       const struct lp_type type = bld->type;
1957       struct lp_type inttype;
1958       struct lp_build_context intbld;
1959       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1960       LLVMValueRef trunc, res, anosign, mask;
1961       LLVMTypeRef int_vec_type = bld->int_vec_type;
1962       LLVMTypeRef vec_type = bld->vec_type;
1963
1964       if (type.width != 32) {
1965          char intrinsic[32];
1966          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
1967          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1968       }
1969
1970       assert(type.width == 32); /* might want to handle doubles at some point */
1971
1972       inttype = type;
1973       inttype.floating = 0;
1974       lp_build_context_init(&intbld, bld->gallivm, inttype);
1975
1976       /* round by truncation */
1977       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1978       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1979
1980       if (type.sign) {
1981          LLVMValueRef tmp;
1982
1983          /*
1984           * fix values if rounding is wrong (for non-special cases)
1985           * - this is the case if trunc > a
1986           */
1987          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1988          /* tmp = trunc > a ? 1.0 : 0.0 */
1989          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1990          tmp = lp_build_and(&intbld, mask, tmp);
1991          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1992          res = lp_build_sub(bld, res, tmp);
1993       }
1994
1995       /* mask out sign bit */
1996       anosign = lp_build_abs(bld, a);
1997       /*
1998        * mask out all values if anosign > 2^24
1999        * This should work both for large ints (all rounding is no-op for them
2000        * because such floats are always exact) as well as special cases like
2001        * NaNs, Infs (taking advantage of the fact they use max exponent).
2002        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2003        */
2004       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2005       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2006       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2007       return lp_build_select(bld, mask, a, res);
2008    }
2009 }
2010
2011
2012 /**
2013  * Return ceiling of float (vector), returning float (vector).
2014  * Ex: ceil( 1.1) = 2.0
2015  * Ex: ceil(-1.1) = -1.0
2016  */
2017 LLVMValueRef
2018 lp_build_ceil(struct lp_build_context *bld,
2019               LLVMValueRef a)
2020 {
2021    LLVMBuilderRef builder = bld->gallivm->builder;
2022    const struct lp_type type = bld->type;
2023
2024    assert(type.floating);
2025    assert(lp_check_value(type, a));
2026
2027    if (arch_rounding_available(type)) {
2028       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2029    }
2030    else {
2031       const struct lp_type type = bld->type;
2032       struct lp_type inttype;
2033       struct lp_build_context intbld;
2034       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2035       LLVMValueRef trunc, res, anosign, mask, tmp;
2036       LLVMTypeRef int_vec_type = bld->int_vec_type;
2037       LLVMTypeRef vec_type = bld->vec_type;
2038
2039       if (type.width != 32) {
2040          char intrinsic[32];
2041          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2042          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2043       }
2044
2045       assert(type.width == 32); /* might want to handle doubles at some point */
2046
2047       inttype = type;
2048       inttype.floating = 0;
2049       lp_build_context_init(&intbld, bld->gallivm, inttype);
2050
2051       /* round by truncation */
2052       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2053       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2054
2055       /*
2056        * fix values if rounding is wrong (for non-special cases)
2057        * - this is the case if trunc < a
2058        */
2059       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2060       /* tmp = trunc < a ? 1.0 : 0.0 */
2061       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2062       tmp = lp_build_and(&intbld, mask, tmp);
2063       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2064       res = lp_build_add(bld, trunc, tmp);
2065
2066       /* mask out sign bit */
2067       anosign = lp_build_abs(bld, a);
2068       /*
2069        * mask out all values if anosign > 2^24
2070        * This should work both for large ints (all rounding is no-op for them
2071        * because such floats are always exact) as well as special cases like
2072        * NaNs, Infs (taking advantage of the fact they use max exponent).
2073        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2074        */
2075       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2076       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2077       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2078       return lp_build_select(bld, mask, a, res);
2079    }
2080 }
2081
2082
2083 /**
2084  * Return fractional part of 'a' computed as a - floor(a)
2085  * Typically used in texture coord arithmetic.
2086  */
2087 LLVMValueRef
2088 lp_build_fract(struct lp_build_context *bld,
2089                LLVMValueRef a)
2090 {
2091    assert(bld->type.floating);
2092    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2093 }
2094
2095
2096 /**
2097  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2098  * against 0.99999(9). (Will also return that value for NaNs.)
2099  */
2100 static inline LLVMValueRef
2101 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2102 {
2103    LLVMValueRef max;
2104
2105    /* this is the largest number smaller than 1.0 representable as float */
2106    max = lp_build_const_vec(bld->gallivm, bld->type,
2107                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2108    return lp_build_min_ext(bld, fract, max,
2109                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2110 }
2111
2112
2113 /**
2114  * Same as lp_build_fract, but guarantees that the result is always smaller
2115  * than one. Will also return the smaller-than-one value for infs, NaNs.
2116  */
2117 LLVMValueRef
2118 lp_build_fract_safe(struct lp_build_context *bld,
2119                     LLVMValueRef a)
2120 {
2121    return clamp_fract(bld, lp_build_fract(bld, a));
2122 }
2123
2124
2125 /**
2126  * Return the integer part of a float (vector) value (== round toward zero).
2127  * The returned value is an integer (vector).
2128  * Ex: itrunc(-1.5) = -1
2129  */
2130 LLVMValueRef
2131 lp_build_itrunc(struct lp_build_context *bld,
2132                 LLVMValueRef a)
2133 {
2134    LLVMBuilderRef builder = bld->gallivm->builder;
2135    const struct lp_type type = bld->type;
2136    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2137
2138    assert(type.floating);
2139    assert(lp_check_value(type, a));
2140
2141    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2142 }
2143
2144
2145 /**
2146  * Return float (vector) rounded to nearest integer (vector).  The returned
2147  * value is an integer (vector).
2148  * Ex: iround(0.9) = 1
2149  * Ex: iround(-1.5) = -2
2150  */
2151 LLVMValueRef
2152 lp_build_iround(struct lp_build_context *bld,
2153                 LLVMValueRef a)
2154 {
2155    LLVMBuilderRef builder = bld->gallivm->builder;
2156    const struct lp_type type = bld->type;
2157    LLVMTypeRef int_vec_type = bld->int_vec_type;
2158    LLVMValueRef res;
2159
2160    assert(type.floating);
2161
2162    assert(lp_check_value(type, a));
2163
2164    if ((util_cpu_caps.has_sse2 &&
2165        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2166        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2167       return lp_build_iround_nearest_sse2(bld, a);
2168    }
2169    if (arch_rounding_available(type)) {
2170       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2171    }
2172    else {
2173       LLVMValueRef half;
2174
2175       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2176
2177       if (type.sign) {
2178          LLVMTypeRef vec_type = bld->vec_type;
2179          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2180                                     (unsigned long long)1 << (type.width - 1));
2181          LLVMValueRef sign;
2182
2183          /* get sign bit */
2184          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2185          sign = LLVMBuildAnd(builder, sign, mask, "");
2186
2187          /* sign * 0.5 */
2188          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2189          half = LLVMBuildOr(builder, sign, half, "");
2190          half = LLVMBuildBitCast(builder, half, vec_type, "");
2191       }
2192
2193       res = LLVMBuildFAdd(builder, a, half, "");
2194    }
2195
2196    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2197
2198    return res;
2199 }
2200
2201
2202 /**
2203  * Return floor of float (vector), result is an int (vector)
2204  * Ex: ifloor(1.1) = 1.0
2205  * Ex: ifloor(-1.1) = -2.0
2206  */
2207 LLVMValueRef
2208 lp_build_ifloor(struct lp_build_context *bld,
2209                 LLVMValueRef a)
2210 {
2211    LLVMBuilderRef builder = bld->gallivm->builder;
2212    const struct lp_type type = bld->type;
2213    LLVMTypeRef int_vec_type = bld->int_vec_type;
2214    LLVMValueRef res;
2215
2216    assert(type.floating);
2217    assert(lp_check_value(type, a));
2218
2219    res = a;
2220    if (type.sign) {
2221       if (arch_rounding_available(type)) {
2222          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2223       }
2224       else {
2225          struct lp_type inttype;
2226          struct lp_build_context intbld;
2227          LLVMValueRef trunc, itrunc, mask;
2228
2229          assert(type.floating);
2230          assert(lp_check_value(type, a));
2231
2232          inttype = type;
2233          inttype.floating = 0;
2234          lp_build_context_init(&intbld, bld->gallivm, inttype);
2235
2236          /* round by truncation */
2237          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2238          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2239
2240          /*
2241           * fix values if rounding is wrong (for non-special cases)
2242           * - this is the case if trunc > a
2243           * The results of doing this with NaNs, very large values etc.
2244           * are undefined but this seems to be the case anyway.
2245           */
2246          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2247          /* cheapie minus one with mask since the mask is minus one / zero */
2248          return lp_build_add(&intbld, itrunc, mask);
2249       }
2250    }
2251
2252    /* round to nearest (toward zero) */
2253    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2254
2255    return res;
2256 }
2257
2258
2259 /**
2260  * Return ceiling of float (vector), returning int (vector).
2261  * Ex: iceil( 1.1) = 2
2262  * Ex: iceil(-1.1) = -1
2263  */
2264 LLVMValueRef
2265 lp_build_iceil(struct lp_build_context *bld,
2266                LLVMValueRef a)
2267 {
2268    LLVMBuilderRef builder = bld->gallivm->builder;
2269    const struct lp_type type = bld->type;
2270    LLVMTypeRef int_vec_type = bld->int_vec_type;
2271    LLVMValueRef res;
2272
2273    assert(type.floating);
2274    assert(lp_check_value(type, a));
2275
2276    if (arch_rounding_available(type)) {
2277       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2278    }
2279    else {
2280       struct lp_type inttype;
2281       struct lp_build_context intbld;
2282       LLVMValueRef trunc, itrunc, mask;
2283
2284       assert(type.floating);
2285       assert(lp_check_value(type, a));
2286
2287       inttype = type;
2288       inttype.floating = 0;
2289       lp_build_context_init(&intbld, bld->gallivm, inttype);
2290
2291       /* round by truncation */
2292       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2293       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2294
2295       /*
2296        * fix values if rounding is wrong (for non-special cases)
2297        * - this is the case if trunc < a
2298        * The results of doing this with NaNs, very large values etc.
2299        * are undefined but this seems to be the case anyway.
2300        */
2301       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2302       /* cheapie plus one with mask since the mask is minus one / zero */
2303       return lp_build_sub(&intbld, itrunc, mask);
2304    }
2305
2306    /* round to nearest (toward zero) */
2307    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2308
2309    return res;
2310 }
2311
2312
2313 /**
2314  * Combined ifloor() & fract().
2315  *
2316  * Preferred to calling the functions separately, as it will ensure that the
2317  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2318  */
2319 void
2320 lp_build_ifloor_fract(struct lp_build_context *bld,
2321                       LLVMValueRef a,
2322                       LLVMValueRef *out_ipart,
2323                       LLVMValueRef *out_fpart)
2324 {
2325    LLVMBuilderRef builder = bld->gallivm->builder;
2326    const struct lp_type type = bld->type;
2327    LLVMValueRef ipart;
2328
2329    assert(type.floating);
2330    assert(lp_check_value(type, a));
2331
2332    if (arch_rounding_available(type)) {
2333       /*
2334        * floor() is easier.
2335        */
2336
2337       ipart = lp_build_floor(bld, a);
2338       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2339       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2340    }
2341    else {
2342       /*
2343        * ifloor() is easier.
2344        */
2345
2346       *out_ipart = lp_build_ifloor(bld, a);
2347       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2348       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2349    }
2350 }
2351
2352
2353 /**
2354  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2355  * always smaller than one.
2356  */
2357 void
2358 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2359                            LLVMValueRef a,
2360                            LLVMValueRef *out_ipart,
2361                            LLVMValueRef *out_fpart)
2362 {
2363    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2364    *out_fpart = clamp_fract(bld, *out_fpart);
2365 }
2366
2367
2368 LLVMValueRef
2369 lp_build_sqrt(struct lp_build_context *bld,
2370               LLVMValueRef a)
2371 {
2372    LLVMBuilderRef builder = bld->gallivm->builder;
2373    const struct lp_type type = bld->type;
2374    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2375    char intrinsic[32];
2376
2377    assert(lp_check_value(type, a));
2378
2379    assert(type.floating);
2380    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2381
2382    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2383 }
2384
2385
2386 /**
2387  * Do one Newton-Raphson step to improve reciprocate precision:
2388  *
2389  *   x_{i+1} = x_i * (2 - a * x_i)
2390  *
2391  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2392  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2393  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2394  * halo. It would be necessary to clamp the argument to prevent this.
2395  *
2396  * See also:
2397  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2398  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2399  */
2400 static inline LLVMValueRef
2401 lp_build_rcp_refine(struct lp_build_context *bld,
2402                     LLVMValueRef a,
2403                     LLVMValueRef rcp_a)
2404 {
2405    LLVMBuilderRef builder = bld->gallivm->builder;
2406    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2407    LLVMValueRef res;
2408
2409    res = LLVMBuildFMul(builder, a, rcp_a, "");
2410    res = LLVMBuildFSub(builder, two, res, "");
2411    res = LLVMBuildFMul(builder, rcp_a, res, "");
2412
2413    return res;
2414 }
2415
2416
2417 LLVMValueRef
2418 lp_build_rcp(struct lp_build_context *bld,
2419              LLVMValueRef a)
2420 {
2421    LLVMBuilderRef builder = bld->gallivm->builder;
2422    const struct lp_type type = bld->type;
2423
2424    assert(lp_check_value(type, a));
2425
2426    if(a == bld->zero)
2427       return bld->undef;
2428    if(a == bld->one)
2429       return bld->one;
2430    if(a == bld->undef)
2431       return bld->undef;
2432
2433    assert(type.floating);
2434
2435    if(LLVMIsConstant(a))
2436       return LLVMConstFDiv(bld->one, a);
2437
2438    /*
2439     * We don't use RCPPS because:
2440     * - it only has 10bits of precision
2441     * - it doesn't even get the reciprocate of 1.0 exactly
2442     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2443     * - for recent processors the benefit over DIVPS is marginal, a case
2444     *   dependent
2445     *
2446     * We could still use it on certain processors if benchmarks show that the
2447     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2448     * particular uses that require less workarounds.
2449     */
2450
2451    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2452          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2453       const unsigned num_iterations = 0;
2454       LLVMValueRef res;
2455       unsigned i;
2456       const char *intrinsic = NULL;
2457
2458       if (type.length == 4) {
2459          intrinsic = "llvm.x86.sse.rcp.ps";
2460       }
2461       else {
2462          intrinsic = "llvm.x86.avx.rcp.ps.256";
2463       }
2464
2465       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2466
2467       for (i = 0; i < num_iterations; ++i) {
2468          res = lp_build_rcp_refine(bld, a, res);
2469       }
2470
2471       return res;
2472    }
2473
2474    return LLVMBuildFDiv(builder, bld->one, a, "");
2475 }
2476
2477
2478 /**
2479  * Do one Newton-Raphson step to improve rsqrt precision:
2480  *
2481  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2482  *
2483  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2484  */
2485 static inline LLVMValueRef
2486 lp_build_rsqrt_refine(struct lp_build_context *bld,
2487                       LLVMValueRef a,
2488                       LLVMValueRef rsqrt_a)
2489 {
2490    LLVMBuilderRef builder = bld->gallivm->builder;
2491    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2492    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2493    LLVMValueRef res;
2494
2495    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2496    res = LLVMBuildFMul(builder, a, res, "");
2497    res = LLVMBuildFSub(builder, three, res, "");
2498    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2499    res = LLVMBuildFMul(builder, half, res, "");
2500
2501    return res;
2502 }
2503
2504
2505 /**
2506  * Generate 1/sqrt(a).
2507  * Result is undefined for values < 0, infinity for +0.
2508  */
2509 LLVMValueRef
2510 lp_build_rsqrt(struct lp_build_context *bld,
2511                LLVMValueRef a)
2512 {
2513    const struct lp_type type = bld->type;
2514
2515    assert(lp_check_value(type, a));
2516
2517    assert(type.floating);
2518
2519    /*
2520     * This should be faster but all denormals will end up as infinity.
2521     */
2522    if (0 && lp_build_fast_rsqrt_available(type)) {
2523       const unsigned num_iterations = 1;
2524       LLVMValueRef res;
2525       unsigned i;
2526
2527       /* rsqrt(1.0) != 1.0 here */
2528       res = lp_build_fast_rsqrt(bld, a);
2529
2530       if (num_iterations) {
2531          /*
2532           * Newton-Raphson will result in NaN instead of infinity for zero,
2533           * and NaN instead of zero for infinity.
2534           * Also, need to ensure rsqrt(1.0) == 1.0.
2535           * All numbers smaller than FLT_MIN will result in +infinity
2536           * (rsqrtps treats all denormals as zero).
2537           */
2538          LLVMValueRef cmp;
2539          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2540          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2541
2542          for (i = 0; i < num_iterations; ++i) {
2543             res = lp_build_rsqrt_refine(bld, a, res);
2544          }
2545          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2546          res = lp_build_select(bld, cmp, inf, res);
2547          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2548          res = lp_build_select(bld, cmp, bld->zero, res);
2549          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2550          res = lp_build_select(bld, cmp, bld->one, res);
2551       }
2552
2553       return res;
2554    }
2555
2556    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2557 }
2558
2559 /**
2560  * If there's a fast (inaccurate) rsqrt instruction available
2561  * (caller may want to avoid to call rsqrt_fast if it's not available,
2562  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2563  * unavailable it would result in sqrt/div/mul so obviously
2564  * much better to just call sqrt, skipping both div and mul).
2565  */
2566 boolean
2567 lp_build_fast_rsqrt_available(struct lp_type type)
2568 {
2569    assert(type.floating);
2570
2571    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2572        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2573       return true;
2574    }
2575    return false;
2576 }
2577
2578
2579 /**
2580  * Generate 1/sqrt(a).
2581  * Result is undefined for values < 0, infinity for +0.
2582  * Precision is limited, only ~10 bits guaranteed
2583  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2584  */
2585 LLVMValueRef
2586 lp_build_fast_rsqrt(struct lp_build_context *bld,
2587                     LLVMValueRef a)
2588 {
2589    LLVMBuilderRef builder = bld->gallivm->builder;
2590    const struct lp_type type = bld->type;
2591
2592    assert(lp_check_value(type, a));
2593
2594    if (lp_build_fast_rsqrt_available(type)) {
2595       const char *intrinsic = NULL;
2596
2597       if (type.length == 4) {
2598          intrinsic = "llvm.x86.sse.rsqrt.ps";
2599       }
2600       else {
2601          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2602       }
2603       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2604    }
2605    else {
2606       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2607    }
2608    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2609 }
2610
2611
2612 /**
2613  * Generate sin(a) or cos(a) using polynomial approximation.
2614  * TODO: it might be worth recognizing sin and cos using same source
2615  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2616  * would be way cheaper than calculating (nearly) everything twice...
2617  * Not sure it's common enough to be worth bothering however, scs
2618  * opcode could also benefit from calculating both though.
2619  */
2620 static LLVMValueRef
2621 lp_build_sin_or_cos(struct lp_build_context *bld,
2622                     LLVMValueRef a,
2623                     boolean cos)
2624 {
2625    struct gallivm_state *gallivm = bld->gallivm;
2626    LLVMBuilderRef b = gallivm->builder;
2627    struct lp_type int_type = lp_int_type(bld->type);
2628
2629    /*
2630     *  take the absolute value,
2631     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2632     */
2633
2634    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2635    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2636
2637    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2638    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2639
2640    /*
2641     * scale by 4/Pi
2642     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2643     */
2644
2645    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2646    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2647
2648    /*
2649     * store the integer part of y in mm0
2650     * emm2 = _mm_cvttps_epi32(y);
2651     */
2652
2653    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2654
2655    /*
2656     * j=(j+1) & (~1) (see the cephes sources)
2657     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2658     */
2659
2660    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2661    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2662    /*
2663     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2664     */
2665    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2666    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2667
2668    /*
2669     * y = _mm_cvtepi32_ps(emm2);
2670     */
2671    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2672
2673    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2674    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2675    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2676    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2677
2678    /*
2679     * Argument used for poly selection and sign bit determination
2680     * is different for sin vs. cos.
2681     */
2682    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2683                                emm2_and;
2684
2685    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2686                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2687                                               const_29, "sign_bit") :
2688                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2689                                                               LLVMBuildShl(b, emm2_add,
2690                                                                            const_29, ""), ""),
2691                                               sign_mask, "sign_bit");
2692
2693    /*
2694     * get the polynom selection mask
2695     * there is one polynom for 0 <= x <= Pi/4
2696     * and another one for Pi/4<x<=Pi/2
2697     * Both branches will be computed.
2698     *
2699     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2700     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2701     */
2702
2703    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2704    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2705                                              int_type, PIPE_FUNC_EQUAL,
2706                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2707
2708    /*
2709     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2710     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2711     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2712     */
2713    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2714    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2715    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2716
2717    /*
2718     * The magic pass: "Extended precision modular arithmetic"
2719     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2720     * xmm1 = _mm_mul_ps(y, xmm1);
2721     * xmm2 = _mm_mul_ps(y, xmm2);
2722     * xmm3 = _mm_mul_ps(y, xmm3);
2723     */
2724    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2725    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2726    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2727
2728    /*
2729     * x = _mm_add_ps(x, xmm1);
2730     * x = _mm_add_ps(x, xmm2);
2731     * x = _mm_add_ps(x, xmm3);
2732     */
2733
2734    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2735    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2736    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2737
2738    /*
2739     * Evaluate the first polynom  (0 <= x <= Pi/4)
2740     *
2741     * z = _mm_mul_ps(x,x);
2742     */
2743    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2744
2745    /*
2746     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2747     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2748     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2749     */
2750    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2751    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2752    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2753
2754    /*
2755     * y = *(v4sf*)_ps_coscof_p0;
2756     * y = _mm_mul_ps(y, z);
2757     */
2758    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2759    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2760    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2761    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2762    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2763    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2764
2765
2766    /*
2767     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2768     * y = _mm_sub_ps(y, tmp);
2769     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2770     */
2771    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2772    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2773    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2774    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2775    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2776
2777    /*
2778     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2779     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2780     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2781     */
2782    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2783    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2784    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2785
2786    /*
2787     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2788     *
2789     * y2 = *(v4sf*)_ps_sincof_p0;
2790     * y2 = _mm_mul_ps(y2, z);
2791     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2792     * y2 = _mm_mul_ps(y2, z);
2793     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2794     * y2 = _mm_mul_ps(y2, z);
2795     * y2 = _mm_mul_ps(y2, x);
2796     * y2 = _mm_add_ps(y2, x);
2797     */
2798
2799    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2800    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2801    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2802    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2803    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2804    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2805    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2806
2807    /*
2808     * select the correct result from the two polynoms
2809     * xmm3 = poly_mask;
2810     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2811     * y = _mm_andnot_ps(xmm3, y);
2812     * y = _mm_or_ps(y,y2);
2813     */
2814    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2815    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2816    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2817    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2818    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2819    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2820
2821    /*
2822     * update the sign
2823     * y = _mm_xor_ps(y, sign_bit);
2824     */
2825    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2826    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2827
2828    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2829
2830    /* clamp output to be within [-1, 1] */
2831    y_result = lp_build_clamp(bld, y_result,
2832                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2833                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2834    /* If a is -inf, inf or NaN then return NaN */
2835    y_result = lp_build_select(bld, isfinite, y_result,
2836                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2837    return y_result;
2838 }
2839
2840
2841 /**
2842  * Generate sin(a)
2843  */
2844 LLVMValueRef
2845 lp_build_sin(struct lp_build_context *bld,
2846              LLVMValueRef a)
2847 {
2848    return lp_build_sin_or_cos(bld, a, FALSE);
2849 }
2850
2851
2852 /**
2853  * Generate cos(a)
2854  */
2855 LLVMValueRef
2856 lp_build_cos(struct lp_build_context *bld,
2857              LLVMValueRef a)
2858 {
2859    return lp_build_sin_or_cos(bld, a, TRUE);
2860 }
2861
2862
2863 /**
2864  * Generate pow(x, y)
2865  */
2866 LLVMValueRef
2867 lp_build_pow(struct lp_build_context *bld,
2868              LLVMValueRef x,
2869              LLVMValueRef y)
2870 {
2871    /* TODO: optimize the constant case */
2872    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2873        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2874       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2875                    __FUNCTION__);
2876    }
2877
2878    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2879 }
2880
2881
2882 /**
2883  * Generate exp(x)
2884  */
2885 LLVMValueRef
2886 lp_build_exp(struct lp_build_context *bld,
2887              LLVMValueRef x)
2888 {
2889    /* log2(e) = 1/log(2) */
2890    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2891                                            1.4426950408889634);
2892
2893    assert(lp_check_value(bld->type, x));
2894
2895    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2896 }
2897
2898
2899 /**
2900  * Generate log(x)
2901  * Behavior is undefined with infs, 0s and nans
2902  */
2903 LLVMValueRef
2904 lp_build_log(struct lp_build_context *bld,
2905              LLVMValueRef x)
2906 {
2907    /* log(2) */
2908    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2909                                           0.69314718055994529);
2910
2911    assert(lp_check_value(bld->type, x));
2912
2913    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2914 }
2915
2916 /**
2917  * Generate log(x) that handles edge cases (infs, 0s and nans)
2918  */
2919 LLVMValueRef
2920 lp_build_log_safe(struct lp_build_context *bld,
2921                   LLVMValueRef x)
2922 {
2923    /* log(2) */
2924    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2925                                           0.69314718055994529);
2926
2927    assert(lp_check_value(bld->type, x));
2928
2929    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2930 }
2931
2932
2933 /**
2934  * Generate polynomial.
2935  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2936  */
2937 LLVMValueRef
2938 lp_build_polynomial(struct lp_build_context *bld,
2939                     LLVMValueRef x,
2940                     const double *coeffs,
2941                     unsigned num_coeffs)
2942 {
2943    const struct lp_type type = bld->type;
2944    LLVMValueRef even = NULL, odd = NULL;
2945    LLVMValueRef x2;
2946    unsigned i;
2947
2948    assert(lp_check_value(bld->type, x));
2949
2950    /* TODO: optimize the constant case */
2951    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2952        LLVMIsConstant(x)) {
2953       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2954                    __FUNCTION__);
2955    }
2956
2957    /*
2958     * Calculate odd and even terms seperately to decrease data dependency
2959     * Ex:
2960     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2961     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2962     */
2963    x2 = lp_build_mul(bld, x, x);
2964
2965    for (i = num_coeffs; i--; ) {
2966       LLVMValueRef coeff;
2967
2968       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2969
2970       if (i % 2 == 0) {
2971          if (even)
2972             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2973          else
2974             even = coeff;
2975       } else {
2976          if (odd)
2977             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2978          else
2979             odd = coeff;
2980       }
2981    }
2982
2983    if (odd)
2984       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2985    else if (even)
2986       return even;
2987    else
2988       return bld->undef;
2989 }
2990
2991
2992 /**
2993  * Minimax polynomial fit of 2**x, in range [0, 1[
2994  */
2995 const double lp_build_exp2_polynomial[] = {
2996 #if EXP_POLY_DEGREE == 5
2997    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2998    0.693153073200168932794,
2999    0.240153617044375388211,
3000    0.0558263180532956664775,
3001    0.00898934009049466391101,
3002    0.00187757667519147912699
3003 #elif EXP_POLY_DEGREE == 4
3004    1.00000259337069434683,
3005    0.693003834469974940458,
3006    0.24144275689150793076,
3007    0.0520114606103070150235,
3008    0.0135341679161270268764
3009 #elif EXP_POLY_DEGREE == 3
3010    0.999925218562710312959,
3011    0.695833540494823811697,
3012    0.226067155427249155588,
3013    0.0780245226406372992967
3014 #elif EXP_POLY_DEGREE == 2
3015    1.00172476321474503578,
3016    0.657636275736077639316,
3017    0.33718943461968720704
3018 #else
3019 #error
3020 #endif
3021 };
3022
3023
3024 LLVMValueRef
3025 lp_build_exp2(struct lp_build_context *bld,
3026               LLVMValueRef x)
3027 {
3028    LLVMBuilderRef builder = bld->gallivm->builder;
3029    const struct lp_type type = bld->type;
3030    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3031    LLVMValueRef ipart = NULL;
3032    LLVMValueRef fpart = NULL;
3033    LLVMValueRef expipart = NULL;
3034    LLVMValueRef expfpart = NULL;
3035    LLVMValueRef res = NULL;
3036
3037    assert(lp_check_value(bld->type, x));
3038
3039    /* TODO: optimize the constant case */
3040    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3041        LLVMIsConstant(x)) {
3042       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3043                    __FUNCTION__);
3044    }
3045
3046    assert(type.floating && type.width == 32);
3047
3048    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3049     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3050    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3051                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3052    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3053                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3054
3055    /* ipart = floor(x) */
3056    /* fpart = x - ipart */
3057    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3058
3059    /* expipart = (float) (1 << ipart) */
3060    expipart = LLVMBuildAdd(builder, ipart,
3061                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3062    expipart = LLVMBuildShl(builder, expipart,
3063                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3064    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3065
3066    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3067                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3068
3069    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3070
3071    return res;
3072 }
3073
3074
3075
3076 /**
3077  * Extract the exponent of a IEEE-754 floating point value.
3078  *
3079  * Optionally apply an integer bias.
3080  *
3081  * Result is an integer value with
3082  *
3083  *   ifloor(log2(x)) + bias
3084  */
3085 LLVMValueRef
3086 lp_build_extract_exponent(struct lp_build_context *bld,
3087                           LLVMValueRef x,
3088                           int bias)
3089 {
3090    LLVMBuilderRef builder = bld->gallivm->builder;
3091    const struct lp_type type = bld->type;
3092    unsigned mantissa = lp_mantissa(type);
3093    LLVMValueRef res;
3094
3095    assert(type.floating);
3096
3097    assert(lp_check_value(bld->type, x));
3098
3099    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3100
3101    res = LLVMBuildLShr(builder, x,
3102                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3103    res = LLVMBuildAnd(builder, res,
3104                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3105    res = LLVMBuildSub(builder, res,
3106                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3107
3108    return res;
3109 }
3110
3111
3112 /**
3113  * Extract the mantissa of the a floating.
3114  *
3115  * Result is a floating point value with
3116  *
3117  *   x / floor(log2(x))
3118  */
3119 LLVMValueRef
3120 lp_build_extract_mantissa(struct lp_build_context *bld,
3121                           LLVMValueRef x)
3122 {
3123    LLVMBuilderRef builder = bld->gallivm->builder;
3124    const struct lp_type type = bld->type;
3125    unsigned mantissa = lp_mantissa(type);
3126    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3127                                                   (1ULL << mantissa) - 1);
3128    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3129    LLVMValueRef res;
3130
3131    assert(lp_check_value(bld->type, x));
3132
3133    assert(type.floating);
3134
3135    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3136
3137    /* res = x / 2**ipart */
3138    res = LLVMBuildAnd(builder, x, mantmask, "");
3139    res = LLVMBuildOr(builder, res, one, "");
3140    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3141
3142    return res;
3143 }
3144
3145
3146
3147 /**
3148  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3149  * These coefficients can be generate with
3150  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3151  */
3152 const double lp_build_log2_polynomial[] = {
3153 #if LOG_POLY_DEGREE == 5
3154    2.88539008148777786488L,
3155    0.961796878841293367824L,
3156    0.577058946784739859012L,
3157    0.412914355135828735411L,
3158    0.308591899232910175289L,
3159    0.352376952300281371868L,
3160 #elif LOG_POLY_DEGREE == 4
3161    2.88539009343309178325L,
3162    0.961791550404184197881L,
3163    0.577440339438736392009L,
3164    0.403343858251329912514L,
3165    0.406718052498846252698L,
3166 #elif LOG_POLY_DEGREE == 3
3167    2.88538959748872753838L,
3168    0.961932915889597772928L,
3169    0.571118517972136195241L,
3170    0.493997535084709500285L,
3171 #else
3172 #error
3173 #endif
3174 };
3175
3176 /**
3177  * See http://www.devmaster.net/forums/showthread.php?p=43580
3178  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3179  * http://www.nezumi.demon.co.uk/consult/logx.htm
3180  *
3181  * If handle_edge_cases is true the function will perform computations
3182  * to match the required D3D10+ behavior for each of the edge cases.
3183  * That means that if input is:
3184  * - less than zero (to and including -inf) then NaN will be returned
3185  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3186  * - +infinity, then +infinity will be returned
3187  * - NaN, then NaN will be returned
3188  *
3189  * Those checks are fairly expensive so if you don't need them make sure
3190  * handle_edge_cases is false.
3191  */
3192 void
3193 lp_build_log2_approx(struct lp_build_context *bld,
3194                      LLVMValueRef x,
3195                      LLVMValueRef *p_exp,
3196                      LLVMValueRef *p_floor_log2,
3197                      LLVMValueRef *p_log2,
3198                      boolean handle_edge_cases)
3199 {
3200    LLVMBuilderRef builder = bld->gallivm->builder;
3201    const struct lp_type type = bld->type;
3202    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3203    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3204
3205    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3206    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3207    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3208
3209    LLVMValueRef i = NULL;
3210    LLVMValueRef y = NULL;
3211    LLVMValueRef z = NULL;
3212    LLVMValueRef exp = NULL;
3213    LLVMValueRef mant = NULL;
3214    LLVMValueRef logexp = NULL;
3215    LLVMValueRef logmant = NULL;
3216    LLVMValueRef res = NULL;
3217
3218    assert(lp_check_value(bld->type, x));
3219
3220    if(p_exp || p_floor_log2 || p_log2) {
3221       /* TODO: optimize the constant case */
3222       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3223           LLVMIsConstant(x)) {
3224          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3225                       __FUNCTION__);
3226       }
3227
3228       assert(type.floating && type.width == 32);
3229
3230       /*
3231        * We don't explicitly handle denormalized numbers. They will yield a
3232        * result in the neighbourhood of -127, which appears to be adequate
3233        * enough.
3234        */
3235
3236       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3237
3238       /* exp = (float) exponent(x) */
3239       exp = LLVMBuildAnd(builder, i, expmask, "");
3240    }
3241
3242    if(p_floor_log2 || p_log2) {
3243       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3244       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3245       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3246    }
3247
3248    if (p_log2) {
3249       /* mant = 1 + (float) mantissa(x) */
3250       mant = LLVMBuildAnd(builder, i, mantmask, "");
3251       mant = LLVMBuildOr(builder, mant, one, "");
3252       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3253
3254       /* y = (mant - 1) / (mant + 1) */
3255       y = lp_build_div(bld,
3256          lp_build_sub(bld, mant, bld->one),
3257          lp_build_add(bld, mant, bld->one)
3258       );
3259
3260       /* z = y^2 */
3261       z = lp_build_mul(bld, y, y);
3262
3263       /* compute P(z) */
3264       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3265                                     ARRAY_SIZE(lp_build_log2_polynomial));
3266
3267       /* logmant = y * P(z) */
3268       logmant = lp_build_mul(bld, y, logmant);
3269
3270       res = lp_build_add(bld, logmant, logexp);
3271
3272       if (type.floating && handle_edge_cases) {
3273          LLVMValueRef negmask, infmask,  zmask;
3274          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3275                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3276          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3277                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3278          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3279                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3280
3281          /* If x is qual to inf make sure we return inf */
3282          res = lp_build_select(bld, infmask,
3283                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3284                                res);
3285          /* If x is qual to 0, return -inf */
3286          res = lp_build_select(bld, zmask,
3287                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3288                                res);
3289          /* If x is nan or less than 0, return nan */
3290          res = lp_build_select(bld, negmask,
3291                                lp_build_const_vec(bld->gallivm, type,  NAN),
3292                                res);
3293       }
3294    }
3295
3296    if (p_exp) {
3297       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3298       *p_exp = exp;
3299    }
3300
3301    if (p_floor_log2)
3302       *p_floor_log2 = logexp;
3303
3304    if (p_log2)
3305       *p_log2 = res;
3306 }
3307
3308
3309 /*
3310  * log2 implementation which doesn't have special code to
3311  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3312  * the results for those cases are undefined.
3313  */
3314 LLVMValueRef
3315 lp_build_log2(struct lp_build_context *bld,
3316               LLVMValueRef x)
3317 {
3318    LLVMValueRef res;
3319    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3320    return res;
3321 }
3322
3323 /*
3324  * Version of log2 which handles all edge cases.
3325  * Look at documentation of lp_build_log2_approx for
3326  * description of the behavior for each of the edge cases.
3327  */
3328 LLVMValueRef
3329 lp_build_log2_safe(struct lp_build_context *bld,
3330                    LLVMValueRef x)
3331 {
3332    LLVMValueRef res;
3333    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3334    return res;
3335 }
3336
3337
3338 /**
3339  * Faster (and less accurate) log2.
3340  *
3341  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3342  *
3343  * Piece-wise linear approximation, with exact results when x is a
3344  * power of two.
3345  *
3346  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3347  */
3348 LLVMValueRef
3349 lp_build_fast_log2(struct lp_build_context *bld,
3350                    LLVMValueRef x)
3351 {
3352    LLVMBuilderRef builder = bld->gallivm->builder;
3353    LLVMValueRef ipart;
3354    LLVMValueRef fpart;
3355
3356    assert(lp_check_value(bld->type, x));
3357
3358    assert(bld->type.floating);
3359
3360    /* ipart = floor(log2(x)) - 1 */
3361    ipart = lp_build_extract_exponent(bld, x, -1);
3362    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3363
3364    /* fpart = x / 2**ipart */
3365    fpart = lp_build_extract_mantissa(bld, x);
3366
3367    /* ipart + fpart */
3368    return LLVMBuildFAdd(builder, ipart, fpart, "");
3369 }
3370
3371
3372 /**
3373  * Fast implementation of iround(log2(x)).
3374  *
3375  * Not an approximation -- it should give accurate results all the time.
3376  */
3377 LLVMValueRef
3378 lp_build_ilog2(struct lp_build_context *bld,
3379                LLVMValueRef x)
3380 {
3381    LLVMBuilderRef builder = bld->gallivm->builder;
3382    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3383    LLVMValueRef ipart;
3384
3385    assert(bld->type.floating);
3386
3387    assert(lp_check_value(bld->type, x));
3388
3389    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3390    x = LLVMBuildFMul(builder, x, sqrt2, "");
3391
3392    /* ipart = floor(log2(x) + 0.5)  */
3393    ipart = lp_build_extract_exponent(bld, x, 0);
3394
3395    return ipart;
3396 }
3397
3398 LLVMValueRef
3399 lp_build_mod(struct lp_build_context *bld,
3400              LLVMValueRef x,
3401              LLVMValueRef y)
3402 {
3403    LLVMBuilderRef builder = bld->gallivm->builder;
3404    LLVMValueRef res;
3405    const struct lp_type type = bld->type;
3406
3407    assert(lp_check_value(type, x));
3408    assert(lp_check_value(type, y));
3409
3410    if (type.floating)
3411       res = LLVMBuildFRem(builder, x, y, "");
3412    else if (type.sign)
3413       res = LLVMBuildSRem(builder, x, y, "");
3414    else
3415       res = LLVMBuildURem(builder, x, y, "");
3416    return res;
3417 }
3418
3419
3420 /*
3421  * For floating inputs it creates and returns a mask
3422  * which is all 1's for channels which are NaN.
3423  * Channels inside x which are not NaN will be 0.
3424  */
3425 LLVMValueRef
3426 lp_build_isnan(struct lp_build_context *bld,
3427                LLVMValueRef x)
3428 {
3429    LLVMValueRef mask;
3430    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3431
3432    assert(bld->type.floating);
3433    assert(lp_check_value(bld->type, x));
3434
3435    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3436                         "isnotnan");
3437    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3438    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3439    return mask;
3440 }
3441
3442 /* Returns all 1's for floating point numbers that are
3443  * finite numbers and returns all zeros for -inf,
3444  * inf and nan's */
3445 LLVMValueRef
3446 lp_build_isfinite(struct lp_build_context *bld,
3447                   LLVMValueRef x)
3448 {
3449    LLVMBuilderRef builder = bld->gallivm->builder;
3450    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3451    struct lp_type int_type = lp_int_type(bld->type);
3452    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3453    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3454                                                     0x7f800000);
3455
3456    if (!bld->type.floating) {
3457       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3458    }
3459    assert(bld->type.floating);
3460    assert(lp_check_value(bld->type, x));
3461    assert(bld->type.width == 32);
3462
3463    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3464    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3465                            intx, infornan32);
3466 }
3467
3468 /*
3469  * Returns true if the number is nan or inf and false otherwise.
3470  * The input has to be a floating point vector.
3471  */
3472 LLVMValueRef
3473 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3474                        const struct lp_type type,
3475                        LLVMValueRef x)
3476 {
3477    LLVMBuilderRef builder = gallivm->builder;
3478    struct lp_type int_type = lp_int_type(type);
3479    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3480                                                 0x7f800000);
3481    LLVMValueRef ret;
3482
3483    assert(type.floating);
3484
3485    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3486    ret = LLVMBuildAnd(builder, ret, const0, "");
3487    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3488                           ret, const0);
3489
3490    return ret;
3491 }
3492
3493
3494 LLVMValueRef
3495 lp_build_fpstate_get(struct gallivm_state *gallivm)
3496 {
3497    if (util_cpu_caps.has_sse) {
3498       LLVMBuilderRef builder = gallivm->builder;
3499       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3500          gallivm,
3501          LLVMInt32TypeInContext(gallivm->context),
3502          "mxcsr_ptr");
3503       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3504           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3505       lp_build_intrinsic(builder,
3506                          "llvm.x86.sse.stmxcsr",
3507                          LLVMVoidTypeInContext(gallivm->context),
3508                          &mxcsr_ptr8, 1, 0);
3509       return mxcsr_ptr;
3510    }
3511    return 0;
3512 }
3513
3514 void
3515 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3516                                   boolean zero)
3517 {
3518    if (util_cpu_caps.has_sse) {
3519       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3520       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3521
3522       LLVMBuilderRef builder = gallivm->builder;
3523       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3524       LLVMValueRef mxcsr =
3525          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3526
3527       if (util_cpu_caps.has_daz) {
3528          /* Enable denormals are zero mode */
3529          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3530       }
3531       if (zero) {
3532          mxcsr = LLVMBuildOr(builder, mxcsr,
3533                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3534       } else {
3535          mxcsr = LLVMBuildAnd(builder, mxcsr,
3536                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3537       }
3538
3539       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3540       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3541    }
3542 }
3543
3544 void
3545 lp_build_fpstate_set(struct gallivm_state *gallivm,
3546                      LLVMValueRef mxcsr_ptr)
3547 {
3548    if (util_cpu_caps.has_sse) {
3549       LLVMBuilderRef builder = gallivm->builder;
3550       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3551                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3552       lp_build_intrinsic(builder,
3553                          "llvm.x86.sse.ldmxcsr",
3554                          LLVMVoidTypeInContext(gallivm->context),
3555                          &mxcsr_ptr, 1, 0);
3556    }
3557 }