src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 138           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 140                       __FUNCTION__);
 141       }
 142       if (type.width == 32 && type.length == 4) {
 143          intrinsic = "llvm.ppc.altivec.vminfp";
 144          intr_size = 128;
 145       }
 146    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 147       intr_size = 128;
 148       if ((type.width == 8 || type.width == 16) &&
 149           (type.width * type.length <= 64) &&
 150           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 151          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 152                       __FUNCTION__);
 153       }
 154       if (type.width == 8 && !type.sign) {
 155          intrinsic = "llvm.x86.sse2.pminu.b";
 156       }
 157       else if (type.width == 16 && type.sign) {
 158          intrinsic = "llvm.x86.sse2.pmins.w";
 159       }
 160       if (util_cpu_caps.has_sse4_1) {
 161          if (type.width == 8 && type.sign) {
 162             intrinsic = "llvm.x86.sse41.pminsb";
 163          }
 164          if (type.width == 16 && !type.sign) {
 165             intrinsic = "llvm.x86.sse41.pminuw";
 166          }
 167          if (type.width == 32 && !type.sign) {
 168             intrinsic = "llvm.x86.sse41.pminud";
 169          }
 170          if (type.width == 32 && type.sign) {
 171             intrinsic = "llvm.x86.sse41.pminsd";
 172          }
 173       }
 174    } else if (util_cpu_caps.has_altivec) {
 175       intr_size = 128;
 176       if (type.width == 8) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminub";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsb";
 181          }
 182       } else if (type.width == 16) {
 183          if (!type.sign) {
 184             intrinsic = "llvm.ppc.altivec.vminuh";
 185          } else {
 186             intrinsic = "llvm.ppc.altivec.vminsh";
 187          }
 188       } else if (type.width == 32) {
 189          if (!type.sign) {
 190             intrinsic = "llvm.ppc.altivec.vminuw";
 191          } else {
 192             intrinsic = "llvm.ppc.altivec.vminsw";
 193          }
 194       }
 195    }
 196
 197    if (intrinsic) {
 198       /* We need to handle nan's for floating point numbers. If one of the
 199        * inputs is nan the other should be returned (required by both D3D10+
 200        * and OpenCL).
 201        * The sse intrinsics return the second operator in case of nan by
 202        * default so we need to special code to handle those.
 203        */
 204       if (util_cpu_caps.has_sse && type.floating &&
 205           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 206           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 207           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 208          LLVMValueRef isnan, min;
 209          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 210                                                    type,
 211                                                    intr_size, a, b);
 212          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 213             isnan = lp_build_isnan(bld, b);
 214             return lp_build_select(bld, isnan, a, min);
 215          } else {
 216             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 217             isnan = lp_build_isnan(bld, a);
 218             return lp_build_select(bld, isnan, a, min);
 219          }
 220       } else {
 221          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 222                                                     type,
 223                                                     intr_size, a, b);
 224       }
 225    }
 226
 227    if (type.floating) {
 228       switch (nan_behavior) {
 229       case GALLIVM_NAN_RETURN_NAN: {
 230          LLVMValueRef isnan = lp_build_isnan(bld, b);
 231          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 232          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 233          return lp_build_select(bld, cond, a, b);
 234       }
 235          break;
 236       case GALLIVM_NAN_RETURN_OTHER: {
 237          LLVMValueRef isnan = lp_build_isnan(bld, a);
 238          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 239          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242          break;
 243       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 244          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 245          return lp_build_select(bld, cond, a, b);
 246       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 248          return lp_build_select(bld, cond, b, a);
 249       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252          break;
 253       default:
 254          assert(0);
 255          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258    } else {
 259       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 260       return lp_build_select(bld, cond, a, b);
 261    }
 262 }
 263
 264
 265 /**
 266  * Generate max(a, b)
 267  * No checks for special case values of a or b = 1 or 0 are done.
 268  * NaN's are handled according to the behavior specified by the
 269  * nan_behavior argument.
 270  */
 271 static LLVMValueRef
 272 lp_build_max_simple(struct lp_build_context *bld,
 273                     LLVMValueRef a,
 274                     LLVMValueRef b,
 275                     enum gallivm_nan_behavior nan_behavior)
 276 {
 277    const struct lp_type type = bld->type;
 278    const char *intrinsic = NULL;
 279    unsigned intr_size = 0;
 280    LLVMValueRef cond;
 281
 282    assert(lp_check_value(type, a));
 283    assert(lp_check_value(type, b));
 284
 285    /* TODO: optimize the constant case */
 286
 287    if (type.floating && util_cpu_caps.has_sse) {
 288       if (type.width == 32) {
 289          if (type.length == 1) {
 290             intrinsic = "llvm.x86.sse.max.ss";
 291             intr_size = 128;
 292          }
 293          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 294             intrinsic = "llvm.x86.sse.max.ps";
 295             intr_size = 128;
 296          }
 297          else {
 298             intrinsic = "llvm.x86.avx.max.ps.256";
 299             intr_size = 256;
 300          }
 301       }
 302       if (type.width == 64 && util_cpu_caps.has_sse2) {
 303          if (type.length == 1) {
 304             intrinsic = "llvm.x86.sse2.max.sd";
 305             intr_size = 128;
 306          }
 307          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 308             intrinsic = "llvm.x86.sse2.max.pd";
 309             intr_size = 128;
 310          }
 311          else {
 312             intrinsic = "llvm.x86.avx.max.pd.256";
 313             intr_size = 256;
 314          }
 315       }
 316    }
 317    else if (type.floating && util_cpu_caps.has_altivec) {
 318       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 319           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 320          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 321                       __FUNCTION__);
 322       }
 323       if (type.width == 32 || type.length == 4) {
 324          intrinsic = "llvm.ppc.altivec.vmaxfp";
 325          intr_size = 128;
 326       }
 327    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 328       intr_size = 128;
 329       if ((type.width == 8 || type.width == 16) &&
 330           (type.width * type.length <= 64) &&
 331           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 332          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 333                       __FUNCTION__);
 334          }
 335       if (type.width == 8 && !type.sign) {
 336          intrinsic = "llvm.x86.sse2.pmaxu.b";
 337          intr_size = 128;
 338       }
 339       else if (type.width == 16 && type.sign) {
 340          intrinsic = "llvm.x86.sse2.pmaxs.w";
 341       }
 342       if (util_cpu_caps.has_sse4_1) {
 343          if (type.width == 8 && type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxsb";
 345          }
 346          if (type.width == 16 && !type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxuw";
 348          }
 349          if (type.width == 32 && !type.sign) {
 350             intrinsic = "llvm.x86.sse41.pmaxud";
 351         }
 352          if (type.width == 32 && type.sign) {
 353             intrinsic = "llvm.x86.sse41.pmaxsd";
 354          }
 355       }
 356    } else if (util_cpu_caps.has_altivec) {
 357      intr_size = 128;
 358      if (type.width == 8) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxub";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsb";
 363        }
 364      } else if (type.width == 16) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuh";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsh";
 369        }
 370      } else if (type.width == 32) {
 371        if (!type.sign) {
 372          intrinsic = "llvm.ppc.altivec.vmaxuw";
 373        } else {
 374          intrinsic = "llvm.ppc.altivec.vmaxsw";
 375        }
 376      }
 377    }
 378
 379    if (intrinsic) {
 380       if (util_cpu_caps.has_sse && type.floating &&
 381           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 382           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 383           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 384          LLVMValueRef isnan, max;
 385          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 386                                                    type,
 387                                                    intr_size, a, b);
 388          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 389             isnan = lp_build_isnan(bld, b);
 390             return lp_build_select(bld, isnan, a, max);
 391          } else {
 392             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 393             isnan = lp_build_isnan(bld, a);
 394             return lp_build_select(bld, isnan, a, max);
 395          }
 396       } else {
 397          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 398                                                     type,
 399                                                     intr_size, a, b);
 400       }
 401    }
 402
 403    if (type.floating) {
 404       switch (nan_behavior) {
 405       case GALLIVM_NAN_RETURN_NAN: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, b);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER: {
 413          LLVMValueRef isnan = lp_build_isnan(bld, a);
 414          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 416          return lp_build_select(bld, cond, a, b);
 417       }
 418          break;
 419       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 420          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 421          return lp_build_select(bld, cond, a, b);
 422       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 423          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 424          return lp_build_select(bld, cond, b, a);
 425       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 426          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 427          return lp_build_select(bld, cond, a, b);
 428          break;
 429       default:
 430          assert(0);
 431          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 432          return lp_build_select(bld, cond, a, b);
 433       }
 434    } else {
 435       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 436       return lp_build_select(bld, cond, a, b);
 437    }
 438 }
 439
 440
 441 /**
 442  * Generate 1 - a, or ~a depending on bld->type.
 443  */
 444 LLVMValueRef
 445 lp_build_comp(struct lp_build_context *bld,
 446               LLVMValueRef a)
 447 {
 448    LLVMBuilderRef builder = bld->gallivm->builder;
 449    const struct lp_type type = bld->type;
 450
 451    assert(lp_check_value(type, a));
 452
 453    if(a == bld->one)
 454       return bld->zero;
 455    if(a == bld->zero)
 456       return bld->one;
 457
 458    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 459       if(LLVMIsConstant(a))
 460          return LLVMConstNot(a);
 461       else
 462          return LLVMBuildNot(builder, a, "");
 463    }
 464
 465    if(LLVMIsConstant(a))
 466       if (type.floating)
 467           return LLVMConstFSub(bld->one, a);
 468       else
 469           return LLVMConstSub(bld->one, a);
 470    else
 471       if (type.floating)
 472          return LLVMBuildFSub(builder, bld->one, a, "");
 473       else
 474          return LLVMBuildSub(builder, bld->one, a, "");
 475 }
 476
 477
 478 /**
 479  * Generate a + b
 480  */
 481 LLVMValueRef
 482 lp_build_add(struct lp_build_context *bld,
 483              LLVMValueRef a,
 484              LLVMValueRef b)
 485 {
 486    LLVMBuilderRef builder = bld->gallivm->builder;
 487    const struct lp_type type = bld->type;
 488    LLVMValueRef res;
 489
 490    assert(lp_check_value(type, a));
 491    assert(lp_check_value(type, b));
 492
 493    if(a == bld->zero)
 494       return b;
 495    if(b == bld->zero)
 496       return a;
 497    if(a == bld->undef || b == bld->undef)
 498       return bld->undef;
 499
 500    if(bld->type.norm) {
 501       const char *intrinsic = NULL;
 502
 503       if(a == bld->one || b == bld->one)
 504         return bld->one;
 505
 506       if (type.width * type.length == 128 &&
 507           !type.floating && !type.fixed) {
 508          if(util_cpu_caps.has_sse2) {
 509            if(type.width == 8)
 510              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 511            if(type.width == 16)
 512              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 513          } else if (util_cpu_caps.has_altivec) {
 514            if(type.width == 8)
 515               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 516            if(type.width == 16)
 517               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 518          }
 519       }
 520
 521       if (intrinsic)
 522          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 523    }
 524
 525    if(type.norm && !type.floating && !type.fixed) {
 526       if (type.sign) {
 527          uint64_t sign = (uint64_t)1 << (type.width - 1);
 528          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 529          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 530          /* a_clamp_max is the maximum a for positive b,
 531             a_clamp_min is the minimum a for negative b. */
 532          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 533          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 534          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
 535       } else {
 536          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 537       }
 538    }
 539
 540    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 541       if (type.floating)
 542          res = LLVMConstFAdd(a, b);
 543       else
 544          res = LLVMConstAdd(a, b);
 545    else
 546       if (type.floating)
 547          res = LLVMBuildFAdd(builder, a, b, "");
 548       else
 549          res = LLVMBuildAdd(builder, a, b, "");
 550
 551    /* clamp to ceiling of 1.0 */
 552    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 553       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 554
 555    /* XXX clamp to floor of -1 or 0??? */
 556
 557    return res;
 558 }
 559
 560
 561 /** Return the scalar sum of the elements of a.
 562  * Should avoid this operation whenever possible.
 563  */
 564 LLVMValueRef
 565 lp_build_horizontal_add(struct lp_build_context *bld,
 566                         LLVMValueRef a)
 567 {
 568    LLVMBuilderRef builder = bld->gallivm->builder;
 569    const struct lp_type type = bld->type;
 570    LLVMValueRef index, res;
 571    unsigned i, length;
 572    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 573    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 574    LLVMValueRef vecres, elem2;
 575
 576    assert(lp_check_value(type, a));
 577
 578    if (type.length == 1) {
 579       return a;
 580    }
 581
 582    assert(!bld->type.norm);
 583
 584    /*
 585     * for byte vectors can do much better with psadbw.
 586     * Using repeated shuffle/adds here. Note with multiple vectors
 587     * this can be done more efficiently as outlined in the intel
 588     * optimization manual.
 589     * Note: could cause data rearrangement if used with smaller element
 590     * sizes.
 591     */
 592
 593    vecres = a;
 594    length = type.length / 2;
 595    while (length > 1) {
 596       LLVMValueRef vec1, vec2;
 597       for (i = 0; i < length; i++) {
 598          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 599          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 600       }
 601       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 602                                     LLVMConstVector(shuffles1, length), "");
 603       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 604                                     LLVMConstVector(shuffles2, length), "");
 605       if (type.floating) {
 606          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 607       }
 608       else {
 609          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 610       }
 611       length = length >> 1;
 612    }
 613
 614    /* always have vector of size 2 here */
 615    assert(length == 1);
 616
 617    index = lp_build_const_int32(bld->gallivm, 0);
 618    res = LLVMBuildExtractElement(builder, vecres, index, "");
 619    index = lp_build_const_int32(bld->gallivm, 1);
 620    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 621
 622    if (type.floating)
 623       res = LLVMBuildFAdd(builder, res, elem2, "");
 624     else
 625       res = LLVMBuildAdd(builder, res, elem2, "");
 626
 627    return res;
 628 }
 629
 630 /**
 631  * Return the horizontal sums of 4 float vectors as a float4 vector.
 632  * This uses the technique as outlined in Intel Optimization Manual.
 633  */
 634 static LLVMValueRef
 635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 636                             LLVMValueRef src[4])
 637 {
 638    struct gallivm_state *gallivm = bld->gallivm;
 639    LLVMBuilderRef builder = gallivm->builder;
 640    LLVMValueRef shuffles[4];
 641    LLVMValueRef tmp[4];
 642    LLVMValueRef sumtmp[2], shuftmp[2];
 643
 644    /* lower half of regs */
 645    shuffles[0] = lp_build_const_int32(gallivm, 0);
 646    shuffles[1] = lp_build_const_int32(gallivm, 1);
 647    shuffles[2] = lp_build_const_int32(gallivm, 4);
 648    shuffles[3] = lp_build_const_int32(gallivm, 5);
 649    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 650                                    LLVMConstVector(shuffles, 4), "");
 651    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 652                                    LLVMConstVector(shuffles, 4), "");
 653
 654    /* upper half of regs */
 655    shuffles[0] = lp_build_const_int32(gallivm, 2);
 656    shuffles[1] = lp_build_const_int32(gallivm, 3);
 657    shuffles[2] = lp_build_const_int32(gallivm, 6);
 658    shuffles[3] = lp_build_const_int32(gallivm, 7);
 659    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 660                                    LLVMConstVector(shuffles, 4), "");
 661    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 662                                    LLVMConstVector(shuffles, 4), "");
 663
 664    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 665    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 666
 667    shuffles[0] = lp_build_const_int32(gallivm, 0);
 668    shuffles[1] = lp_build_const_int32(gallivm, 2);
 669    shuffles[2] = lp_build_const_int32(gallivm, 4);
 670    shuffles[3] = lp_build_const_int32(gallivm, 6);
 671    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 672                                        LLVMConstVector(shuffles, 4), "");
 673
 674    shuffles[0] = lp_build_const_int32(gallivm, 1);
 675    shuffles[1] = lp_build_const_int32(gallivm, 3);
 676    shuffles[2] = lp_build_const_int32(gallivm, 5);
 677    shuffles[3] = lp_build_const_int32(gallivm, 7);
 678    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 679                                        LLVMConstVector(shuffles, 4), "");
 680
 681    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 682 }
 683
 684
 685 /*
 686  * partially horizontally add 2-4 float vectors with length nx4,
 687  * i.e. only four adjacent values in each vector will be added,
 688  * assuming values are really grouped in 4 which also determines
 689  * output order.
 690  *
 691  * Return a vector of the same length as the initial vectors,
 692  * with the excess elements (if any) being undefined.
 693  * The element order is independent of number of input vectors.
 694  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 695  * the output order thus will be
 696  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 697  */
 698 LLVMValueRef
 699 lp_build_hadd_partial4(struct lp_build_context *bld,
 700                        LLVMValueRef vectors[],
 701                        unsigned num_vecs)
 702 {
 703    struct gallivm_state *gallivm = bld->gallivm;
 704    LLVMBuilderRef builder = gallivm->builder;
 705    LLVMValueRef ret_vec;
 706    LLVMValueRef tmp[4];
 707    const char *intrinsic = NULL;
 708
 709    assert(num_vecs >= 2 && num_vecs <= 4);
 710    assert(bld->type.floating);
 711
 712    /* only use this with at least 2 vectors, as it is sort of expensive
 713     * (depending on cpu) and we always need two horizontal adds anyway,
 714     * so a shuffle/add approach might be better.
 715     */
 716
 717    tmp[0] = vectors[0];
 718    tmp[1] = vectors[1];
 719
 720    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 721    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 722
 723    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 724        bld->type.length == 4) {
 725       intrinsic = "llvm.x86.sse3.hadd.ps";
 726    }
 727    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 728             bld->type.length == 8) {
 729       intrinsic = "llvm.x86.avx.hadd.ps.256";
 730    }
 731    if (intrinsic) {
 732       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 733                                        lp_build_vec_type(gallivm, bld->type),
 734                                        tmp[0], tmp[1]);
 735       if (num_vecs > 2) {
 736          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 737                                           lp_build_vec_type(gallivm, bld->type),
 738                                           tmp[2], tmp[3]);
 739       }
 740       else {
 741          tmp[1] = tmp[0];
 742       }
 743       return lp_build_intrinsic_binary(builder, intrinsic,
 744                                        lp_build_vec_type(gallivm, bld->type),
 745                                        tmp[0], tmp[1]);
 746    }
 747
 748    if (bld->type.length == 4) {
 749       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 750    }
 751    else {
 752       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 753       unsigned j;
 754       unsigned num_iter = bld->type.length / 4;
 755       struct lp_type parttype = bld->type;
 756       parttype.length = 4;
 757       for (j = 0; j < num_iter; j++) {
 758          LLVMValueRef partsrc[4];
 759          unsigned i;
 760          for (i = 0; i < 4; i++) {
 761             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 762          }
 763          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 764       }
 765       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 766    }
 767    return ret_vec;
 768 }
 769
 770 /**
 771  * Generate a - b
 772  */
 773 LLVMValueRef
 774 lp_build_sub(struct lp_build_context *bld,
 775              LLVMValueRef a,
 776              LLVMValueRef b)
 777 {
 778    LLVMBuilderRef builder = bld->gallivm->builder;
 779    const struct lp_type type = bld->type;
 780    LLVMValueRef res;
 781
 782    assert(lp_check_value(type, a));
 783    assert(lp_check_value(type, b));
 784
 785    if(b == bld->zero)
 786       return a;
 787    if(a == bld->undef || b == bld->undef)
 788       return bld->undef;
 789    if(a == b)
 790       return bld->zero;
 791
 792    if(bld->type.norm) {
 793       const char *intrinsic = NULL;
 794
 795       if(b == bld->one)
 796         return bld->zero;
 797
 798       if (type.width * type.length == 128 &&
 799           !type.floating && !type.fixed) {
 800          if (util_cpu_caps.has_sse2) {
 801            if(type.width == 8)
 802               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 803            if(type.width == 16)
 804               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 805          } else if (util_cpu_caps.has_altivec) {
 806            if(type.width == 8)
 807               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 808            if(type.width == 16)
 809               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 810          }
 811       }
 812
 813       if (intrinsic)
 814          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 815    }
 816
 817    if(type.norm && !type.floating && !type.fixed) {
 818       if (type.sign) {
 819          uint64_t sign = (uint64_t)1 << (type.width - 1);
 820          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
 821          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
 822          /* a_clamp_max is the maximum a for negative b,
 823             a_clamp_min is the minimum a for positive b. */
 824          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 825          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 826          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
 827       } else {
 828          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 829       }
 830    }
 831
 832    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 833       if (type.floating)
 834          res = LLVMConstFSub(a, b);
 835       else
 836          res = LLVMConstSub(a, b);
 837    else
 838       if (type.floating)
 839          res = LLVMBuildFSub(builder, a, b, "");
 840       else
 841          res = LLVMBuildSub(builder, a, b, "");
 842
 843    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 844       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 845
 846    return res;
 847 }
 848
 849
 850
 851 /**
 852  * Normalized multiplication.
 853  *
 854  * There are several approaches for (using 8-bit normalized multiplication as
 855  * an example):
 856  *
 857  * - alpha plus one
 858  *
 859  *     makes the following approximation to the division (Sree)
 860  *
 861  *       a*b/255 ~= (a*(b + 1)) >> 256
 862  *
 863  *     which is the fastest method that satisfies the following OpenGL criteria of
 864  *
 865  *       0*0 = 0 and 255*255 = 255
 866  *
 867  * - geometric series
 868  *
 869  *     takes the geometric series approximation to the division
 870  *
 871  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 872  *
 873  *     in this case just the first two terms to fit in 16bit arithmetic
 874  *
 875  *       t/255 ~= (t + (t >> 8)) >> 8
 876  *
 877  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 878  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 879  *     must be used.
 880  *
 881  * - geometric series plus rounding
 882  *
 883  *     when using a geometric series division instead of truncating the result
 884  *     use roundoff in the approximation (Jim Blinn)
 885  *
 886  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 887  *
 888  *     achieving the exact results.
 889  *
 890  *
 891  *
 892  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 893  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 894  * @sa Michael Herf, The "double blend trick", May 2000,
 895  *     http://www.stereopsis.com/doubleblend.html
 896  */
 897 static LLVMValueRef
 898 lp_build_mul_norm(struct gallivm_state *gallivm,
 899                   struct lp_type wide_type,
 900                   LLVMValueRef a, LLVMValueRef b)
 901 {
 902    LLVMBuilderRef builder = gallivm->builder;
 903    struct lp_build_context bld;
 904    unsigned n;
 905    LLVMValueRef half;
 906    LLVMValueRef ab;
 907
 908    assert(!wide_type.floating);
 909    assert(lp_check_value(wide_type, a));
 910    assert(lp_check_value(wide_type, b));
 911
 912    lp_build_context_init(&bld, gallivm, wide_type);
 913
 914    n = wide_type.width / 2;
 915    if (wide_type.sign) {
 916       --n;
 917    }
 918
 919    /*
 920     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 921     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 922     */
 923
 924    /*
 925     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 926     */
 927
 928    ab = LLVMBuildMul(builder, a, b, "");
 929    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 930
 931    /*
 932     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 933     */
 934
 935    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
 936    if (wide_type.sign) {
 937       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 938       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 939       half = lp_build_select(&bld, sign, minus_half, half);
 940    }
 941    ab = LLVMBuildAdd(builder, ab, half, "");
 942
 943    /* Final division */
 944    ab = lp_build_shr_imm(&bld, ab, n);
 945
 946    return ab;
 947 }
 948
 949 /**
 950  * Generate a * b
 951  */
 952 LLVMValueRef
 953 lp_build_mul(struct lp_build_context *bld,
 954              LLVMValueRef a,
 955              LLVMValueRef b)
 956 {
 957    LLVMBuilderRef builder = bld->gallivm->builder;
 958    const struct lp_type type = bld->type;
 959    LLVMValueRef shift;
 960    LLVMValueRef res;
 961
 962    assert(lp_check_value(type, a));
 963    assert(lp_check_value(type, b));
 964
 965    if(a == bld->zero)
 966       return bld->zero;
 967    if(a == bld->one)
 968       return b;
 969    if(b == bld->zero)
 970       return bld->zero;
 971    if(b == bld->one)
 972       return a;
 973    if(a == bld->undef || b == bld->undef)
 974       return bld->undef;
 975
 976    if (!type.floating && !type.fixed && type.norm) {
 977       struct lp_type wide_type = lp_wider_type(type);
 978       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 979
 980       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 981       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 982
 983       /* PMULLW, PSRLW, PADDW */
 984       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 985       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 986
 987       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 988
 989       return ab;
 990    }
 991
 992    if(type.fixed)
 993       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 994    else
 995       shift = NULL;
 996
 997    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 998       if (type.floating)
 999          res = LLVMConstFMul(a, b);
1000       else
1001          res = LLVMConstMul(a, b);
1002       if(shift) {
1003          if(type.sign)
1004             res = LLVMConstAShr(res, shift);
1005          else
1006             res = LLVMConstLShr(res, shift);
1007       }
1008    }
1009    else {
1010       if (type.floating)
1011          res = LLVMBuildFMul(builder, a, b, "");
1012       else
1013          res = LLVMBuildMul(builder, a, b, "");
1014       if(shift) {
1015          if(type.sign)
1016             res = LLVMBuildAShr(builder, res, shift, "");
1017          else
1018             res = LLVMBuildLShr(builder, res, shift, "");
1019       }
1020    }
1021
1022    return res;
1023 }
1024
1025
1026 /**
1027  * Small vector x scale multiplication optimization.
1028  */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031                  LLVMValueRef a,
1032                  int b)
1033 {
1034    LLVMBuilderRef builder = bld->gallivm->builder;
1035    LLVMValueRef factor;
1036
1037    assert(lp_check_value(bld->type, a));
1038
1039    if(b == 0)
1040       return bld->zero;
1041
1042    if(b == 1)
1043       return a;
1044
1045    if(b == -1)
1046       return lp_build_negate(bld, a);
1047
1048    if(b == 2 && bld->type.floating)
1049       return lp_build_add(bld, a, a);
1050
1051    if(util_is_power_of_two(b)) {
1052       unsigned shift = ffs(b) - 1;
1053
1054       if(bld->type.floating) {
1055 #if 0
1056          /*
1057           * Power of two multiplication by directly manipulating the exponent.
1058           *
1059           * XXX: This might not be always faster, it will introduce a small error
1060           * for multiplication by zero, and it will produce wrong results
1061           * for Inf and NaN.
1062           */
1063          unsigned mantissa = lp_mantissa(bld->type);
1064          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066          a = LLVMBuildAdd(builder, a, factor, "");
1067          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068          return a;
1069 #endif
1070       }
1071       else {
1072          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073          return LLVMBuildShl(builder, a, factor, "");
1074       }
1075    }
1076
1077    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078    return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083  * Generate a / b
1084  */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087              LLVMValueRef a,
1088              LLVMValueRef b)
1089 {
1090    LLVMBuilderRef builder = bld->gallivm->builder;
1091    const struct lp_type type = bld->type;
1092
1093    assert(lp_check_value(type, a));
1094    assert(lp_check_value(type, b));
1095
1096    if(a == bld->zero)
1097       return bld->zero;
1098    if(a == bld->one && type.floating)
1099       return lp_build_rcp(bld, b);
1100    if(b == bld->zero)
1101       return bld->undef;
1102    if(b == bld->one)
1103       return a;
1104    if(a == bld->undef || b == bld->undef)
1105       return bld->undef;
1106
1107    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108       if (type.floating)
1109          return LLVMConstFDiv(a, b);
1110       else if (type.sign)
1111          return LLVMConstSDiv(a, b);
1112       else
1113          return LLVMConstUDiv(a, b);
1114    }
1115
1116    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118       type.floating)
1119       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121    if (type.floating)
1122       return LLVMBuildFDiv(builder, a, b, "");
1123    else if (type.sign)
1124       return LLVMBuildSDiv(builder, a, b, "");
1125    else
1126       return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131  * Linear interpolation helper.
1132  *
1133  * @param normalized whether we are interpolating normalized values,
1134  *        encoded in normalized integers, twice as wide.
1135  *
1136  * @sa http://www.stereopsis.com/doubleblend.html
1137  */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140                      LLVMValueRef x,
1141                      LLVMValueRef v0,
1142                      LLVMValueRef v1,
1143                      unsigned flags)
1144 {
1145    unsigned half_width = bld->type.width/2;
1146    LLVMBuilderRef builder = bld->gallivm->builder;
1147    LLVMValueRef delta;
1148    LLVMValueRef res;
1149
1150    assert(lp_check_value(bld->type, x));
1151    assert(lp_check_value(bld->type, v0));
1152    assert(lp_check_value(bld->type, v1));
1153
1154    delta = lp_build_sub(bld, v1, v0);
1155
1156    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157       if (!bld->type.sign) {
1158          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159             /*
1160              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161              * most-significant-bit to the lowest-significant-bit, so that
1162              * later we can just divide by 2**n instead of 2**n - 1.
1163              */
1164
1165             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166          }
1167
1168          /* (x * delta) >> n */
1169          res = lp_build_mul(bld, x, delta);
1170          res = lp_build_shr_imm(bld, res, half_width);
1171       } else {
1172          /*
1173           * The rescaling trick above doesn't work for signed numbers, so
1174           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175           * instead.
1176           */
1177          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179       }
1180    } else {
1181       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182       res = lp_build_mul(bld, x, delta);
1183    }
1184
1185    res = lp_build_add(bld, v0, res);
1186
1187    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1188        bld->type.fixed) {
1189       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191        * but it will be wrong for true fixed point use cases. Basically we need
1192        * a more powerful lp_type, capable of further distinguishing the values
1193        * interpretation from the value storage. */
1194       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1195    }
1196
1197    return res;
1198 }
1199
1200
1201 /**
1202  * Linear interpolation.
1203  */
1204 LLVMValueRef
1205 lp_build_lerp(struct lp_build_context *bld,
1206               LLVMValueRef x,
1207               LLVMValueRef v0,
1208               LLVMValueRef v1,
1209               unsigned flags)
1210 {
1211    const struct lp_type type = bld->type;
1212    LLVMValueRef res;
1213
1214    assert(lp_check_value(type, x));
1215    assert(lp_check_value(type, v0));
1216    assert(lp_check_value(type, v1));
1217
1218    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1219
1220    if (type.norm) {
1221       struct lp_type wide_type;
1222       struct lp_build_context wide_bld;
1223       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1224
1225       assert(type.length >= 2);
1226
1227       /*
1228        * Create a wider integer type, enough to hold the
1229        * intermediate result of the multiplication.
1230        */
1231       memset(&wide_type, 0, sizeof wide_type);
1232       wide_type.sign   = type.sign;
1233       wide_type.width  = type.width*2;
1234       wide_type.length = type.length/2;
1235
1236       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1237
1238       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1239       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1240       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1241
1242       /*
1243        * Lerp both halves.
1244        */
1245
1246       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1247
1248       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1249       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1250
1251       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1252    } else {
1253       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1254    }
1255
1256    return res;
1257 }
1258
1259
1260 /**
1261  * Bilinear interpolation.
1262  *
1263  * Values indices are in v_{yx}.
1264  */
1265 LLVMValueRef
1266 lp_build_lerp_2d(struct lp_build_context *bld,
1267                  LLVMValueRef x,
1268                  LLVMValueRef y,
1269                  LLVMValueRef v00,
1270                  LLVMValueRef v01,
1271                  LLVMValueRef v10,
1272                  LLVMValueRef v11,
1273                  unsigned flags)
1274 {
1275    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1276    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1277    return lp_build_lerp(bld, y, v0, v1, flags);
1278 }
1279
1280
1281 LLVMValueRef
1282 lp_build_lerp_3d(struct lp_build_context *bld,
1283                  LLVMValueRef x,
1284                  LLVMValueRef y,
1285                  LLVMValueRef z,
1286                  LLVMValueRef v000,
1287                  LLVMValueRef v001,
1288                  LLVMValueRef v010,
1289                  LLVMValueRef v011,
1290                  LLVMValueRef v100,
1291                  LLVMValueRef v101,
1292                  LLVMValueRef v110,
1293                  LLVMValueRef v111,
1294                  unsigned flags)
1295 {
1296    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1297    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1298    return lp_build_lerp(bld, z, v0, v1, flags);
1299 }
1300
1301
1302 /**
1303  * Generate min(a, b)
1304  * Do checks for special cases but not for nans.
1305  */
1306 LLVMValueRef
1307 lp_build_min(struct lp_build_context *bld,
1308              LLVMValueRef a,
1309              LLVMValueRef b)
1310 {
1311    assert(lp_check_value(bld->type, a));
1312    assert(lp_check_value(bld->type, b));
1313
1314    if(a == bld->undef || b == bld->undef)
1315       return bld->undef;
1316
1317    if(a == b)
1318       return a;
1319
1320    if (bld->type.norm) {
1321       if (!bld->type.sign) {
1322          if (a == bld->zero || b == bld->zero) {
1323             return bld->zero;
1324          }
1325       }
1326       if(a == bld->one)
1327          return b;
1328       if(b == bld->one)
1329          return a;
1330    }
1331
1332    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1333 }
1334
1335
1336 /**
1337  * Generate min(a, b)
1338  * NaN's are handled according to the behavior specified by the
1339  * nan_behavior argument.
1340  */
1341 LLVMValueRef
1342 lp_build_min_ext(struct lp_build_context *bld,
1343                  LLVMValueRef a,
1344                  LLVMValueRef b,
1345                  enum gallivm_nan_behavior nan_behavior)
1346 {
1347    assert(lp_check_value(bld->type, a));
1348    assert(lp_check_value(bld->type, b));
1349
1350    if(a == bld->undef || b == bld->undef)
1351       return bld->undef;
1352
1353    if(a == b)
1354       return a;
1355
1356    if (bld->type.norm) {
1357       if (!bld->type.sign) {
1358          if (a == bld->zero || b == bld->zero) {
1359             return bld->zero;
1360          }
1361       }
1362       if(a == bld->one)
1363          return b;
1364       if(b == bld->one)
1365          return a;
1366    }
1367
1368    return lp_build_min_simple(bld, a, b, nan_behavior);
1369 }
1370
1371 /**
1372  * Generate max(a, b)
1373  * Do checks for special cases, but NaN behavior is undefined.
1374  */
1375 LLVMValueRef
1376 lp_build_max(struct lp_build_context *bld,
1377              LLVMValueRef a,
1378              LLVMValueRef b)
1379 {
1380    assert(lp_check_value(bld->type, a));
1381    assert(lp_check_value(bld->type, b));
1382
1383    if(a == bld->undef || b == bld->undef)
1384       return bld->undef;
1385
1386    if(a == b)
1387       return a;
1388
1389    if(bld->type.norm) {
1390       if(a == bld->one || b == bld->one)
1391          return bld->one;
1392       if (!bld->type.sign) {
1393          if (a == bld->zero) {
1394             return b;
1395          }
1396          if (b == bld->zero) {
1397             return a;
1398          }
1399       }
1400    }
1401
1402    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1403 }
1404
1405
1406 /**
1407  * Generate max(a, b)
1408  * Checks for special cases.
1409  * NaN's are handled according to the behavior specified by the
1410  * nan_behavior argument.
1411  */
1412 LLVMValueRef
1413 lp_build_max_ext(struct lp_build_context *bld,
1414                   LLVMValueRef a,
1415                   LLVMValueRef b,
1416                   enum gallivm_nan_behavior nan_behavior)
1417 {
1418    assert(lp_check_value(bld->type, a));
1419    assert(lp_check_value(bld->type, b));
1420
1421    if(a == bld->undef || b == bld->undef)
1422       return bld->undef;
1423
1424    if(a == b)
1425       return a;
1426
1427    if(bld->type.norm) {
1428       if(a == bld->one || b == bld->one)
1429          return bld->one;
1430       if (!bld->type.sign) {
1431          if (a == bld->zero) {
1432             return b;
1433          }
1434          if (b == bld->zero) {
1435             return a;
1436          }
1437       }
1438    }
1439
1440    return lp_build_max_simple(bld, a, b, nan_behavior);
1441 }
1442
1443 /**
1444  * Generate clamp(a, min, max)
1445  * NaN behavior (for any of a, min, max) is undefined.
1446  * Do checks for special cases.
1447  */
1448 LLVMValueRef
1449 lp_build_clamp(struct lp_build_context *bld,
1450                LLVMValueRef a,
1451                LLVMValueRef min,
1452                LLVMValueRef max)
1453 {
1454    assert(lp_check_value(bld->type, a));
1455    assert(lp_check_value(bld->type, min));
1456    assert(lp_check_value(bld->type, max));
1457
1458    a = lp_build_min(bld, a, max);
1459    a = lp_build_max(bld, a, min);
1460    return a;
1461 }
1462
1463
1464 /**
1465  * Generate clamp(a, 0, 1)
1466  * A NaN will get converted to zero.
1467  */
1468 LLVMValueRef
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1470                                 LLVMValueRef a)
1471 {
1472    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1473    a = lp_build_min(bld, a, bld->one);
1474    return a;
1475 }
1476
1477
1478 /**
1479  * Generate abs(a)
1480  */
1481 LLVMValueRef
1482 lp_build_abs(struct lp_build_context *bld,
1483              LLVMValueRef a)
1484 {
1485    LLVMBuilderRef builder = bld->gallivm->builder;
1486    const struct lp_type type = bld->type;
1487    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1488
1489    assert(lp_check_value(type, a));
1490
1491    if(!type.sign)
1492       return a;
1493
1494    if(type.floating) {
1495       char intrinsic[32];
1496       util_snprintf(intrinsic, sizeof intrinsic, "llvm.fabs.v%uf%u", type.length, type.width);
1497       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1498    }
1499
1500    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1501       switch(type.width) {
1502       case 8:
1503          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1504       case 16:
1505          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1506       case 32:
1507          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1508       }
1509    }
1510    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1511             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1512             (type.width == 8 || type.width == 16 || type.width == 32)) {
1513       debug_printf("%s: inefficient code, should split vectors manually\n",
1514                    __FUNCTION__);
1515    }
1516
1517    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1518 }
1519
1520
1521 LLVMValueRef
1522 lp_build_negate(struct lp_build_context *bld,
1523                 LLVMValueRef a)
1524 {
1525    LLVMBuilderRef builder = bld->gallivm->builder;
1526
1527    assert(lp_check_value(bld->type, a));
1528
1529    if (bld->type.floating)
1530       a = LLVMBuildFNeg(builder, a, "");
1531    else
1532       a = LLVMBuildNeg(builder, a, "");
1533
1534    return a;
1535 }
1536
1537
1538 /** Return -1, 0 or +1 depending on the sign of a */
1539 LLVMValueRef
1540 lp_build_sgn(struct lp_build_context *bld,
1541              LLVMValueRef a)
1542 {
1543    LLVMBuilderRef builder = bld->gallivm->builder;
1544    const struct lp_type type = bld->type;
1545    LLVMValueRef cond;
1546    LLVMValueRef res;
1547
1548    assert(lp_check_value(type, a));
1549
1550    /* Handle non-zero case */
1551    if(!type.sign) {
1552       /* if not zero then sign must be positive */
1553       res = bld->one;
1554    }
1555    else if(type.floating) {
1556       LLVMTypeRef vec_type;
1557       LLVMTypeRef int_type;
1558       LLVMValueRef mask;
1559       LLVMValueRef sign;
1560       LLVMValueRef one;
1561       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1562
1563       int_type = lp_build_int_vec_type(bld->gallivm, type);
1564       vec_type = lp_build_vec_type(bld->gallivm, type);
1565       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1566
1567       /* Take the sign bit and add it to 1 constant */
1568       sign = LLVMBuildBitCast(builder, a, int_type, "");
1569       sign = LLVMBuildAnd(builder, sign, mask, "");
1570       one = LLVMConstBitCast(bld->one, int_type);
1571       res = LLVMBuildOr(builder, sign, one, "");
1572       res = LLVMBuildBitCast(builder, res, vec_type, "");
1573    }
1574    else
1575    {
1576       /* signed int/norm/fixed point */
1577       /* could use psign with sse3 and appropriate vectors here */
1578       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1579       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1580       res = lp_build_select(bld, cond, bld->one, minus_one);
1581    }
1582
1583    /* Handle zero */
1584    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1585    res = lp_build_select(bld, cond, bld->zero, res);
1586
1587    return res;
1588 }
1589
1590
1591 /**
1592  * Set the sign of float vector 'a' according to 'sign'.
1593  * If sign==0, return abs(a).
1594  * If sign==1, return -abs(a);
1595  * Other values for sign produce undefined results.
1596  */
1597 LLVMValueRef
1598 lp_build_set_sign(struct lp_build_context *bld,
1599                   LLVMValueRef a, LLVMValueRef sign)
1600 {
1601    LLVMBuilderRef builder = bld->gallivm->builder;
1602    const struct lp_type type = bld->type;
1603    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1604    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1605    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1606    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1607                              ~((unsigned long long) 1 << (type.width - 1)));
1608    LLVMValueRef val, res;
1609
1610    assert(type.floating);
1611    assert(lp_check_value(type, a));
1612
1613    /* val = reinterpret_cast<int>(a) */
1614    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1615    /* val = val & mask */
1616    val = LLVMBuildAnd(builder, val, mask, "");
1617    /* sign = sign << shift */
1618    sign = LLVMBuildShl(builder, sign, shift, "");
1619    /* res = val | sign */
1620    res = LLVMBuildOr(builder, val, sign, "");
1621    /* res = reinterpret_cast<float>(res) */
1622    res = LLVMBuildBitCast(builder, res, vec_type, "");
1623
1624    return res;
1625 }
1626
1627
1628 /**
1629  * Convert vector of (or scalar) int to vector of (or scalar) float.
1630  */
1631 LLVMValueRef
1632 lp_build_int_to_float(struct lp_build_context *bld,
1633                       LLVMValueRef a)
1634 {
1635    LLVMBuilderRef builder = bld->gallivm->builder;
1636    const struct lp_type type = bld->type;
1637    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1638
1639    assert(type.floating);
1640
1641    return LLVMBuildSIToFP(builder, a, vec_type, "");
1642 }
1643
1644 static boolean
1645 arch_rounding_available(const struct lp_type type)
1646 {
1647    if ((util_cpu_caps.has_sse4_1 &&
1648        (type.length == 1 || type.width*type.length == 128)) ||
1649        (util_cpu_caps.has_avx && type.width*type.length == 256))
1650       return TRUE;
1651    else if ((util_cpu_caps.has_altivec &&
1652             (type.width == 32 && type.length == 4)))
1653       return TRUE;
1654
1655    return FALSE;
1656 }
1657
1658 enum lp_build_round_mode
1659 {
1660    LP_BUILD_ROUND_NEAREST = 0,
1661    LP_BUILD_ROUND_FLOOR = 1,
1662    LP_BUILD_ROUND_CEIL = 2,
1663    LP_BUILD_ROUND_TRUNCATE = 3
1664 };
1665
1666 /**
1667  * Helper for SSE4.1's ROUNDxx instructions.
1668  *
1669  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1670  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1671  */
1672 static inline LLVMValueRef
1673 lp_build_nearest_sse41(struct lp_build_context *bld,
1674                        LLVMValueRef a)
1675 {
1676    LLVMBuilderRef builder = bld->gallivm->builder;
1677    const struct lp_type type = bld->type;
1678    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1679    LLVMValueRef mode = LLVMConstNull(i32t);
1680    const char *intrinsic;
1681    LLVMValueRef res;
1682
1683    assert(type.floating);
1684
1685    assert(lp_check_value(type, a));
1686    assert(util_cpu_caps.has_sse4_1);
1687
1688    if (type.length == 1) {
1689       LLVMTypeRef vec_type;
1690       LLVMValueRef undef;
1691       LLVMValueRef args[3];
1692       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1693
1694       switch(type.width) {
1695       case 32:
1696          intrinsic = "llvm.x86.sse41.round.ss";
1697          break;
1698       case 64:
1699          intrinsic = "llvm.x86.sse41.round.sd";
1700          break;
1701       default:
1702          assert(0);
1703          return bld->undef;
1704       }
1705
1706       vec_type = LLVMVectorType(bld->elem_type, 4);
1707
1708       undef = LLVMGetUndef(vec_type);
1709
1710       args[0] = undef;
1711       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1712       args[2] = mode;
1713
1714       res = lp_build_intrinsic(builder, intrinsic,
1715                                vec_type, args, Elements(args), 0);
1716
1717       res = LLVMBuildExtractElement(builder, res, index0, "");
1718    }
1719    else {
1720       if (type.width * type.length == 128) {
1721          switch(type.width) {
1722          case 32:
1723             intrinsic = "llvm.x86.sse41.round.ps";
1724             break;
1725          case 64:
1726             intrinsic = "llvm.x86.sse41.round.pd";
1727             break;
1728          default:
1729             assert(0);
1730             return bld->undef;
1731          }
1732       }
1733       else {
1734          assert(type.width * type.length == 256);
1735          assert(util_cpu_caps.has_avx);
1736
1737          switch(type.width) {
1738          case 32:
1739             intrinsic = "llvm.x86.avx.round.ps.256";
1740             break;
1741          case 64:
1742             intrinsic = "llvm.x86.avx.round.pd.256";
1743             break;
1744          default:
1745             assert(0);
1746             return bld->undef;
1747          }
1748       }
1749
1750       res = lp_build_intrinsic_binary(builder, intrinsic,
1751                                       bld->vec_type, a,
1752                                       mode);
1753    }
1754
1755    return res;
1756 }
1757
1758
1759 static inline LLVMValueRef
1760 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1761                              LLVMValueRef a)
1762 {
1763    LLVMBuilderRef builder = bld->gallivm->builder;
1764    const struct lp_type type = bld->type;
1765    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1766    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1767    const char *intrinsic;
1768    LLVMValueRef res;
1769
1770    assert(type.floating);
1771    /* using the double precision conversions is a bit more complicated */
1772    assert(type.width == 32);
1773
1774    assert(lp_check_value(type, a));
1775    assert(util_cpu_caps.has_sse2);
1776
1777    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1778    if (type.length == 1) {
1779       LLVMTypeRef vec_type;
1780       LLVMValueRef undef;
1781       LLVMValueRef arg;
1782       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1783
1784       vec_type = LLVMVectorType(bld->elem_type, 4);
1785
1786       intrinsic = "llvm.x86.sse.cvtss2si";
1787
1788       undef = LLVMGetUndef(vec_type);
1789
1790       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1791
1792       res = lp_build_intrinsic_unary(builder, intrinsic,
1793                                      ret_type, arg);
1794    }
1795    else {
1796       if (type.width* type.length == 128) {
1797          intrinsic = "llvm.x86.sse2.cvtps2dq";
1798       }
1799       else {
1800          assert(type.width*type.length == 256);
1801          assert(util_cpu_caps.has_avx);
1802
1803          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1804       }
1805       res = lp_build_intrinsic_unary(builder, intrinsic,
1806                                      ret_type, a);
1807    }
1808
1809    return res;
1810 }
1811
1812
1813 /*
1814  */
1815 static inline LLVMValueRef
1816 lp_build_round_altivec(struct lp_build_context *bld,
1817                        LLVMValueRef a,
1818                        enum lp_build_round_mode mode)
1819 {
1820    LLVMBuilderRef builder = bld->gallivm->builder;
1821    const struct lp_type type = bld->type;
1822    const char *intrinsic = NULL;
1823
1824    assert(type.floating);
1825
1826    assert(lp_check_value(type, a));
1827    assert(util_cpu_caps.has_altivec);
1828
1829    (void)type;
1830
1831    switch (mode) {
1832    case LP_BUILD_ROUND_NEAREST:
1833       intrinsic = "llvm.ppc.altivec.vrfin";
1834       break;
1835    case LP_BUILD_ROUND_FLOOR:
1836       intrinsic = "llvm.ppc.altivec.vrfim";
1837       break;
1838    case LP_BUILD_ROUND_CEIL:
1839       intrinsic = "llvm.ppc.altivec.vrfip";
1840       break;
1841    case LP_BUILD_ROUND_TRUNCATE:
1842       intrinsic = "llvm.ppc.altivec.vrfiz";
1843       break;
1844    }
1845
1846    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1847 }
1848
1849 static inline LLVMValueRef
1850 lp_build_round_arch(struct lp_build_context *bld,
1851                     LLVMValueRef a,
1852                     enum lp_build_round_mode mode)
1853 {
1854    if (util_cpu_caps.has_sse4_1) {
1855       LLVMBuilderRef builder = bld->gallivm->builder;
1856       const struct lp_type type = bld->type;
1857       const char *intrinsic_root;
1858       char intrinsic[32];
1859
1860       assert(type.floating);
1861       assert(lp_check_value(type, a));
1862       (void)type;
1863
1864       switch (mode) {
1865       case LP_BUILD_ROUND_NEAREST:
1866          if (HAVE_LLVM >= 0x0304) {
1867             intrinsic_root = "llvm.round";
1868          } else {
1869             return lp_build_nearest_sse41(bld, a);
1870          }
1871          break;
1872       case LP_BUILD_ROUND_FLOOR:
1873          intrinsic_root = "llvm.floor";
1874          break;
1875       case LP_BUILD_ROUND_CEIL:
1876          intrinsic_root = "llvm.ceil";
1877          break;
1878       case LP_BUILD_ROUND_TRUNCATE:
1879          intrinsic_root = "llvm.trunc";
1880          break;
1881       }
1882
1883       util_snprintf(intrinsic, sizeof intrinsic, "%s.v%uf%u",
1884                     intrinsic_root, type.length, type.width);
1885
1886       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1887    }
1888    else /* (util_cpu_caps.has_altivec) */
1889      return lp_build_round_altivec(bld, a, mode);
1890 }
1891
1892 /**
1893  * Return the integer part of a float (vector) value (== round toward zero).
1894  * The returned value is a float (vector).
1895  * Ex: trunc(-1.5) = -1.0
1896  */
1897 LLVMValueRef
1898 lp_build_trunc(struct lp_build_context *bld,
1899                LLVMValueRef a)
1900 {
1901    LLVMBuilderRef builder = bld->gallivm->builder;
1902    const struct lp_type type = bld->type;
1903
1904    assert(type.floating);
1905    assert(lp_check_value(type, a));
1906
1907    if (arch_rounding_available(type)) {
1908       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1909    }
1910    else {
1911       const struct lp_type type = bld->type;
1912       struct lp_type inttype;
1913       struct lp_build_context intbld;
1914       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1915       LLVMValueRef trunc, res, anosign, mask;
1916       LLVMTypeRef int_vec_type = bld->int_vec_type;
1917       LLVMTypeRef vec_type = bld->vec_type;
1918
1919       assert(type.width == 32); /* might want to handle doubles at some point */
1920
1921       inttype = type;
1922       inttype.floating = 0;
1923       lp_build_context_init(&intbld, bld->gallivm, inttype);
1924
1925       /* round by truncation */
1926       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1927       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1928
1929       /* mask out sign bit */
1930       anosign = lp_build_abs(bld, a);
1931       /*
1932        * mask out all values if anosign > 2^24
1933        * This should work both for large ints (all rounding is no-op for them
1934        * because such floats are always exact) as well as special cases like
1935        * NaNs, Infs (taking advantage of the fact they use max exponent).
1936        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1937        */
1938       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1939       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1940       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1941       return lp_build_select(bld, mask, a, res);
1942    }
1943 }
1944
1945
1946 /**
1947  * Return float (vector) rounded to nearest integer (vector).  The returned
1948  * value is a float (vector).
1949  * Ex: round(0.9) = 1.0
1950  * Ex: round(-1.5) = -2.0
1951  */
1952 LLVMValueRef
1953 lp_build_round(struct lp_build_context *bld,
1954                LLVMValueRef a)
1955 {
1956    LLVMBuilderRef builder = bld->gallivm->builder;
1957    const struct lp_type type = bld->type;
1958
1959    assert(type.floating);
1960    assert(lp_check_value(type, a));
1961
1962    if (arch_rounding_available(type)) {
1963       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1964    }
1965    else {
1966       const struct lp_type type = bld->type;
1967       struct lp_type inttype;
1968       struct lp_build_context intbld;
1969       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1970       LLVMValueRef res, anosign, mask;
1971       LLVMTypeRef int_vec_type = bld->int_vec_type;
1972       LLVMTypeRef vec_type = bld->vec_type;
1973
1974       assert(type.width == 32); /* might want to handle doubles at some point */
1975
1976       inttype = type;
1977       inttype.floating = 0;
1978       lp_build_context_init(&intbld, bld->gallivm, inttype);
1979
1980       res = lp_build_iround(bld, a);
1981       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1982
1983       /* mask out sign bit */
1984       anosign = lp_build_abs(bld, a);
1985       /*
1986        * mask out all values if anosign > 2^24
1987        * This should work both for large ints (all rounding is no-op for them
1988        * because such floats are always exact) as well as special cases like
1989        * NaNs, Infs (taking advantage of the fact they use max exponent).
1990        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1991        */
1992       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1993       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1994       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1995       return lp_build_select(bld, mask, a, res);
1996    }
1997 }
1998
1999
2000 /**
2001  * Return floor of float (vector), result is a float (vector)
2002  * Ex: floor(1.1) = 1.0
2003  * Ex: floor(-1.1) = -2.0
2004  */
2005 LLVMValueRef
2006 lp_build_floor(struct lp_build_context *bld,
2007                LLVMValueRef a)
2008 {
2009    LLVMBuilderRef builder = bld->gallivm->builder;
2010    const struct lp_type type = bld->type;
2011
2012    assert(type.floating);
2013    assert(lp_check_value(type, a));
2014
2015    if (arch_rounding_available(type)) {
2016       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2017    }
2018    else {
2019       const struct lp_type type = bld->type;
2020       struct lp_type inttype;
2021       struct lp_build_context intbld;
2022       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2023       LLVMValueRef trunc, res, anosign, mask;
2024       LLVMTypeRef int_vec_type = bld->int_vec_type;
2025       LLVMTypeRef vec_type = bld->vec_type;
2026
2027       if (type.width != 32) {
2028          char intrinsic[32];
2029          util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
2030          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2031       }
2032
2033       assert(type.width == 32); /* might want to handle doubles at some point */
2034
2035       inttype = type;
2036       inttype.floating = 0;
2037       lp_build_context_init(&intbld, bld->gallivm, inttype);
2038
2039       /* round by truncation */
2040       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2041       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2042
2043       if (type.sign) {
2044          LLVMValueRef tmp;
2045
2046          /*
2047           * fix values if rounding is wrong (for non-special cases)
2048           * - this is the case if trunc > a
2049           */
2050          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2051          /* tmp = trunc > a ? 1.0 : 0.0 */
2052          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2053          tmp = lp_build_and(&intbld, mask, tmp);
2054          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2055          res = lp_build_sub(bld, res, tmp);
2056       }
2057
2058       /* mask out sign bit */
2059       anosign = lp_build_abs(bld, a);
2060       /*
2061        * mask out all values if anosign > 2^24
2062        * This should work both for large ints (all rounding is no-op for them
2063        * because such floats are always exact) as well as special cases like
2064        * NaNs, Infs (taking advantage of the fact they use max exponent).
2065        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2066        */
2067       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2068       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2069       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2070       return lp_build_select(bld, mask, a, res);
2071    }
2072 }
2073
2074
2075 /**
2076  * Return ceiling of float (vector), returning float (vector).
2077  * Ex: ceil( 1.1) = 2.0
2078  * Ex: ceil(-1.1) = -1.0
2079  */
2080 LLVMValueRef
2081 lp_build_ceil(struct lp_build_context *bld,
2082               LLVMValueRef a)
2083 {
2084    LLVMBuilderRef builder = bld->gallivm->builder;
2085    const struct lp_type type = bld->type;
2086
2087    assert(type.floating);
2088    assert(lp_check_value(type, a));
2089
2090    if (arch_rounding_available(type)) {
2091       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2092    }
2093    else {
2094       const struct lp_type type = bld->type;
2095       struct lp_type inttype;
2096       struct lp_build_context intbld;
2097       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2098       LLVMValueRef trunc, res, anosign, mask, tmp;
2099       LLVMTypeRef int_vec_type = bld->int_vec_type;
2100       LLVMTypeRef vec_type = bld->vec_type;
2101
2102       if (type.width != 32) {
2103          char intrinsic[32];
2104          util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
2105          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2106       }
2107
2108       assert(type.width == 32); /* might want to handle doubles at some point */
2109
2110       inttype = type;
2111       inttype.floating = 0;
2112       lp_build_context_init(&intbld, bld->gallivm, inttype);
2113
2114       /* round by truncation */
2115       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2116       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2117
2118       /*
2119        * fix values if rounding is wrong (for non-special cases)
2120        * - this is the case if trunc < a
2121        */
2122       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2123       /* tmp = trunc < a ? 1.0 : 0.0 */
2124       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2125       tmp = lp_build_and(&intbld, mask, tmp);
2126       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2127       res = lp_build_add(bld, trunc, tmp);
2128
2129       /* mask out sign bit */
2130       anosign = lp_build_abs(bld, a);
2131       /*
2132        * mask out all values if anosign > 2^24
2133        * This should work both for large ints (all rounding is no-op for them
2134        * because such floats are always exact) as well as special cases like
2135        * NaNs, Infs (taking advantage of the fact they use max exponent).
2136        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2137        */
2138       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2139       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2140       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2141       return lp_build_select(bld, mask, a, res);
2142    }
2143 }
2144
2145
2146 /**
2147  * Return fractional part of 'a' computed as a - floor(a)
2148  * Typically used in texture coord arithmetic.
2149  */
2150 LLVMValueRef
2151 lp_build_fract(struct lp_build_context *bld,
2152                LLVMValueRef a)
2153 {
2154    assert(bld->type.floating);
2155    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2156 }
2157
2158
2159 /**
2160  * Prevent returning a fractional part of 1.0 for very small negative values of
2161  * 'a' by clamping against 0.99999(9).
2162  */
2163 static inline LLVMValueRef
2164 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2165 {
2166    LLVMValueRef max;
2167
2168    /* this is the largest number smaller than 1.0 representable as float */
2169    max = lp_build_const_vec(bld->gallivm, bld->type,
2170                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2171    return lp_build_min(bld, fract, max);
2172 }
2173
2174
2175 /**
2176  * Same as lp_build_fract, but guarantees that the result is always smaller
2177  * than one.
2178  */
2179 LLVMValueRef
2180 lp_build_fract_safe(struct lp_build_context *bld,
2181                     LLVMValueRef a)
2182 {
2183    return clamp_fract(bld, lp_build_fract(bld, a));
2184 }
2185
2186
2187 /**
2188  * Return the integer part of a float (vector) value (== round toward zero).
2189  * The returned value is an integer (vector).
2190  * Ex: itrunc(-1.5) = -1
2191  */
2192 LLVMValueRef
2193 lp_build_itrunc(struct lp_build_context *bld,
2194                 LLVMValueRef a)
2195 {
2196    LLVMBuilderRef builder = bld->gallivm->builder;
2197    const struct lp_type type = bld->type;
2198    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2199
2200    assert(type.floating);
2201    assert(lp_check_value(type, a));
2202
2203    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2204 }
2205
2206
2207 /**
2208  * Return float (vector) rounded to nearest integer (vector).  The returned
2209  * value is an integer (vector).
2210  * Ex: iround(0.9) = 1
2211  * Ex: iround(-1.5) = -2
2212  */
2213 LLVMValueRef
2214 lp_build_iround(struct lp_build_context *bld,
2215                 LLVMValueRef a)
2216 {
2217    LLVMBuilderRef builder = bld->gallivm->builder;
2218    const struct lp_type type = bld->type;
2219    LLVMTypeRef int_vec_type = bld->int_vec_type;
2220    LLVMValueRef res;
2221
2222    assert(type.floating);
2223
2224    assert(lp_check_value(type, a));
2225
2226    if ((util_cpu_caps.has_sse2 &&
2227        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2228        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2229       return lp_build_iround_nearest_sse2(bld, a);
2230    }
2231    if (arch_rounding_available(type)) {
2232       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2233    }
2234    else {
2235       LLVMValueRef half;
2236
2237       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2238
2239       if (type.sign) {
2240          LLVMTypeRef vec_type = bld->vec_type;
2241          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2242                                     (unsigned long long)1 << (type.width - 1));
2243          LLVMValueRef sign;
2244
2245          /* get sign bit */
2246          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2247          sign = LLVMBuildAnd(builder, sign, mask, "");
2248
2249          /* sign * 0.5 */
2250          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2251          half = LLVMBuildOr(builder, sign, half, "");
2252          half = LLVMBuildBitCast(builder, half, vec_type, "");
2253       }
2254
2255       res = LLVMBuildFAdd(builder, a, half, "");
2256    }
2257
2258    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2259
2260    return res;
2261 }
2262
2263
2264 /**
2265  * Return floor of float (vector), result is an int (vector)
2266  * Ex: ifloor(1.1) = 1.0
2267  * Ex: ifloor(-1.1) = -2.0
2268  */
2269 LLVMValueRef
2270 lp_build_ifloor(struct lp_build_context *bld,
2271                 LLVMValueRef a)
2272 {
2273    LLVMBuilderRef builder = bld->gallivm->builder;
2274    const struct lp_type type = bld->type;
2275    LLVMTypeRef int_vec_type = bld->int_vec_type;
2276    LLVMValueRef res;
2277
2278    assert(type.floating);
2279    assert(lp_check_value(type, a));
2280
2281    res = a;
2282    if (type.sign) {
2283       if (arch_rounding_available(type)) {
2284          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2285       }
2286       else {
2287          struct lp_type inttype;
2288          struct lp_build_context intbld;
2289          LLVMValueRef trunc, itrunc, mask;
2290
2291          assert(type.floating);
2292          assert(lp_check_value(type, a));
2293
2294          inttype = type;
2295          inttype.floating = 0;
2296          lp_build_context_init(&intbld, bld->gallivm, inttype);
2297
2298          /* round by truncation */
2299          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2300          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2301
2302          /*
2303           * fix values if rounding is wrong (for non-special cases)
2304           * - this is the case if trunc > a
2305           * The results of doing this with NaNs, very large values etc.
2306           * are undefined but this seems to be the case anyway.
2307           */
2308          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2309          /* cheapie minus one with mask since the mask is minus one / zero */
2310          return lp_build_add(&intbld, itrunc, mask);
2311       }
2312    }
2313
2314    /* round to nearest (toward zero) */
2315    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2316
2317    return res;
2318 }
2319
2320
2321 /**
2322  * Return ceiling of float (vector), returning int (vector).
2323  * Ex: iceil( 1.1) = 2
2324  * Ex: iceil(-1.1) = -1
2325  */
2326 LLVMValueRef
2327 lp_build_iceil(struct lp_build_context *bld,
2328                LLVMValueRef a)
2329 {
2330    LLVMBuilderRef builder = bld->gallivm->builder;
2331    const struct lp_type type = bld->type;
2332    LLVMTypeRef int_vec_type = bld->int_vec_type;
2333    LLVMValueRef res;
2334
2335    assert(type.floating);
2336    assert(lp_check_value(type, a));
2337
2338    if (arch_rounding_available(type)) {
2339       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2340    }
2341    else {
2342       struct lp_type inttype;
2343       struct lp_build_context intbld;
2344       LLVMValueRef trunc, itrunc, mask;
2345
2346       assert(type.floating);
2347       assert(lp_check_value(type, a));
2348
2349       inttype = type;
2350       inttype.floating = 0;
2351       lp_build_context_init(&intbld, bld->gallivm, inttype);
2352
2353       /* round by truncation */
2354       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2355       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2356
2357       /*
2358        * fix values if rounding is wrong (for non-special cases)
2359        * - this is the case if trunc < a
2360        * The results of doing this with NaNs, very large values etc.
2361        * are undefined but this seems to be the case anyway.
2362        */
2363       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2364       /* cheapie plus one with mask since the mask is minus one / zero */
2365       return lp_build_sub(&intbld, itrunc, mask);
2366    }
2367
2368    /* round to nearest (toward zero) */
2369    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2370
2371    return res;
2372 }
2373
2374
2375 /**
2376  * Combined ifloor() & fract().
2377  *
2378  * Preferred to calling the functions separately, as it will ensure that the
2379  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2380  */
2381 void
2382 lp_build_ifloor_fract(struct lp_build_context *bld,
2383                       LLVMValueRef a,
2384                       LLVMValueRef *out_ipart,
2385                       LLVMValueRef *out_fpart)
2386 {
2387    LLVMBuilderRef builder = bld->gallivm->builder;
2388    const struct lp_type type = bld->type;
2389    LLVMValueRef ipart;
2390
2391    assert(type.floating);
2392    assert(lp_check_value(type, a));
2393
2394    if (arch_rounding_available(type)) {
2395       /*
2396        * floor() is easier.
2397        */
2398
2399       ipart = lp_build_floor(bld, a);
2400       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2401       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2402    }
2403    else {
2404       /*
2405        * ifloor() is easier.
2406        */
2407
2408       *out_ipart = lp_build_ifloor(bld, a);
2409       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2410       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2411    }
2412 }
2413
2414
2415 /**
2416  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2417  * always smaller than one.
2418  */
2419 void
2420 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2421                            LLVMValueRef a,
2422                            LLVMValueRef *out_ipart,
2423                            LLVMValueRef *out_fpart)
2424 {
2425    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2426    *out_fpart = clamp_fract(bld, *out_fpart);
2427 }
2428
2429
2430 LLVMValueRef
2431 lp_build_sqrt(struct lp_build_context *bld,
2432               LLVMValueRef a)
2433 {
2434    LLVMBuilderRef builder = bld->gallivm->builder;
2435    const struct lp_type type = bld->type;
2436    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2437    char intrinsic[32];
2438
2439    assert(lp_check_value(type, a));
2440
2441    /* TODO: optimize the constant case */
2442
2443    assert(type.floating);
2444    if (type.length == 1) {
2445       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2446    }
2447    else {
2448       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2449    }
2450
2451    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2452 }
2453
2454
2455 /**
2456  * Do one Newton-Raphson step to improve reciprocate precision:
2457  *
2458  *   x_{i+1} = x_i * (2 - a * x_i)
2459  *
2460  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2461  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2462  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2463  * halo. It would be necessary to clamp the argument to prevent this.
2464  *
2465  * See also:
2466  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2467  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2468  */
2469 static inline LLVMValueRef
2470 lp_build_rcp_refine(struct lp_build_context *bld,
2471                     LLVMValueRef a,
2472                     LLVMValueRef rcp_a)
2473 {
2474    LLVMBuilderRef builder = bld->gallivm->builder;
2475    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2476    LLVMValueRef res;
2477
2478    res = LLVMBuildFMul(builder, a, rcp_a, "");
2479    res = LLVMBuildFSub(builder, two, res, "");
2480    res = LLVMBuildFMul(builder, rcp_a, res, "");
2481
2482    return res;
2483 }
2484
2485
2486 LLVMValueRef
2487 lp_build_rcp(struct lp_build_context *bld,
2488              LLVMValueRef a)
2489 {
2490    LLVMBuilderRef builder = bld->gallivm->builder;
2491    const struct lp_type type = bld->type;
2492
2493    assert(lp_check_value(type, a));
2494
2495    if(a == bld->zero)
2496       return bld->undef;
2497    if(a == bld->one)
2498       return bld->one;
2499    if(a == bld->undef)
2500       return bld->undef;
2501
2502    assert(type.floating);
2503
2504    if(LLVMIsConstant(a))
2505       return LLVMConstFDiv(bld->one, a);
2506
2507    /*
2508     * We don't use RCPPS because:
2509     * - it only has 10bits of precision
2510     * - it doesn't even get the reciprocate of 1.0 exactly
2511     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2512     * - for recent processors the benefit over DIVPS is marginal, a case
2513     *   dependent
2514     *
2515     * We could still use it on certain processors if benchmarks show that the
2516     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2517     * particular uses that require less workarounds.
2518     */
2519
2520    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2521          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2522       const unsigned num_iterations = 0;
2523       LLVMValueRef res;
2524       unsigned i;
2525       const char *intrinsic = NULL;
2526
2527       if (type.length == 4) {
2528          intrinsic = "llvm.x86.sse.rcp.ps";
2529       }
2530       else {
2531          intrinsic = "llvm.x86.avx.rcp.ps.256";
2532       }
2533
2534       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2535
2536       for (i = 0; i < num_iterations; ++i) {
2537          res = lp_build_rcp_refine(bld, a, res);
2538       }
2539
2540       return res;
2541    }
2542
2543    return LLVMBuildFDiv(builder, bld->one, a, "");
2544 }
2545
2546
2547 /**
2548  * Do one Newton-Raphson step to improve rsqrt precision:
2549  *
2550  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2551  *
2552  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2553  */
2554 static inline LLVMValueRef
2555 lp_build_rsqrt_refine(struct lp_build_context *bld,
2556                       LLVMValueRef a,
2557                       LLVMValueRef rsqrt_a)
2558 {
2559    LLVMBuilderRef builder = bld->gallivm->builder;
2560    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2561    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2562    LLVMValueRef res;
2563
2564    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2565    res = LLVMBuildFMul(builder, a, res, "");
2566    res = LLVMBuildFSub(builder, three, res, "");
2567    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2568    res = LLVMBuildFMul(builder, half, res, "");
2569
2570    return res;
2571 }
2572
2573
2574 /**
2575  * Generate 1/sqrt(a).
2576  * Result is undefined for values < 0, infinity for +0.
2577  */
2578 LLVMValueRef
2579 lp_build_rsqrt(struct lp_build_context *bld,
2580                LLVMValueRef a)
2581 {
2582    const struct lp_type type = bld->type;
2583
2584    assert(lp_check_value(type, a));
2585
2586    assert(type.floating);
2587
2588    /*
2589     * This should be faster but all denormals will end up as infinity.
2590     */
2591    if (0 && lp_build_fast_rsqrt_available(type)) {
2592       const unsigned num_iterations = 1;
2593       LLVMValueRef res;
2594       unsigned i;
2595
2596       /* rsqrt(1.0) != 1.0 here */
2597       res = lp_build_fast_rsqrt(bld, a);
2598
2599       if (num_iterations) {
2600          /*
2601           * Newton-Raphson will result in NaN instead of infinity for zero,
2602           * and NaN instead of zero for infinity.
2603           * Also, need to ensure rsqrt(1.0) == 1.0.
2604           * All numbers smaller than FLT_MIN will result in +infinity
2605           * (rsqrtps treats all denormals as zero).
2606           */
2607          LLVMValueRef cmp;
2608          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2609          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2610
2611          for (i = 0; i < num_iterations; ++i) {
2612             res = lp_build_rsqrt_refine(bld, a, res);
2613          }
2614          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2615          res = lp_build_select(bld, cmp, inf, res);
2616          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2617          res = lp_build_select(bld, cmp, bld->zero, res);
2618          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2619          res = lp_build_select(bld, cmp, bld->one, res);
2620       }
2621
2622       return res;
2623    }
2624
2625    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2626 }
2627
2628 /**
2629  * If there's a fast (inaccurate) rsqrt instruction available
2630  * (caller may want to avoid to call rsqrt_fast if it's not available,
2631  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2632  * unavailable it would result in sqrt/div/mul so obviously
2633  * much better to just call sqrt, skipping both div and mul).
2634  */
2635 boolean
2636 lp_build_fast_rsqrt_available(struct lp_type type)
2637 {
2638    assert(type.floating);
2639
2640    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2641        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2642       return true;
2643    }
2644    return false;
2645 }
2646
2647
2648 /**
2649  * Generate 1/sqrt(a).
2650  * Result is undefined for values < 0, infinity for +0.
2651  * Precision is limited, only ~10 bits guaranteed
2652  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2653  */
2654 LLVMValueRef
2655 lp_build_fast_rsqrt(struct lp_build_context *bld,
2656                     LLVMValueRef a)
2657 {
2658    LLVMBuilderRef builder = bld->gallivm->builder;
2659    const struct lp_type type = bld->type;
2660
2661    assert(lp_check_value(type, a));
2662
2663    if (lp_build_fast_rsqrt_available(type)) {
2664       const char *intrinsic = NULL;
2665
2666       if (type.length == 4) {
2667          intrinsic = "llvm.x86.sse.rsqrt.ps";
2668       }
2669       else {
2670          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2671       }
2672       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2673    }
2674    else {
2675       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2676    }
2677    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2678 }
2679
2680
2681 /**
2682  * Generate sin(a) or cos(a) using polynomial approximation.
2683  * TODO: it might be worth recognizing sin and cos using same source
2684  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2685  * would be way cheaper than calculating (nearly) everything twice...
2686  * Not sure it's common enough to be worth bothering however, scs
2687  * opcode could also benefit from calculating both though.
2688  */
2689 static LLVMValueRef
2690 lp_build_sin_or_cos(struct lp_build_context *bld,
2691                     LLVMValueRef a,
2692                     boolean cos)
2693 {
2694    struct gallivm_state *gallivm = bld->gallivm;
2695    LLVMBuilderRef b = gallivm->builder;
2696    struct lp_type int_type = lp_int_type(bld->type);
2697
2698    /*
2699     *  take the absolute value,
2700     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2701     */
2702
2703    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2704    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2705
2706    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2707    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2708
2709    /*
2710     * scale by 4/Pi
2711     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2712     */
2713
2714    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2715    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2716
2717    /*
2718     * store the integer part of y in mm0
2719     * emm2 = _mm_cvttps_epi32(y);
2720     */
2721
2722    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2723
2724    /*
2725     * j=(j+1) & (~1) (see the cephes sources)
2726     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2727     */
2728
2729    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2730    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2731    /*
2732     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2733     */
2734    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2735    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2736
2737    /*
2738     * y = _mm_cvtepi32_ps(emm2);
2739     */
2740    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2741
2742    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2743    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2744    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2745    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2746
2747    /*
2748     * Argument used for poly selection and sign bit determination
2749     * is different for sin vs. cos.
2750     */
2751    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2752                                emm2_and;
2753
2754    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2755                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2756                                               const_29, "sign_bit") :
2757                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2758                                                               LLVMBuildShl(b, emm2_add,
2759                                                                            const_29, ""), ""),
2760                                               sign_mask, "sign_bit");
2761
2762    /*
2763     * get the polynom selection mask
2764     * there is one polynom for 0 <= x <= Pi/4
2765     * and another one for Pi/4<x<=Pi/2
2766     * Both branches will be computed.
2767     *
2768     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2769     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2770     */
2771
2772    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2773    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2774                                              int_type, PIPE_FUNC_EQUAL,
2775                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2776
2777    /*
2778     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2779     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2780     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2781     */
2782    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2783    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2784    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2785
2786    /*
2787     * The magic pass: "Extended precision modular arithmetic"
2788     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2789     * xmm1 = _mm_mul_ps(y, xmm1);
2790     * xmm2 = _mm_mul_ps(y, xmm2);
2791     * xmm3 = _mm_mul_ps(y, xmm3);
2792     */
2793    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2794    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2795    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2796
2797    /*
2798     * x = _mm_add_ps(x, xmm1);
2799     * x = _mm_add_ps(x, xmm2);
2800     * x = _mm_add_ps(x, xmm3);
2801     */
2802
2803    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2804    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2805    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2806
2807    /*
2808     * Evaluate the first polynom  (0 <= x <= Pi/4)
2809     *
2810     * z = _mm_mul_ps(x,x);
2811     */
2812    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2813
2814    /*
2815     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2816     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2817     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2818     */
2819    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2820    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2821    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2822
2823    /*
2824     * y = *(v4sf*)_ps_coscof_p0;
2825     * y = _mm_mul_ps(y, z);
2826     */
2827    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2828    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2829    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2830    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2831    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2832    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2833
2834
2835    /*
2836     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2837     * y = _mm_sub_ps(y, tmp);
2838     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2839     */
2840    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2841    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2842    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2843    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2844    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2845
2846    /*
2847     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2848     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2849     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2850     */
2851    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2852    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2853    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2854
2855    /*
2856     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2857     *
2858     * y2 = *(v4sf*)_ps_sincof_p0;
2859     * y2 = _mm_mul_ps(y2, z);
2860     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2861     * y2 = _mm_mul_ps(y2, z);
2862     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2863     * y2 = _mm_mul_ps(y2, z);
2864     * y2 = _mm_mul_ps(y2, x);
2865     * y2 = _mm_add_ps(y2, x);
2866     */
2867
2868    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2869    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2870    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2871    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2872    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2873    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2874    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2875
2876    /*
2877     * select the correct result from the two polynoms
2878     * xmm3 = poly_mask;
2879     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2880     * y = _mm_andnot_ps(xmm3, y);
2881     * y = _mm_or_ps(y,y2);
2882     */
2883    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2884    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2885    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2886    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2887    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2888    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2889
2890    /*
2891     * update the sign
2892     * y = _mm_xor_ps(y, sign_bit);
2893     */
2894    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2895    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2896
2897    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2898
2899    /* clamp output to be within [-1, 1] */
2900    y_result = lp_build_clamp(bld, y_result,
2901                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2902                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2903    /* If a is -inf, inf or NaN then return NaN */
2904    y_result = lp_build_select(bld, isfinite, y_result,
2905                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2906    return y_result;
2907 }
2908
2909
2910 /**
2911  * Generate sin(a)
2912  */
2913 LLVMValueRef
2914 lp_build_sin(struct lp_build_context *bld,
2915              LLVMValueRef a)
2916 {
2917    return lp_build_sin_or_cos(bld, a, FALSE);
2918 }
2919
2920
2921 /**
2922  * Generate cos(a)
2923  */
2924 LLVMValueRef
2925 lp_build_cos(struct lp_build_context *bld,
2926              LLVMValueRef a)
2927 {
2928    return lp_build_sin_or_cos(bld, a, TRUE);
2929 }
2930
2931
2932 /**
2933  * Generate pow(x, y)
2934  */
2935 LLVMValueRef
2936 lp_build_pow(struct lp_build_context *bld,
2937              LLVMValueRef x,
2938              LLVMValueRef y)
2939 {
2940    /* TODO: optimize the constant case */
2941    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2942        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2943       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2944                    __FUNCTION__);
2945    }
2946
2947    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2948 }
2949
2950
2951 /**
2952  * Generate exp(x)
2953  */
2954 LLVMValueRef
2955 lp_build_exp(struct lp_build_context *bld,
2956              LLVMValueRef x)
2957 {
2958    /* log2(e) = 1/log(2) */
2959    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2960                                            1.4426950408889634);
2961
2962    assert(lp_check_value(bld->type, x));
2963
2964    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2965 }
2966
2967
2968 /**
2969  * Generate log(x)
2970  * Behavior is undefined with infs, 0s and nans
2971  */
2972 LLVMValueRef
2973 lp_build_log(struct lp_build_context *bld,
2974              LLVMValueRef x)
2975 {
2976    /* log(2) */
2977    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2978                                           0.69314718055994529);
2979
2980    assert(lp_check_value(bld->type, x));
2981
2982    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2983 }
2984
2985 /**
2986  * Generate log(x) that handles edge cases (infs, 0s and nans)
2987  */
2988 LLVMValueRef
2989 lp_build_log_safe(struct lp_build_context *bld,
2990                   LLVMValueRef x)
2991 {
2992    /* log(2) */
2993    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2994                                           0.69314718055994529);
2995
2996    assert(lp_check_value(bld->type, x));
2997
2998    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2999 }
3000
3001
3002 /**
3003  * Generate polynomial.
3004  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3005  */
3006 LLVMValueRef
3007 lp_build_polynomial(struct lp_build_context *bld,
3008                     LLVMValueRef x,
3009                     const double *coeffs,
3010                     unsigned num_coeffs)
3011 {
3012    const struct lp_type type = bld->type;
3013    LLVMValueRef even = NULL, odd = NULL;
3014    LLVMValueRef x2;
3015    unsigned i;
3016
3017    assert(lp_check_value(bld->type, x));
3018
3019    /* TODO: optimize the constant case */
3020    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3021        LLVMIsConstant(x)) {
3022       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3023                    __FUNCTION__);
3024    }
3025
3026    /*
3027     * Calculate odd and even terms seperately to decrease data dependency
3028     * Ex:
3029     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3030     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3031     */
3032    x2 = lp_build_mul(bld, x, x);
3033
3034    for (i = num_coeffs; i--; ) {
3035       LLVMValueRef coeff;
3036
3037       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3038
3039       if (i % 2 == 0) {
3040          if (even)
3041             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3042          else
3043             even = coeff;
3044       } else {
3045          if (odd)
3046             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3047          else
3048             odd = coeff;
3049       }
3050    }
3051
3052    if (odd)
3053       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3054    else if (even)
3055       return even;
3056    else
3057       return bld->undef;
3058 }
3059
3060
3061 /**
3062  * Minimax polynomial fit of 2**x, in range [0, 1[
3063  */
3064 const double lp_build_exp2_polynomial[] = {
3065 #if EXP_POLY_DEGREE == 5
3066    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3067    0.693153073200168932794,
3068    0.240153617044375388211,
3069    0.0558263180532956664775,
3070    0.00898934009049466391101,
3071    0.00187757667519147912699
3072 #elif EXP_POLY_DEGREE == 4
3073    1.00000259337069434683,
3074    0.693003834469974940458,
3075    0.24144275689150793076,
3076    0.0520114606103070150235,
3077    0.0135341679161270268764
3078 #elif EXP_POLY_DEGREE == 3
3079    0.999925218562710312959,
3080    0.695833540494823811697,
3081    0.226067155427249155588,
3082    0.0780245226406372992967
3083 #elif EXP_POLY_DEGREE == 2
3084    1.00172476321474503578,
3085    0.657636275736077639316,
3086    0.33718943461968720704
3087 #else
3088 #error
3089 #endif
3090 };
3091
3092
3093 LLVMValueRef
3094 lp_build_exp2(struct lp_build_context *bld,
3095               LLVMValueRef x)
3096 {
3097    LLVMBuilderRef builder = bld->gallivm->builder;
3098    const struct lp_type type = bld->type;
3099    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3100    LLVMValueRef ipart = NULL;
3101    LLVMValueRef fpart = NULL;
3102    LLVMValueRef expipart = NULL;
3103    LLVMValueRef expfpart = NULL;
3104    LLVMValueRef res = NULL;
3105
3106    assert(lp_check_value(bld->type, x));
3107
3108    /* TODO: optimize the constant case */
3109    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3110        LLVMIsConstant(x)) {
3111       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3112                    __FUNCTION__);
3113    }
3114
3115    assert(type.floating && type.width == 32);
3116
3117    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3118     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3119    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3120                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3121    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3122                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3123
3124    /* ipart = floor(x) */
3125    /* fpart = x - ipart */
3126    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3127
3128    /* expipart = (float) (1 << ipart) */
3129    expipart = LLVMBuildAdd(builder, ipart,
3130                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3131    expipart = LLVMBuildShl(builder, expipart,
3132                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3133    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3134
3135    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3136                                   Elements(lp_build_exp2_polynomial));
3137
3138    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3139
3140    return res;
3141 }
3142
3143
3144
3145 /**
3146  * Extract the exponent of a IEEE-754 floating point value.
3147  *
3148  * Optionally apply an integer bias.
3149  *
3150  * Result is an integer value with
3151  *
3152  *   ifloor(log2(x)) + bias
3153  */
3154 LLVMValueRef
3155 lp_build_extract_exponent(struct lp_build_context *bld,
3156                           LLVMValueRef x,
3157                           int bias)
3158 {
3159    LLVMBuilderRef builder = bld->gallivm->builder;
3160    const struct lp_type type = bld->type;
3161    unsigned mantissa = lp_mantissa(type);
3162    LLVMValueRef res;
3163
3164    assert(type.floating);
3165
3166    assert(lp_check_value(bld->type, x));
3167
3168    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3169
3170    res = LLVMBuildLShr(builder, x,
3171                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3172    res = LLVMBuildAnd(builder, res,
3173                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3174    res = LLVMBuildSub(builder, res,
3175                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3176
3177    return res;
3178 }
3179
3180
3181 /**
3182  * Extract the mantissa of the a floating.
3183  *
3184  * Result is a floating point value with
3185  *
3186  *   x / floor(log2(x))
3187  */
3188 LLVMValueRef
3189 lp_build_extract_mantissa(struct lp_build_context *bld,
3190                           LLVMValueRef x)
3191 {
3192    LLVMBuilderRef builder = bld->gallivm->builder;
3193    const struct lp_type type = bld->type;
3194    unsigned mantissa = lp_mantissa(type);
3195    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3196                                                   (1ULL << mantissa) - 1);
3197    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3198    LLVMValueRef res;
3199
3200    assert(lp_check_value(bld->type, x));
3201
3202    assert(type.floating);
3203
3204    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3205
3206    /* res = x / 2**ipart */
3207    res = LLVMBuildAnd(builder, x, mantmask, "");
3208    res = LLVMBuildOr(builder, res, one, "");
3209    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3210
3211    return res;
3212 }
3213
3214
3215
3216 /**
3217  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3218  * These coefficients can be generate with
3219  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3220  */
3221 const double lp_build_log2_polynomial[] = {
3222 #if LOG_POLY_DEGREE == 5
3223    2.88539008148777786488L,
3224    0.961796878841293367824L,
3225    0.577058946784739859012L,
3226    0.412914355135828735411L,
3227    0.308591899232910175289L,
3228    0.352376952300281371868L,
3229 #elif LOG_POLY_DEGREE == 4
3230    2.88539009343309178325L,
3231    0.961791550404184197881L,
3232    0.577440339438736392009L,
3233    0.403343858251329912514L,
3234    0.406718052498846252698L,
3235 #elif LOG_POLY_DEGREE == 3
3236    2.88538959748872753838L,
3237    0.961932915889597772928L,
3238    0.571118517972136195241L,
3239    0.493997535084709500285L,
3240 #else
3241 #error
3242 #endif
3243 };
3244
3245 /**
3246  * See http://www.devmaster.net/forums/showthread.php?p=43580
3247  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3248  * http://www.nezumi.demon.co.uk/consult/logx.htm
3249  *
3250  * If handle_edge_cases is true the function will perform computations
3251  * to match the required D3D10+ behavior for each of the edge cases.
3252  * That means that if input is:
3253  * - less than zero (to and including -inf) then NaN will be returned
3254  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3255  * - +infinity, then +infinity will be returned
3256  * - NaN, then NaN will be returned
3257  *
3258  * Those checks are fairly expensive so if you don't need them make sure
3259  * handle_edge_cases is false.
3260  */
3261 void
3262 lp_build_log2_approx(struct lp_build_context *bld,
3263                      LLVMValueRef x,
3264                      LLVMValueRef *p_exp,
3265                      LLVMValueRef *p_floor_log2,
3266                      LLVMValueRef *p_log2,
3267                      boolean handle_edge_cases)
3268 {
3269    LLVMBuilderRef builder = bld->gallivm->builder;
3270    const struct lp_type type = bld->type;
3271    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3272    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3273
3274    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3275    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3276    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3277
3278    LLVMValueRef i = NULL;
3279    LLVMValueRef y = NULL;
3280    LLVMValueRef z = NULL;
3281    LLVMValueRef exp = NULL;
3282    LLVMValueRef mant = NULL;
3283    LLVMValueRef logexp = NULL;
3284    LLVMValueRef logmant = NULL;
3285    LLVMValueRef res = NULL;
3286
3287    assert(lp_check_value(bld->type, x));
3288
3289    if(p_exp || p_floor_log2 || p_log2) {
3290       /* TODO: optimize the constant case */
3291       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3292           LLVMIsConstant(x)) {
3293          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3294                       __FUNCTION__);
3295       }
3296
3297       assert(type.floating && type.width == 32);
3298
3299       /*
3300        * We don't explicitly handle denormalized numbers. They will yield a
3301        * result in the neighbourhood of -127, which appears to be adequate
3302        * enough.
3303        */
3304
3305       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3306
3307       /* exp = (float) exponent(x) */
3308       exp = LLVMBuildAnd(builder, i, expmask, "");
3309    }
3310
3311    if(p_floor_log2 || p_log2) {
3312       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3313       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3314       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3315    }
3316
3317    if (p_log2) {
3318       /* mant = 1 + (float) mantissa(x) */
3319       mant = LLVMBuildAnd(builder, i, mantmask, "");
3320       mant = LLVMBuildOr(builder, mant, one, "");
3321       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3322
3323       /* y = (mant - 1) / (mant + 1) */
3324       y = lp_build_div(bld,
3325          lp_build_sub(bld, mant, bld->one),
3326          lp_build_add(bld, mant, bld->one)
3327       );
3328
3329       /* z = y^2 */
3330       z = lp_build_mul(bld, y, y);
3331
3332       /* compute P(z) */
3333       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3334                                     Elements(lp_build_log2_polynomial));
3335
3336       /* logmant = y * P(z) */
3337       logmant = lp_build_mul(bld, y, logmant);
3338
3339       res = lp_build_add(bld, logmant, logexp);
3340
3341       if (type.floating && handle_edge_cases) {
3342          LLVMValueRef negmask, infmask,  zmask;
3343          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3344                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3345          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3346                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3347          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3348                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3349
3350          /* If x is qual to inf make sure we return inf */
3351          res = lp_build_select(bld, infmask,
3352                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3353                                res);
3354          /* If x is qual to 0, return -inf */
3355          res = lp_build_select(bld, zmask,
3356                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3357                                res);
3358          /* If x is nan or less than 0, return nan */
3359          res = lp_build_select(bld, negmask,
3360                                lp_build_const_vec(bld->gallivm, type,  NAN),
3361                                res);
3362       }
3363    }
3364
3365    if (p_exp) {
3366       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3367       *p_exp = exp;
3368    }
3369
3370    if (p_floor_log2)
3371       *p_floor_log2 = logexp;
3372
3373    if (p_log2)
3374       *p_log2 = res;
3375 }
3376
3377
3378 /*
3379  * log2 implementation which doesn't have special code to
3380  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3381  * the results for those cases are undefined.
3382  */
3383 LLVMValueRef
3384 lp_build_log2(struct lp_build_context *bld,
3385               LLVMValueRef x)
3386 {
3387    LLVMValueRef res;
3388    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3389    return res;
3390 }
3391
3392 /*
3393  * Version of log2 which handles all edge cases.
3394  * Look at documentation of lp_build_log2_approx for
3395  * description of the behavior for each of the edge cases.
3396  */
3397 LLVMValueRef
3398 lp_build_log2_safe(struct lp_build_context *bld,
3399                    LLVMValueRef x)
3400 {
3401    LLVMValueRef res;
3402    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3403    return res;
3404 }
3405
3406
3407 /**
3408  * Faster (and less accurate) log2.
3409  *
3410  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3411  *
3412  * Piece-wise linear approximation, with exact results when x is a
3413  * power of two.
3414  *
3415  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3416  */
3417 LLVMValueRef
3418 lp_build_fast_log2(struct lp_build_context *bld,
3419                    LLVMValueRef x)
3420 {
3421    LLVMBuilderRef builder = bld->gallivm->builder;
3422    LLVMValueRef ipart;
3423    LLVMValueRef fpart;
3424
3425    assert(lp_check_value(bld->type, x));
3426
3427    assert(bld->type.floating);
3428
3429    /* ipart = floor(log2(x)) - 1 */
3430    ipart = lp_build_extract_exponent(bld, x, -1);
3431    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3432
3433    /* fpart = x / 2**ipart */
3434    fpart = lp_build_extract_mantissa(bld, x);
3435
3436    /* ipart + fpart */
3437    return LLVMBuildFAdd(builder, ipart, fpart, "");
3438 }
3439
3440
3441 /**
3442  * Fast implementation of iround(log2(x)).
3443  *
3444  * Not an approximation -- it should give accurate results all the time.
3445  */
3446 LLVMValueRef
3447 lp_build_ilog2(struct lp_build_context *bld,
3448                LLVMValueRef x)
3449 {
3450    LLVMBuilderRef builder = bld->gallivm->builder;
3451    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3452    LLVMValueRef ipart;
3453
3454    assert(bld->type.floating);
3455
3456    assert(lp_check_value(bld->type, x));
3457
3458    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3459    x = LLVMBuildFMul(builder, x, sqrt2, "");
3460
3461    /* ipart = floor(log2(x) + 0.5)  */
3462    ipart = lp_build_extract_exponent(bld, x, 0);
3463
3464    return ipart;
3465 }
3466
3467 LLVMValueRef
3468 lp_build_mod(struct lp_build_context *bld,
3469              LLVMValueRef x,
3470              LLVMValueRef y)
3471 {
3472    LLVMBuilderRef builder = bld->gallivm->builder;
3473    LLVMValueRef res;
3474    const struct lp_type type = bld->type;
3475
3476    assert(lp_check_value(type, x));
3477    assert(lp_check_value(type, y));
3478
3479    if (type.floating)
3480       res = LLVMBuildFRem(builder, x, y, "");
3481    else if (type.sign)
3482       res = LLVMBuildSRem(builder, x, y, "");
3483    else
3484       res = LLVMBuildURem(builder, x, y, "");
3485    return res;
3486 }
3487
3488
3489 /*
3490  * For floating inputs it creates and returns a mask
3491  * which is all 1's for channels which are NaN.
3492  * Channels inside x which are not NaN will be 0.
3493  */
3494 LLVMValueRef
3495 lp_build_isnan(struct lp_build_context *bld,
3496                LLVMValueRef x)
3497 {
3498    LLVMValueRef mask;
3499    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3500
3501    assert(bld->type.floating);
3502    assert(lp_check_value(bld->type, x));
3503
3504    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3505                         "isnotnan");
3506    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3507    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3508    return mask;
3509 }
3510
3511 /* Returns all 1's for floating point numbers that are
3512  * finite numbers and returns all zeros for -inf,
3513  * inf and nan's */
3514 LLVMValueRef
3515 lp_build_isfinite(struct lp_build_context *bld,
3516                   LLVMValueRef x)
3517 {
3518    LLVMBuilderRef builder = bld->gallivm->builder;
3519    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3520    struct lp_type int_type = lp_int_type(bld->type);
3521    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3522    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3523                                                     0x7f800000);
3524
3525    if (!bld->type.floating) {
3526       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3527    }
3528    assert(bld->type.floating);
3529    assert(lp_check_value(bld->type, x));
3530    assert(bld->type.width == 32);
3531
3532    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3533    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3534                            intx, infornan32);
3535 }
3536
3537 /*
3538  * Returns true if the number is nan or inf and false otherwise.
3539  * The input has to be a floating point vector.
3540  */
3541 LLVMValueRef
3542 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3543                        const struct lp_type type,
3544                        LLVMValueRef x)
3545 {
3546    LLVMBuilderRef builder = gallivm->builder;
3547    struct lp_type int_type = lp_int_type(type);
3548    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3549                                                 0x7f800000);
3550    LLVMValueRef ret;
3551
3552    assert(type.floating);
3553
3554    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3555    ret = LLVMBuildAnd(builder, ret, const0, "");
3556    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3557                           ret, const0);
3558
3559    return ret;
3560 }
3561
3562
3563 LLVMValueRef
3564 lp_build_fpstate_get(struct gallivm_state *gallivm)
3565 {
3566    if (util_cpu_caps.has_sse) {
3567       LLVMBuilderRef builder = gallivm->builder;
3568       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3569          gallivm,
3570          LLVMInt32TypeInContext(gallivm->context),
3571          "mxcsr_ptr");
3572       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3573           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3574       lp_build_intrinsic(builder,
3575                          "llvm.x86.sse.stmxcsr",
3576                          LLVMVoidTypeInContext(gallivm->context),
3577                          &mxcsr_ptr8, 1, 0);
3578       return mxcsr_ptr;
3579    }
3580    return 0;
3581 }
3582
3583 void
3584 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3585                                   boolean zero)
3586 {
3587    if (util_cpu_caps.has_sse) {
3588       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3589       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3590
3591       LLVMBuilderRef builder = gallivm->builder;
3592       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3593       LLVMValueRef mxcsr =
3594          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3595
3596       if (util_cpu_caps.has_daz) {
3597          /* Enable denormals are zero mode */
3598          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3599       }
3600       if (zero) {
3601          mxcsr = LLVMBuildOr(builder, mxcsr,
3602                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3603       } else {
3604          mxcsr = LLVMBuildAnd(builder, mxcsr,
3605                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3606       }
3607
3608       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3609       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3610    }
3611 }
3612
3613 void
3614 lp_build_fpstate_set(struct gallivm_state *gallivm,
3615                      LLVMValueRef mxcsr_ptr)
3616 {
3617    if (util_cpu_caps.has_sse) {
3618       LLVMBuilderRef builder = gallivm->builder;
3619       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3620                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3621       lp_build_intrinsic(builder,
3622                          "llvm.x86.sse.ldmxcsr",
3623                          LLVMVoidTypeInContext(gallivm->context),
3624                          &mxcsr_ptr, 1, 0);
3625    }
3626 }