src/gallium/auxiliary/gallivm/lp_bld_arit.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009-2010 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Helper
  32  *
  33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34  * notably min/max and saturated operations), and it is often necessary to
  35  * resort machine-specific intrinsics directly. The functions here hide all
  36  * these implementation details from the other modules.
  37  *
  38  * We also do simple expressions simplification here. Reasons are:
  39  * - it is very easy given we have all necessary information readily available
  40  * - LLVM optimization passes fail to simplify several vector expressions
  41  * - We often know value constraints which the optimization passes have no way
  42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43  *
  44  * @author Jose Fonseca <jfonseca@vmware.com>
  45  */
  46
  47
  48 #include <float.h>
  49
  50 #include "util/u_memory.h"
  51 #include "util/u_debug.h"
  52 #include "util/u_math.h"
  53 #include "util/u_string.h"
  54 #include "util/u_cpu_detect.h"
  55
  56 #include "lp_bld_type.h"
  57 #include "lp_bld_const.h"
  58 #include "lp_bld_init.h"
  59 #include "lp_bld_intr.h"
  60 #include "lp_bld_logic.h"
  61 #include "lp_bld_pack.h"
  62 #include "lp_bld_debug.h"
  63 #include "lp_bld_bitarit.h"
  64 #include "lp_bld_arit.h"
  65 #include "lp_bld_flow.h"
  66
  67 #if defined(PIPE_ARCH_SSE)
  68 #include <xmmintrin.h>
  69 #endif
  70
  71 #ifndef _MM_DENORMALS_ZERO_MASK
  72 #define _MM_DENORMALS_ZERO_MASK 0x0040
  73 #endif
  74
  75 #ifndef _MM_FLUSH_ZERO_MASK
  76 #define _MM_FLUSH_ZERO_MASK 0x8000
  77 #endif
  78
  79 #define EXP_POLY_DEGREE 5
  80
  81 #define LOG_POLY_DEGREE 4
  82
  83
  84 /**
  85  * Generate min(a, b)
  86  * No checks for special case values of a or b = 1 or 0 are done.
  87  * NaN's are handled according to the behavior specified by the
  88  * nan_behavior argument.
  89  */
  90 static LLVMValueRef
  91 lp_build_min_simple(struct lp_build_context *bld,
  92                     LLVMValueRef a,
  93                     LLVMValueRef b,
  94                     enum gallivm_nan_behavior nan_behavior)
  95 {
  96    const struct lp_type type = bld->type;
  97    const char *intrinsic = NULL;
  98    unsigned intr_size = 0;
  99    LLVMValueRef cond;
 100
 101    assert(lp_check_value(type, a));
 102    assert(lp_check_value(type, b));
 103
 104    /* TODO: optimize the constant case */
 105
 106    if (type.floating && util_cpu_caps.has_sse) {
 107       if (type.width == 32) {
 108          if (type.length == 1) {
 109             intrinsic = "llvm.x86.sse.min.ss";
 110             intr_size = 128;
 111          }
 112          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 113             intrinsic = "llvm.x86.sse.min.ps";
 114             intr_size = 128;
 115          }
 116          else {
 117             intrinsic = "llvm.x86.avx.min.ps.256";
 118             intr_size = 256;
 119          }
 120       }
 121       if (type.width == 64 && util_cpu_caps.has_sse2) {
 122          if (type.length == 1) {
 123             intrinsic = "llvm.x86.sse2.min.sd";
 124             intr_size = 128;
 125          }
 126          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 127             intrinsic = "llvm.x86.sse2.min.pd";
 128             intr_size = 128;
 129          }
 130          else {
 131             intrinsic = "llvm.x86.avx.min.pd.256";
 132             intr_size = 256;
 133          }
 134       }
 135    }
 136    else if (type.floating && util_cpu_caps.has_altivec) {
 137       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 138           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 140                       __FUNCTION__);
 141       }
 142       if (type.width == 32 && type.length == 4) {
 143          intrinsic = "llvm.ppc.altivec.vminfp";
 144          intr_size = 128;
 145       }
 146    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 147       intr_size = 128;
 148       if ((type.width == 8 || type.width == 16) &&
 149           (type.width * type.length <= 64) &&
 150           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 151          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 152                       __FUNCTION__);
 153       }
 154       if (type.width == 8 && !type.sign) {
 155          intrinsic = "llvm.x86.sse2.pminu.b";
 156       }
 157       else if (type.width == 16 && type.sign) {
 158          intrinsic = "llvm.x86.sse2.pmins.w";
 159       }
 160       if (util_cpu_caps.has_sse4_1) {
 161          if (type.width == 8 && type.sign) {
 162             intrinsic = "llvm.x86.sse41.pminsb";
 163          }
 164          if (type.width == 16 && !type.sign) {
 165             intrinsic = "llvm.x86.sse41.pminuw";
 166          }
 167          if (type.width == 32 && !type.sign) {
 168             intrinsic = "llvm.x86.sse41.pminud";
 169          }
 170          if (type.width == 32 && type.sign) {
 171             intrinsic = "llvm.x86.sse41.pminsd";
 172          }
 173       }
 174    } else if (util_cpu_caps.has_altivec) {
 175       intr_size = 128;
 176       if (type.width == 8) {
 177          if (!type.sign) {
 178             intrinsic = "llvm.ppc.altivec.vminub";
 179          } else {
 180             intrinsic = "llvm.ppc.altivec.vminsb";
 181          }
 182       } else if (type.width == 16) {
 183          if (!type.sign) {
 184             intrinsic = "llvm.ppc.altivec.vminuh";
 185          } else {
 186             intrinsic = "llvm.ppc.altivec.vminsh";
 187          }
 188       } else if (type.width == 32) {
 189          if (!type.sign) {
 190             intrinsic = "llvm.ppc.altivec.vminuw";
 191          } else {
 192             intrinsic = "llvm.ppc.altivec.vminsw";
 193          }
 194       }
 195    }
 196
 197    if(intrinsic) {
 198       /* We need to handle nan's for floating point numbers. If one of the
 199        * inputs is nan the other should be returned (required by both D3D10+
 200        * and OpenCL).
 201        * The sse intrinsics return the second operator in case of nan by
 202        * default so we need to special code to handle those.
 203        */
 204       if (util_cpu_caps.has_sse && type.floating &&
 205           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 206           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 207           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 208          LLVMValueRef isnan, max;
 209          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 210                                                    type,
 211                                                    intr_size, a, b);
 212          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 213             isnan = lp_build_isnan(bld, b);
 214             return lp_build_select(bld, isnan, a, max);
 215          } else {
 216             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 217             isnan = lp_build_isnan(bld, a);
 218             return lp_build_select(bld, isnan, a, max);
 219          }
 220       } else {
 221          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 222                                                     type,
 223                                                     intr_size, a, b);
 224       }
 225    }
 226
 227    if (type.floating) {
 228       switch (nan_behavior) {
 229       case GALLIVM_NAN_RETURN_NAN: {
 230          LLVMValueRef isnan = lp_build_isnan(bld, b);
 231          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 232          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 233          return lp_build_select(bld, cond, a, b);
 234       }
 235          break;
 236       case GALLIVM_NAN_RETURN_OTHER: {
 237          LLVMValueRef isnan = lp_build_isnan(bld, a);
 238          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 239          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 240          return lp_build_select(bld, cond, a, b);
 241       }
 242          break;
 243       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 244          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
 245          return lp_build_select(bld, cond, a, b);
 246       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 247          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
 248          return lp_build_select(bld, cond, b, a);
 249       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 250          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 251          return lp_build_select(bld, cond, a, b);
 252          break;
 253       default:
 254          assert(0);
 255          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 256          return lp_build_select(bld, cond, a, b);
 257       }
 258    } else {
 259       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
 260       return lp_build_select(bld, cond, a, b);
 261    }
 262 }
 263
 264
 265 /**
 266  * Generate max(a, b)
 267  * No checks for special case values of a or b = 1 or 0 are done.
 268  * NaN's are handled according to the behavior specified by the
 269  * nan_behavior argument.
 270  */
 271 static LLVMValueRef
 272 lp_build_max_simple(struct lp_build_context *bld,
 273                     LLVMValueRef a,
 274                     LLVMValueRef b,
 275                     enum gallivm_nan_behavior nan_behavior)
 276 {
 277    const struct lp_type type = bld->type;
 278    const char *intrinsic = NULL;
 279    unsigned intr_size = 0;
 280    LLVMValueRef cond;
 281
 282    assert(lp_check_value(type, a));
 283    assert(lp_check_value(type, b));
 284
 285    /* TODO: optimize the constant case */
 286
 287    if (type.floating && util_cpu_caps.has_sse) {
 288       if (type.width == 32) {
 289          if (type.length == 1) {
 290             intrinsic = "llvm.x86.sse.max.ss";
 291             intr_size = 128;
 292          }
 293          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
 294             intrinsic = "llvm.x86.sse.max.ps";
 295             intr_size = 128;
 296          }
 297          else {
 298             intrinsic = "llvm.x86.avx.max.ps.256";
 299             intr_size = 256;
 300          }
 301       }
 302       if (type.width == 64 && util_cpu_caps.has_sse2) {
 303          if (type.length == 1) {
 304             intrinsic = "llvm.x86.sse2.max.sd";
 305             intr_size = 128;
 306          }
 307          else if (type.length == 2 || !util_cpu_caps.has_avx) {
 308             intrinsic = "llvm.x86.sse2.max.pd";
 309             intr_size = 128;
 310          }
 311          else {
 312             intrinsic = "llvm.x86.avx.max.pd.256";
 313             intr_size = 256;
 314          }
 315       }
 316    }
 317    else if (type.floating && util_cpu_caps.has_altivec) {
 318       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
 319           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 320          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
 321                       __FUNCTION__);
 322       }
 323       if (type.width == 32 || type.length == 4) {
 324          intrinsic = "llvm.ppc.altivec.vmaxfp";
 325          intr_size = 128;
 326       }
 327    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
 328       intr_size = 128;
 329       if ((type.width == 8 || type.width == 16) &&
 330           (type.width * type.length <= 64) &&
 331           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
 332          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
 333                       __FUNCTION__);
 334          }
 335       if (type.width == 8 && !type.sign) {
 336          intrinsic = "llvm.x86.sse2.pmaxu.b";
 337          intr_size = 128;
 338       }
 339       else if (type.width == 16 && type.sign) {
 340          intrinsic = "llvm.x86.sse2.pmaxs.w";
 341       }
 342       if (util_cpu_caps.has_sse4_1) {
 343          if (type.width == 8 && type.sign) {
 344             intrinsic = "llvm.x86.sse41.pmaxsb";
 345          }
 346          if (type.width == 16 && !type.sign) {
 347             intrinsic = "llvm.x86.sse41.pmaxuw";
 348          }
 349          if (type.width == 32 && !type.sign) {
 350             intrinsic = "llvm.x86.sse41.pmaxud";
 351         }
 352          if (type.width == 32 && type.sign) {
 353             intrinsic = "llvm.x86.sse41.pmaxsd";
 354          }
 355       }
 356    } else if (util_cpu_caps.has_altivec) {
 357      intr_size = 128;
 358      if (type.width == 8) {
 359        if (!type.sign) {
 360          intrinsic = "llvm.ppc.altivec.vmaxub";
 361        } else {
 362          intrinsic = "llvm.ppc.altivec.vmaxsb";
 363        }
 364      } else if (type.width == 16) {
 365        if (!type.sign) {
 366          intrinsic = "llvm.ppc.altivec.vmaxuh";
 367        } else {
 368          intrinsic = "llvm.ppc.altivec.vmaxsh";
 369        }
 370      } else if (type.width == 32) {
 371        if (!type.sign) {
 372          intrinsic = "llvm.ppc.altivec.vmaxuw";
 373        } else {
 374          intrinsic = "llvm.ppc.altivec.vmaxsw";
 375        }
 376      }
 377    }
 378
 379    if(intrinsic) {
 380       if (util_cpu_caps.has_sse && type.floating &&
 381           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
 382           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
 383           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
 384          LLVMValueRef isnan, min;
 385          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 386                                                    type,
 387                                                    intr_size, a, b);
 388          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
 389             isnan = lp_build_isnan(bld, b);
 390             return lp_build_select(bld, isnan, a, min);
 391          } else {
 392             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
 393             isnan = lp_build_isnan(bld, a);
 394             return lp_build_select(bld, isnan, a, min);
 395          }
 396       } else {
 397          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
 398                                                     type,
 399                                                     intr_size, a, b);
 400       }
 401    }
 402
 403    if (type.floating) {
 404       switch (nan_behavior) {
 405       case GALLIVM_NAN_RETURN_NAN: {
 406          LLVMValueRef isnan = lp_build_isnan(bld, b);
 407          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 408          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 409          return lp_build_select(bld, cond, a, b);
 410       }
 411          break;
 412       case GALLIVM_NAN_RETURN_OTHER: {
 413          LLVMValueRef isnan = lp_build_isnan(bld, a);
 414          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 415          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
 416          return lp_build_select(bld, cond, a, b);
 417       }
 418          break;
 419       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
 420          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
 421          return lp_build_select(bld, cond, a, b);
 422       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
 423          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
 424          return lp_build_select(bld, cond, b, a);
 425       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
 426          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 427          return lp_build_select(bld, cond, a, b);
 428          break;
 429       default:
 430          assert(0);
 431          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 432          return lp_build_select(bld, cond, a, b);
 433       }
 434    } else {
 435       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
 436       return lp_build_select(bld, cond, a, b);
 437    }
 438 }
 439
 440
 441 /**
 442  * Generate 1 - a, or ~a depending on bld->type.
 443  */
 444 LLVMValueRef
 445 lp_build_comp(struct lp_build_context *bld,
 446               LLVMValueRef a)
 447 {
 448    LLVMBuilderRef builder = bld->gallivm->builder;
 449    const struct lp_type type = bld->type;
 450
 451    assert(lp_check_value(type, a));
 452
 453    if(a == bld->one)
 454       return bld->zero;
 455    if(a == bld->zero)
 456       return bld->one;
 457
 458    if(type.norm && !type.floating && !type.fixed && !type.sign) {
 459       if(LLVMIsConstant(a))
 460          return LLVMConstNot(a);
 461       else
 462          return LLVMBuildNot(builder, a, "");
 463    }
 464
 465    if(LLVMIsConstant(a))
 466       if (type.floating)
 467           return LLVMConstFSub(bld->one, a);
 468       else
 469           return LLVMConstSub(bld->one, a);
 470    else
 471       if (type.floating)
 472          return LLVMBuildFSub(builder, bld->one, a, "");
 473       else
 474          return LLVMBuildSub(builder, bld->one, a, "");
 475 }
 476
 477
 478 /**
 479  * Generate a + b
 480  */
 481 LLVMValueRef
 482 lp_build_add(struct lp_build_context *bld,
 483              LLVMValueRef a,
 484              LLVMValueRef b)
 485 {
 486    LLVMBuilderRef builder = bld->gallivm->builder;
 487    const struct lp_type type = bld->type;
 488    LLVMValueRef res;
 489
 490    assert(lp_check_value(type, a));
 491    assert(lp_check_value(type, b));
 492
 493    if(a == bld->zero)
 494       return b;
 495    if(b == bld->zero)
 496       return a;
 497    if(a == bld->undef || b == bld->undef)
 498       return bld->undef;
 499
 500    if(bld->type.norm) {
 501       const char *intrinsic = NULL;
 502
 503       if(a == bld->one || b == bld->one)
 504         return bld->one;
 505
 506       if (type.width * type.length == 128 &&
 507           !type.floating && !type.fixed) {
 508          if(util_cpu_caps.has_sse2) {
 509            if(type.width == 8)
 510              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
 511            if(type.width == 16)
 512              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
 513          } else if (util_cpu_caps.has_altivec) {
 514            if(type.width == 8)
 515               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
 516            if(type.width == 16)
 517               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
 518          }
 519       }
 520
 521       if(intrinsic)
 522          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 523    }
 524
 525    /* TODO: handle signed case */
 526    if(type.norm && !type.floating && !type.fixed && !type.sign)
 527       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 528
 529    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 530       if (type.floating)
 531          res = LLVMConstFAdd(a, b);
 532       else
 533          res = LLVMConstAdd(a, b);
 534    else
 535       if (type.floating)
 536          res = LLVMBuildFAdd(builder, a, b, "");
 537       else
 538          res = LLVMBuildAdd(builder, a, b, "");
 539
 540    /* clamp to ceiling of 1.0 */
 541    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 542       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 543
 544    /* XXX clamp to floor of -1 or 0??? */
 545
 546    return res;
 547 }
 548
 549
 550 /** Return the scalar sum of the elements of a.
 551  * Should avoid this operation whenever possible.
 552  */
 553 LLVMValueRef
 554 lp_build_horizontal_add(struct lp_build_context *bld,
 555                         LLVMValueRef a)
 556 {
 557    LLVMBuilderRef builder = bld->gallivm->builder;
 558    const struct lp_type type = bld->type;
 559    LLVMValueRef index, res;
 560    unsigned i, length;
 561    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
 562    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
 563    LLVMValueRef vecres, elem2;
 564
 565    assert(lp_check_value(type, a));
 566
 567    if (type.length == 1) {
 568       return a;
 569    }
 570
 571    assert(!bld->type.norm);
 572
 573    /*
 574     * for byte vectors can do much better with psadbw.
 575     * Using repeated shuffle/adds here. Note with multiple vectors
 576     * this can be done more efficiently as outlined in the intel
 577     * optimization manual.
 578     * Note: could cause data rearrangement if used with smaller element
 579     * sizes.
 580     */
 581
 582    vecres = a;
 583    length = type.length / 2;
 584    while (length > 1) {
 585       LLVMValueRef vec1, vec2;
 586       for (i = 0; i < length; i++) {
 587          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
 588          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
 589       }
 590       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
 591                                     LLVMConstVector(shuffles1, length), "");
 592       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
 593                                     LLVMConstVector(shuffles2, length), "");
 594       if (type.floating) {
 595          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
 596       }
 597       else {
 598          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
 599       }
 600       length = length >> 1;
 601    }
 602
 603    /* always have vector of size 2 here */
 604    assert(length == 1);
 605
 606    index = lp_build_const_int32(bld->gallivm, 0);
 607    res = LLVMBuildExtractElement(builder, vecres, index, "");
 608    index = lp_build_const_int32(bld->gallivm, 1);
 609    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
 610
 611    if (type.floating)
 612       res = LLVMBuildFAdd(builder, res, elem2, "");
 613     else
 614       res = LLVMBuildAdd(builder, res, elem2, "");
 615
 616    return res;
 617 }
 618
 619 /**
 620  * Return the horizontal sums of 4 float vectors as a float4 vector.
 621  * This uses the technique as outlined in Intel Optimization Manual.
 622  */
 623 static LLVMValueRef
 624 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
 625                             LLVMValueRef src[4])
 626 {
 627    struct gallivm_state *gallivm = bld->gallivm;
 628    LLVMBuilderRef builder = gallivm->builder;
 629    LLVMValueRef shuffles[4];
 630    LLVMValueRef tmp[4];
 631    LLVMValueRef sumtmp[2], shuftmp[2];
 632
 633    /* lower half of regs */
 634    shuffles[0] = lp_build_const_int32(gallivm, 0);
 635    shuffles[1] = lp_build_const_int32(gallivm, 1);
 636    shuffles[2] = lp_build_const_int32(gallivm, 4);
 637    shuffles[3] = lp_build_const_int32(gallivm, 5);
 638    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
 639                                    LLVMConstVector(shuffles, 4), "");
 640    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
 641                                    LLVMConstVector(shuffles, 4), "");
 642
 643    /* upper half of regs */
 644    shuffles[0] = lp_build_const_int32(gallivm, 2);
 645    shuffles[1] = lp_build_const_int32(gallivm, 3);
 646    shuffles[2] = lp_build_const_int32(gallivm, 6);
 647    shuffles[3] = lp_build_const_int32(gallivm, 7);
 648    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
 649                                    LLVMConstVector(shuffles, 4), "");
 650    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
 651                                    LLVMConstVector(shuffles, 4), "");
 652
 653    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
 654    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
 655
 656    shuffles[0] = lp_build_const_int32(gallivm, 0);
 657    shuffles[1] = lp_build_const_int32(gallivm, 2);
 658    shuffles[2] = lp_build_const_int32(gallivm, 4);
 659    shuffles[3] = lp_build_const_int32(gallivm, 6);
 660    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 661                                        LLVMConstVector(shuffles, 4), "");
 662
 663    shuffles[0] = lp_build_const_int32(gallivm, 1);
 664    shuffles[1] = lp_build_const_int32(gallivm, 3);
 665    shuffles[2] = lp_build_const_int32(gallivm, 5);
 666    shuffles[3] = lp_build_const_int32(gallivm, 7);
 667    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
 668                                        LLVMConstVector(shuffles, 4), "");
 669
 670    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
 671 }
 672
 673
 674 /*
 675  * partially horizontally add 2-4 float vectors with length nx4,
 676  * i.e. only four adjacent values in each vector will be added,
 677  * assuming values are really grouped in 4 which also determines
 678  * output order.
 679  *
 680  * Return a vector of the same length as the initial vectors,
 681  * with the excess elements (if any) being undefined.
 682  * The element order is independent of number of input vectors.
 683  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
 684  * the output order thus will be
 685  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
 686  */
 687 LLVMValueRef
 688 lp_build_hadd_partial4(struct lp_build_context *bld,
 689                        LLVMValueRef vectors[],
 690                        unsigned num_vecs)
 691 {
 692    struct gallivm_state *gallivm = bld->gallivm;
 693    LLVMBuilderRef builder = gallivm->builder;
 694    LLVMValueRef ret_vec;
 695    LLVMValueRef tmp[4];
 696    const char *intrinsic = NULL;
 697
 698    assert(num_vecs >= 2 && num_vecs <= 4);
 699    assert(bld->type.floating);
 700
 701    /* only use this with at least 2 vectors, as it is sort of expensive
 702     * (depending on cpu) and we always need two horizontal adds anyway,
 703     * so a shuffle/add approach might be better.
 704     */
 705
 706    tmp[0] = vectors[0];
 707    tmp[1] = vectors[1];
 708
 709    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
 710    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
 711
 712    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
 713        bld->type.length == 4) {
 714       intrinsic = "llvm.x86.sse3.hadd.ps";
 715    }
 716    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
 717             bld->type.length == 8) {
 718       intrinsic = "llvm.x86.avx.hadd.ps.256";
 719    }
 720    if (intrinsic) {
 721       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
 722                                        lp_build_vec_type(gallivm, bld->type),
 723                                        tmp[0], tmp[1]);
 724       if (num_vecs > 2) {
 725          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
 726                                           lp_build_vec_type(gallivm, bld->type),
 727                                           tmp[2], tmp[3]);
 728       }
 729       else {
 730          tmp[1] = tmp[0];
 731       }
 732       return lp_build_intrinsic_binary(builder, intrinsic,
 733                                        lp_build_vec_type(gallivm, bld->type),
 734                                        tmp[0], tmp[1]);
 735    }
 736
 737    if (bld->type.length == 4) {
 738       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
 739    }
 740    else {
 741       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
 742       unsigned j;
 743       unsigned num_iter = bld->type.length / 4;
 744       struct lp_type parttype = bld->type;
 745       parttype.length = 4;
 746       for (j = 0; j < num_iter; j++) {
 747          LLVMValueRef partsrc[4];
 748          unsigned i;
 749          for (i = 0; i < 4; i++) {
 750             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
 751          }
 752          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
 753       }
 754       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
 755    }
 756    return ret_vec;
 757 }
 758
 759 /**
 760  * Generate a - b
 761  */
 762 LLVMValueRef
 763 lp_build_sub(struct lp_build_context *bld,
 764              LLVMValueRef a,
 765              LLVMValueRef b)
 766 {
 767    LLVMBuilderRef builder = bld->gallivm->builder;
 768    const struct lp_type type = bld->type;
 769    LLVMValueRef res;
 770
 771    assert(lp_check_value(type, a));
 772    assert(lp_check_value(type, b));
 773
 774    if(b == bld->zero)
 775       return a;
 776    if(a == bld->undef || b == bld->undef)
 777       return bld->undef;
 778    if(a == b)
 779       return bld->zero;
 780
 781    if(bld->type.norm) {
 782       const char *intrinsic = NULL;
 783
 784       if(b == bld->one)
 785         return bld->zero;
 786
 787       if (type.width * type.length == 128 &&
 788           !type.floating && !type.fixed) {
 789          if (util_cpu_caps.has_sse2) {
 790            if(type.width == 8)
 791               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
 792            if(type.width == 16)
 793               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
 794          } else if (util_cpu_caps.has_altivec) {
 795            if(type.width == 8)
 796               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
 797            if(type.width == 16)
 798               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
 799          }
 800       }
 801
 802       if(intrinsic)
 803          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
 804    }
 805
 806    /* TODO: handle signed case */
 807    if(type.norm && !type.floating && !type.fixed && !type.sign)
 808       a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 809
 810    if(LLVMIsConstant(a) && LLVMIsConstant(b))
 811       if (type.floating)
 812          res = LLVMConstFSub(a, b);
 813       else
 814          res = LLVMConstSub(a, b);
 815    else
 816       if (type.floating)
 817          res = LLVMBuildFSub(builder, a, b, "");
 818       else
 819          res = LLVMBuildSub(builder, a, b, "");
 820
 821    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
 822       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 823
 824    return res;
 825 }
 826
 827
 828
 829 /**
 830  * Normalized multiplication.
 831  *
 832  * There are several approaches for (using 8-bit normalized multiplication as
 833  * an example):
 834  *
 835  * - alpha plus one
 836  *
 837  *     makes the following approximation to the division (Sree)
 838  *
 839  *       a*b/255 ~= (a*(b + 1)) >> 256
 840  *
 841  *     which is the fastest method that satisfies the following OpenGL criteria of
 842  *
 843  *       0*0 = 0 and 255*255 = 255
 844  *
 845  * - geometric series
 846  *
 847  *     takes the geometric series approximation to the division
 848  *
 849  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
 850  *
 851  *     in this case just the first two terms to fit in 16bit arithmetic
 852  *
 853  *       t/255 ~= (t + (t >> 8)) >> 8
 854  *
 855  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 856  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 857  *     must be used.
 858  *
 859  * - geometric series plus rounding
 860  *
 861  *     when using a geometric series division instead of truncating the result
 862  *     use roundoff in the approximation (Jim Blinn)
 863  *
 864  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 865  *
 866  *     achieving the exact results.
 867  *
 868  *
 869  *
 870  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
 871  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
 872  * @sa Michael Herf, The "double blend trick", May 2000,
 873  *     http://www.stereopsis.com/doubleblend.html
 874  */
 875 static LLVMValueRef
 876 lp_build_mul_norm(struct gallivm_state *gallivm,
 877                   struct lp_type wide_type,
 878                   LLVMValueRef a, LLVMValueRef b)
 879 {
 880    LLVMBuilderRef builder = gallivm->builder;
 881    struct lp_build_context bld;
 882    unsigned n;
 883    LLVMValueRef half;
 884    LLVMValueRef ab;
 885
 886    assert(!wide_type.floating);
 887    assert(lp_check_value(wide_type, a));
 888    assert(lp_check_value(wide_type, b));
 889
 890    lp_build_context_init(&bld, gallivm, wide_type);
 891
 892    n = wide_type.width / 2;
 893    if (wide_type.sign) {
 894       --n;
 895    }
 896
 897    /*
 898     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
 899     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
 900     */
 901
 902    /*
 903     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
 904     */
 905
 906    ab = LLVMBuildMul(builder, a, b, "");
 907    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
 908
 909    /*
 910     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
 911     */
 912
 913    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
 914    if (wide_type.sign) {
 915       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
 916       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
 917       half = lp_build_select(&bld, sign, minus_half, half);
 918    }
 919    ab = LLVMBuildAdd(builder, ab, half, "");
 920
 921    /* Final division */
 922    ab = lp_build_shr_imm(&bld, ab, n);
 923
 924    return ab;
 925 }
 926
 927 /**
 928  * Generate a * b
 929  */
 930 LLVMValueRef
 931 lp_build_mul(struct lp_build_context *bld,
 932              LLVMValueRef a,
 933              LLVMValueRef b)
 934 {
 935    LLVMBuilderRef builder = bld->gallivm->builder;
 936    const struct lp_type type = bld->type;
 937    LLVMValueRef shift;
 938    LLVMValueRef res;
 939
 940    assert(lp_check_value(type, a));
 941    assert(lp_check_value(type, b));
 942
 943    if(a == bld->zero)
 944       return bld->zero;
 945    if(a == bld->one)
 946       return b;
 947    if(b == bld->zero)
 948       return bld->zero;
 949    if(b == bld->one)
 950       return a;
 951    if(a == bld->undef || b == bld->undef)
 952       return bld->undef;
 953
 954    if (!type.floating && !type.fixed && type.norm) {
 955       struct lp_type wide_type = lp_wider_type(type);
 956       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 957
 958       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
 959       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
 960
 961       /* PMULLW, PSRLW, PADDW */
 962       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
 963       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 964
 965       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
 966
 967       return ab;
 968    }
 969
 970    if(type.fixed)
 971       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
 972    else
 973       shift = NULL;
 974
 975    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
 976       if (type.floating)
 977          res = LLVMConstFMul(a, b);
 978       else
 979          res = LLVMConstMul(a, b);
 980       if(shift) {
 981          if(type.sign)
 982             res = LLVMConstAShr(res, shift);
 983          else
 984             res = LLVMConstLShr(res, shift);
 985       }
 986    }
 987    else {
 988       if (type.floating)
 989          res = LLVMBuildFMul(builder, a, b, "");
 990       else
 991          res = LLVMBuildMul(builder, a, b, "");
 992       if(shift) {
 993          if(type.sign)
 994             res = LLVMBuildAShr(builder, res, shift, "");
 995          else
 996             res = LLVMBuildLShr(builder, res, shift, "");
 997       }
 998    }
 999
1000    return res;
1001 }
1002
1003
1004 /**
1005  * Small vector x scale multiplication optimization.
1006  */
1007 LLVMValueRef
1008 lp_build_mul_imm(struct lp_build_context *bld,
1009                  LLVMValueRef a,
1010                  int b)
1011 {
1012    LLVMBuilderRef builder = bld->gallivm->builder;
1013    LLVMValueRef factor;
1014
1015    assert(lp_check_value(bld->type, a));
1016
1017    if(b == 0)
1018       return bld->zero;
1019
1020    if(b == 1)
1021       return a;
1022
1023    if(b == -1)
1024       return lp_build_negate(bld, a);
1025
1026    if(b == 2 && bld->type.floating)
1027       return lp_build_add(bld, a, a);
1028
1029    if(util_is_power_of_two(b)) {
1030       unsigned shift = ffs(b) - 1;
1031
1032       if(bld->type.floating) {
1033 #if 0
1034          /*
1035           * Power of two multiplication by directly manipulating the exponent.
1036           *
1037           * XXX: This might not be always faster, it will introduce a small error
1038           * for multiplication by zero, and it will produce wrong results
1039           * for Inf and NaN.
1040           */
1041          unsigned mantissa = lp_mantissa(bld->type);
1042          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1043          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1044          a = LLVMBuildAdd(builder, a, factor, "");
1045          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1046          return a;
1047 #endif
1048       }
1049       else {
1050          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1051          return LLVMBuildShl(builder, a, factor, "");
1052       }
1053    }
1054
1055    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1056    return lp_build_mul(bld, a, factor);
1057 }
1058
1059
1060 /**
1061  * Generate a / b
1062  */
1063 LLVMValueRef
1064 lp_build_div(struct lp_build_context *bld,
1065              LLVMValueRef a,
1066              LLVMValueRef b)
1067 {
1068    LLVMBuilderRef builder = bld->gallivm->builder;
1069    const struct lp_type type = bld->type;
1070
1071    assert(lp_check_value(type, a));
1072    assert(lp_check_value(type, b));
1073
1074    if(a == bld->zero)
1075       return bld->zero;
1076    if(a == bld->one)
1077       return lp_build_rcp(bld, b);
1078    if(b == bld->zero)
1079       return bld->undef;
1080    if(b == bld->one)
1081       return a;
1082    if(a == bld->undef || b == bld->undef)
1083       return bld->undef;
1084
1085    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1086       if (type.floating)
1087          return LLVMConstFDiv(a, b);
1088       else if (type.sign)
1089          return LLVMConstSDiv(a, b);
1090       else
1091          return LLVMConstUDiv(a, b);
1092    }
1093
1094    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1095        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1096       type.floating)
1097       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1098
1099    if (type.floating)
1100       return LLVMBuildFDiv(builder, a, b, "");
1101    else if (type.sign)
1102       return LLVMBuildSDiv(builder, a, b, "");
1103    else
1104       return LLVMBuildUDiv(builder, a, b, "");
1105 }
1106
1107
1108 /**
1109  * Linear interpolation helper.
1110  *
1111  * @param normalized whether we are interpolating normalized values,
1112  *        encoded in normalized integers, twice as wide.
1113  *
1114  * @sa http://www.stereopsis.com/doubleblend.html
1115  */
1116 static INLINE LLVMValueRef
1117 lp_build_lerp_simple(struct lp_build_context *bld,
1118                      LLVMValueRef x,
1119                      LLVMValueRef v0,
1120                      LLVMValueRef v1,
1121                      unsigned flags)
1122 {
1123    unsigned half_width = bld->type.width/2;
1124    LLVMBuilderRef builder = bld->gallivm->builder;
1125    LLVMValueRef delta;
1126    LLVMValueRef res;
1127
1128    assert(lp_check_value(bld->type, x));
1129    assert(lp_check_value(bld->type, v0));
1130    assert(lp_check_value(bld->type, v1));
1131
1132    delta = lp_build_sub(bld, v1, v0);
1133
1134    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1135       if (!bld->type.sign) {
1136          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1137             /*
1138              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1139              * most-significant-bit to the lowest-significant-bit, so that
1140              * later we can just divide by 2**n instead of 2**n - 1.
1141              */
1142
1143             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1144          }
1145
1146          /* (x * delta) >> n */
1147          res = lp_build_mul(bld, x, delta);
1148          res = lp_build_shr_imm(bld, res, half_width);
1149       } else {
1150          /*
1151           * The rescaling trick above doesn't work for signed numbers, so
1152           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1153           * instead.
1154           */
1155          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1156          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1157       }
1158    } else {
1159       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1160       res = lp_build_mul(bld, x, delta);
1161    }
1162
1163    res = lp_build_add(bld, v0, res);
1164
1165    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1166        bld->type.fixed) {
1167       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1168       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1169        * but it will be wrong for true fixed point use cases. Basically we need
1170        * a more powerful lp_type, capable of further distinguishing the values
1171        * interpretation from the value storage. */
1172       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1173    }
1174
1175    return res;
1176 }
1177
1178
1179 /**
1180  * Linear interpolation.
1181  */
1182 LLVMValueRef
1183 lp_build_lerp(struct lp_build_context *bld,
1184               LLVMValueRef x,
1185               LLVMValueRef v0,
1186               LLVMValueRef v1,
1187               unsigned flags)
1188 {
1189    const struct lp_type type = bld->type;
1190    LLVMValueRef res;
1191
1192    assert(lp_check_value(type, x));
1193    assert(lp_check_value(type, v0));
1194    assert(lp_check_value(type, v1));
1195
1196    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1197
1198    if (type.norm) {
1199       struct lp_type wide_type;
1200       struct lp_build_context wide_bld;
1201       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1202
1203       assert(type.length >= 2);
1204
1205       /*
1206        * Create a wider integer type, enough to hold the
1207        * intermediate result of the multiplication.
1208        */
1209       memset(&wide_type, 0, sizeof wide_type);
1210       wide_type.sign   = type.sign;
1211       wide_type.width  = type.width*2;
1212       wide_type.length = type.length/2;
1213
1214       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1215
1216       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1217       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1218       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1219
1220       /*
1221        * Lerp both halves.
1222        */
1223
1224       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1225
1226       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1227       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1228
1229       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1230    } else {
1231       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1232    }
1233
1234    return res;
1235 }
1236
1237
1238 /**
1239  * Bilinear interpolation.
1240  *
1241  * Values indices are in v_{yx}.
1242  */
1243 LLVMValueRef
1244 lp_build_lerp_2d(struct lp_build_context *bld,
1245                  LLVMValueRef x,
1246                  LLVMValueRef y,
1247                  LLVMValueRef v00,
1248                  LLVMValueRef v01,
1249                  LLVMValueRef v10,
1250                  LLVMValueRef v11,
1251                  unsigned flags)
1252 {
1253    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1254    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1255    return lp_build_lerp(bld, y, v0, v1, flags);
1256 }
1257
1258
1259 LLVMValueRef
1260 lp_build_lerp_3d(struct lp_build_context *bld,
1261                  LLVMValueRef x,
1262                  LLVMValueRef y,
1263                  LLVMValueRef z,
1264                  LLVMValueRef v000,
1265                  LLVMValueRef v001,
1266                  LLVMValueRef v010,
1267                  LLVMValueRef v011,
1268                  LLVMValueRef v100,
1269                  LLVMValueRef v101,
1270                  LLVMValueRef v110,
1271                  LLVMValueRef v111,
1272                  unsigned flags)
1273 {
1274    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1275    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1276    return lp_build_lerp(bld, z, v0, v1, flags);
1277 }
1278
1279
1280 /**
1281  * Generate min(a, b)
1282  * Do checks for special cases but not for nans.
1283  */
1284 LLVMValueRef
1285 lp_build_min(struct lp_build_context *bld,
1286              LLVMValueRef a,
1287              LLVMValueRef b)
1288 {
1289    assert(lp_check_value(bld->type, a));
1290    assert(lp_check_value(bld->type, b));
1291
1292    if(a == bld->undef || b == bld->undef)
1293       return bld->undef;
1294
1295    if(a == b)
1296       return a;
1297
1298    if (bld->type.norm) {
1299       if (!bld->type.sign) {
1300          if (a == bld->zero || b == bld->zero) {
1301             return bld->zero;
1302          }
1303       }
1304       if(a == bld->one)
1305          return b;
1306       if(b == bld->one)
1307          return a;
1308    }
1309
1310    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1311 }
1312
1313
1314 /**
1315  * Generate min(a, b)
1316  * NaN's are handled according to the behavior specified by the
1317  * nan_behavior argument.
1318  */
1319 LLVMValueRef
1320 lp_build_min_ext(struct lp_build_context *bld,
1321                  LLVMValueRef a,
1322                  LLVMValueRef b,
1323                  enum gallivm_nan_behavior nan_behavior)
1324 {
1325    assert(lp_check_value(bld->type, a));
1326    assert(lp_check_value(bld->type, b));
1327
1328    if(a == bld->undef || b == bld->undef)
1329       return bld->undef;
1330
1331    if(a == b)
1332       return a;
1333
1334    if (bld->type.norm) {
1335       if (!bld->type.sign) {
1336          if (a == bld->zero || b == bld->zero) {
1337             return bld->zero;
1338          }
1339       }
1340       if(a == bld->one)
1341          return b;
1342       if(b == bld->one)
1343          return a;
1344    }
1345
1346    return lp_build_min_simple(bld, a, b, nan_behavior);
1347 }
1348
1349 /**
1350  * Generate max(a, b)
1351  * Do checks for special cases, but NaN behavior is undefined.
1352  */
1353 LLVMValueRef
1354 lp_build_max(struct lp_build_context *bld,
1355              LLVMValueRef a,
1356              LLVMValueRef b)
1357 {
1358    assert(lp_check_value(bld->type, a));
1359    assert(lp_check_value(bld->type, b));
1360
1361    if(a == bld->undef || b == bld->undef)
1362       return bld->undef;
1363
1364    if(a == b)
1365       return a;
1366
1367    if(bld->type.norm) {
1368       if(a == bld->one || b == bld->one)
1369          return bld->one;
1370       if (!bld->type.sign) {
1371          if (a == bld->zero) {
1372             return b;
1373          }
1374          if (b == bld->zero) {
1375             return a;
1376          }
1377       }
1378    }
1379
1380    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1381 }
1382
1383
1384 /**
1385  * Generate max(a, b)
1386  * Checks for special cases.
1387  * NaN's are handled according to the behavior specified by the
1388  * nan_behavior argument.
1389  */
1390 LLVMValueRef
1391 lp_build_max_ext(struct lp_build_context *bld,
1392                   LLVMValueRef a,
1393                   LLVMValueRef b,
1394                   enum gallivm_nan_behavior nan_behavior)
1395 {
1396    assert(lp_check_value(bld->type, a));
1397    assert(lp_check_value(bld->type, b));
1398
1399    if(a == bld->undef || b == bld->undef)
1400       return bld->undef;
1401
1402    if(a == b)
1403       return a;
1404
1405    if(bld->type.norm) {
1406       if(a == bld->one || b == bld->one)
1407          return bld->one;
1408       if (!bld->type.sign) {
1409          if (a == bld->zero) {
1410             return b;
1411          }
1412          if (b == bld->zero) {
1413             return a;
1414          }
1415       }
1416    }
1417
1418    return lp_build_max_simple(bld, a, b, nan_behavior);
1419 }
1420
1421 /**
1422  * Generate clamp(a, min, max)
1423  * NaN behavior (for any of a, min, max) is undefined.
1424  * Do checks for special cases.
1425  */
1426 LLVMValueRef
1427 lp_build_clamp(struct lp_build_context *bld,
1428                LLVMValueRef a,
1429                LLVMValueRef min,
1430                LLVMValueRef max)
1431 {
1432    assert(lp_check_value(bld->type, a));
1433    assert(lp_check_value(bld->type, min));
1434    assert(lp_check_value(bld->type, max));
1435
1436    a = lp_build_min(bld, a, max);
1437    a = lp_build_max(bld, a, min);
1438    return a;
1439 }
1440
1441
1442 /**
1443  * Generate clamp(a, 0, 1)
1444  * A NaN will get converted to zero.
1445  */
1446 LLVMValueRef
1447 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1448                                 LLVMValueRef a)
1449 {
1450    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1451    a = lp_build_min(bld, a, bld->one);
1452    return a;
1453 }
1454
1455
1456 /**
1457  * Generate abs(a)
1458  */
1459 LLVMValueRef
1460 lp_build_abs(struct lp_build_context *bld,
1461              LLVMValueRef a)
1462 {
1463    LLVMBuilderRef builder = bld->gallivm->builder;
1464    const struct lp_type type = bld->type;
1465    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1466
1467    assert(lp_check_value(type, a));
1468
1469    if(!type.sign)
1470       return a;
1471
1472    if(type.floating) {
1473       /* Mask out the sign bit */
1474       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1475       unsigned long long absMask = ~(1ULL << (type.width - 1));
1476       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1477       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1478       a = LLVMBuildAnd(builder, a, mask, "");
1479       a = LLVMBuildBitCast(builder, a, vec_type, "");
1480       return a;
1481    }
1482
1483    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1484       switch(type.width) {
1485       case 8:
1486          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1487       case 16:
1488          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1489       case 32:
1490          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1491       }
1492    }
1493    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1494             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1495             (type.width == 8 || type.width == 16 || type.width == 32)) {
1496       debug_printf("%s: inefficient code, should split vectors manually\n",
1497                    __FUNCTION__);
1498    }
1499
1500    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1501 }
1502
1503
1504 LLVMValueRef
1505 lp_build_negate(struct lp_build_context *bld,
1506                 LLVMValueRef a)
1507 {
1508    LLVMBuilderRef builder = bld->gallivm->builder;
1509
1510    assert(lp_check_value(bld->type, a));
1511
1512    if (bld->type.floating)
1513       a = LLVMBuildFNeg(builder, a, "");
1514    else
1515       a = LLVMBuildNeg(builder, a, "");
1516
1517    return a;
1518 }
1519
1520
1521 /** Return -1, 0 or +1 depending on the sign of a */
1522 LLVMValueRef
1523 lp_build_sgn(struct lp_build_context *bld,
1524              LLVMValueRef a)
1525 {
1526    LLVMBuilderRef builder = bld->gallivm->builder;
1527    const struct lp_type type = bld->type;
1528    LLVMValueRef cond;
1529    LLVMValueRef res;
1530
1531    assert(lp_check_value(type, a));
1532
1533    /* Handle non-zero case */
1534    if(!type.sign) {
1535       /* if not zero then sign must be positive */
1536       res = bld->one;
1537    }
1538    else if(type.floating) {
1539       LLVMTypeRef vec_type;
1540       LLVMTypeRef int_type;
1541       LLVMValueRef mask;
1542       LLVMValueRef sign;
1543       LLVMValueRef one;
1544       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1545
1546       int_type = lp_build_int_vec_type(bld->gallivm, type);
1547       vec_type = lp_build_vec_type(bld->gallivm, type);
1548       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1549
1550       /* Take the sign bit and add it to 1 constant */
1551       sign = LLVMBuildBitCast(builder, a, int_type, "");
1552       sign = LLVMBuildAnd(builder, sign, mask, "");
1553       one = LLVMConstBitCast(bld->one, int_type);
1554       res = LLVMBuildOr(builder, sign, one, "");
1555       res = LLVMBuildBitCast(builder, res, vec_type, "");
1556    }
1557    else
1558    {
1559       /* signed int/norm/fixed point */
1560       /* could use psign with sse3 and appropriate vectors here */
1561       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1562       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1563       res = lp_build_select(bld, cond, bld->one, minus_one);
1564    }
1565
1566    /* Handle zero */
1567    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1568    res = lp_build_select(bld, cond, bld->zero, res);
1569
1570    return res;
1571 }
1572
1573
1574 /**
1575  * Set the sign of float vector 'a' according to 'sign'.
1576  * If sign==0, return abs(a).
1577  * If sign==1, return -abs(a);
1578  * Other values for sign produce undefined results.
1579  */
1580 LLVMValueRef
1581 lp_build_set_sign(struct lp_build_context *bld,
1582                   LLVMValueRef a, LLVMValueRef sign)
1583 {
1584    LLVMBuilderRef builder = bld->gallivm->builder;
1585    const struct lp_type type = bld->type;
1586    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1587    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1588    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1589    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1590                              ~((unsigned long long) 1 << (type.width - 1)));
1591    LLVMValueRef val, res;
1592
1593    assert(type.floating);
1594    assert(lp_check_value(type, a));
1595
1596    /* val = reinterpret_cast<int>(a) */
1597    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1598    /* val = val & mask */
1599    val = LLVMBuildAnd(builder, val, mask, "");
1600    /* sign = sign << shift */
1601    sign = LLVMBuildShl(builder, sign, shift, "");
1602    /* res = val | sign */
1603    res = LLVMBuildOr(builder, val, sign, "");
1604    /* res = reinterpret_cast<float>(res) */
1605    res = LLVMBuildBitCast(builder, res, vec_type, "");
1606
1607    return res;
1608 }
1609
1610
1611 /**
1612  * Convert vector of (or scalar) int to vector of (or scalar) float.
1613  */
1614 LLVMValueRef
1615 lp_build_int_to_float(struct lp_build_context *bld,
1616                       LLVMValueRef a)
1617 {
1618    LLVMBuilderRef builder = bld->gallivm->builder;
1619    const struct lp_type type = bld->type;
1620    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1621
1622    assert(type.floating);
1623
1624    return LLVMBuildSIToFP(builder, a, vec_type, "");
1625 }
1626
1627 static boolean
1628 arch_rounding_available(const struct lp_type type)
1629 {
1630    if ((util_cpu_caps.has_sse4_1 &&
1631        (type.length == 1 || type.width*type.length == 128)) ||
1632        (util_cpu_caps.has_avx && type.width*type.length == 256))
1633       return TRUE;
1634    else if ((util_cpu_caps.has_altivec &&
1635             (type.width == 32 && type.length == 4)))
1636       return TRUE;
1637
1638    return FALSE;
1639 }
1640
1641 enum lp_build_round_mode
1642 {
1643    LP_BUILD_ROUND_NEAREST = 0,
1644    LP_BUILD_ROUND_FLOOR = 1,
1645    LP_BUILD_ROUND_CEIL = 2,
1646    LP_BUILD_ROUND_TRUNCATE = 3
1647 };
1648
1649 /**
1650  * Helper for SSE4.1's ROUNDxx instructions.
1651  *
1652  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1653  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
1654  */
1655 static INLINE LLVMValueRef
1656 lp_build_round_sse41(struct lp_build_context *bld,
1657                      LLVMValueRef a,
1658                      enum lp_build_round_mode mode)
1659 {
1660    LLVMBuilderRef builder = bld->gallivm->builder;
1661    const struct lp_type type = bld->type;
1662    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1663    const char *intrinsic;
1664    LLVMValueRef res;
1665
1666    assert(type.floating);
1667
1668    assert(lp_check_value(type, a));
1669    assert(util_cpu_caps.has_sse4_1);
1670
1671    if (type.length == 1) {
1672       LLVMTypeRef vec_type;
1673       LLVMValueRef undef;
1674       LLVMValueRef args[3];
1675       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1676
1677       switch(type.width) {
1678       case 32:
1679          intrinsic = "llvm.x86.sse41.round.ss";
1680          break;
1681       case 64:
1682          intrinsic = "llvm.x86.sse41.round.sd";
1683          break;
1684       default:
1685          assert(0);
1686          return bld->undef;
1687       }
1688
1689       vec_type = LLVMVectorType(bld->elem_type, 4);
1690
1691       undef = LLVMGetUndef(vec_type);
1692
1693       args[0] = undef;
1694       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1695       args[2] = LLVMConstInt(i32t, mode, 0);
1696
1697       res = lp_build_intrinsic(builder, intrinsic,
1698                                vec_type, args, Elements(args));
1699
1700       res = LLVMBuildExtractElement(builder, res, index0, "");
1701    }
1702    else {
1703       if (type.width * type.length == 128) {
1704          switch(type.width) {
1705          case 32:
1706             intrinsic = "llvm.x86.sse41.round.ps";
1707             break;
1708          case 64:
1709             intrinsic = "llvm.x86.sse41.round.pd";
1710             break;
1711          default:
1712             assert(0);
1713             return bld->undef;
1714          }
1715       }
1716       else {
1717          assert(type.width * type.length == 256);
1718          assert(util_cpu_caps.has_avx);
1719
1720          switch(type.width) {
1721          case 32:
1722             intrinsic = "llvm.x86.avx.round.ps.256";
1723             break;
1724          case 64:
1725             intrinsic = "llvm.x86.avx.round.pd.256";
1726             break;
1727          default:
1728             assert(0);
1729             return bld->undef;
1730          }
1731       }
1732
1733       res = lp_build_intrinsic_binary(builder, intrinsic,
1734                                       bld->vec_type, a,
1735                                       LLVMConstInt(i32t, mode, 0));
1736    }
1737
1738    return res;
1739 }
1740
1741
1742 static INLINE LLVMValueRef
1743 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1744                              LLVMValueRef a)
1745 {
1746    LLVMBuilderRef builder = bld->gallivm->builder;
1747    const struct lp_type type = bld->type;
1748    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1749    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1750    const char *intrinsic;
1751    LLVMValueRef res;
1752
1753    assert(type.floating);
1754    /* using the double precision conversions is a bit more complicated */
1755    assert(type.width == 32);
1756
1757    assert(lp_check_value(type, a));
1758    assert(util_cpu_caps.has_sse2);
1759
1760    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1761    if (type.length == 1) {
1762       LLVMTypeRef vec_type;
1763       LLVMValueRef undef;
1764       LLVMValueRef arg;
1765       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1766
1767       vec_type = LLVMVectorType(bld->elem_type, 4);
1768
1769       intrinsic = "llvm.x86.sse.cvtss2si";
1770
1771       undef = LLVMGetUndef(vec_type);
1772
1773       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1774
1775       res = lp_build_intrinsic_unary(builder, intrinsic,
1776                                      ret_type, arg);
1777    }
1778    else {
1779       if (type.width* type.length == 128) {
1780          intrinsic = "llvm.x86.sse2.cvtps2dq";
1781       }
1782       else {
1783          assert(type.width*type.length == 256);
1784          assert(util_cpu_caps.has_avx);
1785
1786          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1787       }
1788       res = lp_build_intrinsic_unary(builder, intrinsic,
1789                                      ret_type, a);
1790    }
1791
1792    return res;
1793 }
1794
1795
1796 /*
1797  */
1798 static INLINE LLVMValueRef
1799 lp_build_round_altivec(struct lp_build_context *bld,
1800                        LLVMValueRef a,
1801                        enum lp_build_round_mode mode)
1802 {
1803    LLVMBuilderRef builder = bld->gallivm->builder;
1804    const struct lp_type type = bld->type;
1805    const char *intrinsic = NULL;
1806
1807    assert(type.floating);
1808
1809    assert(lp_check_value(type, a));
1810    assert(util_cpu_caps.has_altivec);
1811
1812    switch (mode) {
1813    case LP_BUILD_ROUND_NEAREST:
1814       intrinsic = "llvm.ppc.altivec.vrfin";
1815       break;
1816    case LP_BUILD_ROUND_FLOOR:
1817       intrinsic = "llvm.ppc.altivec.vrfim";
1818       break;
1819    case LP_BUILD_ROUND_CEIL:
1820       intrinsic = "llvm.ppc.altivec.vrfip";
1821       break;
1822    case LP_BUILD_ROUND_TRUNCATE:
1823       intrinsic = "llvm.ppc.altivec.vrfiz";
1824       break;
1825    }
1826
1827    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1828 }
1829
1830 static INLINE LLVMValueRef
1831 lp_build_round_arch(struct lp_build_context *bld,
1832                     LLVMValueRef a,
1833                     enum lp_build_round_mode mode)
1834 {
1835    if (util_cpu_caps.has_sse4_1)
1836      return lp_build_round_sse41(bld, a, mode);
1837    else /* (util_cpu_caps.has_altivec) */
1838      return lp_build_round_altivec(bld, a, mode);
1839 }
1840
1841 /**
1842  * Return the integer part of a float (vector) value (== round toward zero).
1843  * The returned value is a float (vector).
1844  * Ex: trunc(-1.5) = -1.0
1845  */
1846 LLVMValueRef
1847 lp_build_trunc(struct lp_build_context *bld,
1848                LLVMValueRef a)
1849 {
1850    LLVMBuilderRef builder = bld->gallivm->builder;
1851    const struct lp_type type = bld->type;
1852
1853    assert(type.floating);
1854    assert(lp_check_value(type, a));
1855
1856    if (arch_rounding_available(type)) {
1857       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1858    }
1859    else {
1860       const struct lp_type type = bld->type;
1861       struct lp_type inttype;
1862       struct lp_build_context intbld;
1863       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1864       LLVMValueRef trunc, res, anosign, mask;
1865       LLVMTypeRef int_vec_type = bld->int_vec_type;
1866       LLVMTypeRef vec_type = bld->vec_type;
1867
1868       assert(type.width == 32); /* might want to handle doubles at some point */
1869
1870       inttype = type;
1871       inttype.floating = 0;
1872       lp_build_context_init(&intbld, bld->gallivm, inttype);
1873
1874       /* round by truncation */
1875       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1876       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1877
1878       /* mask out sign bit */
1879       anosign = lp_build_abs(bld, a);
1880       /*
1881        * mask out all values if anosign > 2^24
1882        * This should work both for large ints (all rounding is no-op for them
1883        * because such floats are always exact) as well as special cases like
1884        * NaNs, Infs (taking advantage of the fact they use max exponent).
1885        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1886        */
1887       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1888       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1889       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1890       return lp_build_select(bld, mask, a, res);
1891    }
1892 }
1893
1894
1895 /**
1896  * Return float (vector) rounded to nearest integer (vector).  The returned
1897  * value is a float (vector).
1898  * Ex: round(0.9) = 1.0
1899  * Ex: round(-1.5) = -2.0
1900  */
1901 LLVMValueRef
1902 lp_build_round(struct lp_build_context *bld,
1903                LLVMValueRef a)
1904 {
1905    LLVMBuilderRef builder = bld->gallivm->builder;
1906    const struct lp_type type = bld->type;
1907
1908    assert(type.floating);
1909    assert(lp_check_value(type, a));
1910
1911    if (arch_rounding_available(type)) {
1912       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1913    }
1914    else {
1915       const struct lp_type type = bld->type;
1916       struct lp_type inttype;
1917       struct lp_build_context intbld;
1918       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1919       LLVMValueRef res, anosign, mask;
1920       LLVMTypeRef int_vec_type = bld->int_vec_type;
1921       LLVMTypeRef vec_type = bld->vec_type;
1922
1923       assert(type.width == 32); /* might want to handle doubles at some point */
1924
1925       inttype = type;
1926       inttype.floating = 0;
1927       lp_build_context_init(&intbld, bld->gallivm, inttype);
1928
1929       res = lp_build_iround(bld, a);
1930       res = LLVMBuildSIToFP(builder, res, vec_type, "");
1931
1932       /* mask out sign bit */
1933       anosign = lp_build_abs(bld, a);
1934       /*
1935        * mask out all values if anosign > 2^24
1936        * This should work both for large ints (all rounding is no-op for them
1937        * because such floats are always exact) as well as special cases like
1938        * NaNs, Infs (taking advantage of the fact they use max exponent).
1939        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1940        */
1941       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1942       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1943       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1944       return lp_build_select(bld, mask, a, res);
1945    }
1946 }
1947
1948
1949 /**
1950  * Return floor of float (vector), result is a float (vector)
1951  * Ex: floor(1.1) = 1.0
1952  * Ex: floor(-1.1) = -2.0
1953  */
1954 LLVMValueRef
1955 lp_build_floor(struct lp_build_context *bld,
1956                LLVMValueRef a)
1957 {
1958    LLVMBuilderRef builder = bld->gallivm->builder;
1959    const struct lp_type type = bld->type;
1960
1961    assert(type.floating);
1962    assert(lp_check_value(type, a));
1963
1964    if (arch_rounding_available(type)) {
1965       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1966    }
1967    else {
1968       const struct lp_type type = bld->type;
1969       struct lp_type inttype;
1970       struct lp_build_context intbld;
1971       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1972       LLVMValueRef trunc, res, anosign, mask;
1973       LLVMTypeRef int_vec_type = bld->int_vec_type;
1974       LLVMTypeRef vec_type = bld->vec_type;
1975
1976       assert(type.width == 32); /* might want to handle doubles at some point */
1977
1978       inttype = type;
1979       inttype.floating = 0;
1980       lp_build_context_init(&intbld, bld->gallivm, inttype);
1981
1982       /* round by truncation */
1983       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1984       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1985
1986       if (type.sign) {
1987          LLVMValueRef tmp;
1988
1989          /*
1990           * fix values if rounding is wrong (for non-special cases)
1991           * - this is the case if trunc > a
1992           */
1993          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1994          /* tmp = trunc > a ? 1.0 : 0.0 */
1995          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1996          tmp = lp_build_and(&intbld, mask, tmp);
1997          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1998          res = lp_build_sub(bld, res, tmp);
1999       }
2000
2001       /* mask out sign bit */
2002       anosign = lp_build_abs(bld, a);
2003       /*
2004        * mask out all values if anosign > 2^24
2005        * This should work both for large ints (all rounding is no-op for them
2006        * because such floats are always exact) as well as special cases like
2007        * NaNs, Infs (taking advantage of the fact they use max exponent).
2008        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2009        */
2010       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2011       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2012       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2013       return lp_build_select(bld, mask, a, res);
2014    }
2015 }
2016
2017
2018 /**
2019  * Return ceiling of float (vector), returning float (vector).
2020  * Ex: ceil( 1.1) = 2.0
2021  * Ex: ceil(-1.1) = -1.0
2022  */
2023 LLVMValueRef
2024 lp_build_ceil(struct lp_build_context *bld,
2025               LLVMValueRef a)
2026 {
2027    LLVMBuilderRef builder = bld->gallivm->builder;
2028    const struct lp_type type = bld->type;
2029
2030    assert(type.floating);
2031    assert(lp_check_value(type, a));
2032
2033    if (arch_rounding_available(type)) {
2034       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2035    }
2036    else {
2037       const struct lp_type type = bld->type;
2038       struct lp_type inttype;
2039       struct lp_build_context intbld;
2040       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2041       LLVMValueRef trunc, res, anosign, mask, tmp;
2042       LLVMTypeRef int_vec_type = bld->int_vec_type;
2043       LLVMTypeRef vec_type = bld->vec_type;
2044
2045       assert(type.width == 32); /* might want to handle doubles at some point */
2046
2047       inttype = type;
2048       inttype.floating = 0;
2049       lp_build_context_init(&intbld, bld->gallivm, inttype);
2050
2051       /* round by truncation */
2052       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2053       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2054
2055       /*
2056        * fix values if rounding is wrong (for non-special cases)
2057        * - this is the case if trunc < a
2058        */
2059       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2060       /* tmp = trunc < a ? 1.0 : 0.0 */
2061       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2062       tmp = lp_build_and(&intbld, mask, tmp);
2063       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2064       res = lp_build_add(bld, trunc, tmp);
2065
2066       /* mask out sign bit */
2067       anosign = lp_build_abs(bld, a);
2068       /*
2069        * mask out all values if anosign > 2^24
2070        * This should work both for large ints (all rounding is no-op for them
2071        * because such floats are always exact) as well as special cases like
2072        * NaNs, Infs (taking advantage of the fact they use max exponent).
2073        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2074        */
2075       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2076       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2077       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2078       return lp_build_select(bld, mask, a, res);
2079    }
2080 }
2081
2082
2083 /**
2084  * Return fractional part of 'a' computed as a - floor(a)
2085  * Typically used in texture coord arithmetic.
2086  */
2087 LLVMValueRef
2088 lp_build_fract(struct lp_build_context *bld,
2089                LLVMValueRef a)
2090 {
2091    assert(bld->type.floating);
2092    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2093 }
2094
2095
2096 /**
2097  * Prevent returning a fractional part of 1.0 for very small negative values of
2098  * 'a' by clamping against 0.99999(9).
2099  */
2100 static inline LLVMValueRef
2101 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2102 {
2103    LLVMValueRef max;
2104
2105    /* this is the largest number smaller than 1.0 representable as float */
2106    max = lp_build_const_vec(bld->gallivm, bld->type,
2107                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2108    return lp_build_min(bld, fract, max);
2109 }
2110
2111
2112 /**
2113  * Same as lp_build_fract, but guarantees that the result is always smaller
2114  * than one.
2115  */
2116 LLVMValueRef
2117 lp_build_fract_safe(struct lp_build_context *bld,
2118                     LLVMValueRef a)
2119 {
2120    return clamp_fract(bld, lp_build_fract(bld, a));
2121 }
2122
2123
2124 /**
2125  * Return the integer part of a float (vector) value (== round toward zero).
2126  * The returned value is an integer (vector).
2127  * Ex: itrunc(-1.5) = -1
2128  */
2129 LLVMValueRef
2130 lp_build_itrunc(struct lp_build_context *bld,
2131                 LLVMValueRef a)
2132 {
2133    LLVMBuilderRef builder = bld->gallivm->builder;
2134    const struct lp_type type = bld->type;
2135    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2136
2137    assert(type.floating);
2138    assert(lp_check_value(type, a));
2139
2140    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2141 }
2142
2143
2144 /**
2145  * Return float (vector) rounded to nearest integer (vector).  The returned
2146  * value is an integer (vector).
2147  * Ex: iround(0.9) = 1
2148  * Ex: iround(-1.5) = -2
2149  */
2150 LLVMValueRef
2151 lp_build_iround(struct lp_build_context *bld,
2152                 LLVMValueRef a)
2153 {
2154    LLVMBuilderRef builder = bld->gallivm->builder;
2155    const struct lp_type type = bld->type;
2156    LLVMTypeRef int_vec_type = bld->int_vec_type;
2157    LLVMValueRef res;
2158
2159    assert(type.floating);
2160
2161    assert(lp_check_value(type, a));
2162
2163    if ((util_cpu_caps.has_sse2 &&
2164        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2165        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2166       return lp_build_iround_nearest_sse2(bld, a);
2167    }
2168    if (arch_rounding_available(type)) {
2169       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2170    }
2171    else {
2172       LLVMValueRef half;
2173
2174       half = lp_build_const_vec(bld->gallivm, type, 0.5);
2175
2176       if (type.sign) {
2177          LLVMTypeRef vec_type = bld->vec_type;
2178          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2179                                     (unsigned long long)1 << (type.width - 1));
2180          LLVMValueRef sign;
2181
2182          /* get sign bit */
2183          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2184          sign = LLVMBuildAnd(builder, sign, mask, "");
2185
2186          /* sign * 0.5 */
2187          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2188          half = LLVMBuildOr(builder, sign, half, "");
2189          half = LLVMBuildBitCast(builder, half, vec_type, "");
2190       }
2191
2192       res = LLVMBuildFAdd(builder, a, half, "");
2193    }
2194
2195    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2196
2197    return res;
2198 }
2199
2200
2201 /**
2202  * Return floor of float (vector), result is an int (vector)
2203  * Ex: ifloor(1.1) = 1.0
2204  * Ex: ifloor(-1.1) = -2.0
2205  */
2206 LLVMValueRef
2207 lp_build_ifloor(struct lp_build_context *bld,
2208                 LLVMValueRef a)
2209 {
2210    LLVMBuilderRef builder = bld->gallivm->builder;
2211    const struct lp_type type = bld->type;
2212    LLVMTypeRef int_vec_type = bld->int_vec_type;
2213    LLVMValueRef res;
2214
2215    assert(type.floating);
2216    assert(lp_check_value(type, a));
2217
2218    res = a;
2219    if (type.sign) {
2220       if (arch_rounding_available(type)) {
2221          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2222       }
2223       else {
2224          struct lp_type inttype;
2225          struct lp_build_context intbld;
2226          LLVMValueRef trunc, itrunc, mask;
2227
2228          assert(type.floating);
2229          assert(lp_check_value(type, a));
2230
2231          inttype = type;
2232          inttype.floating = 0;
2233          lp_build_context_init(&intbld, bld->gallivm, inttype);
2234
2235          /* round by truncation */
2236          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2237          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2238
2239          /*
2240           * fix values if rounding is wrong (for non-special cases)
2241           * - this is the case if trunc > a
2242           * The results of doing this with NaNs, very large values etc.
2243           * are undefined but this seems to be the case anyway.
2244           */
2245          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2246          /* cheapie minus one with mask since the mask is minus one / zero */
2247          return lp_build_add(&intbld, itrunc, mask);
2248       }
2249    }
2250
2251    /* round to nearest (toward zero) */
2252    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2253
2254    return res;
2255 }
2256
2257
2258 /**
2259  * Return ceiling of float (vector), returning int (vector).
2260  * Ex: iceil( 1.1) = 2
2261  * Ex: iceil(-1.1) = -1
2262  */
2263 LLVMValueRef
2264 lp_build_iceil(struct lp_build_context *bld,
2265                LLVMValueRef a)
2266 {
2267    LLVMBuilderRef builder = bld->gallivm->builder;
2268    const struct lp_type type = bld->type;
2269    LLVMTypeRef int_vec_type = bld->int_vec_type;
2270    LLVMValueRef res;
2271
2272    assert(type.floating);
2273    assert(lp_check_value(type, a));
2274
2275    if (arch_rounding_available(type)) {
2276       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2277    }
2278    else {
2279       struct lp_type inttype;
2280       struct lp_build_context intbld;
2281       LLVMValueRef trunc, itrunc, mask;
2282
2283       assert(type.floating);
2284       assert(lp_check_value(type, a));
2285
2286       inttype = type;
2287       inttype.floating = 0;
2288       lp_build_context_init(&intbld, bld->gallivm, inttype);
2289
2290       /* round by truncation */
2291       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2292       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2293
2294       /*
2295        * fix values if rounding is wrong (for non-special cases)
2296        * - this is the case if trunc < a
2297        * The results of doing this with NaNs, very large values etc.
2298        * are undefined but this seems to be the case anyway.
2299        */
2300       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2301       /* cheapie plus one with mask since the mask is minus one / zero */
2302       return lp_build_sub(&intbld, itrunc, mask);
2303    }
2304
2305    /* round to nearest (toward zero) */
2306    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2307
2308    return res;
2309 }
2310
2311
2312 /**
2313  * Combined ifloor() & fract().
2314  *
2315  * Preferred to calling the functions separately, as it will ensure that the
2316  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2317  */
2318 void
2319 lp_build_ifloor_fract(struct lp_build_context *bld,
2320                       LLVMValueRef a,
2321                       LLVMValueRef *out_ipart,
2322                       LLVMValueRef *out_fpart)
2323 {
2324    LLVMBuilderRef builder = bld->gallivm->builder;
2325    const struct lp_type type = bld->type;
2326    LLVMValueRef ipart;
2327
2328    assert(type.floating);
2329    assert(lp_check_value(type, a));
2330
2331    if (arch_rounding_available(type)) {
2332       /*
2333        * floor() is easier.
2334        */
2335
2336       ipart = lp_build_floor(bld, a);
2337       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2338       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2339    }
2340    else {
2341       /*
2342        * ifloor() is easier.
2343        */
2344
2345       *out_ipart = lp_build_ifloor(bld, a);
2346       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2347       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2348    }
2349 }
2350
2351
2352 /**
2353  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2354  * always smaller than one.
2355  */
2356 void
2357 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2358                            LLVMValueRef a,
2359                            LLVMValueRef *out_ipart,
2360                            LLVMValueRef *out_fpart)
2361 {
2362    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2363    *out_fpart = clamp_fract(bld, *out_fpart);
2364 }
2365
2366
2367 LLVMValueRef
2368 lp_build_sqrt(struct lp_build_context *bld,
2369               LLVMValueRef a)
2370 {
2371    LLVMBuilderRef builder = bld->gallivm->builder;
2372    const struct lp_type type = bld->type;
2373    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2374    char intrinsic[32];
2375
2376    assert(lp_check_value(type, a));
2377
2378    /* TODO: optimize the constant case */
2379
2380    assert(type.floating);
2381    if (type.length == 1) {
2382       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2383    }
2384    else {
2385       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2386    }
2387
2388    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2389 }
2390
2391
2392 /**
2393  * Do one Newton-Raphson step to improve reciprocate precision:
2394  *
2395  *   x_{i+1} = x_i * (2 - a * x_i)
2396  *
2397  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2398  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2399  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2400  * halo. It would be necessary to clamp the argument to prevent this.
2401  *
2402  * See also:
2403  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2404  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2405  */
2406 static INLINE LLVMValueRef
2407 lp_build_rcp_refine(struct lp_build_context *bld,
2408                     LLVMValueRef a,
2409                     LLVMValueRef rcp_a)
2410 {
2411    LLVMBuilderRef builder = bld->gallivm->builder;
2412    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2413    LLVMValueRef res;
2414
2415    res = LLVMBuildFMul(builder, a, rcp_a, "");
2416    res = LLVMBuildFSub(builder, two, res, "");
2417    res = LLVMBuildFMul(builder, rcp_a, res, "");
2418
2419    return res;
2420 }
2421
2422
2423 LLVMValueRef
2424 lp_build_rcp(struct lp_build_context *bld,
2425              LLVMValueRef a)
2426 {
2427    LLVMBuilderRef builder = bld->gallivm->builder;
2428    const struct lp_type type = bld->type;
2429
2430    assert(lp_check_value(type, a));
2431
2432    if(a == bld->zero)
2433       return bld->undef;
2434    if(a == bld->one)
2435       return bld->one;
2436    if(a == bld->undef)
2437       return bld->undef;
2438
2439    assert(type.floating);
2440
2441    if(LLVMIsConstant(a))
2442       return LLVMConstFDiv(bld->one, a);
2443
2444    /*
2445     * We don't use RCPPS because:
2446     * - it only has 10bits of precision
2447     * - it doesn't even get the reciprocate of 1.0 exactly
2448     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2449     * - for recent processors the benefit over DIVPS is marginal, a case
2450     *   dependent
2451     *
2452     * We could still use it on certain processors if benchmarks show that the
2453     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2454     * particular uses that require less workarounds.
2455     */
2456
2457    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2458          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2459       const unsigned num_iterations = 0;
2460       LLVMValueRef res;
2461       unsigned i;
2462       const char *intrinsic = NULL;
2463
2464       if (type.length == 4) {
2465          intrinsic = "llvm.x86.sse.rcp.ps";
2466       }
2467       else {
2468          intrinsic = "llvm.x86.avx.rcp.ps.256";
2469       }
2470
2471       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2472
2473       for (i = 0; i < num_iterations; ++i) {
2474          res = lp_build_rcp_refine(bld, a, res);
2475       }
2476
2477       return res;
2478    }
2479
2480    return LLVMBuildFDiv(builder, bld->one, a, "");
2481 }
2482
2483
2484 /**
2485  * Do one Newton-Raphson step to improve rsqrt precision:
2486  *
2487  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2488  *
2489  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2490  */
2491 static INLINE LLVMValueRef
2492 lp_build_rsqrt_refine(struct lp_build_context *bld,
2493                       LLVMValueRef a,
2494                       LLVMValueRef rsqrt_a)
2495 {
2496    LLVMBuilderRef builder = bld->gallivm->builder;
2497    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2498    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2499    LLVMValueRef res;
2500
2501    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2502    res = LLVMBuildFMul(builder, a, res, "");
2503    res = LLVMBuildFSub(builder, three, res, "");
2504    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2505    res = LLVMBuildFMul(builder, half, res, "");
2506
2507    return res;
2508 }
2509
2510
2511 /**
2512  * Generate 1/sqrt(a).
2513  * Result is undefined for values < 0, infinity for +0.
2514  */
2515 LLVMValueRef
2516 lp_build_rsqrt(struct lp_build_context *bld,
2517                LLVMValueRef a)
2518 {
2519    LLVMBuilderRef builder = bld->gallivm->builder;
2520    const struct lp_type type = bld->type;
2521
2522    assert(lp_check_value(type, a));
2523
2524    assert(type.floating);
2525
2526    /*
2527     * This should be faster but all denormals will end up as infinity.
2528     */
2529    if (0 && lp_build_fast_rsqrt_available(type)) {
2530       const unsigned num_iterations = 1;
2531       LLVMValueRef res;
2532       unsigned i;
2533
2534       /* rsqrt(1.0) != 1.0 here */
2535       res = lp_build_fast_rsqrt(bld, a);
2536
2537       if (num_iterations) {
2538          /*
2539           * Newton-Raphson will result in NaN instead of infinity for zero,
2540           * and NaN instead of zero for infinity.
2541           * Also, need to ensure rsqrt(1.0) == 1.0.
2542           * All numbers smaller than FLT_MIN will result in +infinity
2543           * (rsqrtps treats all denormals as zero).
2544           */
2545          /*
2546           * Certain non-c99 compilers don't know INFINITY and might not support
2547           * hacks to evaluate it at compile time neither.
2548           */
2549          const unsigned posinf_int = 0x7F800000;
2550          LLVMValueRef cmp;
2551          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2552          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2553
2554          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2555
2556          for (i = 0; i < num_iterations; ++i) {
2557             res = lp_build_rsqrt_refine(bld, a, res);
2558          }
2559          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2560          res = lp_build_select(bld, cmp, inf, res);
2561          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2562          res = lp_build_select(bld, cmp, bld->zero, res);
2563          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2564          res = lp_build_select(bld, cmp, bld->one, res);
2565       }
2566
2567       return res;
2568    }
2569
2570    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2571 }
2572
2573 /**
2574  * If there's a fast (inaccurate) rsqrt instruction available
2575  * (caller may want to avoid to call rsqrt_fast if it's not available,
2576  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2577  * unavailable it would result in sqrt/div/mul so obviously
2578  * much better to just call sqrt, skipping both div and mul).
2579  */
2580 boolean
2581 lp_build_fast_rsqrt_available(struct lp_type type)
2582 {
2583    assert(type.floating);
2584
2585    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2586        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2587       return true;
2588    }
2589    return false;
2590 }
2591
2592
2593 /**
2594  * Generate 1/sqrt(a).
2595  * Result is undefined for values < 0, infinity for +0.
2596  * Precision is limited, only ~10 bits guaranteed
2597  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2598  */
2599 LLVMValueRef
2600 lp_build_fast_rsqrt(struct lp_build_context *bld,
2601                     LLVMValueRef a)
2602 {
2603    LLVMBuilderRef builder = bld->gallivm->builder;
2604    const struct lp_type type = bld->type;
2605
2606    assert(lp_check_value(type, a));
2607
2608    if (lp_build_fast_rsqrt_available(type)) {
2609       const char *intrinsic = NULL;
2610
2611       if (type.length == 4) {
2612          intrinsic = "llvm.x86.sse.rsqrt.ps";
2613       }
2614       else {
2615          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2616       }
2617       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2618    }
2619    else {
2620       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2621    }
2622    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2623 }
2624
2625
2626 /**
2627  * Generate sin(a) or cos(a) using polynomial approximation.
2628  * TODO: it might be worth recognizing sin and cos using same source
2629  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2630  * would be way cheaper than calculating (nearly) everything twice...
2631  * Not sure it's common enough to be worth bothering however, scs
2632  * opcode could also benefit from calculating both though.
2633  */
2634 static LLVMValueRef
2635 lp_build_sin_or_cos(struct lp_build_context *bld,
2636                     LLVMValueRef a,
2637                     boolean cos)
2638 {
2639    struct gallivm_state *gallivm = bld->gallivm;
2640    LLVMBuilderRef b = gallivm->builder;
2641    struct lp_type int_type = lp_int_type(bld->type);
2642
2643    /*
2644     *  take the absolute value,
2645     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2646     */
2647
2648    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2649    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2650
2651    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2652    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2653
2654    /*
2655     * scale by 4/Pi
2656     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2657     */
2658
2659    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2660    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2661
2662    /*
2663     * store the integer part of y in mm0
2664     * emm2 = _mm_cvttps_epi32(y);
2665     */
2666
2667    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2668
2669    /*
2670     * j=(j+1) & (~1) (see the cephes sources)
2671     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2672     */
2673
2674    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2675    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2676    /*
2677     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2678     */
2679    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2680    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2681
2682    /*
2683     * y = _mm_cvtepi32_ps(emm2);
2684     */
2685    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2686
2687    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2688    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2689    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2690    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2691
2692    /*
2693     * Argument used for poly selection and sign bit determination
2694     * is different for sin vs. cos.
2695     */
2696    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2697                                emm2_and;
2698
2699    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2700                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2701                                               const_29, "sign_bit") :
2702                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2703                                                               LLVMBuildShl(b, emm2_add,
2704                                                                            const_29, ""), ""),
2705                                               sign_mask, "sign_bit");
2706
2707    /*
2708     * get the polynom selection mask
2709     * there is one polynom for 0 <= x <= Pi/4
2710     * and another one for Pi/4<x<=Pi/2
2711     * Both branches will be computed.
2712     *
2713     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2714     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2715     */
2716
2717    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2718    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2719                                              int_type, PIPE_FUNC_EQUAL,
2720                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2721
2722    /*
2723     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2724     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2725     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2726     */
2727    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2728    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2729    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2730
2731    /*
2732     * The magic pass: "Extended precision modular arithmetic"
2733     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2734     * xmm1 = _mm_mul_ps(y, xmm1);
2735     * xmm2 = _mm_mul_ps(y, xmm2);
2736     * xmm3 = _mm_mul_ps(y, xmm3);
2737     */
2738    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2739    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2740    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2741
2742    /*
2743     * x = _mm_add_ps(x, xmm1);
2744     * x = _mm_add_ps(x, xmm2);
2745     * x = _mm_add_ps(x, xmm3);
2746     */
2747
2748    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2749    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2750    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2751
2752    /*
2753     * Evaluate the first polynom  (0 <= x <= Pi/4)
2754     *
2755     * z = _mm_mul_ps(x,x);
2756     */
2757    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2758
2759    /*
2760     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2761     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2762     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2763     */
2764    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2765    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2766    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2767
2768    /*
2769     * y = *(v4sf*)_ps_coscof_p0;
2770     * y = _mm_mul_ps(y, z);
2771     */
2772    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2773    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2774    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2775    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2776    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2777    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2778
2779
2780    /*
2781     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2782     * y = _mm_sub_ps(y, tmp);
2783     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2784     */
2785    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2786    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2787    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2788    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2789    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2790
2791    /*
2792     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2793     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2794     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2795     */
2796    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2797    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2798    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2799
2800    /*
2801     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2802     *
2803     * y2 = *(v4sf*)_ps_sincof_p0;
2804     * y2 = _mm_mul_ps(y2, z);
2805     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2806     * y2 = _mm_mul_ps(y2, z);
2807     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2808     * y2 = _mm_mul_ps(y2, z);
2809     * y2 = _mm_mul_ps(y2, x);
2810     * y2 = _mm_add_ps(y2, x);
2811     */
2812
2813    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2814    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2815    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2816    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2817    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2818    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2819    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2820
2821    /*
2822     * select the correct result from the two polynoms
2823     * xmm3 = poly_mask;
2824     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2825     * y = _mm_andnot_ps(xmm3, y);
2826     * y = _mm_or_ps(y,y2);
2827     */
2828    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2829    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2830    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2831    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2832    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2833    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2834
2835    /*
2836     * update the sign
2837     * y = _mm_xor_ps(y, sign_bit);
2838     */
2839    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2840    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2841
2842    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2843
2844    /* clamp output to be within [-1, 1] */
2845    y_result = lp_build_clamp(bld, y_result,
2846                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
2847                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
2848    /* If a is -inf, inf or NaN then return NaN */
2849    y_result = lp_build_select(bld, isfinite, y_result,
2850                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
2851    return y_result;
2852 }
2853
2854
2855 /**
2856  * Generate sin(a)
2857  */
2858 LLVMValueRef
2859 lp_build_sin(struct lp_build_context *bld,
2860              LLVMValueRef a)
2861 {
2862    return lp_build_sin_or_cos(bld, a, FALSE);
2863 }
2864
2865
2866 /**
2867  * Generate cos(a)
2868  */
2869 LLVMValueRef
2870 lp_build_cos(struct lp_build_context *bld,
2871              LLVMValueRef a)
2872 {
2873    return lp_build_sin_or_cos(bld, a, TRUE);
2874 }
2875
2876
2877 /**
2878  * Generate pow(x, y)
2879  */
2880 LLVMValueRef
2881 lp_build_pow(struct lp_build_context *bld,
2882              LLVMValueRef x,
2883              LLVMValueRef y)
2884 {
2885    /* TODO: optimize the constant case */
2886    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2887        LLVMIsConstant(x) && LLVMIsConstant(y)) {
2888       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2889                    __FUNCTION__);
2890    }
2891
2892    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2893 }
2894
2895
2896 /**
2897  * Generate exp(x)
2898  */
2899 LLVMValueRef
2900 lp_build_exp(struct lp_build_context *bld,
2901              LLVMValueRef x)
2902 {
2903    /* log2(e) = 1/log(2) */
2904    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2905                                            1.4426950408889634);
2906
2907    assert(lp_check_value(bld->type, x));
2908
2909    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2910 }
2911
2912
2913 /**
2914  * Generate log(x)
2915  * Behavior is undefined with infs, 0s and nans
2916  */
2917 LLVMValueRef
2918 lp_build_log(struct lp_build_context *bld,
2919              LLVMValueRef x)
2920 {
2921    /* log(2) */
2922    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2923                                           0.69314718055994529);
2924
2925    assert(lp_check_value(bld->type, x));
2926
2927    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2928 }
2929
2930 /**
2931  * Generate log(x) that handles edge cases (infs, 0s and nans)
2932  */
2933 LLVMValueRef
2934 lp_build_log_safe(struct lp_build_context *bld,
2935                   LLVMValueRef x)
2936 {
2937    /* log(2) */
2938    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2939                                           0.69314718055994529);
2940
2941    assert(lp_check_value(bld->type, x));
2942
2943    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2944 }
2945
2946
2947 /**
2948  * Generate polynomial.
2949  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2950  */
2951 LLVMValueRef
2952 lp_build_polynomial(struct lp_build_context *bld,
2953                     LLVMValueRef x,
2954                     const double *coeffs,
2955                     unsigned num_coeffs)
2956 {
2957    const struct lp_type type = bld->type;
2958    LLVMValueRef even = NULL, odd = NULL;
2959    LLVMValueRef x2;
2960    unsigned i;
2961
2962    assert(lp_check_value(bld->type, x));
2963
2964    /* TODO: optimize the constant case */
2965    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2966        LLVMIsConstant(x)) {
2967       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2968                    __FUNCTION__);
2969    }
2970
2971    /*
2972     * Calculate odd and even terms seperately to decrease data dependency
2973     * Ex:
2974     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
2975     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2976     */
2977    x2 = lp_build_mul(bld, x, x);
2978
2979    for (i = num_coeffs; i--; ) {
2980       LLVMValueRef coeff;
2981
2982       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2983
2984       if (i % 2 == 0) {
2985          if (even)
2986             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2987          else
2988             even = coeff;
2989       } else {
2990          if (odd)
2991             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2992          else
2993             odd = coeff;
2994       }
2995    }
2996
2997    if (odd)
2998       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2999    else if (even)
3000       return even;
3001    else
3002       return bld->undef;
3003 }
3004
3005
3006 /**
3007  * Minimax polynomial fit of 2**x, in range [0, 1[
3008  */
3009 const double lp_build_exp2_polynomial[] = {
3010 #if EXP_POLY_DEGREE == 5
3011    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3012    0.693153073200168932794,
3013    0.240153617044375388211,
3014    0.0558263180532956664775,
3015    0.00898934009049466391101,
3016    0.00187757667519147912699
3017 #elif EXP_POLY_DEGREE == 4
3018    1.00000259337069434683,
3019    0.693003834469974940458,
3020    0.24144275689150793076,
3021    0.0520114606103070150235,
3022    0.0135341679161270268764
3023 #elif EXP_POLY_DEGREE == 3
3024    0.999925218562710312959,
3025    0.695833540494823811697,
3026    0.226067155427249155588,
3027    0.0780245226406372992967
3028 #elif EXP_POLY_DEGREE == 2
3029    1.00172476321474503578,
3030    0.657636275736077639316,
3031    0.33718943461968720704
3032 #else
3033 #error
3034 #endif
3035 };
3036
3037
3038 LLVMValueRef
3039 lp_build_exp2(struct lp_build_context *bld,
3040               LLVMValueRef x)
3041 {
3042    LLVMBuilderRef builder = bld->gallivm->builder;
3043    const struct lp_type type = bld->type;
3044    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3045    LLVMValueRef ipart = NULL;
3046    LLVMValueRef fpart = NULL;
3047    LLVMValueRef expipart = NULL;
3048    LLVMValueRef expfpart = NULL;
3049    LLVMValueRef res = NULL;
3050
3051    assert(lp_check_value(bld->type, x));
3052
3053    /* TODO: optimize the constant case */
3054    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3055        LLVMIsConstant(x)) {
3056       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3057                    __FUNCTION__);
3058    }
3059
3060    assert(type.floating && type.width == 32);
3061
3062    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3063     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3064    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3065                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3066    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3067                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3068
3069    /* ipart = floor(x) */
3070    /* fpart = x - ipart */
3071    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3072
3073    /* expipart = (float) (1 << ipart) */
3074    expipart = LLVMBuildAdd(builder, ipart,
3075                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3076    expipart = LLVMBuildShl(builder, expipart,
3077                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3078    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3079
3080    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3081                                   Elements(lp_build_exp2_polynomial));
3082
3083    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3084
3085    return res;
3086 }
3087
3088
3089
3090 /**
3091  * Extract the exponent of a IEEE-754 floating point value.
3092  *
3093  * Optionally apply an integer bias.
3094  *
3095  * Result is an integer value with
3096  *
3097  *   ifloor(log2(x)) + bias
3098  */
3099 LLVMValueRef
3100 lp_build_extract_exponent(struct lp_build_context *bld,
3101                           LLVMValueRef x,
3102                           int bias)
3103 {
3104    LLVMBuilderRef builder = bld->gallivm->builder;
3105    const struct lp_type type = bld->type;
3106    unsigned mantissa = lp_mantissa(type);
3107    LLVMValueRef res;
3108
3109    assert(type.floating);
3110
3111    assert(lp_check_value(bld->type, x));
3112
3113    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3114
3115    res = LLVMBuildLShr(builder, x,
3116                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3117    res = LLVMBuildAnd(builder, res,
3118                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3119    res = LLVMBuildSub(builder, res,
3120                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3121
3122    return res;
3123 }
3124
3125
3126 /**
3127  * Extract the mantissa of the a floating.
3128  *
3129  * Result is a floating point value with
3130  *
3131  *   x / floor(log2(x))
3132  */
3133 LLVMValueRef
3134 lp_build_extract_mantissa(struct lp_build_context *bld,
3135                           LLVMValueRef x)
3136 {
3137    LLVMBuilderRef builder = bld->gallivm->builder;
3138    const struct lp_type type = bld->type;
3139    unsigned mantissa = lp_mantissa(type);
3140    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3141                                                   (1ULL << mantissa) - 1);
3142    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3143    LLVMValueRef res;
3144
3145    assert(lp_check_value(bld->type, x));
3146
3147    assert(type.floating);
3148
3149    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3150
3151    /* res = x / 2**ipart */
3152    res = LLVMBuildAnd(builder, x, mantmask, "");
3153    res = LLVMBuildOr(builder, res, one, "");
3154    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3155
3156    return res;
3157 }
3158
3159
3160
3161 /**
3162  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3163  * These coefficients can be generate with
3164  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3165  */
3166 const double lp_build_log2_polynomial[] = {
3167 #if LOG_POLY_DEGREE == 5
3168    2.88539008148777786488L,
3169    0.961796878841293367824L,
3170    0.577058946784739859012L,
3171    0.412914355135828735411L,
3172    0.308591899232910175289L,
3173    0.352376952300281371868L,
3174 #elif LOG_POLY_DEGREE == 4
3175    2.88539009343309178325L,
3176    0.961791550404184197881L,
3177    0.577440339438736392009L,
3178    0.403343858251329912514L,
3179    0.406718052498846252698L,
3180 #elif LOG_POLY_DEGREE == 3
3181    2.88538959748872753838L,
3182    0.961932915889597772928L,
3183    0.571118517972136195241L,
3184    0.493997535084709500285L,
3185 #else
3186 #error
3187 #endif
3188 };
3189
3190 /**
3191  * See http://www.devmaster.net/forums/showthread.php?p=43580
3192  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3193  * http://www.nezumi.demon.co.uk/consult/logx.htm
3194  *
3195  * If handle_edge_cases is true the function will perform computations
3196  * to match the required D3D10+ behavior for each of the edge cases.
3197  * That means that if input is:
3198  * - less than zero (to and including -inf) then NaN will be returned
3199  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3200  * - +infinity, then +infinity will be returned
3201  * - NaN, then NaN will be returned
3202  *
3203  * Those checks are fairly expensive so if you don't need them make sure
3204  * handle_edge_cases is false.
3205  */
3206 void
3207 lp_build_log2_approx(struct lp_build_context *bld,
3208                      LLVMValueRef x,
3209                      LLVMValueRef *p_exp,
3210                      LLVMValueRef *p_floor_log2,
3211                      LLVMValueRef *p_log2,
3212                      boolean handle_edge_cases)
3213 {
3214    LLVMBuilderRef builder = bld->gallivm->builder;
3215    const struct lp_type type = bld->type;
3216    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3217    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3218
3219    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3220    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3221    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3222
3223    LLVMValueRef i = NULL;
3224    LLVMValueRef y = NULL;
3225    LLVMValueRef z = NULL;
3226    LLVMValueRef exp = NULL;
3227    LLVMValueRef mant = NULL;
3228    LLVMValueRef logexp = NULL;
3229    LLVMValueRef logmant = NULL;
3230    LLVMValueRef res = NULL;
3231
3232    assert(lp_check_value(bld->type, x));
3233
3234    if(p_exp || p_floor_log2 || p_log2) {
3235       /* TODO: optimize the constant case */
3236       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3237           LLVMIsConstant(x)) {
3238          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3239                       __FUNCTION__);
3240       }
3241
3242       assert(type.floating && type.width == 32);
3243
3244       /*
3245        * We don't explicitly handle denormalized numbers. They will yield a
3246        * result in the neighbourhood of -127, which appears to be adequate
3247        * enough.
3248        */
3249
3250       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3251
3252       /* exp = (float) exponent(x) */
3253       exp = LLVMBuildAnd(builder, i, expmask, "");
3254    }
3255
3256    if(p_floor_log2 || p_log2) {
3257       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3258       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3259       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3260    }
3261
3262    if(p_log2) {
3263       /* mant = 1 + (float) mantissa(x) */
3264       mant = LLVMBuildAnd(builder, i, mantmask, "");
3265       mant = LLVMBuildOr(builder, mant, one, "");
3266       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3267
3268       /* y = (mant - 1) / (mant + 1) */
3269       y = lp_build_div(bld,
3270          lp_build_sub(bld, mant, bld->one),
3271          lp_build_add(bld, mant, bld->one)
3272       );
3273
3274       /* z = y^2 */
3275       z = lp_build_mul(bld, y, y);
3276
3277       /* compute P(z) */
3278       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3279                                     Elements(lp_build_log2_polynomial));
3280
3281       /* logmant = y * P(z) */
3282       logmant = lp_build_mul(bld, y, logmant);
3283
3284       res = lp_build_add(bld, logmant, logexp);
3285
3286       if (type.floating && handle_edge_cases) {
3287          LLVMValueRef negmask, infmask,  zmask;
3288          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3289                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3290          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3291                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3292          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3293                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3294
3295          /* If x is qual to inf make sure we return inf */
3296          res = lp_build_select(bld, infmask,
3297                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3298                                res);
3299          /* If x is qual to 0, return -inf */
3300          res = lp_build_select(bld, zmask,
3301                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3302                                res);
3303          /* If x is nan or less than 0, return nan */
3304          res = lp_build_select(bld, negmask,
3305                                lp_build_const_vec(bld->gallivm, type,  NAN),
3306                                res);
3307       }
3308    }
3309
3310    if(p_exp) {
3311       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3312       *p_exp = exp;
3313    }
3314
3315    if(p_floor_log2)
3316       *p_floor_log2 = logexp;
3317
3318    if(p_log2)
3319       *p_log2 = res;
3320 }
3321
3322
3323 /*
3324  * log2 implementation which doesn't have special code to
3325  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3326  * the results for those cases are undefined.
3327  */
3328 LLVMValueRef
3329 lp_build_log2(struct lp_build_context *bld,
3330               LLVMValueRef x)
3331 {
3332    LLVMValueRef res;
3333    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3334    return res;
3335 }
3336
3337 /*
3338  * Version of log2 which handles all edge cases.
3339  * Look at documentation of lp_build_log2_approx for
3340  * description of the behavior for each of the edge cases.
3341  */
3342 LLVMValueRef
3343 lp_build_log2_safe(struct lp_build_context *bld,
3344                    LLVMValueRef x)
3345 {
3346    LLVMValueRef res;
3347    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3348    return res;
3349 }
3350
3351
3352 /**
3353  * Faster (and less accurate) log2.
3354  *
3355  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3356  *
3357  * Piece-wise linear approximation, with exact results when x is a
3358  * power of two.
3359  *
3360  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3361  */
3362 LLVMValueRef
3363 lp_build_fast_log2(struct lp_build_context *bld,
3364                    LLVMValueRef x)
3365 {
3366    LLVMBuilderRef builder = bld->gallivm->builder;
3367    LLVMValueRef ipart;
3368    LLVMValueRef fpart;
3369
3370    assert(lp_check_value(bld->type, x));
3371
3372    assert(bld->type.floating);
3373
3374    /* ipart = floor(log2(x)) - 1 */
3375    ipart = lp_build_extract_exponent(bld, x, -1);
3376    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3377
3378    /* fpart = x / 2**ipart */
3379    fpart = lp_build_extract_mantissa(bld, x);
3380
3381    /* ipart + fpart */
3382    return LLVMBuildFAdd(builder, ipart, fpart, "");
3383 }
3384
3385
3386 /**
3387  * Fast implementation of iround(log2(x)).
3388  *
3389  * Not an approximation -- it should give accurate results all the time.
3390  */
3391 LLVMValueRef
3392 lp_build_ilog2(struct lp_build_context *bld,
3393                LLVMValueRef x)
3394 {
3395    LLVMBuilderRef builder = bld->gallivm->builder;
3396    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3397    LLVMValueRef ipart;
3398
3399    assert(bld->type.floating);
3400
3401    assert(lp_check_value(bld->type, x));
3402
3403    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3404    x = LLVMBuildFMul(builder, x, sqrt2, "");
3405
3406    /* ipart = floor(log2(x) + 0.5)  */
3407    ipart = lp_build_extract_exponent(bld, x, 0);
3408
3409    return ipart;
3410 }
3411
3412 LLVMValueRef
3413 lp_build_mod(struct lp_build_context *bld,
3414              LLVMValueRef x,
3415              LLVMValueRef y)
3416 {
3417    LLVMBuilderRef builder = bld->gallivm->builder;
3418    LLVMValueRef res;
3419    const struct lp_type type = bld->type;
3420
3421    assert(lp_check_value(type, x));
3422    assert(lp_check_value(type, y));
3423
3424    if (type.floating)
3425       res = LLVMBuildFRem(builder, x, y, "");
3426    else if (type.sign)
3427       res = LLVMBuildSRem(builder, x, y, "");
3428    else
3429       res = LLVMBuildURem(builder, x, y, "");
3430    return res;
3431 }
3432
3433
3434 /*
3435  * For floating inputs it creates and returns a mask
3436  * which is all 1's for channels which are NaN.
3437  * Channels inside x which are not NaN will be 0.
3438  */
3439 LLVMValueRef
3440 lp_build_isnan(struct lp_build_context *bld,
3441                LLVMValueRef x)
3442 {
3443    LLVMValueRef mask;
3444    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3445
3446    assert(bld->type.floating);
3447    assert(lp_check_value(bld->type, x));
3448
3449    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3450                         "isnotnan");
3451    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3452    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3453    return mask;
3454 }
3455
3456 /* Returns all 1's for floating point numbers that are
3457  * finite numbers and returns all zeros for -inf,
3458  * inf and nan's */
3459 LLVMValueRef
3460 lp_build_isfinite(struct lp_build_context *bld,
3461                   LLVMValueRef x)
3462 {
3463    LLVMBuilderRef builder = bld->gallivm->builder;
3464    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3465    struct lp_type int_type = lp_int_type(bld->type);
3466    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3467    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3468                                                     0x7f800000);
3469
3470    if (!bld->type.floating) {
3471       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3472    }
3473    assert(bld->type.floating);
3474    assert(lp_check_value(bld->type, x));
3475    assert(bld->type.width == 32);
3476
3477    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3478    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3479                            intx, infornan32);
3480 }
3481
3482 /*
3483  * Returns true if the number is nan or inf and false otherwise.
3484  * The input has to be a floating point vector.
3485  */
3486 LLVMValueRef
3487 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3488                        const struct lp_type type,
3489                        LLVMValueRef x)
3490 {
3491    LLVMBuilderRef builder = gallivm->builder;
3492    struct lp_type int_type = lp_int_type(type);
3493    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3494                                                 0x7f800000);
3495    LLVMValueRef ret;
3496
3497    assert(type.floating);
3498
3499    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3500    ret = LLVMBuildAnd(builder, ret, const0, "");
3501    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3502                           ret, const0);
3503
3504    return ret;
3505 }
3506
3507
3508 LLVMValueRef
3509 lp_build_fpstate_get(struct gallivm_state *gallivm)
3510 {
3511    if (util_cpu_caps.has_sse) {
3512       LLVMBuilderRef builder = gallivm->builder;
3513       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3514          gallivm,
3515          LLVMInt32TypeInContext(gallivm->context),
3516          "mxcsr_ptr");
3517       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3518           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3519       lp_build_intrinsic(builder,
3520                          "llvm.x86.sse.stmxcsr",
3521                          LLVMVoidTypeInContext(gallivm->context),
3522                          &mxcsr_ptr8, 1);
3523       return mxcsr_ptr;
3524    }
3525    return 0;
3526 }
3527
3528 void
3529 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3530                                   boolean zero)
3531 {
3532    if (util_cpu_caps.has_sse) {
3533       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3534       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3535
3536       LLVMBuilderRef builder = gallivm->builder;
3537       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3538       LLVMValueRef mxcsr =
3539          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3540
3541       if (util_cpu_caps.has_daz) {
3542          /* Enable denormals are zero mode */
3543          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3544       }
3545       if (zero) {
3546          mxcsr = LLVMBuildOr(builder, mxcsr,
3547                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3548       } else {
3549          mxcsr = LLVMBuildAnd(builder, mxcsr,
3550                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3551       }
3552
3553       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3554       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3555    }
3556 }
3557
3558 void
3559 lp_build_fpstate_set(struct gallivm_state *gallivm,
3560                      LLVMValueRef mxcsr_ptr)
3561 {
3562    if (util_cpu_caps.has_sse) {
3563       LLVMBuilderRef builder = gallivm->builder;
3564       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3565                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3566       lp_build_intrinsic(builder,
3567                          "llvm.x86.sse.ldmxcsr",
3568                          LLVMVoidTypeInContext(gallivm->context),
3569                          &mxcsr_ptr, 1);
3570    }
3571 }