1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include <llvm/Config/llvm-config.h>
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
80 #define EXP_POLY_DEGREE 5
82 #define LOG_POLY_DEGREE 4
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
92 lp_build_min_simple(struct lp_build_context
*bld
,
95 enum gallivm_nan_behavior nan_behavior
)
97 const struct lp_type type
= bld
->type
;
98 const char *intrinsic
= NULL
;
99 unsigned intr_size
= 0;
102 assert(lp_check_value(type
, a
));
103 assert(lp_check_value(type
, b
));
105 /* TODO: optimize the constant case */
107 if (type
.floating
&& util_cpu_caps
.has_sse
) {
108 if (type
.width
== 32) {
109 if (type
.length
== 1) {
110 intrinsic
= "llvm.x86.sse.min.ss";
113 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
114 intrinsic
= "llvm.x86.sse.min.ps";
118 intrinsic
= "llvm.x86.avx.min.ps.256";
122 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
123 if (type
.length
== 1) {
124 intrinsic
= "llvm.x86.sse2.min.sd";
127 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
128 intrinsic
= "llvm.x86.sse2.min.pd";
132 intrinsic
= "llvm.x86.avx.min.pd.256";
137 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
138 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
139 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
143 if (type
.width
== 32 && type
.length
== 4) {
144 intrinsic
= "llvm.ppc.altivec.vminfp";
147 } else if (util_cpu_caps
.has_altivec
) {
149 if (type
.width
== 8) {
151 intrinsic
= "llvm.ppc.altivec.vminub";
153 intrinsic
= "llvm.ppc.altivec.vminsb";
155 } else if (type
.width
== 16) {
157 intrinsic
= "llvm.ppc.altivec.vminuh";
159 intrinsic
= "llvm.ppc.altivec.vminsh";
161 } else if (type
.width
== 32) {
163 intrinsic
= "llvm.ppc.altivec.vminuw";
165 intrinsic
= "llvm.ppc.altivec.vminsw";
171 /* We need to handle nan's for floating point numbers. If one of the
172 * inputs is nan the other should be returned (required by both D3D10+
174 * The sse intrinsics return the second operator in case of nan by
175 * default so we need to special code to handle those.
177 if (util_cpu_caps
.has_sse
&& type
.floating
&&
178 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
179 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
180 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
181 LLVMValueRef isnan
, min
;
182 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
185 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
186 isnan
= lp_build_isnan(bld
, b
);
187 return lp_build_select(bld
, isnan
, a
, min
);
189 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
190 isnan
= lp_build_isnan(bld
, a
);
191 return lp_build_select(bld
, isnan
, a
, min
);
194 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
201 switch (nan_behavior
) {
202 case GALLIVM_NAN_RETURN_NAN
: {
203 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
204 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
205 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
206 return lp_build_select(bld
, cond
, a
, b
);
209 case GALLIVM_NAN_RETURN_OTHER
: {
210 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
211 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
212 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
213 return lp_build_select(bld
, cond
, a
, b
);
216 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
217 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
218 return lp_build_select(bld
, cond
, a
, b
);
219 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
220 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, b
, a
);
221 return lp_build_select(bld
, cond
, b
, a
);
222 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
223 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
224 return lp_build_select(bld
, cond
, a
, b
);
228 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
229 return lp_build_select(bld
, cond
, a
, b
);
232 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
233 return lp_build_select(bld
, cond
, a
, b
);
239 lp_build_fmuladd(LLVMBuilderRef builder
,
244 LLVMTypeRef type
= LLVMTypeOf(a
);
245 assert(type
== LLVMTypeOf(b
));
246 assert(type
== LLVMTypeOf(c
));
249 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fmuladd", type
);
250 LLVMValueRef args
[] = { a
, b
, c
};
251 return lp_build_intrinsic(builder
, intrinsic
, type
, args
, 3, 0);
257 * No checks for special case values of a or b = 1 or 0 are done.
258 * NaN's are handled according to the behavior specified by the
259 * nan_behavior argument.
262 lp_build_max_simple(struct lp_build_context
*bld
,
265 enum gallivm_nan_behavior nan_behavior
)
267 const struct lp_type type
= bld
->type
;
268 const char *intrinsic
= NULL
;
269 unsigned intr_size
= 0;
272 assert(lp_check_value(type
, a
));
273 assert(lp_check_value(type
, b
));
275 /* TODO: optimize the constant case */
277 if (type
.floating
&& util_cpu_caps
.has_sse
) {
278 if (type
.width
== 32) {
279 if (type
.length
== 1) {
280 intrinsic
= "llvm.x86.sse.max.ss";
283 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
284 intrinsic
= "llvm.x86.sse.max.ps";
288 intrinsic
= "llvm.x86.avx.max.ps.256";
292 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
293 if (type
.length
== 1) {
294 intrinsic
= "llvm.x86.sse2.max.sd";
297 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
298 intrinsic
= "llvm.x86.sse2.max.pd";
302 intrinsic
= "llvm.x86.avx.max.pd.256";
307 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
308 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
309 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
310 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
313 if (type
.width
== 32 || type
.length
== 4) {
314 intrinsic
= "llvm.ppc.altivec.vmaxfp";
317 } else if (util_cpu_caps
.has_altivec
) {
319 if (type
.width
== 8) {
321 intrinsic
= "llvm.ppc.altivec.vmaxub";
323 intrinsic
= "llvm.ppc.altivec.vmaxsb";
325 } else if (type
.width
== 16) {
327 intrinsic
= "llvm.ppc.altivec.vmaxuh";
329 intrinsic
= "llvm.ppc.altivec.vmaxsh";
331 } else if (type
.width
== 32) {
333 intrinsic
= "llvm.ppc.altivec.vmaxuw";
335 intrinsic
= "llvm.ppc.altivec.vmaxsw";
341 if (util_cpu_caps
.has_sse
&& type
.floating
&&
342 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
343 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
344 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
345 LLVMValueRef isnan
, max
;
346 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
349 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
350 isnan
= lp_build_isnan(bld
, b
);
351 return lp_build_select(bld
, isnan
, a
, max
);
353 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
354 isnan
= lp_build_isnan(bld
, a
);
355 return lp_build_select(bld
, isnan
, a
, max
);
358 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
365 switch (nan_behavior
) {
366 case GALLIVM_NAN_RETURN_NAN
: {
367 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
368 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
369 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
370 return lp_build_select(bld
, cond
, a
, b
);
373 case GALLIVM_NAN_RETURN_OTHER
: {
374 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
375 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
376 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
377 return lp_build_select(bld
, cond
, a
, b
);
380 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
381 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
382 return lp_build_select(bld
, cond
, a
, b
);
383 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
384 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, a
);
385 return lp_build_select(bld
, cond
, b
, a
);
386 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
387 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
388 return lp_build_select(bld
, cond
, a
, b
);
392 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
393 return lp_build_select(bld
, cond
, a
, b
);
396 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
397 return lp_build_select(bld
, cond
, a
, b
);
403 * Generate 1 - a, or ~a depending on bld->type.
406 lp_build_comp(struct lp_build_context
*bld
,
409 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
410 const struct lp_type type
= bld
->type
;
412 assert(lp_check_value(type
, a
));
419 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
420 if(LLVMIsConstant(a
))
421 return LLVMConstNot(a
);
423 return LLVMBuildNot(builder
, a
, "");
426 if(LLVMIsConstant(a
))
428 return LLVMConstFSub(bld
->one
, a
);
430 return LLVMConstSub(bld
->one
, a
);
433 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
435 return LLVMBuildSub(builder
, bld
->one
, a
, "");
443 lp_build_add(struct lp_build_context
*bld
,
447 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
448 const struct lp_type type
= bld
->type
;
451 assert(lp_check_value(type
, a
));
452 assert(lp_check_value(type
, b
));
458 if (a
== bld
->undef
|| b
== bld
->undef
)
462 const char *intrinsic
= NULL
;
464 if (!type
.sign
&& (a
== bld
->one
|| b
== bld
->one
))
467 if (!type
.floating
&& !type
.fixed
) {
468 if (LLVM_VERSION_MAJOR
>= 9) {
470 intrinsic
= type
.sign
? "llvm.sadd.sat" : "llvm.uadd.sat";
471 lp_format_intrinsic(intrin
, sizeof intrin
, intrinsic
, bld
->vec_type
);
472 return lp_build_intrinsic_binary(builder
, intrin
, bld
->vec_type
, a
, b
);
474 if (type
.width
* type
.length
== 128) {
475 if (util_cpu_caps
.has_sse2
) {
477 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" :
478 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.paddus.b" : NULL
;
479 if (type
.width
== 16)
480 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" :
481 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.paddus.w" : NULL
;
482 } else if (util_cpu_caps
.has_altivec
) {
484 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
485 if (type
.width
== 16)
486 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
489 if (type
.width
* type
.length
== 256) {
490 if (util_cpu_caps
.has_avx2
) {
492 intrinsic
= type
.sign
? "llvm.x86.avx2.padds.b" :
493 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.paddus.b" : NULL
;
494 if (type
.width
== 16)
495 intrinsic
= type
.sign
? "llvm.x86.avx2.padds.w" :
496 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.paddus.w" : NULL
;
502 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
505 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
507 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
508 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
509 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
510 /* a_clamp_max is the maximum a for positive b,
511 a_clamp_min is the minimum a for negative b. */
512 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildSub(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
513 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildSub(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
514 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_max
, a_clamp_min
);
518 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
520 res
= LLVMConstFAdd(a
, b
);
522 res
= LLVMConstAdd(a
, b
);
525 res
= LLVMBuildFAdd(builder
, a
, b
, "");
527 res
= LLVMBuildAdd(builder
, a
, b
, "");
529 /* clamp to ceiling of 1.0 */
530 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
531 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
533 if (type
.norm
&& !type
.floating
&& !type
.fixed
) {
536 * newer llvm versions no longer support the intrinsics, but recognize
537 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
538 * code, it is important we match the pattern llvm uses (and pray llvm
539 * doesn't change it - and hope they decide on the same pattern for
540 * all backends supporting it...).
541 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
542 * interfere with llvm's ability to recognize the pattern but seems
544 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
546 LLVMValueRef overflowed
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, res
);
547 res
= lp_build_select(bld
, overflowed
,
548 LLVMConstAllOnes(bld
->int_vec_type
), res
);
552 /* XXX clamp to floor of -1 or 0??? */
558 /** Return the scalar sum of the elements of a.
559 * Should avoid this operation whenever possible.
562 lp_build_horizontal_add(struct lp_build_context
*bld
,
565 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
566 const struct lp_type type
= bld
->type
;
567 LLVMValueRef index
, res
;
569 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
570 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
571 LLVMValueRef vecres
, elem2
;
573 assert(lp_check_value(type
, a
));
575 if (type
.length
== 1) {
579 assert(!bld
->type
.norm
);
582 * for byte vectors can do much better with psadbw.
583 * Using repeated shuffle/adds here. Note with multiple vectors
584 * this can be done more efficiently as outlined in the intel
585 * optimization manual.
586 * Note: could cause data rearrangement if used with smaller element
591 length
= type
.length
/ 2;
593 LLVMValueRef vec1
, vec2
;
594 for (i
= 0; i
< length
; i
++) {
595 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
596 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
598 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
599 LLVMConstVector(shuffles1
, length
), "");
600 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
601 LLVMConstVector(shuffles2
, length
), "");
603 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
606 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
608 length
= length
>> 1;
611 /* always have vector of size 2 here */
614 index
= lp_build_const_int32(bld
->gallivm
, 0);
615 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
616 index
= lp_build_const_int32(bld
->gallivm
, 1);
617 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
620 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
622 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
628 * Return the horizontal sums of 4 float vectors as a float4 vector.
629 * This uses the technique as outlined in Intel Optimization Manual.
632 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
635 struct gallivm_state
*gallivm
= bld
->gallivm
;
636 LLVMBuilderRef builder
= gallivm
->builder
;
637 LLVMValueRef shuffles
[4];
639 LLVMValueRef sumtmp
[2], shuftmp
[2];
641 /* lower half of regs */
642 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
643 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
644 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
645 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
646 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
647 LLVMConstVector(shuffles
, 4), "");
648 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
649 LLVMConstVector(shuffles
, 4), "");
651 /* upper half of regs */
652 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
653 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
654 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
655 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
656 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
657 LLVMConstVector(shuffles
, 4), "");
658 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
659 LLVMConstVector(shuffles
, 4), "");
661 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
662 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
664 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
665 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
666 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
667 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
668 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
669 LLVMConstVector(shuffles
, 4), "");
671 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
672 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
673 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
674 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
675 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
676 LLVMConstVector(shuffles
, 4), "");
678 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
683 * partially horizontally add 2-4 float vectors with length nx4,
684 * i.e. only four adjacent values in each vector will be added,
685 * assuming values are really grouped in 4 which also determines
688 * Return a vector of the same length as the initial vectors,
689 * with the excess elements (if any) being undefined.
690 * The element order is independent of number of input vectors.
691 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
692 * the output order thus will be
693 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
696 lp_build_hadd_partial4(struct lp_build_context
*bld
,
697 LLVMValueRef vectors
[],
700 struct gallivm_state
*gallivm
= bld
->gallivm
;
701 LLVMBuilderRef builder
= gallivm
->builder
;
702 LLVMValueRef ret_vec
;
704 const char *intrinsic
= NULL
;
706 assert(num_vecs
>= 2 && num_vecs
<= 4);
707 assert(bld
->type
.floating
);
709 /* only use this with at least 2 vectors, as it is sort of expensive
710 * (depending on cpu) and we always need two horizontal adds anyway,
711 * so a shuffle/add approach might be better.
717 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
718 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
720 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
721 bld
->type
.length
== 4) {
722 intrinsic
= "llvm.x86.sse3.hadd.ps";
724 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
725 bld
->type
.length
== 8) {
726 intrinsic
= "llvm.x86.avx.hadd.ps.256";
729 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
730 lp_build_vec_type(gallivm
, bld
->type
),
733 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
734 lp_build_vec_type(gallivm
, bld
->type
),
740 return lp_build_intrinsic_binary(builder
, intrinsic
,
741 lp_build_vec_type(gallivm
, bld
->type
),
745 if (bld
->type
.length
== 4) {
746 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
749 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
751 unsigned num_iter
= bld
->type
.length
/ 4;
752 struct lp_type parttype
= bld
->type
;
754 for (j
= 0; j
< num_iter
; j
++) {
755 LLVMValueRef partsrc
[4];
757 for (i
= 0; i
< 4; i
++) {
758 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
760 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
762 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
771 lp_build_sub(struct lp_build_context
*bld
,
775 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
776 const struct lp_type type
= bld
->type
;
779 assert(lp_check_value(type
, a
));
780 assert(lp_check_value(type
, b
));
784 if (a
== bld
->undef
|| b
== bld
->undef
)
790 const char *intrinsic
= NULL
;
792 if (!type
.sign
&& b
== bld
->one
)
795 if (!type
.floating
&& !type
.fixed
) {
796 if (LLVM_VERSION_MAJOR
>= 9) {
798 intrinsic
= type
.sign
? "llvm.ssub.sat" : "llvm.usub.sat";
799 lp_format_intrinsic(intrin
, sizeof intrin
, intrinsic
, bld
->vec_type
);
800 return lp_build_intrinsic_binary(builder
, intrin
, bld
->vec_type
, a
, b
);
802 if (type
.width
* type
.length
== 128) {
803 if (util_cpu_caps
.has_sse2
) {
805 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" :
806 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.psubus.b" : NULL
;
807 if (type
.width
== 16)
808 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" :
809 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.psubus.w" : NULL
;
810 } else if (util_cpu_caps
.has_altivec
) {
812 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
813 if (type
.width
== 16)
814 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
817 if (type
.width
* type
.length
== 256) {
818 if (util_cpu_caps
.has_avx2
) {
820 intrinsic
= type
.sign
? "llvm.x86.avx2.psubs.b" :
821 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.psubus.b" : NULL
;
822 if (type
.width
== 16)
823 intrinsic
= type
.sign
? "llvm.x86.avx2.psubs.w" :
824 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.psubus.w" : NULL
;
830 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
833 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
835 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
836 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
837 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
838 /* a_clamp_max is the maximum a for negative b,
839 a_clamp_min is the minimum a for positive b. */
840 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildAdd(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
841 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildAdd(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
842 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_min
, a_clamp_max
);
845 * This must match llvm pattern for saturated unsigned sub.
846 * (lp_build_max_simple actually does the job with its current
847 * definition but do it explicitly here.)
848 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
849 * interfere with llvm's ability to recognize the pattern but seems
851 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
853 LLVMValueRef no_ov
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
854 a
= lp_build_select(bld
, no_ov
, a
, b
);
858 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
860 res
= LLVMConstFSub(a
, b
);
862 res
= LLVMConstSub(a
, b
);
865 res
= LLVMBuildFSub(builder
, a
, b
, "");
867 res
= LLVMBuildSub(builder
, a
, b
, "");
869 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
870 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
878 * Normalized multiplication.
880 * There are several approaches for (using 8-bit normalized multiplication as
885 * makes the following approximation to the division (Sree)
887 * a*b/255 ~= (a*(b + 1)) >> 256
889 * which is the fastest method that satisfies the following OpenGL criteria of
891 * 0*0 = 0 and 255*255 = 255
895 * takes the geometric series approximation to the division
897 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
899 * in this case just the first two terms to fit in 16bit arithmetic
901 * t/255 ~= (t + (t >> 8)) >> 8
903 * note that just by itself it doesn't satisfies the OpenGL criteria, as
904 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
907 * - geometric series plus rounding
909 * when using a geometric series division instead of truncating the result
910 * use roundoff in the approximation (Jim Blinn)
912 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
914 * achieving the exact results.
918 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
919 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
920 * @sa Michael Herf, The "double blend trick", May 2000,
921 * http://www.stereopsis.com/doubleblend.html
924 lp_build_mul_norm(struct gallivm_state
*gallivm
,
925 struct lp_type wide_type
,
926 LLVMValueRef a
, LLVMValueRef b
)
928 LLVMBuilderRef builder
= gallivm
->builder
;
929 struct lp_build_context bld
;
934 assert(!wide_type
.floating
);
935 assert(lp_check_value(wide_type
, a
));
936 assert(lp_check_value(wide_type
, b
));
938 lp_build_context_init(&bld
, gallivm
, wide_type
);
940 n
= wide_type
.width
/ 2;
941 if (wide_type
.sign
) {
946 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
947 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
951 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
954 ab
= LLVMBuildMul(builder
, a
, b
, "");
955 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
958 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
961 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1LL << (n
- 1));
962 if (wide_type
.sign
) {
963 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
964 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
965 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
967 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
970 ab
= lp_build_shr_imm(&bld
, ab
, n
);
979 lp_build_mul(struct lp_build_context
*bld
,
983 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
984 const struct lp_type type
= bld
->type
;
988 assert(lp_check_value(type
, a
));
989 assert(lp_check_value(type
, b
));
999 if(a
== bld
->undef
|| b
== bld
->undef
)
1002 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
1003 struct lp_type wide_type
= lp_wider_type(type
);
1004 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
1006 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
1007 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
1009 /* PMULLW, PSRLW, PADDW */
1010 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
1011 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
1013 ab
= lp_build_pack2_native(bld
->gallivm
, wide_type
, type
, abl
, abh
);
1019 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
1023 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1025 res
= LLVMConstFMul(a
, b
);
1027 res
= LLVMConstMul(a
, b
);
1030 res
= LLVMConstAShr(res
, shift
);
1032 res
= LLVMConstLShr(res
, shift
);
1037 res
= LLVMBuildFMul(builder
, a
, b
, "");
1039 res
= LLVMBuildMul(builder
, a
, b
, "");
1042 res
= LLVMBuildAShr(builder
, res
, shift
, "");
1044 res
= LLVMBuildLShr(builder
, res
, shift
, "");
1052 * Widening mul, valid for 32x32 bit -> 64bit only.
1053 * Result is low 32bits, high bits returned in res_hi.
1055 * Emits code that is meant to be compiled for the host CPU.
1058 lp_build_mul_32_lohi_cpu(struct lp_build_context
*bld
,
1061 LLVMValueRef
*res_hi
)
1063 struct gallivm_state
*gallivm
= bld
->gallivm
;
1064 LLVMBuilderRef builder
= gallivm
->builder
;
1066 assert(bld
->type
.width
== 32);
1067 assert(bld
->type
.floating
== 0);
1068 assert(bld
->type
.fixed
== 0);
1069 assert(bld
->type
.norm
== 0);
1072 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1073 * for x86 simd is atrocious (even if the high bits weren't required),
1074 * trying to handle real 64bit inputs (which of course can't happen due
1075 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1076 * apparently llvm does not recognize this widening mul). This includes 6
1077 * (instead of 2) pmuludq plus extra adds and shifts
1078 * The same story applies to signed mul, albeit fixing this requires sse41.
1079 * https://llvm.org/bugs/show_bug.cgi?id=30845
1080 * So, whip up our own code, albeit only for length 4 and 8 (which
1081 * should be good enough)...
1082 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1083 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1084 * for signed), which the fallback code does not, without this llvm
1085 * will likely still produce atrocious code.
1087 if (LLVM_VERSION_MAJOR
< 7 &&
1088 (bld
->type
.length
== 4 || bld
->type
.length
== 8) &&
1089 ((util_cpu_caps
.has_sse2
&& (bld
->type
.sign
== 0)) ||
1090 util_cpu_caps
.has_sse4_1
)) {
1091 const char *intrinsic
= NULL
;
1092 LLVMValueRef aeven
, aodd
, beven
, bodd
, muleven
, mulodd
;
1093 LLVMValueRef shuf
[LP_MAX_VECTOR_WIDTH
/ 32], shuf_vec
;
1094 struct lp_type type_wide
= lp_wider_type(bld
->type
);
1095 LLVMTypeRef wider_type
= lp_build_vec_type(gallivm
, type_wide
);
1097 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1098 shuf
[i
] = lp_build_const_int32(gallivm
, i
+1);
1099 shuf
[i
+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
1101 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1104 aodd
= LLVMBuildShuffleVector(builder
, aeven
, bld
->undef
, shuf_vec
, "");
1105 bodd
= LLVMBuildShuffleVector(builder
, beven
, bld
->undef
, shuf_vec
, "");
1107 if (util_cpu_caps
.has_avx2
&& bld
->type
.length
== 8) {
1108 if (bld
->type
.sign
) {
1109 intrinsic
= "llvm.x86.avx2.pmul.dq";
1111 intrinsic
= "llvm.x86.avx2.pmulu.dq";
1113 muleven
= lp_build_intrinsic_binary(builder
, intrinsic
,
1114 wider_type
, aeven
, beven
);
1115 mulodd
= lp_build_intrinsic_binary(builder
, intrinsic
,
1116 wider_type
, aodd
, bodd
);
1119 /* for consistent naming look elsewhere... */
1120 if (bld
->type
.sign
) {
1121 intrinsic
= "llvm.x86.sse41.pmuldq";
1123 intrinsic
= "llvm.x86.sse2.pmulu.dq";
1126 * XXX If we only have AVX but not AVX2 this is a pain.
1127 * lp_build_intrinsic_binary_anylength() can't handle it
1128 * (due to src and dst type not being identical).
1130 if (bld
->type
.length
== 8) {
1131 LLVMValueRef aevenlo
, aevenhi
, bevenlo
, bevenhi
;
1132 LLVMValueRef aoddlo
, aoddhi
, boddlo
, boddhi
;
1133 LLVMValueRef muleven2
[2], mulodd2
[2];
1134 struct lp_type type_wide_half
= type_wide
;
1135 LLVMTypeRef wtype_half
;
1136 type_wide_half
.length
= 2;
1137 wtype_half
= lp_build_vec_type(gallivm
, type_wide_half
);
1138 aevenlo
= lp_build_extract_range(gallivm
, aeven
, 0, 4);
1139 aevenhi
= lp_build_extract_range(gallivm
, aeven
, 4, 4);
1140 bevenlo
= lp_build_extract_range(gallivm
, beven
, 0, 4);
1141 bevenhi
= lp_build_extract_range(gallivm
, beven
, 4, 4);
1142 aoddlo
= lp_build_extract_range(gallivm
, aodd
, 0, 4);
1143 aoddhi
= lp_build_extract_range(gallivm
, aodd
, 4, 4);
1144 boddlo
= lp_build_extract_range(gallivm
, bodd
, 0, 4);
1145 boddhi
= lp_build_extract_range(gallivm
, bodd
, 4, 4);
1146 muleven2
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
1147 wtype_half
, aevenlo
, bevenlo
);
1148 mulodd2
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
1149 wtype_half
, aoddlo
, boddlo
);
1150 muleven2
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
1151 wtype_half
, aevenhi
, bevenhi
);
1152 mulodd2
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
1153 wtype_half
, aoddhi
, boddhi
);
1154 muleven
= lp_build_concat(gallivm
, muleven2
, type_wide_half
, 2);
1155 mulodd
= lp_build_concat(gallivm
, mulodd2
, type_wide_half
, 2);
1159 muleven
= lp_build_intrinsic_binary(builder
, intrinsic
,
1160 wider_type
, aeven
, beven
);
1161 mulodd
= lp_build_intrinsic_binary(builder
, intrinsic
,
1162 wider_type
, aodd
, bodd
);
1165 muleven
= LLVMBuildBitCast(builder
, muleven
, bld
->vec_type
, "");
1166 mulodd
= LLVMBuildBitCast(builder
, mulodd
, bld
->vec_type
, "");
1168 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1169 shuf
[i
] = lp_build_const_int32(gallivm
, i
+ 1);
1170 shuf
[i
+1] = lp_build_const_int32(gallivm
, i
+ 1 + bld
->type
.length
);
1172 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1173 *res_hi
= LLVMBuildShuffleVector(builder
, muleven
, mulodd
, shuf_vec
, "");
1175 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1176 shuf
[i
] = lp_build_const_int32(gallivm
, i
);
1177 shuf
[i
+1] = lp_build_const_int32(gallivm
, i
+ bld
->type
.length
);
1179 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1180 return LLVMBuildShuffleVector(builder
, muleven
, mulodd
, shuf_vec
, "");
1183 return lp_build_mul_32_lohi(bld
, a
, b
, res_hi
);
1189 * Widening mul, valid for 32x32 bit -> 64bit only.
1190 * Result is low 32bits, high bits returned in res_hi.
1192 * Emits generic code.
1195 lp_build_mul_32_lohi(struct lp_build_context
*bld
,
1198 LLVMValueRef
*res_hi
)
1200 struct gallivm_state
*gallivm
= bld
->gallivm
;
1201 LLVMBuilderRef builder
= gallivm
->builder
;
1202 LLVMValueRef tmp
, shift
, res_lo
;
1203 struct lp_type type_tmp
;
1204 LLVMTypeRef wide_type
, narrow_type
;
1206 type_tmp
= bld
->type
;
1207 narrow_type
= lp_build_vec_type(gallivm
, type_tmp
);
1208 type_tmp
.width
*= 2;
1209 wide_type
= lp_build_vec_type(gallivm
, type_tmp
);
1210 shift
= lp_build_const_vec(gallivm
, type_tmp
, 32);
1212 if (bld
->type
.sign
) {
1213 a
= LLVMBuildSExt(builder
, a
, wide_type
, "");
1214 b
= LLVMBuildSExt(builder
, b
, wide_type
, "");
1216 a
= LLVMBuildZExt(builder
, a
, wide_type
, "");
1217 b
= LLVMBuildZExt(builder
, b
, wide_type
, "");
1219 tmp
= LLVMBuildMul(builder
, a
, b
, "");
1221 res_lo
= LLVMBuildTrunc(builder
, tmp
, narrow_type
, "");
1223 /* Since we truncate anyway, LShr and AShr are equivalent. */
1224 tmp
= LLVMBuildLShr(builder
, tmp
, shift
, "");
1225 *res_hi
= LLVMBuildTrunc(builder
, tmp
, narrow_type
, "");
1233 lp_build_mad(struct lp_build_context
*bld
,
1238 const struct lp_type type
= bld
->type
;
1239 if (type
.floating
) {
1240 return lp_build_fmuladd(bld
->gallivm
->builder
, a
, b
, c
);
1242 return lp_build_add(bld
, lp_build_mul(bld
, a
, b
), c
);
1248 * Small vector x scale multiplication optimization.
1251 lp_build_mul_imm(struct lp_build_context
*bld
,
1255 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1256 LLVMValueRef factor
;
1258 assert(lp_check_value(bld
->type
, a
));
1267 return lp_build_negate(bld
, a
);
1269 if(b
== 2 && bld
->type
.floating
)
1270 return lp_build_add(bld
, a
, a
);
1272 if(util_is_power_of_two_or_zero(b
)) {
1273 unsigned shift
= ffs(b
) - 1;
1275 if(bld
->type
.floating
) {
1278 * Power of two multiplication by directly manipulating the exponent.
1280 * XXX: This might not be always faster, it will introduce a small error
1281 * for multiplication by zero, and it will produce wrong results
1284 unsigned mantissa
= lp_mantissa(bld
->type
);
1285 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1286 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1287 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1288 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1293 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1294 return LLVMBuildShl(builder
, a
, factor
, "");
1298 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1299 return lp_build_mul(bld
, a
, factor
);
1307 lp_build_div(struct lp_build_context
*bld
,
1311 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1312 const struct lp_type type
= bld
->type
;
1314 assert(lp_check_value(type
, a
));
1315 assert(lp_check_value(type
, b
));
1319 if(a
== bld
->one
&& type
.floating
)
1320 return lp_build_rcp(bld
, b
);
1325 if(a
== bld
->undef
|| b
== bld
->undef
)
1328 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1330 return LLVMConstFDiv(a
, b
);
1332 return LLVMConstSDiv(a
, b
);
1334 return LLVMConstUDiv(a
, b
);
1337 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1339 ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1340 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1342 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1345 return LLVMBuildFDiv(builder
, a
, b
, "");
1347 return LLVMBuildSDiv(builder
, a
, b
, "");
1349 return LLVMBuildUDiv(builder
, a
, b
, "");
1354 * Linear interpolation helper.
1356 * @param normalized whether we are interpolating normalized values,
1357 * encoded in normalized integers, twice as wide.
1359 * @sa http://www.stereopsis.com/doubleblend.html
1361 static inline LLVMValueRef
1362 lp_build_lerp_simple(struct lp_build_context
*bld
,
1368 unsigned half_width
= bld
->type
.width
/2;
1369 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1373 assert(lp_check_value(bld
->type
, x
));
1374 assert(lp_check_value(bld
->type
, v0
));
1375 assert(lp_check_value(bld
->type
, v1
));
1377 delta
= lp_build_sub(bld
, v1
, v0
);
1379 if (bld
->type
.floating
) {
1381 return lp_build_mad(bld
, x
, delta
, v0
);
1384 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1385 if (!bld
->type
.sign
) {
1386 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1388 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1389 * most-significant-bit to the lowest-significant-bit, so that
1390 * later we can just divide by 2**n instead of 2**n - 1.
1393 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1396 /* (x * delta) >> n */
1397 res
= lp_build_mul(bld
, x
, delta
);
1398 res
= lp_build_shr_imm(bld
, res
, half_width
);
1401 * The rescaling trick above doesn't work for signed numbers, so
1402 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1405 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1406 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1409 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1410 res
= lp_build_mul(bld
, x
, delta
);
1413 if ((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) {
1415 * At this point both res and v0 only use the lower half of the bits,
1416 * the rest is zero. Instead of add / mask, do add with half wide type.
1418 struct lp_type narrow_type
;
1419 struct lp_build_context narrow_bld
;
1421 memset(&narrow_type
, 0, sizeof narrow_type
);
1422 narrow_type
.sign
= bld
->type
.sign
;
1423 narrow_type
.width
= bld
->type
.width
/2;
1424 narrow_type
.length
= bld
->type
.length
*2;
1426 lp_build_context_init(&narrow_bld
, bld
->gallivm
, narrow_type
);
1427 res
= LLVMBuildBitCast(builder
, res
, narrow_bld
.vec_type
, "");
1428 v0
= LLVMBuildBitCast(builder
, v0
, narrow_bld
.vec_type
, "");
1429 res
= lp_build_add(&narrow_bld
, v0
, res
);
1430 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
1432 res
= lp_build_add(bld
, v0
, res
);
1434 if (bld
->type
.fixed
) {
1436 * We need to mask out the high order bits when lerping 8bit
1437 * normalized colors stored on 16bits
1439 /* XXX: This step is necessary for lerping 8bit colors stored on
1440 * 16bits, but it will be wrong for true fixed point use cases.
1441 * Basically we need a more powerful lp_type, capable of further
1442 * distinguishing the values interpretation from the value storage.
1444 LLVMValueRef low_bits
;
1445 low_bits
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1);
1446 res
= LLVMBuildAnd(builder
, res
, low_bits
, "");
1455 * Linear interpolation.
1458 lp_build_lerp(struct lp_build_context
*bld
,
1464 const struct lp_type type
= bld
->type
;
1467 assert(lp_check_value(type
, x
));
1468 assert(lp_check_value(type
, v0
));
1469 assert(lp_check_value(type
, v1
));
1471 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1474 struct lp_type wide_type
;
1475 struct lp_build_context wide_bld
;
1476 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1478 assert(type
.length
>= 2);
1481 * Create a wider integer type, enough to hold the
1482 * intermediate result of the multiplication.
1484 memset(&wide_type
, 0, sizeof wide_type
);
1485 wide_type
.sign
= type
.sign
;
1486 wide_type
.width
= type
.width
*2;
1487 wide_type
.length
= type
.length
/2;
1489 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1491 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1492 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1493 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1499 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1501 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1502 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1504 res
= lp_build_pack2_native(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1506 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1514 * Bilinear interpolation.
1516 * Values indices are in v_{yx}.
1519 lp_build_lerp_2d(struct lp_build_context
*bld
,
1528 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1529 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1530 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1535 lp_build_lerp_3d(struct lp_build_context
*bld
,
1549 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1550 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1551 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1556 * Generate min(a, b)
1557 * Do checks for special cases but not for nans.
1560 lp_build_min(struct lp_build_context
*bld
,
1564 assert(lp_check_value(bld
->type
, a
));
1565 assert(lp_check_value(bld
->type
, b
));
1567 if(a
== bld
->undef
|| b
== bld
->undef
)
1573 if (bld
->type
.norm
) {
1574 if (!bld
->type
.sign
) {
1575 if (a
== bld
->zero
|| b
== bld
->zero
) {
1585 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1590 * Generate min(a, b)
1591 * NaN's are handled according to the behavior specified by the
1592 * nan_behavior argument.
1595 lp_build_min_ext(struct lp_build_context
*bld
,
1598 enum gallivm_nan_behavior nan_behavior
)
1600 assert(lp_check_value(bld
->type
, a
));
1601 assert(lp_check_value(bld
->type
, b
));
1603 if(a
== bld
->undef
|| b
== bld
->undef
)
1609 if (bld
->type
.norm
) {
1610 if (!bld
->type
.sign
) {
1611 if (a
== bld
->zero
|| b
== bld
->zero
) {
1621 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1625 * Generate max(a, b)
1626 * Do checks for special cases, but NaN behavior is undefined.
1629 lp_build_max(struct lp_build_context
*bld
,
1633 assert(lp_check_value(bld
->type
, a
));
1634 assert(lp_check_value(bld
->type
, b
));
1636 if(a
== bld
->undef
|| b
== bld
->undef
)
1642 if(bld
->type
.norm
) {
1643 if(a
== bld
->one
|| b
== bld
->one
)
1645 if (!bld
->type
.sign
) {
1646 if (a
== bld
->zero
) {
1649 if (b
== bld
->zero
) {
1655 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1660 * Generate max(a, b)
1661 * Checks for special cases.
1662 * NaN's are handled according to the behavior specified by the
1663 * nan_behavior argument.
1666 lp_build_max_ext(struct lp_build_context
*bld
,
1669 enum gallivm_nan_behavior nan_behavior
)
1671 assert(lp_check_value(bld
->type
, a
));
1672 assert(lp_check_value(bld
->type
, b
));
1674 if(a
== bld
->undef
|| b
== bld
->undef
)
1680 if(bld
->type
.norm
) {
1681 if(a
== bld
->one
|| b
== bld
->one
)
1683 if (!bld
->type
.sign
) {
1684 if (a
== bld
->zero
) {
1687 if (b
== bld
->zero
) {
1693 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1697 * Generate clamp(a, min, max)
1698 * NaN behavior (for any of a, min, max) is undefined.
1699 * Do checks for special cases.
1702 lp_build_clamp(struct lp_build_context
*bld
,
1707 assert(lp_check_value(bld
->type
, a
));
1708 assert(lp_check_value(bld
->type
, min
));
1709 assert(lp_check_value(bld
->type
, max
));
1711 a
= lp_build_min(bld
, a
, max
);
1712 a
= lp_build_max(bld
, a
, min
);
1718 * Generate clamp(a, 0, 1)
1719 * A NaN will get converted to zero.
1722 lp_build_clamp_zero_one_nanzero(struct lp_build_context
*bld
,
1725 a
= lp_build_max_ext(bld
, a
, bld
->zero
, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
1726 a
= lp_build_min(bld
, a
, bld
->one
);
1735 lp_build_abs(struct lp_build_context
*bld
,
1738 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1739 const struct lp_type type
= bld
->type
;
1740 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1742 assert(lp_check_value(type
, a
));
1749 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fabs", vec_type
);
1750 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1753 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
&& LLVM_VERSION_MAJOR
< 6) {
1754 switch(type
.width
) {
1756 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1758 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1760 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1763 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_avx2
&& LLVM_VERSION_MAJOR
< 6) {
1764 switch(type
.width
) {
1766 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.b", vec_type
, a
);
1768 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.w", vec_type
, a
);
1770 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.d", vec_type
, a
);
1774 return lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
),
1775 a
, LLVMBuildNeg(builder
, a
, ""));
1780 lp_build_negate(struct lp_build_context
*bld
,
1783 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1785 assert(lp_check_value(bld
->type
, a
));
1787 if (bld
->type
.floating
)
1788 a
= LLVMBuildFNeg(builder
, a
, "");
1790 a
= LLVMBuildNeg(builder
, a
, "");
1796 /** Return -1, 0 or +1 depending on the sign of a */
1798 lp_build_sgn(struct lp_build_context
*bld
,
1801 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1802 const struct lp_type type
= bld
->type
;
1806 assert(lp_check_value(type
, a
));
1808 /* Handle non-zero case */
1810 /* if not zero then sign must be positive */
1813 else if(type
.floating
) {
1814 LLVMTypeRef vec_type
;
1815 LLVMTypeRef int_type
;
1819 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1821 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1822 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1823 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1825 /* Take the sign bit and add it to 1 constant */
1826 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1827 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1828 one
= LLVMConstBitCast(bld
->one
, int_type
);
1829 res
= LLVMBuildOr(builder
, sign
, one
, "");
1830 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1834 /* signed int/norm/fixed point */
1835 /* could use psign with sse3 and appropriate vectors here */
1836 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1837 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1838 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1842 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1843 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1850 * Set the sign of float vector 'a' according to 'sign'.
1851 * If sign==0, return abs(a).
1852 * If sign==1, return -abs(a);
1853 * Other values for sign produce undefined results.
1856 lp_build_set_sign(struct lp_build_context
*bld
,
1857 LLVMValueRef a
, LLVMValueRef sign
)
1859 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1860 const struct lp_type type
= bld
->type
;
1861 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1862 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1863 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1864 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1865 ~((unsigned long long) 1 << (type
.width
- 1)));
1866 LLVMValueRef val
, res
;
1868 assert(type
.floating
);
1869 assert(lp_check_value(type
, a
));
1871 /* val = reinterpret_cast<int>(a) */
1872 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1873 /* val = val & mask */
1874 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1875 /* sign = sign << shift */
1876 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1877 /* res = val | sign */
1878 res
= LLVMBuildOr(builder
, val
, sign
, "");
1879 /* res = reinterpret_cast<float>(res) */
1880 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1887 * Convert vector of (or scalar) int to vector of (or scalar) float.
1890 lp_build_int_to_float(struct lp_build_context
*bld
,
1893 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1894 const struct lp_type type
= bld
->type
;
1895 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1897 assert(type
.floating
);
1899 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1903 arch_rounding_available(const struct lp_type type
)
1905 if ((util_cpu_caps
.has_sse4_1
&&
1906 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1907 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256) ||
1908 (util_cpu_caps
.has_avx512f
&& type
.width
*type
.length
== 512))
1910 else if ((util_cpu_caps
.has_altivec
&&
1911 (type
.width
== 32 && type
.length
== 4)))
1913 else if (util_cpu_caps
.has_neon
)
1919 enum lp_build_round_mode
1921 LP_BUILD_ROUND_NEAREST
= 0,
1922 LP_BUILD_ROUND_FLOOR
= 1,
1923 LP_BUILD_ROUND_CEIL
= 2,
1924 LP_BUILD_ROUND_TRUNCATE
= 3
1927 static inline LLVMValueRef
1928 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1931 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1932 const struct lp_type type
= bld
->type
;
1933 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1934 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1935 const char *intrinsic
;
1938 assert(type
.floating
);
1939 /* using the double precision conversions is a bit more complicated */
1940 assert(type
.width
== 32);
1942 assert(lp_check_value(type
, a
));
1943 assert(util_cpu_caps
.has_sse2
);
1945 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1946 if (type
.length
== 1) {
1947 LLVMTypeRef vec_type
;
1950 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1952 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1954 intrinsic
= "llvm.x86.sse.cvtss2si";
1956 undef
= LLVMGetUndef(vec_type
);
1958 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1960 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1964 if (type
.width
* type
.length
== 128) {
1965 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1968 assert(type
.width
*type
.length
== 256);
1969 assert(util_cpu_caps
.has_avx
);
1971 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1973 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1983 static inline LLVMValueRef
1984 lp_build_round_altivec(struct lp_build_context
*bld
,
1986 enum lp_build_round_mode mode
)
1988 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1989 const struct lp_type type
= bld
->type
;
1990 const char *intrinsic
= NULL
;
1992 assert(type
.floating
);
1994 assert(lp_check_value(type
, a
));
1995 assert(util_cpu_caps
.has_altivec
);
2000 case LP_BUILD_ROUND_NEAREST
:
2001 intrinsic
= "llvm.ppc.altivec.vrfin";
2003 case LP_BUILD_ROUND_FLOOR
:
2004 intrinsic
= "llvm.ppc.altivec.vrfim";
2006 case LP_BUILD_ROUND_CEIL
:
2007 intrinsic
= "llvm.ppc.altivec.vrfip";
2009 case LP_BUILD_ROUND_TRUNCATE
:
2010 intrinsic
= "llvm.ppc.altivec.vrfiz";
2014 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2017 static inline LLVMValueRef
2018 lp_build_round_arch(struct lp_build_context
*bld
,
2020 enum lp_build_round_mode mode
)
2022 if (util_cpu_caps
.has_sse4_1
|| util_cpu_caps
.has_neon
) {
2023 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2024 const struct lp_type type
= bld
->type
;
2025 const char *intrinsic_root
;
2028 assert(type
.floating
);
2029 assert(lp_check_value(type
, a
));
2033 case LP_BUILD_ROUND_NEAREST
:
2034 intrinsic_root
= "llvm.nearbyint";
2036 case LP_BUILD_ROUND_FLOOR
:
2037 intrinsic_root
= "llvm.floor";
2039 case LP_BUILD_ROUND_CEIL
:
2040 intrinsic_root
= "llvm.ceil";
2042 case LP_BUILD_ROUND_TRUNCATE
:
2043 intrinsic_root
= "llvm.trunc";
2047 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, intrinsic_root
, bld
->vec_type
);
2048 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2050 else /* (util_cpu_caps.has_altivec) */
2051 return lp_build_round_altivec(bld
, a
, mode
);
2055 * Return the integer part of a float (vector) value (== round toward zero).
2056 * The returned value is a float (vector).
2057 * Ex: trunc(-1.5) = -1.0
2060 lp_build_trunc(struct lp_build_context
*bld
,
2063 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2064 const struct lp_type type
= bld
->type
;
2066 assert(type
.floating
);
2067 assert(lp_check_value(type
, a
));
2069 if (arch_rounding_available(type
)) {
2070 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
2073 const struct lp_type type
= bld
->type
;
2074 struct lp_type inttype
;
2075 struct lp_build_context intbld
;
2076 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2077 LLVMValueRef trunc
, res
, anosign
, mask
;
2078 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2079 LLVMTypeRef vec_type
= bld
->vec_type
;
2081 assert(type
.width
== 32); /* might want to handle doubles at some point */
2084 inttype
.floating
= 0;
2085 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2087 /* round by truncation */
2088 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2089 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2091 /* mask out sign bit */
2092 anosign
= lp_build_abs(bld
, a
);
2094 * mask out all values if anosign > 2^24
2095 * This should work both for large ints (all rounding is no-op for them
2096 * because such floats are always exact) as well as special cases like
2097 * NaNs, Infs (taking advantage of the fact they use max exponent).
2098 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2100 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2101 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2102 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2103 return lp_build_select(bld
, mask
, a
, res
);
2109 * Return float (vector) rounded to nearest integer (vector). The returned
2110 * value is a float (vector).
2111 * Ex: round(0.9) = 1.0
2112 * Ex: round(-1.5) = -2.0
2115 lp_build_round(struct lp_build_context
*bld
,
2118 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2119 const struct lp_type type
= bld
->type
;
2121 assert(type
.floating
);
2122 assert(lp_check_value(type
, a
));
2124 if (arch_rounding_available(type
)) {
2125 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2128 const struct lp_type type
= bld
->type
;
2129 struct lp_type inttype
;
2130 struct lp_build_context intbld
;
2131 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2132 LLVMValueRef res
, anosign
, mask
;
2133 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2134 LLVMTypeRef vec_type
= bld
->vec_type
;
2136 assert(type
.width
== 32); /* might want to handle doubles at some point */
2139 inttype
.floating
= 0;
2140 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2142 res
= lp_build_iround(bld
, a
);
2143 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
2145 /* mask out sign bit */
2146 anosign
= lp_build_abs(bld
, a
);
2148 * mask out all values if anosign > 2^24
2149 * This should work both for large ints (all rounding is no-op for them
2150 * because such floats are always exact) as well as special cases like
2151 * NaNs, Infs (taking advantage of the fact they use max exponent).
2152 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2154 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2155 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2156 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2157 return lp_build_select(bld
, mask
, a
, res
);
2163 * Return floor of float (vector), result is a float (vector)
2164 * Ex: floor(1.1) = 1.0
2165 * Ex: floor(-1.1) = -2.0
2168 lp_build_floor(struct lp_build_context
*bld
,
2171 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2172 const struct lp_type type
= bld
->type
;
2174 assert(type
.floating
);
2175 assert(lp_check_value(type
, a
));
2177 if (arch_rounding_available(type
)) {
2178 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2181 const struct lp_type type
= bld
->type
;
2182 struct lp_type inttype
;
2183 struct lp_build_context intbld
;
2184 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2185 LLVMValueRef trunc
, res
, anosign
, mask
;
2186 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2187 LLVMTypeRef vec_type
= bld
->vec_type
;
2189 if (type
.width
!= 32) {
2191 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.floor", vec_type
);
2192 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2195 assert(type
.width
== 32); /* might want to handle doubles at some point */
2198 inttype
.floating
= 0;
2199 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2201 /* round by truncation */
2202 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2203 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2209 * fix values if rounding is wrong (for non-special cases)
2210 * - this is the case if trunc > a
2212 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
2213 /* tmp = trunc > a ? 1.0 : 0.0 */
2214 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2215 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2216 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2217 res
= lp_build_sub(bld
, res
, tmp
);
2220 /* mask out sign bit */
2221 anosign
= lp_build_abs(bld
, a
);
2223 * mask out all values if anosign > 2^24
2224 * This should work both for large ints (all rounding is no-op for them
2225 * because such floats are always exact) as well as special cases like
2226 * NaNs, Infs (taking advantage of the fact they use max exponent).
2227 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2229 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2230 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2231 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2232 return lp_build_select(bld
, mask
, a
, res
);
2238 * Return ceiling of float (vector), returning float (vector).
2239 * Ex: ceil( 1.1) = 2.0
2240 * Ex: ceil(-1.1) = -1.0
2243 lp_build_ceil(struct lp_build_context
*bld
,
2246 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2247 const struct lp_type type
= bld
->type
;
2249 assert(type
.floating
);
2250 assert(lp_check_value(type
, a
));
2252 if (arch_rounding_available(type
)) {
2253 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2256 const struct lp_type type
= bld
->type
;
2257 struct lp_type inttype
;
2258 struct lp_build_context intbld
;
2259 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2260 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2261 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2262 LLVMTypeRef vec_type
= bld
->vec_type
;
2264 if (type
.width
!= 32) {
2266 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.ceil", vec_type
);
2267 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2270 assert(type
.width
== 32); /* might want to handle doubles at some point */
2273 inttype
.floating
= 0;
2274 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2276 /* round by truncation */
2277 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2278 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2281 * fix values if rounding is wrong (for non-special cases)
2282 * - this is the case if trunc < a
2284 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2285 /* tmp = trunc < a ? 1.0 : 0.0 */
2286 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2287 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2288 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2289 res
= lp_build_add(bld
, trunc
, tmp
);
2291 /* mask out sign bit */
2292 anosign
= lp_build_abs(bld
, a
);
2294 * mask out all values if anosign > 2^24
2295 * This should work both for large ints (all rounding is no-op for them
2296 * because such floats are always exact) as well as special cases like
2297 * NaNs, Infs (taking advantage of the fact they use max exponent).
2298 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2300 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2301 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2302 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2303 return lp_build_select(bld
, mask
, a
, res
);
2309 * Return fractional part of 'a' computed as a - floor(a)
2310 * Typically used in texture coord arithmetic.
2313 lp_build_fract(struct lp_build_context
*bld
,
2316 assert(bld
->type
.floating
);
2317 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2322 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2323 * against 0.99999(9). (Will also return that value for NaNs.)
2325 static inline LLVMValueRef
2326 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2330 /* this is the largest number smaller than 1.0 representable as float */
2331 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2332 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2333 return lp_build_min_ext(bld
, fract
, max
,
2334 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
2339 * Same as lp_build_fract, but guarantees that the result is always smaller
2340 * than one. Will also return the smaller-than-one value for infs, NaNs.
2343 lp_build_fract_safe(struct lp_build_context
*bld
,
2346 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2351 * Return the integer part of a float (vector) value (== round toward zero).
2352 * The returned value is an integer (vector).
2353 * Ex: itrunc(-1.5) = -1
2356 lp_build_itrunc(struct lp_build_context
*bld
,
2359 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2360 const struct lp_type type
= bld
->type
;
2361 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2363 assert(type
.floating
);
2364 assert(lp_check_value(type
, a
));
2366 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2371 * Return float (vector) rounded to nearest integer (vector). The returned
2372 * value is an integer (vector).
2373 * Ex: iround(0.9) = 1
2374 * Ex: iround(-1.5) = -2
2377 lp_build_iround(struct lp_build_context
*bld
,
2380 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2381 const struct lp_type type
= bld
->type
;
2382 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2385 assert(type
.floating
);
2387 assert(lp_check_value(type
, a
));
2389 if ((util_cpu_caps
.has_sse2
&&
2390 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2391 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2392 return lp_build_iround_nearest_sse2(bld
, a
);
2394 if (arch_rounding_available(type
)) {
2395 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2400 half
= lp_build_const_vec(bld
->gallivm
, type
, nextafterf(0.5, 0.0));
2403 LLVMTypeRef vec_type
= bld
->vec_type
;
2404 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2405 (unsigned long long)1 << (type
.width
- 1));
2409 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2410 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2413 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2414 half
= LLVMBuildOr(builder
, sign
, half
, "");
2415 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2418 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2421 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2428 * Return floor of float (vector), result is an int (vector)
2429 * Ex: ifloor(1.1) = 1.0
2430 * Ex: ifloor(-1.1) = -2.0
2433 lp_build_ifloor(struct lp_build_context
*bld
,
2436 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2437 const struct lp_type type
= bld
->type
;
2438 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2441 assert(type
.floating
);
2442 assert(lp_check_value(type
, a
));
2446 if (arch_rounding_available(type
)) {
2447 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2450 struct lp_type inttype
;
2451 struct lp_build_context intbld
;
2452 LLVMValueRef trunc
, itrunc
, mask
;
2454 assert(type
.floating
);
2455 assert(lp_check_value(type
, a
));
2458 inttype
.floating
= 0;
2459 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2461 /* round by truncation */
2462 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2463 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2466 * fix values if rounding is wrong (for non-special cases)
2467 * - this is the case if trunc > a
2468 * The results of doing this with NaNs, very large values etc.
2469 * are undefined but this seems to be the case anyway.
2471 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2472 /* cheapie minus one with mask since the mask is minus one / zero */
2473 return lp_build_add(&intbld
, itrunc
, mask
);
2477 /* round to nearest (toward zero) */
2478 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2485 * Return ceiling of float (vector), returning int (vector).
2486 * Ex: iceil( 1.1) = 2
2487 * Ex: iceil(-1.1) = -1
2490 lp_build_iceil(struct lp_build_context
*bld
,
2493 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2494 const struct lp_type type
= bld
->type
;
2495 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2498 assert(type
.floating
);
2499 assert(lp_check_value(type
, a
));
2501 if (arch_rounding_available(type
)) {
2502 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2505 struct lp_type inttype
;
2506 struct lp_build_context intbld
;
2507 LLVMValueRef trunc
, itrunc
, mask
;
2509 assert(type
.floating
);
2510 assert(lp_check_value(type
, a
));
2513 inttype
.floating
= 0;
2514 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2516 /* round by truncation */
2517 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2518 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2521 * fix values if rounding is wrong (for non-special cases)
2522 * - this is the case if trunc < a
2523 * The results of doing this with NaNs, very large values etc.
2524 * are undefined but this seems to be the case anyway.
2526 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2527 /* cheapie plus one with mask since the mask is minus one / zero */
2528 return lp_build_sub(&intbld
, itrunc
, mask
);
2531 /* round to nearest (toward zero) */
2532 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2539 * Combined ifloor() & fract().
2541 * Preferred to calling the functions separately, as it will ensure that the
2542 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2545 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2547 LLVMValueRef
*out_ipart
,
2548 LLVMValueRef
*out_fpart
)
2550 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2551 const struct lp_type type
= bld
->type
;
2554 assert(type
.floating
);
2555 assert(lp_check_value(type
, a
));
2557 if (arch_rounding_available(type
)) {
2559 * floor() is easier.
2562 ipart
= lp_build_floor(bld
, a
);
2563 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2564 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2568 * ifloor() is easier.
2571 *out_ipart
= lp_build_ifloor(bld
, a
);
2572 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2573 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2579 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2580 * always smaller than one.
2583 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2585 LLVMValueRef
*out_ipart
,
2586 LLVMValueRef
*out_fpart
)
2588 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2589 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2594 lp_build_sqrt(struct lp_build_context
*bld
,
2597 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2598 const struct lp_type type
= bld
->type
;
2599 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2602 assert(lp_check_value(type
, a
));
2604 assert(type
.floating
);
2605 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.sqrt", vec_type
);
2607 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2612 * Do one Newton-Raphson step to improve reciprocate precision:
2614 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2616 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2617 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2618 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2619 * halo. It would be necessary to clamp the argument to prevent this.
2622 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2623 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2625 static inline LLVMValueRef
2626 lp_build_rcp_refine(struct lp_build_context
*bld
,
2630 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2634 neg_a
= LLVMBuildFNeg(builder
, a
, "");
2635 res
= lp_build_fmuladd(builder
, neg_a
, rcp_a
, bld
->one
);
2636 res
= lp_build_fmuladd(builder
, res
, rcp_a
, rcp_a
);
2643 lp_build_rcp(struct lp_build_context
*bld
,
2646 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2647 const struct lp_type type
= bld
->type
;
2649 assert(lp_check_value(type
, a
));
2658 assert(type
.floating
);
2660 if(LLVMIsConstant(a
))
2661 return LLVMConstFDiv(bld
->one
, a
);
2664 * We don't use RCPPS because:
2665 * - it only has 10bits of precision
2666 * - it doesn't even get the reciprocate of 1.0 exactly
2667 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2668 * - for recent processors the benefit over DIVPS is marginal, a case
2671 * We could still use it on certain processors if benchmarks show that the
2672 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2673 * particular uses that require less workarounds.
2676 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2677 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2678 const unsigned num_iterations
= 0;
2681 const char *intrinsic
= NULL
;
2683 if (type
.length
== 4) {
2684 intrinsic
= "llvm.x86.sse.rcp.ps";
2687 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2690 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2692 for (i
= 0; i
< num_iterations
; ++i
) {
2693 res
= lp_build_rcp_refine(bld
, a
, res
);
2699 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2704 * Do one Newton-Raphson step to improve rsqrt precision:
2706 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2708 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2710 static inline LLVMValueRef
2711 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2713 LLVMValueRef rsqrt_a
)
2715 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2716 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2717 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2720 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2721 res
= LLVMBuildFMul(builder
, a
, res
, "");
2722 res
= LLVMBuildFSub(builder
, three
, res
, "");
2723 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2724 res
= LLVMBuildFMul(builder
, half
, res
, "");
2731 * Generate 1/sqrt(a).
2732 * Result is undefined for values < 0, infinity for +0.
2735 lp_build_rsqrt(struct lp_build_context
*bld
,
2738 const struct lp_type type
= bld
->type
;
2740 assert(lp_check_value(type
, a
));
2742 assert(type
.floating
);
2745 * This should be faster but all denormals will end up as infinity.
2747 if (0 && lp_build_fast_rsqrt_available(type
)) {
2748 const unsigned num_iterations
= 1;
2752 /* rsqrt(1.0) != 1.0 here */
2753 res
= lp_build_fast_rsqrt(bld
, a
);
2755 if (num_iterations
) {
2757 * Newton-Raphson will result in NaN instead of infinity for zero,
2758 * and NaN instead of zero for infinity.
2759 * Also, need to ensure rsqrt(1.0) == 1.0.
2760 * All numbers smaller than FLT_MIN will result in +infinity
2761 * (rsqrtps treats all denormals as zero).
2764 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2765 LLVMValueRef inf
= lp_build_const_vec(bld
->gallivm
, type
, INFINITY
);
2767 for (i
= 0; i
< num_iterations
; ++i
) {
2768 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2770 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2771 res
= lp_build_select(bld
, cmp
, inf
, res
);
2772 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2773 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2774 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2775 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2781 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2785 * If there's a fast (inaccurate) rsqrt instruction available
2786 * (caller may want to avoid to call rsqrt_fast if it's not available,
2787 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2788 * unavailable it would result in sqrt/div/mul so obviously
2789 * much better to just call sqrt, skipping both div and mul).
2792 lp_build_fast_rsqrt_available(struct lp_type type
)
2794 assert(type
.floating
);
2796 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2797 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2805 * Generate 1/sqrt(a).
2806 * Result is undefined for values < 0, infinity for +0.
2807 * Precision is limited, only ~10 bits guaranteed
2808 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2811 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2814 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2815 const struct lp_type type
= bld
->type
;
2817 assert(lp_check_value(type
, a
));
2819 if (lp_build_fast_rsqrt_available(type
)) {
2820 const char *intrinsic
= NULL
;
2822 if (type
.length
== 4) {
2823 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2826 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2828 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2831 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2833 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2838 * Generate sin(a) or cos(a) using polynomial approximation.
2839 * TODO: it might be worth recognizing sin and cos using same source
2840 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2841 * would be way cheaper than calculating (nearly) everything twice...
2842 * Not sure it's common enough to be worth bothering however, scs
2843 * opcode could also benefit from calculating both though.
2846 lp_build_sin_or_cos(struct lp_build_context
*bld
,
2850 struct gallivm_state
*gallivm
= bld
->gallivm
;
2851 LLVMBuilderRef b
= gallivm
->builder
;
2852 struct lp_type int_type
= lp_int_type(bld
->type
);
2855 * take the absolute value,
2856 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2859 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2860 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2862 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2863 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2867 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2870 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2871 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2874 * store the integer part of y in mm0
2875 * emm2 = _mm_cvttps_epi32(y);
2878 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2881 * j=(j+1) & (~1) (see the cephes sources)
2882 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2885 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2886 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2888 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2890 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2891 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2894 * y = _mm_cvtepi32_ps(emm2);
2896 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2898 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2899 LLVMValueRef const_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2900 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2901 LLVMValueRef sign_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2904 * Argument used for poly selection and sign bit determination
2905 * is different for sin vs. cos.
2907 LLVMValueRef emm2_2
= cos
? LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2") :
2910 LLVMValueRef sign_bit
= cos
? LLVMBuildShl(b
, LLVMBuildAnd(b
, const_4
,
2911 LLVMBuildNot(b
, emm2_2
, ""), ""),
2912 const_29
, "sign_bit") :
2913 LLVMBuildAnd(b
, LLVMBuildXor(b
, a_v4si
,
2914 LLVMBuildShl(b
, emm2_add
,
2916 sign_mask
, "sign_bit");
2919 * get the polynom selection mask
2920 * there is one polynom for 0 <= x <= Pi/4
2921 * and another one for Pi/4<x<=Pi/2
2922 * Both branches will be computed.
2924 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2925 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2928 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, const_2
, "emm2_3");
2929 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2930 int_type
, PIPE_FUNC_EQUAL
,
2931 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2934 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2935 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2936 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2938 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2939 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2940 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2943 * The magic pass: "Extended precision modular arithmetic"
2944 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2946 LLVMValueRef x_1
= lp_build_fmuladd(b
, y_2
, DP1
, x_abs
);
2947 LLVMValueRef x_2
= lp_build_fmuladd(b
, y_2
, DP2
, x_1
);
2948 LLVMValueRef x_3
= lp_build_fmuladd(b
, y_2
, DP3
, x_2
);
2951 * Evaluate the first polynom (0 <= x <= Pi/4)
2953 * z = _mm_mul_ps(x,x);
2955 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2958 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2959 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2960 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2962 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2963 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2964 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2967 * y = *(v4sf*)_ps_coscof_p0;
2968 * y = _mm_mul_ps(y, z);
2970 LLVMValueRef y_4
= lp_build_fmuladd(b
, z
, coscof_p0
, coscof_p1
);
2971 LLVMValueRef y_6
= lp_build_fmuladd(b
, y_4
, z
, coscof_p2
);
2972 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2973 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2977 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2978 * y = _mm_sub_ps(y, tmp);
2979 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2981 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2982 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2983 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2984 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2985 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2988 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2989 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2990 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2992 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2993 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2994 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2997 * Evaluate the second polynom (Pi/4 <= x <= 0)
2999 * y2 = *(v4sf*)_ps_sincof_p0;
3000 * y2 = _mm_mul_ps(y2, z);
3001 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3002 * y2 = _mm_mul_ps(y2, z);
3003 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3004 * y2 = _mm_mul_ps(y2, z);
3005 * y2 = _mm_mul_ps(y2, x);
3006 * y2 = _mm_add_ps(y2, x);
3009 LLVMValueRef y2_4
= lp_build_fmuladd(b
, z
, sincof_p0
, sincof_p1
);
3010 LLVMValueRef y2_6
= lp_build_fmuladd(b
, y2_4
, z
, sincof_p2
);
3011 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
3012 LLVMValueRef y2_9
= lp_build_fmuladd(b
, y2_7
, x_3
, x_3
);
3015 * select the correct result from the two polynoms
3017 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3018 * y = _mm_andnot_ps(xmm3, y);
3019 * y = _mm_or_ps(y,y2);
3021 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
3022 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
3023 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
3024 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
3025 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
3026 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
3030 * y = _mm_xor_ps(y, sign_bit);
3032 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sign");
3033 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
3035 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
3037 /* clamp output to be within [-1, 1] */
3038 y_result
= lp_build_clamp(bld
, y_result
,
3039 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
3040 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
3041 /* If a is -inf, inf or NaN then return NaN */
3042 y_result
= lp_build_select(bld
, isfinite
, y_result
,
3043 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
3052 lp_build_sin(struct lp_build_context
*bld
,
3055 return lp_build_sin_or_cos(bld
, a
, FALSE
);
3063 lp_build_cos(struct lp_build_context
*bld
,
3066 return lp_build_sin_or_cos(bld
, a
, TRUE
);
3071 * Generate pow(x, y)
3074 lp_build_pow(struct lp_build_context
*bld
,
3078 /* TODO: optimize the constant case */
3079 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3080 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
3081 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3085 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
3093 lp_build_exp(struct lp_build_context
*bld
,
3096 /* log2(e) = 1/log(2) */
3097 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3098 1.4426950408889634);
3100 assert(lp_check_value(bld
->type
, x
));
3102 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
3108 * Behavior is undefined with infs, 0s and nans
3111 lp_build_log(struct lp_build_context
*bld
,
3115 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3116 0.69314718055994529);
3118 assert(lp_check_value(bld
->type
, x
));
3120 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
3124 * Generate log(x) that handles edge cases (infs, 0s and nans)
3127 lp_build_log_safe(struct lp_build_context
*bld
,
3131 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3132 0.69314718055994529);
3134 assert(lp_check_value(bld
->type
, x
));
3136 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
3141 * Generate polynomial.
3142 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3145 lp_build_polynomial(struct lp_build_context
*bld
,
3147 const double *coeffs
,
3148 unsigned num_coeffs
)
3150 const struct lp_type type
= bld
->type
;
3151 LLVMValueRef even
= NULL
, odd
= NULL
;
3155 assert(lp_check_value(bld
->type
, x
));
3157 /* TODO: optimize the constant case */
3158 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3159 LLVMIsConstant(x
)) {
3160 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3165 * Calculate odd and even terms seperately to decrease data dependency
3167 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3168 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3170 x2
= lp_build_mul(bld
, x
, x
);
3172 for (i
= num_coeffs
; i
--; ) {
3175 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
3179 even
= lp_build_mad(bld
, x2
, even
, coeff
);
3184 odd
= lp_build_mad(bld
, x2
, odd
, coeff
);
3191 return lp_build_mad(bld
, odd
, x
, even
);
3200 * Minimax polynomial fit of 2**x, in range [0, 1[
3202 const double lp_build_exp2_polynomial
[] = {
3203 #if EXP_POLY_DEGREE == 5
3204 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3205 0.693153073200168932794,
3206 0.240153617044375388211,
3207 0.0558263180532956664775,
3208 0.00898934009049466391101,
3209 0.00187757667519147912699
3210 #elif EXP_POLY_DEGREE == 4
3211 1.00000259337069434683,
3212 0.693003834469974940458,
3213 0.24144275689150793076,
3214 0.0520114606103070150235,
3215 0.0135341679161270268764
3216 #elif EXP_POLY_DEGREE == 3
3217 0.999925218562710312959,
3218 0.695833540494823811697,
3219 0.226067155427249155588,
3220 0.0780245226406372992967
3221 #elif EXP_POLY_DEGREE == 2
3222 1.00172476321474503578,
3223 0.657636275736077639316,
3224 0.33718943461968720704
3232 lp_build_exp2(struct lp_build_context
*bld
,
3235 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3236 const struct lp_type type
= bld
->type
;
3237 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3238 LLVMValueRef ipart
= NULL
;
3239 LLVMValueRef fpart
= NULL
;
3240 LLVMValueRef expipart
= NULL
;
3241 LLVMValueRef expfpart
= NULL
;
3242 LLVMValueRef res
= NULL
;
3244 assert(lp_check_value(bld
->type
, x
));
3246 /* TODO: optimize the constant case */
3247 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3248 LLVMIsConstant(x
)) {
3249 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3253 assert(type
.floating
&& type
.width
== 32);
3255 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3256 * the result is INF and if it's smaller than -126.9 the result is 0 */
3257 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3258 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3259 x
= lp_build_max_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999),
3260 x
, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3262 /* ipart = floor(x) */
3263 /* fpart = x - ipart */
3264 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3266 /* expipart = (float) (1 << ipart) */
3267 expipart
= LLVMBuildAdd(builder
, ipart
,
3268 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3269 expipart
= LLVMBuildShl(builder
, expipart
,
3270 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3271 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3273 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3274 ARRAY_SIZE(lp_build_exp2_polynomial
));
3276 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3284 * Extract the exponent of a IEEE-754 floating point value.
3286 * Optionally apply an integer bias.
3288 * Result is an integer value with
3290 * ifloor(log2(x)) + bias
3293 lp_build_extract_exponent(struct lp_build_context
*bld
,
3297 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3298 const struct lp_type type
= bld
->type
;
3299 unsigned mantissa
= lp_mantissa(type
);
3302 assert(type
.floating
);
3304 assert(lp_check_value(bld
->type
, x
));
3306 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3308 res
= LLVMBuildLShr(builder
, x
,
3309 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3310 res
= LLVMBuildAnd(builder
, res
,
3311 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3312 res
= LLVMBuildSub(builder
, res
,
3313 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3320 * Extract the mantissa of the a floating.
3322 * Result is a floating point value with
3324 * x / floor(log2(x))
3327 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3330 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3331 const struct lp_type type
= bld
->type
;
3332 unsigned mantissa
= lp_mantissa(type
);
3333 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3334 (1ULL << mantissa
) - 1);
3335 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3338 assert(lp_check_value(bld
->type
, x
));
3340 assert(type
.floating
);
3342 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3344 /* res = x / 2**ipart */
3345 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3346 res
= LLVMBuildOr(builder
, res
, one
, "");
3347 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3355 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3356 * These coefficients can be generate with
3357 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3359 const double lp_build_log2_polynomial
[] = {
3360 #if LOG_POLY_DEGREE == 5
3361 2.88539008148777786488L,
3362 0.961796878841293367824L,
3363 0.577058946784739859012L,
3364 0.412914355135828735411L,
3365 0.308591899232910175289L,
3366 0.352376952300281371868L,
3367 #elif LOG_POLY_DEGREE == 4
3368 2.88539009343309178325L,
3369 0.961791550404184197881L,
3370 0.577440339438736392009L,
3371 0.403343858251329912514L,
3372 0.406718052498846252698L,
3373 #elif LOG_POLY_DEGREE == 3
3374 2.88538959748872753838L,
3375 0.961932915889597772928L,
3376 0.571118517972136195241L,
3377 0.493997535084709500285L,
3384 * See http://www.devmaster.net/forums/showthread.php?p=43580
3385 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3386 * http://www.nezumi.demon.co.uk/consult/logx.htm
3388 * If handle_edge_cases is true the function will perform computations
3389 * to match the required D3D10+ behavior for each of the edge cases.
3390 * That means that if input is:
3391 * - less than zero (to and including -inf) then NaN will be returned
3392 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3393 * - +infinity, then +infinity will be returned
3394 * - NaN, then NaN will be returned
3396 * Those checks are fairly expensive so if you don't need them make sure
3397 * handle_edge_cases is false.
3400 lp_build_log2_approx(struct lp_build_context
*bld
,
3402 LLVMValueRef
*p_exp
,
3403 LLVMValueRef
*p_floor_log2
,
3404 LLVMValueRef
*p_log2
,
3405 boolean handle_edge_cases
)
3407 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3408 const struct lp_type type
= bld
->type
;
3409 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3410 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3412 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3413 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3414 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3416 LLVMValueRef i
= NULL
;
3417 LLVMValueRef y
= NULL
;
3418 LLVMValueRef z
= NULL
;
3419 LLVMValueRef exp
= NULL
;
3420 LLVMValueRef mant
= NULL
;
3421 LLVMValueRef logexp
= NULL
;
3422 LLVMValueRef p_z
= NULL
;
3423 LLVMValueRef res
= NULL
;
3425 assert(lp_check_value(bld
->type
, x
));
3427 if(p_exp
|| p_floor_log2
|| p_log2
) {
3428 /* TODO: optimize the constant case */
3429 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3430 LLVMIsConstant(x
)) {
3431 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3435 assert(type
.floating
&& type
.width
== 32);
3438 * We don't explicitly handle denormalized numbers. They will yield a
3439 * result in the neighbourhood of -127, which appears to be adequate
3443 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3445 /* exp = (float) exponent(x) */
3446 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3449 if(p_floor_log2
|| p_log2
) {
3450 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3451 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3452 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3456 /* mant = 1 + (float) mantissa(x) */
3457 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3458 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3459 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3461 /* y = (mant - 1) / (mant + 1) */
3462 y
= lp_build_div(bld
,
3463 lp_build_sub(bld
, mant
, bld
->one
),
3464 lp_build_add(bld
, mant
, bld
->one
)
3468 z
= lp_build_mul(bld
, y
, y
);
3471 p_z
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3472 ARRAY_SIZE(lp_build_log2_polynomial
));
3474 /* y * P(z) + logexp */
3475 res
= lp_build_mad(bld
, y
, p_z
, logexp
);
3477 if (type
.floating
&& handle_edge_cases
) {
3478 LLVMValueRef negmask
, infmask
, zmask
;
3479 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3480 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3481 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3482 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3483 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3484 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3486 /* If x is qual to inf make sure we return inf */
3487 res
= lp_build_select(bld
, infmask
,
3488 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3490 /* If x is qual to 0, return -inf */
3491 res
= lp_build_select(bld
, zmask
,
3492 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3494 /* If x is nan or less than 0, return nan */
3495 res
= lp_build_select(bld
, negmask
,
3496 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3502 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3507 *p_floor_log2
= logexp
;
3515 * log2 implementation which doesn't have special code to
3516 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3517 * the results for those cases are undefined.
3520 lp_build_log2(struct lp_build_context
*bld
,
3524 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3529 * Version of log2 which handles all edge cases.
3530 * Look at documentation of lp_build_log2_approx for
3531 * description of the behavior for each of the edge cases.
3534 lp_build_log2_safe(struct lp_build_context
*bld
,
3538 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3544 * Faster (and less accurate) log2.
3546 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3548 * Piece-wise linear approximation, with exact results when x is a
3551 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3554 lp_build_fast_log2(struct lp_build_context
*bld
,
3557 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3561 assert(lp_check_value(bld
->type
, x
));
3563 assert(bld
->type
.floating
);
3565 /* ipart = floor(log2(x)) - 1 */
3566 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3567 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3569 /* fpart = x / 2**ipart */
3570 fpart
= lp_build_extract_mantissa(bld
, x
);
3573 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3578 * Fast implementation of iround(log2(x)).
3580 * Not an approximation -- it should give accurate results all the time.
3583 lp_build_ilog2(struct lp_build_context
*bld
,
3586 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3587 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3590 assert(bld
->type
.floating
);
3592 assert(lp_check_value(bld
->type
, x
));
3594 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3595 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3597 /* ipart = floor(log2(x) + 0.5) */
3598 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3604 lp_build_mod(struct lp_build_context
*bld
,
3608 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3610 const struct lp_type type
= bld
->type
;
3612 assert(lp_check_value(type
, x
));
3613 assert(lp_check_value(type
, y
));
3616 res
= LLVMBuildFRem(builder
, x
, y
, "");
3618 res
= LLVMBuildSRem(builder
, x
, y
, "");
3620 res
= LLVMBuildURem(builder
, x
, y
, "");
3626 * For floating inputs it creates and returns a mask
3627 * which is all 1's for channels which are NaN.
3628 * Channels inside x which are not NaN will be 0.
3631 lp_build_isnan(struct lp_build_context
*bld
,
3635 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3637 assert(bld
->type
.floating
);
3638 assert(lp_check_value(bld
->type
, x
));
3640 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3642 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3643 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3647 /* Returns all 1's for floating point numbers that are
3648 * finite numbers and returns all zeros for -inf,
3651 lp_build_isfinite(struct lp_build_context
*bld
,
3654 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3655 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3656 struct lp_type int_type
= lp_int_type(bld
->type
);
3657 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3658 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3661 if (!bld
->type
.floating
) {
3662 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3664 assert(bld
->type
.floating
);
3665 assert(lp_check_value(bld
->type
, x
));
3666 assert(bld
->type
.width
== 32);
3668 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3669 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,
3674 * Returns true if the number is nan or inf and false otherwise.
3675 * The input has to be a floating point vector.
3678 lp_build_is_inf_or_nan(struct gallivm_state
*gallivm
,
3679 const struct lp_type type
,
3682 LLVMBuilderRef builder
= gallivm
->builder
;
3683 struct lp_type int_type
= lp_int_type(type
);
3684 LLVMValueRef const0
= lp_build_const_int_vec(gallivm
, int_type
,
3688 assert(type
.floating
);
3690 ret
= LLVMBuildBitCast(builder
, x
, lp_build_vec_type(gallivm
, int_type
), "");
3691 ret
= LLVMBuildAnd(builder
, ret
, const0
, "");
3692 ret
= lp_build_compare(gallivm
, int_type
, PIPE_FUNC_EQUAL
,
3700 lp_build_fpstate_get(struct gallivm_state
*gallivm
)
3702 if (util_cpu_caps
.has_sse
) {
3703 LLVMBuilderRef builder
= gallivm
->builder
;
3704 LLVMValueRef mxcsr_ptr
= lp_build_alloca(
3706 LLVMInt32TypeInContext(gallivm
->context
),
3708 LLVMValueRef mxcsr_ptr8
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3709 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3710 lp_build_intrinsic(builder
,
3711 "llvm.x86.sse.stmxcsr",
3712 LLVMVoidTypeInContext(gallivm
->context
),
3720 lp_build_fpstate_set_denorms_zero(struct gallivm_state
*gallivm
,
3723 if (util_cpu_caps
.has_sse
) {
3724 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3725 int daz_ftz
= _MM_FLUSH_ZERO_MASK
;
3727 LLVMBuilderRef builder
= gallivm
->builder
;
3728 LLVMValueRef mxcsr_ptr
= lp_build_fpstate_get(gallivm
);
3729 LLVMValueRef mxcsr
=
3730 LLVMBuildLoad(builder
, mxcsr_ptr
, "mxcsr");
3732 if (util_cpu_caps
.has_daz
) {
3733 /* Enable denormals are zero mode */
3734 daz_ftz
|= _MM_DENORMALS_ZERO_MASK
;
3737 mxcsr
= LLVMBuildOr(builder
, mxcsr
,
3738 LLVMConstInt(LLVMTypeOf(mxcsr
), daz_ftz
, 0), "");
3740 mxcsr
= LLVMBuildAnd(builder
, mxcsr
,
3741 LLVMConstInt(LLVMTypeOf(mxcsr
), ~daz_ftz
, 0), "");
3744 LLVMBuildStore(builder
, mxcsr
, mxcsr_ptr
);
3745 lp_build_fpstate_set(gallivm
, mxcsr_ptr
);
3750 lp_build_fpstate_set(struct gallivm_state
*gallivm
,
3751 LLVMValueRef mxcsr_ptr
)
3753 if (util_cpu_caps
.has_sse
) {
3754 LLVMBuilderRef builder
= gallivm
->builder
;
3755 mxcsr_ptr
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3756 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3757 lp_build_intrinsic(builder
,
3758 "llvm.x86.sse.ldmxcsr",
3759 LLVMVoidTypeInContext(gallivm
->context
),