1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #define EXP_POLY_DEGREE 5
80 #define LOG_POLY_DEGREE 4
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
90 lp_build_min_simple(struct lp_build_context
*bld
,
93 enum gallivm_nan_behavior nan_behavior
)
95 const struct lp_type type
= bld
->type
;
96 const char *intrinsic
= NULL
;
97 unsigned intr_size
= 0;
100 assert(lp_check_value(type
, a
));
101 assert(lp_check_value(type
, b
));
103 /* TODO: optimize the constant case */
105 if (type
.floating
&& util_cpu_caps
.has_sse
) {
106 if (type
.width
== 32) {
107 if (type
.length
== 1) {
108 intrinsic
= "llvm.x86.sse.min.ss";
111 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
112 intrinsic
= "llvm.x86.sse.min.ps";
116 intrinsic
= "llvm.x86.avx.min.ps.256";
120 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
121 if (type
.length
== 1) {
122 intrinsic
= "llvm.x86.sse2.min.sd";
125 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
126 intrinsic
= "llvm.x86.sse2.min.pd";
130 intrinsic
= "llvm.x86.avx.min.pd.256";
135 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
136 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
137 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 if (type
.width
== 32 && type
.length
== 4) {
142 intrinsic
= "llvm.ppc.altivec.vminfp";
145 } else if (HAVE_LLVM
< 0x0309 &&
146 util_cpu_caps
.has_avx2
&& type
.length
> 4) {
148 switch (type
.width
) {
150 intrinsic
= type
.sign
? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
153 intrinsic
= type
.sign
? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
156 intrinsic
= type
.sign
? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
159 } else if (HAVE_LLVM
< 0x0309 &&
160 util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
162 if ((type
.width
== 8 || type
.width
== 16) &&
163 (type
.width
* type
.length
<= 64) &&
164 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
168 if (type
.width
== 8 && !type
.sign
) {
169 intrinsic
= "llvm.x86.sse2.pminu.b";
171 else if (type
.width
== 16 && type
.sign
) {
172 intrinsic
= "llvm.x86.sse2.pmins.w";
174 if (util_cpu_caps
.has_sse4_1
) {
175 if (type
.width
== 8 && type
.sign
) {
176 intrinsic
= "llvm.x86.sse41.pminsb";
178 if (type
.width
== 16 && !type
.sign
) {
179 intrinsic
= "llvm.x86.sse41.pminuw";
181 if (type
.width
== 32 && !type
.sign
) {
182 intrinsic
= "llvm.x86.sse41.pminud";
184 if (type
.width
== 32 && type
.sign
) {
185 intrinsic
= "llvm.x86.sse41.pminsd";
188 } else if (util_cpu_caps
.has_altivec
) {
190 if (type
.width
== 8) {
192 intrinsic
= "llvm.ppc.altivec.vminub";
194 intrinsic
= "llvm.ppc.altivec.vminsb";
196 } else if (type
.width
== 16) {
198 intrinsic
= "llvm.ppc.altivec.vminuh";
200 intrinsic
= "llvm.ppc.altivec.vminsh";
202 } else if (type
.width
== 32) {
204 intrinsic
= "llvm.ppc.altivec.vminuw";
206 intrinsic
= "llvm.ppc.altivec.vminsw";
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
218 if (util_cpu_caps
.has_sse
&& type
.floating
&&
219 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
220 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
221 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
222 LLVMValueRef isnan
, min
;
223 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
226 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
227 isnan
= lp_build_isnan(bld
, b
);
228 return lp_build_select(bld
, isnan
, a
, min
);
230 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
231 isnan
= lp_build_isnan(bld
, a
);
232 return lp_build_select(bld
, isnan
, a
, min
);
235 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
242 switch (nan_behavior
) {
243 case GALLIVM_NAN_RETURN_NAN
: {
244 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
245 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
246 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
247 return lp_build_select(bld
, cond
, a
, b
);
250 case GALLIVM_NAN_RETURN_OTHER
: {
251 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
252 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
253 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
254 return lp_build_select(bld
, cond
, a
, b
);
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
258 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
259 return lp_build_select(bld
, cond
, a
, b
);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
261 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, b
, a
);
262 return lp_build_select(bld
, cond
, b
, a
);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
264 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
265 return lp_build_select(bld
, cond
, a
, b
);
269 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
270 return lp_build_select(bld
, cond
, a
, b
);
273 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
274 return lp_build_select(bld
, cond
, a
, b
);
280 lp_build_fmuladd(LLVMBuilderRef builder
,
285 LLVMTypeRef type
= LLVMTypeOf(a
);
286 assert(type
== LLVMTypeOf(b
));
287 assert(type
== LLVMTypeOf(c
));
288 if (HAVE_LLVM
< 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
292 return LLVMBuildFAdd(builder
, LLVMBuildFMul(builder
, a
, b
, ""), c
, "");
295 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fmuladd", type
);
296 LLVMValueRef args
[] = { a
, b
, c
};
297 return lp_build_intrinsic(builder
, intrinsic
, type
, args
, 3, 0);
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
308 lp_build_max_simple(struct lp_build_context
*bld
,
311 enum gallivm_nan_behavior nan_behavior
)
313 const struct lp_type type
= bld
->type
;
314 const char *intrinsic
= NULL
;
315 unsigned intr_size
= 0;
318 assert(lp_check_value(type
, a
));
319 assert(lp_check_value(type
, b
));
321 /* TODO: optimize the constant case */
323 if (type
.floating
&& util_cpu_caps
.has_sse
) {
324 if (type
.width
== 32) {
325 if (type
.length
== 1) {
326 intrinsic
= "llvm.x86.sse.max.ss";
329 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
330 intrinsic
= "llvm.x86.sse.max.ps";
334 intrinsic
= "llvm.x86.avx.max.ps.256";
338 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
339 if (type
.length
== 1) {
340 intrinsic
= "llvm.x86.sse2.max.sd";
343 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
344 intrinsic
= "llvm.x86.sse2.max.pd";
348 intrinsic
= "llvm.x86.avx.max.pd.256";
353 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
354 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
355 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
359 if (type
.width
== 32 || type
.length
== 4) {
360 intrinsic
= "llvm.ppc.altivec.vmaxfp";
363 } else if (HAVE_LLVM
< 0x0309 &&
364 util_cpu_caps
.has_avx2
&& type
.length
> 4) {
366 switch (type
.width
) {
368 intrinsic
= type
.sign
? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
371 intrinsic
= type
.sign
? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
374 intrinsic
= type
.sign
? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
377 } else if (HAVE_LLVM
< 0x0309 &&
378 util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
380 if ((type
.width
== 8 || type
.width
== 16) &&
381 (type
.width
* type
.length
<= 64) &&
382 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
386 if (type
.width
== 8 && !type
.sign
) {
387 intrinsic
= "llvm.x86.sse2.pmaxu.b";
390 else if (type
.width
== 16 && type
.sign
) {
391 intrinsic
= "llvm.x86.sse2.pmaxs.w";
393 if (util_cpu_caps
.has_sse4_1
) {
394 if (type
.width
== 8 && type
.sign
) {
395 intrinsic
= "llvm.x86.sse41.pmaxsb";
397 if (type
.width
== 16 && !type
.sign
) {
398 intrinsic
= "llvm.x86.sse41.pmaxuw";
400 if (type
.width
== 32 && !type
.sign
) {
401 intrinsic
= "llvm.x86.sse41.pmaxud";
403 if (type
.width
== 32 && type
.sign
) {
404 intrinsic
= "llvm.x86.sse41.pmaxsd";
407 } else if (util_cpu_caps
.has_altivec
) {
409 if (type
.width
== 8) {
411 intrinsic
= "llvm.ppc.altivec.vmaxub";
413 intrinsic
= "llvm.ppc.altivec.vmaxsb";
415 } else if (type
.width
== 16) {
417 intrinsic
= "llvm.ppc.altivec.vmaxuh";
419 intrinsic
= "llvm.ppc.altivec.vmaxsh";
421 } else if (type
.width
== 32) {
423 intrinsic
= "llvm.ppc.altivec.vmaxuw";
425 intrinsic
= "llvm.ppc.altivec.vmaxsw";
431 if (util_cpu_caps
.has_sse
&& type
.floating
&&
432 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
433 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
434 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
435 LLVMValueRef isnan
, max
;
436 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
439 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
440 isnan
= lp_build_isnan(bld
, b
);
441 return lp_build_select(bld
, isnan
, a
, max
);
443 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
444 isnan
= lp_build_isnan(bld
, a
);
445 return lp_build_select(bld
, isnan
, a
, max
);
448 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
455 switch (nan_behavior
) {
456 case GALLIVM_NAN_RETURN_NAN
: {
457 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
458 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
459 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
460 return lp_build_select(bld
, cond
, a
, b
);
463 case GALLIVM_NAN_RETURN_OTHER
: {
464 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
465 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
466 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
467 return lp_build_select(bld
, cond
, a
, b
);
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
471 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
472 return lp_build_select(bld
, cond
, a
, b
);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
474 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, a
);
475 return lp_build_select(bld
, cond
, b
, a
);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
477 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
478 return lp_build_select(bld
, cond
, a
, b
);
482 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
483 return lp_build_select(bld
, cond
, a
, b
);
486 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
487 return lp_build_select(bld
, cond
, a
, b
);
493 * Generate 1 - a, or ~a depending on bld->type.
496 lp_build_comp(struct lp_build_context
*bld
,
499 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
500 const struct lp_type type
= bld
->type
;
502 assert(lp_check_value(type
, a
));
509 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
510 if(LLVMIsConstant(a
))
511 return LLVMConstNot(a
);
513 return LLVMBuildNot(builder
, a
, "");
516 if(LLVMIsConstant(a
))
518 return LLVMConstFSub(bld
->one
, a
);
520 return LLVMConstSub(bld
->one
, a
);
523 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
525 return LLVMBuildSub(builder
, bld
->one
, a
, "");
533 lp_build_add(struct lp_build_context
*bld
,
537 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
538 const struct lp_type type
= bld
->type
;
541 assert(lp_check_value(type
, a
));
542 assert(lp_check_value(type
, b
));
548 if (a
== bld
->undef
|| b
== bld
->undef
)
552 const char *intrinsic
= NULL
;
554 if (!type
.sign
&& (a
== bld
->one
|| b
== bld
->one
))
557 if (!type
.floating
&& !type
.fixed
) {
558 if (type
.width
* type
.length
== 128) {
559 if (util_cpu_caps
.has_sse2
) {
561 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" :
562 HAVE_LLVM
< 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL
;
563 if (type
.width
== 16)
564 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" :
565 HAVE_LLVM
< 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL
;
566 } else if (util_cpu_caps
.has_altivec
) {
568 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
569 if (type
.width
== 16)
570 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
573 if (type
.width
* type
.length
== 256) {
574 if (util_cpu_caps
.has_avx2
) {
576 intrinsic
= type
.sign
? "llvm.x86.avx2.padds.b" :
577 HAVE_LLVM
< 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL
;
578 if (type
.width
== 16)
579 intrinsic
= type
.sign
? "llvm.x86.avx2.padds.w" :
580 HAVE_LLVM
< 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL
;
586 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
589 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
591 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
592 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
593 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
594 /* a_clamp_max is the maximum a for positive b,
595 a_clamp_min is the minimum a for negative b. */
596 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildSub(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
597 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildSub(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
598 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_max
, a_clamp_min
);
602 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
604 res
= LLVMConstFAdd(a
, b
);
606 res
= LLVMConstAdd(a
, b
);
609 res
= LLVMBuildFAdd(builder
, a
, b
, "");
611 res
= LLVMBuildAdd(builder
, a
, b
, "");
613 /* clamp to ceiling of 1.0 */
614 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
615 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
617 if (type
.norm
&& !type
.floating
&& !type
.fixed
) {
620 * newer llvm versions no longer support the intrinsics, but recognize
621 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
622 * code, it is important we match the pattern llvm uses (and pray llvm
623 * doesn't change it - and hope they decide on the same pattern for
624 * all backends supporting it...).
625 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
626 * interfere with llvm's ability to recognize the pattern but seems
629 LLVMValueRef overflowed
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, res
);
630 res
= lp_build_select(bld
, overflowed
,
631 LLVMConstAllOnes(bld
->int_vec_type
), res
);
635 /* XXX clamp to floor of -1 or 0??? */
641 /** Return the scalar sum of the elements of a.
642 * Should avoid this operation whenever possible.
645 lp_build_horizontal_add(struct lp_build_context
*bld
,
648 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
649 const struct lp_type type
= bld
->type
;
650 LLVMValueRef index
, res
;
652 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
653 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
654 LLVMValueRef vecres
, elem2
;
656 assert(lp_check_value(type
, a
));
658 if (type
.length
== 1) {
662 assert(!bld
->type
.norm
);
665 * for byte vectors can do much better with psadbw.
666 * Using repeated shuffle/adds here. Note with multiple vectors
667 * this can be done more efficiently as outlined in the intel
668 * optimization manual.
669 * Note: could cause data rearrangement if used with smaller element
674 length
= type
.length
/ 2;
676 LLVMValueRef vec1
, vec2
;
677 for (i
= 0; i
< length
; i
++) {
678 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
679 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
681 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
682 LLVMConstVector(shuffles1
, length
), "");
683 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
684 LLVMConstVector(shuffles2
, length
), "");
686 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
689 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
691 length
= length
>> 1;
694 /* always have vector of size 2 here */
697 index
= lp_build_const_int32(bld
->gallivm
, 0);
698 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
699 index
= lp_build_const_int32(bld
->gallivm
, 1);
700 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
703 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
705 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
711 * Return the horizontal sums of 4 float vectors as a float4 vector.
712 * This uses the technique as outlined in Intel Optimization Manual.
715 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
718 struct gallivm_state
*gallivm
= bld
->gallivm
;
719 LLVMBuilderRef builder
= gallivm
->builder
;
720 LLVMValueRef shuffles
[4];
722 LLVMValueRef sumtmp
[2], shuftmp
[2];
724 /* lower half of regs */
725 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
726 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
727 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
728 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
729 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
730 LLVMConstVector(shuffles
, 4), "");
731 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
732 LLVMConstVector(shuffles
, 4), "");
734 /* upper half of regs */
735 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
736 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
737 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
738 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
739 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
740 LLVMConstVector(shuffles
, 4), "");
741 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
742 LLVMConstVector(shuffles
, 4), "");
744 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
745 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
747 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
748 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
749 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
750 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
751 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
752 LLVMConstVector(shuffles
, 4), "");
754 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
755 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
756 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
757 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
758 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
759 LLVMConstVector(shuffles
, 4), "");
761 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
766 * partially horizontally add 2-4 float vectors with length nx4,
767 * i.e. only four adjacent values in each vector will be added,
768 * assuming values are really grouped in 4 which also determines
771 * Return a vector of the same length as the initial vectors,
772 * with the excess elements (if any) being undefined.
773 * The element order is independent of number of input vectors.
774 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
775 * the output order thus will be
776 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
779 lp_build_hadd_partial4(struct lp_build_context
*bld
,
780 LLVMValueRef vectors
[],
783 struct gallivm_state
*gallivm
= bld
->gallivm
;
784 LLVMBuilderRef builder
= gallivm
->builder
;
785 LLVMValueRef ret_vec
;
787 const char *intrinsic
= NULL
;
789 assert(num_vecs
>= 2 && num_vecs
<= 4);
790 assert(bld
->type
.floating
);
792 /* only use this with at least 2 vectors, as it is sort of expensive
793 * (depending on cpu) and we always need two horizontal adds anyway,
794 * so a shuffle/add approach might be better.
800 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
801 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
803 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
804 bld
->type
.length
== 4) {
805 intrinsic
= "llvm.x86.sse3.hadd.ps";
807 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
808 bld
->type
.length
== 8) {
809 intrinsic
= "llvm.x86.avx.hadd.ps.256";
812 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
813 lp_build_vec_type(gallivm
, bld
->type
),
816 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
817 lp_build_vec_type(gallivm
, bld
->type
),
823 return lp_build_intrinsic_binary(builder
, intrinsic
,
824 lp_build_vec_type(gallivm
, bld
->type
),
828 if (bld
->type
.length
== 4) {
829 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
832 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
834 unsigned num_iter
= bld
->type
.length
/ 4;
835 struct lp_type parttype
= bld
->type
;
837 for (j
= 0; j
< num_iter
; j
++) {
838 LLVMValueRef partsrc
[4];
840 for (i
= 0; i
< 4; i
++) {
841 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
843 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
845 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
854 lp_build_sub(struct lp_build_context
*bld
,
858 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
859 const struct lp_type type
= bld
->type
;
862 assert(lp_check_value(type
, a
));
863 assert(lp_check_value(type
, b
));
867 if (a
== bld
->undef
|| b
== bld
->undef
)
873 const char *intrinsic
= NULL
;
875 if (!type
.sign
&& b
== bld
->one
)
878 if (!type
.floating
&& !type
.fixed
) {
879 if (type
.width
* type
.length
== 128) {
880 if (util_cpu_caps
.has_sse2
) {
882 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" :
883 HAVE_LLVM
< 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL
;
884 if (type
.width
== 16)
885 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" :
886 HAVE_LLVM
< 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL
;
887 } else if (util_cpu_caps
.has_altivec
) {
889 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
890 if (type
.width
== 16)
891 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
894 if (type
.width
* type
.length
== 256) {
895 if (util_cpu_caps
.has_avx2
) {
897 intrinsic
= type
.sign
? "llvm.x86.avx2.psubs.b" :
898 HAVE_LLVM
< 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL
;
899 if (type
.width
== 16)
900 intrinsic
= type
.sign
? "llvm.x86.avx2.psubs.w" :
901 HAVE_LLVM
< 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL
;
907 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
910 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
912 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
913 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
914 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
915 /* a_clamp_max is the maximum a for negative b,
916 a_clamp_min is the minimum a for positive b. */
917 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildAdd(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
918 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildAdd(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
919 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_min
, a_clamp_max
);
922 * This must match llvm pattern for saturated unsigned sub.
923 * (lp_build_max_simple actually does the job with its current
924 * definition but do it explicitly here.)
925 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
926 * interfere with llvm's ability to recognize the pattern but seems
929 LLVMValueRef no_ov
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
930 a
= lp_build_select(bld
, no_ov
, a
, b
);
934 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
936 res
= LLVMConstFSub(a
, b
);
938 res
= LLVMConstSub(a
, b
);
941 res
= LLVMBuildFSub(builder
, a
, b
, "");
943 res
= LLVMBuildSub(builder
, a
, b
, "");
945 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
946 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
954 * Normalized multiplication.
956 * There are several approaches for (using 8-bit normalized multiplication as
961 * makes the following approximation to the division (Sree)
963 * a*b/255 ~= (a*(b + 1)) >> 256
965 * which is the fastest method that satisfies the following OpenGL criteria of
967 * 0*0 = 0 and 255*255 = 255
971 * takes the geometric series approximation to the division
973 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
975 * in this case just the first two terms to fit in 16bit arithmetic
977 * t/255 ~= (t + (t >> 8)) >> 8
979 * note that just by itself it doesn't satisfies the OpenGL criteria, as
980 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
983 * - geometric series plus rounding
985 * when using a geometric series division instead of truncating the result
986 * use roundoff in the approximation (Jim Blinn)
988 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
990 * achieving the exact results.
994 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
995 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
996 * @sa Michael Herf, The "double blend trick", May 2000,
997 * http://www.stereopsis.com/doubleblend.html
1000 lp_build_mul_norm(struct gallivm_state
*gallivm
,
1001 struct lp_type wide_type
,
1002 LLVMValueRef a
, LLVMValueRef b
)
1004 LLVMBuilderRef builder
= gallivm
->builder
;
1005 struct lp_build_context bld
;
1010 assert(!wide_type
.floating
);
1011 assert(lp_check_value(wide_type
, a
));
1012 assert(lp_check_value(wide_type
, b
));
1014 lp_build_context_init(&bld
, gallivm
, wide_type
);
1016 n
= wide_type
.width
/ 2;
1017 if (wide_type
.sign
) {
1022 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1023 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1027 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1030 ab
= LLVMBuildMul(builder
, a
, b
, "");
1031 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
1034 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1037 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1LL << (n
- 1));
1038 if (wide_type
.sign
) {
1039 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
1040 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
1041 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
1043 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
1045 /* Final division */
1046 ab
= lp_build_shr_imm(&bld
, ab
, n
);
1055 lp_build_mul(struct lp_build_context
*bld
,
1059 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1060 const struct lp_type type
= bld
->type
;
1064 assert(lp_check_value(type
, a
));
1065 assert(lp_check_value(type
, b
));
1075 if(a
== bld
->undef
|| b
== bld
->undef
)
1078 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
1079 struct lp_type wide_type
= lp_wider_type(type
);
1080 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
1082 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
1083 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
1085 /* PMULLW, PSRLW, PADDW */
1086 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
1087 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
1089 ab
= lp_build_pack2_native(bld
->gallivm
, wide_type
, type
, abl
, abh
);
1095 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
1099 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1101 res
= LLVMConstFMul(a
, b
);
1103 res
= LLVMConstMul(a
, b
);
1106 res
= LLVMConstAShr(res
, shift
);
1108 res
= LLVMConstLShr(res
, shift
);
1113 res
= LLVMBuildFMul(builder
, a
, b
, "");
1115 res
= LLVMBuildMul(builder
, a
, b
, "");
1118 res
= LLVMBuildAShr(builder
, res
, shift
, "");
1120 res
= LLVMBuildLShr(builder
, res
, shift
, "");
1128 * Widening mul, valid for 32x32 bit -> 64bit only.
1129 * Result is low 32bits, high bits returned in res_hi.
1131 * Emits code that is meant to be compiled for the host CPU.
1134 lp_build_mul_32_lohi_cpu(struct lp_build_context
*bld
,
1137 LLVMValueRef
*res_hi
)
1139 struct gallivm_state
*gallivm
= bld
->gallivm
;
1140 LLVMBuilderRef builder
= gallivm
->builder
;
1142 assert(bld
->type
.width
== 32);
1143 assert(bld
->type
.floating
== 0);
1144 assert(bld
->type
.fixed
== 0);
1145 assert(bld
->type
.norm
== 0);
1148 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1149 * for x86 simd is atrocious (even if the high bits weren't required),
1150 * trying to handle real 64bit inputs (which of course can't happen due
1151 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1152 * apparently llvm does not recognize this widening mul). This includes 6
1153 * (instead of 2) pmuludq plus extra adds and shifts
1154 * The same story applies to signed mul, albeit fixing this requires sse41.
1155 * https://llvm.org/bugs/show_bug.cgi?id=30845
1156 * So, whip up our own code, albeit only for length 4 and 8 (which
1157 * should be good enough)...
1159 if ((bld
->type
.length
== 4 || bld
->type
.length
== 8) &&
1160 ((util_cpu_caps
.has_sse2
&& (bld
->type
.sign
== 0)) ||
1161 util_cpu_caps
.has_sse4_1
)) {
1162 const char *intrinsic
= NULL
;
1163 LLVMValueRef aeven
, aodd
, beven
, bodd
, muleven
, mulodd
;
1164 LLVMValueRef shuf
[LP_MAX_VECTOR_WIDTH
/ 32], shuf_vec
;
1165 struct lp_type type_wide
= lp_wider_type(bld
->type
);
1166 LLVMTypeRef wider_type
= lp_build_vec_type(gallivm
, type_wide
);
1168 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1169 shuf
[i
] = lp_build_const_int32(gallivm
, i
+1);
1170 shuf
[i
+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
1172 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1175 aodd
= LLVMBuildShuffleVector(builder
, aeven
, bld
->undef
, shuf_vec
, "");
1176 bodd
= LLVMBuildShuffleVector(builder
, beven
, bld
->undef
, shuf_vec
, "");
1178 if (util_cpu_caps
.has_avx2
&& bld
->type
.length
== 8) {
1179 if (bld
->type
.sign
) {
1180 intrinsic
= "llvm.x86.avx2.pmul.dq";
1182 intrinsic
= "llvm.x86.avx2.pmulu.dq";
1184 muleven
= lp_build_intrinsic_binary(builder
, intrinsic
,
1185 wider_type
, aeven
, beven
);
1186 mulodd
= lp_build_intrinsic_binary(builder
, intrinsic
,
1187 wider_type
, aodd
, bodd
);
1190 /* for consistent naming look elsewhere... */
1191 if (bld
->type
.sign
) {
1192 intrinsic
= "llvm.x86.sse41.pmuldq";
1194 intrinsic
= "llvm.x86.sse2.pmulu.dq";
1197 * XXX If we only have AVX but not AVX2 this is a pain.
1198 * lp_build_intrinsic_binary_anylength() can't handle it
1199 * (due to src and dst type not being identical).
1201 if (bld
->type
.length
== 8) {
1202 LLVMValueRef aevenlo
, aevenhi
, bevenlo
, bevenhi
;
1203 LLVMValueRef aoddlo
, aoddhi
, boddlo
, boddhi
;
1204 LLVMValueRef muleven2
[2], mulodd2
[2];
1205 struct lp_type type_wide_half
= type_wide
;
1206 LLVMTypeRef wtype_half
;
1207 type_wide_half
.length
= 2;
1208 wtype_half
= lp_build_vec_type(gallivm
, type_wide_half
);
1209 aevenlo
= lp_build_extract_range(gallivm
, aeven
, 0, 4);
1210 aevenhi
= lp_build_extract_range(gallivm
, aeven
, 4, 4);
1211 bevenlo
= lp_build_extract_range(gallivm
, beven
, 0, 4);
1212 bevenhi
= lp_build_extract_range(gallivm
, beven
, 4, 4);
1213 aoddlo
= lp_build_extract_range(gallivm
, aodd
, 0, 4);
1214 aoddhi
= lp_build_extract_range(gallivm
, aodd
, 4, 4);
1215 boddlo
= lp_build_extract_range(gallivm
, bodd
, 0, 4);
1216 boddhi
= lp_build_extract_range(gallivm
, bodd
, 4, 4);
1217 muleven2
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
1218 wtype_half
, aevenlo
, bevenlo
);
1219 mulodd2
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
1220 wtype_half
, aoddlo
, boddlo
);
1221 muleven2
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
1222 wtype_half
, aevenhi
, bevenhi
);
1223 mulodd2
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
1224 wtype_half
, aoddhi
, boddhi
);
1225 muleven
= lp_build_concat(gallivm
, muleven2
, type_wide_half
, 2);
1226 mulodd
= lp_build_concat(gallivm
, mulodd2
, type_wide_half
, 2);
1230 muleven
= lp_build_intrinsic_binary(builder
, intrinsic
,
1231 wider_type
, aeven
, beven
);
1232 mulodd
= lp_build_intrinsic_binary(builder
, intrinsic
,
1233 wider_type
, aodd
, bodd
);
1236 muleven
= LLVMBuildBitCast(builder
, muleven
, bld
->vec_type
, "");
1237 mulodd
= LLVMBuildBitCast(builder
, mulodd
, bld
->vec_type
, "");
1239 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1240 shuf
[i
] = lp_build_const_int32(gallivm
, i
+ 1);
1241 shuf
[i
+1] = lp_build_const_int32(gallivm
, i
+ 1 + bld
->type
.length
);
1243 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1244 *res_hi
= LLVMBuildShuffleVector(builder
, muleven
, mulodd
, shuf_vec
, "");
1246 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1247 shuf
[i
] = lp_build_const_int32(gallivm
, i
);
1248 shuf
[i
+1] = lp_build_const_int32(gallivm
, i
+ bld
->type
.length
);
1250 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1251 return LLVMBuildShuffleVector(builder
, muleven
, mulodd
, shuf_vec
, "");
1254 return lp_build_mul_32_lohi(bld
, a
, b
, res_hi
);
1260 * Widening mul, valid for 32x32 bit -> 64bit only.
1261 * Result is low 32bits, high bits returned in res_hi.
1263 * Emits generic code.
1266 lp_build_mul_32_lohi(struct lp_build_context
*bld
,
1269 LLVMValueRef
*res_hi
)
1271 struct gallivm_state
*gallivm
= bld
->gallivm
;
1272 LLVMBuilderRef builder
= gallivm
->builder
;
1273 LLVMValueRef tmp
, shift
, res_lo
;
1274 struct lp_type type_tmp
;
1275 LLVMTypeRef wide_type
, narrow_type
;
1277 type_tmp
= bld
->type
;
1278 narrow_type
= lp_build_vec_type(gallivm
, type_tmp
);
1279 type_tmp
.width
*= 2;
1280 wide_type
= lp_build_vec_type(gallivm
, type_tmp
);
1281 shift
= lp_build_const_vec(gallivm
, type_tmp
, 32);
1283 if (bld
->type
.sign
) {
1284 a
= LLVMBuildSExt(builder
, a
, wide_type
, "");
1285 b
= LLVMBuildSExt(builder
, b
, wide_type
, "");
1287 a
= LLVMBuildZExt(builder
, a
, wide_type
, "");
1288 b
= LLVMBuildZExt(builder
, b
, wide_type
, "");
1290 tmp
= LLVMBuildMul(builder
, a
, b
, "");
1292 res_lo
= LLVMBuildTrunc(builder
, tmp
, narrow_type
, "");
1294 /* Since we truncate anyway, LShr and AShr are equivalent. */
1295 tmp
= LLVMBuildLShr(builder
, tmp
, shift
, "");
1296 *res_hi
= LLVMBuildTrunc(builder
, tmp
, narrow_type
, "");
1304 lp_build_mad(struct lp_build_context
*bld
,
1309 const struct lp_type type
= bld
->type
;
1310 if (type
.floating
) {
1311 return lp_build_fmuladd(bld
->gallivm
->builder
, a
, b
, c
);
1313 return lp_build_add(bld
, lp_build_mul(bld
, a
, b
), c
);
1319 * Small vector x scale multiplication optimization.
1322 lp_build_mul_imm(struct lp_build_context
*bld
,
1326 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1327 LLVMValueRef factor
;
1329 assert(lp_check_value(bld
->type
, a
));
1338 return lp_build_negate(bld
, a
);
1340 if(b
== 2 && bld
->type
.floating
)
1341 return lp_build_add(bld
, a
, a
);
1343 if(util_is_power_of_two_or_zero(b
)) {
1344 unsigned shift
= ffs(b
) - 1;
1346 if(bld
->type
.floating
) {
1349 * Power of two multiplication by directly manipulating the exponent.
1351 * XXX: This might not be always faster, it will introduce a small error
1352 * for multiplication by zero, and it will produce wrong results
1355 unsigned mantissa
= lp_mantissa(bld
->type
);
1356 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1357 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1358 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1359 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1364 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1365 return LLVMBuildShl(builder
, a
, factor
, "");
1369 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1370 return lp_build_mul(bld
, a
, factor
);
1378 lp_build_div(struct lp_build_context
*bld
,
1382 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1383 const struct lp_type type
= bld
->type
;
1385 assert(lp_check_value(type
, a
));
1386 assert(lp_check_value(type
, b
));
1390 if(a
== bld
->one
&& type
.floating
)
1391 return lp_build_rcp(bld
, b
);
1396 if(a
== bld
->undef
|| b
== bld
->undef
)
1399 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1401 return LLVMConstFDiv(a
, b
);
1403 return LLVMConstSDiv(a
, b
);
1405 return LLVMConstUDiv(a
, b
);
1408 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1410 ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1411 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1413 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1416 return LLVMBuildFDiv(builder
, a
, b
, "");
1418 return LLVMBuildSDiv(builder
, a
, b
, "");
1420 return LLVMBuildUDiv(builder
, a
, b
, "");
1425 * Linear interpolation helper.
1427 * @param normalized whether we are interpolating normalized values,
1428 * encoded in normalized integers, twice as wide.
1430 * @sa http://www.stereopsis.com/doubleblend.html
1432 static inline LLVMValueRef
1433 lp_build_lerp_simple(struct lp_build_context
*bld
,
1439 unsigned half_width
= bld
->type
.width
/2;
1440 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1444 assert(lp_check_value(bld
->type
, x
));
1445 assert(lp_check_value(bld
->type
, v0
));
1446 assert(lp_check_value(bld
->type
, v1
));
1448 delta
= lp_build_sub(bld
, v1
, v0
);
1450 if (bld
->type
.floating
) {
1452 return lp_build_mad(bld
, x
, delta
, v0
);
1455 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1456 if (!bld
->type
.sign
) {
1457 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1459 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1460 * most-significant-bit to the lowest-significant-bit, so that
1461 * later we can just divide by 2**n instead of 2**n - 1.
1464 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1467 /* (x * delta) >> n */
1468 res
= lp_build_mul(bld
, x
, delta
);
1469 res
= lp_build_shr_imm(bld
, res
, half_width
);
1472 * The rescaling trick above doesn't work for signed numbers, so
1473 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1476 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1477 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1480 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1481 res
= lp_build_mul(bld
, x
, delta
);
1484 if ((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) {
1486 * At this point both res and v0 only use the lower half of the bits,
1487 * the rest is zero. Instead of add / mask, do add with half wide type.
1489 struct lp_type narrow_type
;
1490 struct lp_build_context narrow_bld
;
1492 memset(&narrow_type
, 0, sizeof narrow_type
);
1493 narrow_type
.sign
= bld
->type
.sign
;
1494 narrow_type
.width
= bld
->type
.width
/2;
1495 narrow_type
.length
= bld
->type
.length
*2;
1497 lp_build_context_init(&narrow_bld
, bld
->gallivm
, narrow_type
);
1498 res
= LLVMBuildBitCast(builder
, res
, narrow_bld
.vec_type
, "");
1499 v0
= LLVMBuildBitCast(builder
, v0
, narrow_bld
.vec_type
, "");
1500 res
= lp_build_add(&narrow_bld
, v0
, res
);
1501 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
1503 res
= lp_build_add(bld
, v0
, res
);
1505 if (bld
->type
.fixed
) {
1507 * We need to mask out the high order bits when lerping 8bit
1508 * normalized colors stored on 16bits
1510 /* XXX: This step is necessary for lerping 8bit colors stored on
1511 * 16bits, but it will be wrong for true fixed point use cases.
1512 * Basically we need a more powerful lp_type, capable of further
1513 * distinguishing the values interpretation from the value storage.
1515 LLVMValueRef low_bits
;
1516 low_bits
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1);
1517 res
= LLVMBuildAnd(builder
, res
, low_bits
, "");
1526 * Linear interpolation.
1529 lp_build_lerp(struct lp_build_context
*bld
,
1535 const struct lp_type type
= bld
->type
;
1538 assert(lp_check_value(type
, x
));
1539 assert(lp_check_value(type
, v0
));
1540 assert(lp_check_value(type
, v1
));
1542 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1545 struct lp_type wide_type
;
1546 struct lp_build_context wide_bld
;
1547 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1549 assert(type
.length
>= 2);
1552 * Create a wider integer type, enough to hold the
1553 * intermediate result of the multiplication.
1555 memset(&wide_type
, 0, sizeof wide_type
);
1556 wide_type
.sign
= type
.sign
;
1557 wide_type
.width
= type
.width
*2;
1558 wide_type
.length
= type
.length
/2;
1560 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1562 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1563 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1564 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1570 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1572 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1573 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1575 res
= lp_build_pack2_native(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1577 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1585 * Bilinear interpolation.
1587 * Values indices are in v_{yx}.
1590 lp_build_lerp_2d(struct lp_build_context
*bld
,
1599 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1600 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1601 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1606 lp_build_lerp_3d(struct lp_build_context
*bld
,
1620 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1621 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1622 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1627 * Generate min(a, b)
1628 * Do checks for special cases but not for nans.
1631 lp_build_min(struct lp_build_context
*bld
,
1635 assert(lp_check_value(bld
->type
, a
));
1636 assert(lp_check_value(bld
->type
, b
));
1638 if(a
== bld
->undef
|| b
== bld
->undef
)
1644 if (bld
->type
.norm
) {
1645 if (!bld
->type
.sign
) {
1646 if (a
== bld
->zero
|| b
== bld
->zero
) {
1656 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1661 * Generate min(a, b)
1662 * NaN's are handled according to the behavior specified by the
1663 * nan_behavior argument.
1666 lp_build_min_ext(struct lp_build_context
*bld
,
1669 enum gallivm_nan_behavior nan_behavior
)
1671 assert(lp_check_value(bld
->type
, a
));
1672 assert(lp_check_value(bld
->type
, b
));
1674 if(a
== bld
->undef
|| b
== bld
->undef
)
1680 if (bld
->type
.norm
) {
1681 if (!bld
->type
.sign
) {
1682 if (a
== bld
->zero
|| b
== bld
->zero
) {
1692 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1696 * Generate max(a, b)
1697 * Do checks for special cases, but NaN behavior is undefined.
1700 lp_build_max(struct lp_build_context
*bld
,
1704 assert(lp_check_value(bld
->type
, a
));
1705 assert(lp_check_value(bld
->type
, b
));
1707 if(a
== bld
->undef
|| b
== bld
->undef
)
1713 if(bld
->type
.norm
) {
1714 if(a
== bld
->one
|| b
== bld
->one
)
1716 if (!bld
->type
.sign
) {
1717 if (a
== bld
->zero
) {
1720 if (b
== bld
->zero
) {
1726 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1731 * Generate max(a, b)
1732 * Checks for special cases.
1733 * NaN's are handled according to the behavior specified by the
1734 * nan_behavior argument.
1737 lp_build_max_ext(struct lp_build_context
*bld
,
1740 enum gallivm_nan_behavior nan_behavior
)
1742 assert(lp_check_value(bld
->type
, a
));
1743 assert(lp_check_value(bld
->type
, b
));
1745 if(a
== bld
->undef
|| b
== bld
->undef
)
1751 if(bld
->type
.norm
) {
1752 if(a
== bld
->one
|| b
== bld
->one
)
1754 if (!bld
->type
.sign
) {
1755 if (a
== bld
->zero
) {
1758 if (b
== bld
->zero
) {
1764 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1768 * Generate clamp(a, min, max)
1769 * NaN behavior (for any of a, min, max) is undefined.
1770 * Do checks for special cases.
1773 lp_build_clamp(struct lp_build_context
*bld
,
1778 assert(lp_check_value(bld
->type
, a
));
1779 assert(lp_check_value(bld
->type
, min
));
1780 assert(lp_check_value(bld
->type
, max
));
1782 a
= lp_build_min(bld
, a
, max
);
1783 a
= lp_build_max(bld
, a
, min
);
1789 * Generate clamp(a, 0, 1)
1790 * A NaN will get converted to zero.
1793 lp_build_clamp_zero_one_nanzero(struct lp_build_context
*bld
,
1796 a
= lp_build_max_ext(bld
, a
, bld
->zero
, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
1797 a
= lp_build_min(bld
, a
, bld
->one
);
1806 lp_build_abs(struct lp_build_context
*bld
,
1809 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1810 const struct lp_type type
= bld
->type
;
1811 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1813 assert(lp_check_value(type
, a
));
1819 if (0x0306 <= HAVE_LLVM
&& HAVE_LLVM
< 0x0309) {
1820 /* Workaround llvm.org/PR27332 */
1821 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1822 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1823 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1824 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1825 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1826 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1830 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fabs", vec_type
);
1831 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1835 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
&& HAVE_LLVM
< 0x0600) {
1836 switch(type
.width
) {
1838 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1840 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1842 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1845 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_avx2
&& HAVE_LLVM
< 0x0600) {
1846 switch(type
.width
) {
1848 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.b", vec_type
, a
);
1850 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.w", vec_type
, a
);
1852 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.d", vec_type
, a
);
1856 return lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
),
1857 a
, LLVMBuildNeg(builder
, a
, ""));
1862 lp_build_negate(struct lp_build_context
*bld
,
1865 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1867 assert(lp_check_value(bld
->type
, a
));
1869 if (bld
->type
.floating
)
1870 a
= LLVMBuildFNeg(builder
, a
, "");
1872 a
= LLVMBuildNeg(builder
, a
, "");
1878 /** Return -1, 0 or +1 depending on the sign of a */
1880 lp_build_sgn(struct lp_build_context
*bld
,
1883 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1884 const struct lp_type type
= bld
->type
;
1888 assert(lp_check_value(type
, a
));
1890 /* Handle non-zero case */
1892 /* if not zero then sign must be positive */
1895 else if(type
.floating
) {
1896 LLVMTypeRef vec_type
;
1897 LLVMTypeRef int_type
;
1901 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1903 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1904 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1905 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1907 /* Take the sign bit and add it to 1 constant */
1908 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1909 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1910 one
= LLVMConstBitCast(bld
->one
, int_type
);
1911 res
= LLVMBuildOr(builder
, sign
, one
, "");
1912 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1916 /* signed int/norm/fixed point */
1917 /* could use psign with sse3 and appropriate vectors here */
1918 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1919 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1920 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1924 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1925 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1932 * Set the sign of float vector 'a' according to 'sign'.
1933 * If sign==0, return abs(a).
1934 * If sign==1, return -abs(a);
1935 * Other values for sign produce undefined results.
1938 lp_build_set_sign(struct lp_build_context
*bld
,
1939 LLVMValueRef a
, LLVMValueRef sign
)
1941 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1942 const struct lp_type type
= bld
->type
;
1943 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1944 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1945 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1946 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1947 ~((unsigned long long) 1 << (type
.width
- 1)));
1948 LLVMValueRef val
, res
;
1950 assert(type
.floating
);
1951 assert(lp_check_value(type
, a
));
1953 /* val = reinterpret_cast<int>(a) */
1954 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1955 /* val = val & mask */
1956 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1957 /* sign = sign << shift */
1958 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1959 /* res = val | sign */
1960 res
= LLVMBuildOr(builder
, val
, sign
, "");
1961 /* res = reinterpret_cast<float>(res) */
1962 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1969 * Convert vector of (or scalar) int to vector of (or scalar) float.
1972 lp_build_int_to_float(struct lp_build_context
*bld
,
1975 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1976 const struct lp_type type
= bld
->type
;
1977 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1979 assert(type
.floating
);
1981 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1985 arch_rounding_available(const struct lp_type type
)
1987 if ((util_cpu_caps
.has_sse4_1
&&
1988 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1989 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256) ||
1990 (util_cpu_caps
.has_avx512f
&& type
.width
*type
.length
== 512))
1992 else if ((util_cpu_caps
.has_altivec
&&
1993 (type
.width
== 32 && type
.length
== 4)))
1995 else if (util_cpu_caps
.has_neon
)
2001 enum lp_build_round_mode
2003 LP_BUILD_ROUND_NEAREST
= 0,
2004 LP_BUILD_ROUND_FLOOR
= 1,
2005 LP_BUILD_ROUND_CEIL
= 2,
2006 LP_BUILD_ROUND_TRUNCATE
= 3
2009 static inline LLVMValueRef
2010 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
2013 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2014 const struct lp_type type
= bld
->type
;
2015 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
2016 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2017 const char *intrinsic
;
2020 assert(type
.floating
);
2021 /* using the double precision conversions is a bit more complicated */
2022 assert(type
.width
== 32);
2024 assert(lp_check_value(type
, a
));
2025 assert(util_cpu_caps
.has_sse2
);
2027 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2028 if (type
.length
== 1) {
2029 LLVMTypeRef vec_type
;
2032 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
2034 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
2036 intrinsic
= "llvm.x86.sse.cvtss2si";
2038 undef
= LLVMGetUndef(vec_type
);
2040 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
2042 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
2046 if (type
.width
* type
.length
== 128) {
2047 intrinsic
= "llvm.x86.sse2.cvtps2dq";
2050 assert(type
.width
*type
.length
== 256);
2051 assert(util_cpu_caps
.has_avx
);
2053 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
2055 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
2065 static inline LLVMValueRef
2066 lp_build_round_altivec(struct lp_build_context
*bld
,
2068 enum lp_build_round_mode mode
)
2070 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2071 const struct lp_type type
= bld
->type
;
2072 const char *intrinsic
= NULL
;
2074 assert(type
.floating
);
2076 assert(lp_check_value(type
, a
));
2077 assert(util_cpu_caps
.has_altivec
);
2082 case LP_BUILD_ROUND_NEAREST
:
2083 intrinsic
= "llvm.ppc.altivec.vrfin";
2085 case LP_BUILD_ROUND_FLOOR
:
2086 intrinsic
= "llvm.ppc.altivec.vrfim";
2088 case LP_BUILD_ROUND_CEIL
:
2089 intrinsic
= "llvm.ppc.altivec.vrfip";
2091 case LP_BUILD_ROUND_TRUNCATE
:
2092 intrinsic
= "llvm.ppc.altivec.vrfiz";
2096 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2099 static inline LLVMValueRef
2100 lp_build_round_arch(struct lp_build_context
*bld
,
2102 enum lp_build_round_mode mode
)
2104 if (util_cpu_caps
.has_sse4_1
|| util_cpu_caps
.has_neon
) {
2105 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2106 const struct lp_type type
= bld
->type
;
2107 const char *intrinsic_root
;
2110 assert(type
.floating
);
2111 assert(lp_check_value(type
, a
));
2115 case LP_BUILD_ROUND_NEAREST
:
2116 intrinsic_root
= "llvm.nearbyint";
2118 case LP_BUILD_ROUND_FLOOR
:
2119 intrinsic_root
= "llvm.floor";
2121 case LP_BUILD_ROUND_CEIL
:
2122 intrinsic_root
= "llvm.ceil";
2124 case LP_BUILD_ROUND_TRUNCATE
:
2125 intrinsic_root
= "llvm.trunc";
2129 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, intrinsic_root
, bld
->vec_type
);
2130 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2132 else /* (util_cpu_caps.has_altivec) */
2133 return lp_build_round_altivec(bld
, a
, mode
);
2137 * Return the integer part of a float (vector) value (== round toward zero).
2138 * The returned value is a float (vector).
2139 * Ex: trunc(-1.5) = -1.0
2142 lp_build_trunc(struct lp_build_context
*bld
,
2145 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2146 const struct lp_type type
= bld
->type
;
2148 assert(type
.floating
);
2149 assert(lp_check_value(type
, a
));
2151 if (arch_rounding_available(type
)) {
2152 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
2155 const struct lp_type type
= bld
->type
;
2156 struct lp_type inttype
;
2157 struct lp_build_context intbld
;
2158 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2159 LLVMValueRef trunc
, res
, anosign
, mask
;
2160 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2161 LLVMTypeRef vec_type
= bld
->vec_type
;
2163 assert(type
.width
== 32); /* might want to handle doubles at some point */
2166 inttype
.floating
= 0;
2167 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2169 /* round by truncation */
2170 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2171 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2173 /* mask out sign bit */
2174 anosign
= lp_build_abs(bld
, a
);
2176 * mask out all values if anosign > 2^24
2177 * This should work both for large ints (all rounding is no-op for them
2178 * because such floats are always exact) as well as special cases like
2179 * NaNs, Infs (taking advantage of the fact they use max exponent).
2180 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2182 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2183 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2184 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2185 return lp_build_select(bld
, mask
, a
, res
);
2191 * Return float (vector) rounded to nearest integer (vector). The returned
2192 * value is a float (vector).
2193 * Ex: round(0.9) = 1.0
2194 * Ex: round(-1.5) = -2.0
2197 lp_build_round(struct lp_build_context
*bld
,
2200 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2201 const struct lp_type type
= bld
->type
;
2203 assert(type
.floating
);
2204 assert(lp_check_value(type
, a
));
2206 if (arch_rounding_available(type
)) {
2207 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2210 const struct lp_type type
= bld
->type
;
2211 struct lp_type inttype
;
2212 struct lp_build_context intbld
;
2213 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2214 LLVMValueRef res
, anosign
, mask
;
2215 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2216 LLVMTypeRef vec_type
= bld
->vec_type
;
2218 assert(type
.width
== 32); /* might want to handle doubles at some point */
2221 inttype
.floating
= 0;
2222 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2224 res
= lp_build_iround(bld
, a
);
2225 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
2227 /* mask out sign bit */
2228 anosign
= lp_build_abs(bld
, a
);
2230 * mask out all values if anosign > 2^24
2231 * This should work both for large ints (all rounding is no-op for them
2232 * because such floats are always exact) as well as special cases like
2233 * NaNs, Infs (taking advantage of the fact they use max exponent).
2234 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2236 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2237 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2238 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2239 return lp_build_select(bld
, mask
, a
, res
);
2245 * Return floor of float (vector), result is a float (vector)
2246 * Ex: floor(1.1) = 1.0
2247 * Ex: floor(-1.1) = -2.0
2250 lp_build_floor(struct lp_build_context
*bld
,
2253 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2254 const struct lp_type type
= bld
->type
;
2256 assert(type
.floating
);
2257 assert(lp_check_value(type
, a
));
2259 if (arch_rounding_available(type
)) {
2260 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2263 const struct lp_type type
= bld
->type
;
2264 struct lp_type inttype
;
2265 struct lp_build_context intbld
;
2266 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2267 LLVMValueRef trunc
, res
, anosign
, mask
;
2268 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2269 LLVMTypeRef vec_type
= bld
->vec_type
;
2271 if (type
.width
!= 32) {
2273 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.floor", vec_type
);
2274 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2277 assert(type
.width
== 32); /* might want to handle doubles at some point */
2280 inttype
.floating
= 0;
2281 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2283 /* round by truncation */
2284 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2285 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2291 * fix values if rounding is wrong (for non-special cases)
2292 * - this is the case if trunc > a
2294 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
2295 /* tmp = trunc > a ? 1.0 : 0.0 */
2296 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2297 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2298 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2299 res
= lp_build_sub(bld
, res
, tmp
);
2302 /* mask out sign bit */
2303 anosign
= lp_build_abs(bld
, a
);
2305 * mask out all values if anosign > 2^24
2306 * This should work both for large ints (all rounding is no-op for them
2307 * because such floats are always exact) as well as special cases like
2308 * NaNs, Infs (taking advantage of the fact they use max exponent).
2309 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2311 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2312 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2313 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2314 return lp_build_select(bld
, mask
, a
, res
);
2320 * Return ceiling of float (vector), returning float (vector).
2321 * Ex: ceil( 1.1) = 2.0
2322 * Ex: ceil(-1.1) = -1.0
2325 lp_build_ceil(struct lp_build_context
*bld
,
2328 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2329 const struct lp_type type
= bld
->type
;
2331 assert(type
.floating
);
2332 assert(lp_check_value(type
, a
));
2334 if (arch_rounding_available(type
)) {
2335 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2338 const struct lp_type type
= bld
->type
;
2339 struct lp_type inttype
;
2340 struct lp_build_context intbld
;
2341 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2342 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2343 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2344 LLVMTypeRef vec_type
= bld
->vec_type
;
2346 if (type
.width
!= 32) {
2348 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.ceil", vec_type
);
2349 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2352 assert(type
.width
== 32); /* might want to handle doubles at some point */
2355 inttype
.floating
= 0;
2356 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2358 /* round by truncation */
2359 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2360 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2363 * fix values if rounding is wrong (for non-special cases)
2364 * - this is the case if trunc < a
2366 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2367 /* tmp = trunc < a ? 1.0 : 0.0 */
2368 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2369 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2370 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2371 res
= lp_build_add(bld
, trunc
, tmp
);
2373 /* mask out sign bit */
2374 anosign
= lp_build_abs(bld
, a
);
2376 * mask out all values if anosign > 2^24
2377 * This should work both for large ints (all rounding is no-op for them
2378 * because such floats are always exact) as well as special cases like
2379 * NaNs, Infs (taking advantage of the fact they use max exponent).
2380 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2382 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2383 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2384 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2385 return lp_build_select(bld
, mask
, a
, res
);
2391 * Return fractional part of 'a' computed as a - floor(a)
2392 * Typically used in texture coord arithmetic.
2395 lp_build_fract(struct lp_build_context
*bld
,
2398 assert(bld
->type
.floating
);
2399 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2404 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2405 * against 0.99999(9). (Will also return that value for NaNs.)
2407 static inline LLVMValueRef
2408 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2412 /* this is the largest number smaller than 1.0 representable as float */
2413 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2414 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2415 return lp_build_min_ext(bld
, fract
, max
,
2416 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
2421 * Same as lp_build_fract, but guarantees that the result is always smaller
2422 * than one. Will also return the smaller-than-one value for infs, NaNs.
2425 lp_build_fract_safe(struct lp_build_context
*bld
,
2428 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2433 * Return the integer part of a float (vector) value (== round toward zero).
2434 * The returned value is an integer (vector).
2435 * Ex: itrunc(-1.5) = -1
2438 lp_build_itrunc(struct lp_build_context
*bld
,
2441 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2442 const struct lp_type type
= bld
->type
;
2443 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2445 assert(type
.floating
);
2446 assert(lp_check_value(type
, a
));
2448 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2453 * Return float (vector) rounded to nearest integer (vector). The returned
2454 * value is an integer (vector).
2455 * Ex: iround(0.9) = 1
2456 * Ex: iround(-1.5) = -2
2459 lp_build_iround(struct lp_build_context
*bld
,
2462 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2463 const struct lp_type type
= bld
->type
;
2464 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2467 assert(type
.floating
);
2469 assert(lp_check_value(type
, a
));
2471 if ((util_cpu_caps
.has_sse2
&&
2472 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2473 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2474 return lp_build_iround_nearest_sse2(bld
, a
);
2476 if (arch_rounding_available(type
)) {
2477 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2482 half
= lp_build_const_vec(bld
->gallivm
, type
, nextafterf(0.5, 0.0));
2485 LLVMTypeRef vec_type
= bld
->vec_type
;
2486 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2487 (unsigned long long)1 << (type
.width
- 1));
2491 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2492 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2495 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2496 half
= LLVMBuildOr(builder
, sign
, half
, "");
2497 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2500 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2503 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2510 * Return floor of float (vector), result is an int (vector)
2511 * Ex: ifloor(1.1) = 1.0
2512 * Ex: ifloor(-1.1) = -2.0
2515 lp_build_ifloor(struct lp_build_context
*bld
,
2518 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2519 const struct lp_type type
= bld
->type
;
2520 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2523 assert(type
.floating
);
2524 assert(lp_check_value(type
, a
));
2528 if (arch_rounding_available(type
)) {
2529 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2532 struct lp_type inttype
;
2533 struct lp_build_context intbld
;
2534 LLVMValueRef trunc
, itrunc
, mask
;
2536 assert(type
.floating
);
2537 assert(lp_check_value(type
, a
));
2540 inttype
.floating
= 0;
2541 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2543 /* round by truncation */
2544 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2545 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2548 * fix values if rounding is wrong (for non-special cases)
2549 * - this is the case if trunc > a
2550 * The results of doing this with NaNs, very large values etc.
2551 * are undefined but this seems to be the case anyway.
2553 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2554 /* cheapie minus one with mask since the mask is minus one / zero */
2555 return lp_build_add(&intbld
, itrunc
, mask
);
2559 /* round to nearest (toward zero) */
2560 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2567 * Return ceiling of float (vector), returning int (vector).
2568 * Ex: iceil( 1.1) = 2
2569 * Ex: iceil(-1.1) = -1
2572 lp_build_iceil(struct lp_build_context
*bld
,
2575 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2576 const struct lp_type type
= bld
->type
;
2577 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2580 assert(type
.floating
);
2581 assert(lp_check_value(type
, a
));
2583 if (arch_rounding_available(type
)) {
2584 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2587 struct lp_type inttype
;
2588 struct lp_build_context intbld
;
2589 LLVMValueRef trunc
, itrunc
, mask
;
2591 assert(type
.floating
);
2592 assert(lp_check_value(type
, a
));
2595 inttype
.floating
= 0;
2596 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2598 /* round by truncation */
2599 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2600 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2603 * fix values if rounding is wrong (for non-special cases)
2604 * - this is the case if trunc < a
2605 * The results of doing this with NaNs, very large values etc.
2606 * are undefined but this seems to be the case anyway.
2608 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2609 /* cheapie plus one with mask since the mask is minus one / zero */
2610 return lp_build_sub(&intbld
, itrunc
, mask
);
2613 /* round to nearest (toward zero) */
2614 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2621 * Combined ifloor() & fract().
2623 * Preferred to calling the functions separately, as it will ensure that the
2624 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2627 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2629 LLVMValueRef
*out_ipart
,
2630 LLVMValueRef
*out_fpart
)
2632 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2633 const struct lp_type type
= bld
->type
;
2636 assert(type
.floating
);
2637 assert(lp_check_value(type
, a
));
2639 if (arch_rounding_available(type
)) {
2641 * floor() is easier.
2644 ipart
= lp_build_floor(bld
, a
);
2645 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2646 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2650 * ifloor() is easier.
2653 *out_ipart
= lp_build_ifloor(bld
, a
);
2654 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2655 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2661 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2662 * always smaller than one.
2665 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2667 LLVMValueRef
*out_ipart
,
2668 LLVMValueRef
*out_fpart
)
2670 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2671 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2676 lp_build_sqrt(struct lp_build_context
*bld
,
2679 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2680 const struct lp_type type
= bld
->type
;
2681 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2684 assert(lp_check_value(type
, a
));
2686 assert(type
.floating
);
2687 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.sqrt", vec_type
);
2689 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2694 * Do one Newton-Raphson step to improve reciprocate precision:
2696 * x_{i+1} = x_i * (2 - a * x_i)
2698 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2699 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2700 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2701 * halo. It would be necessary to clamp the argument to prevent this.
2704 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2705 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2707 static inline LLVMValueRef
2708 lp_build_rcp_refine(struct lp_build_context
*bld
,
2712 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2713 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2716 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2717 res
= LLVMBuildFSub(builder
, two
, res
, "");
2718 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2725 lp_build_rcp(struct lp_build_context
*bld
,
2728 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2729 const struct lp_type type
= bld
->type
;
2731 assert(lp_check_value(type
, a
));
2740 assert(type
.floating
);
2742 if(LLVMIsConstant(a
))
2743 return LLVMConstFDiv(bld
->one
, a
);
2746 * We don't use RCPPS because:
2747 * - it only has 10bits of precision
2748 * - it doesn't even get the reciprocate of 1.0 exactly
2749 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2750 * - for recent processors the benefit over DIVPS is marginal, a case
2753 * We could still use it on certain processors if benchmarks show that the
2754 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2755 * particular uses that require less workarounds.
2758 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2759 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2760 const unsigned num_iterations
= 0;
2763 const char *intrinsic
= NULL
;
2765 if (type
.length
== 4) {
2766 intrinsic
= "llvm.x86.sse.rcp.ps";
2769 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2772 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2774 for (i
= 0; i
< num_iterations
; ++i
) {
2775 res
= lp_build_rcp_refine(bld
, a
, res
);
2781 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2786 * Do one Newton-Raphson step to improve rsqrt precision:
2788 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2790 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2792 static inline LLVMValueRef
2793 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2795 LLVMValueRef rsqrt_a
)
2797 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2798 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2799 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2802 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2803 res
= LLVMBuildFMul(builder
, a
, res
, "");
2804 res
= LLVMBuildFSub(builder
, three
, res
, "");
2805 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2806 res
= LLVMBuildFMul(builder
, half
, res
, "");
2813 * Generate 1/sqrt(a).
2814 * Result is undefined for values < 0, infinity for +0.
2817 lp_build_rsqrt(struct lp_build_context
*bld
,
2820 const struct lp_type type
= bld
->type
;
2822 assert(lp_check_value(type
, a
));
2824 assert(type
.floating
);
2827 * This should be faster but all denormals will end up as infinity.
2829 if (0 && lp_build_fast_rsqrt_available(type
)) {
2830 const unsigned num_iterations
= 1;
2834 /* rsqrt(1.0) != 1.0 here */
2835 res
= lp_build_fast_rsqrt(bld
, a
);
2837 if (num_iterations
) {
2839 * Newton-Raphson will result in NaN instead of infinity for zero,
2840 * and NaN instead of zero for infinity.
2841 * Also, need to ensure rsqrt(1.0) == 1.0.
2842 * All numbers smaller than FLT_MIN will result in +infinity
2843 * (rsqrtps treats all denormals as zero).
2846 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2847 LLVMValueRef inf
= lp_build_const_vec(bld
->gallivm
, type
, INFINITY
);
2849 for (i
= 0; i
< num_iterations
; ++i
) {
2850 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2852 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2853 res
= lp_build_select(bld
, cmp
, inf
, res
);
2854 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2855 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2856 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2857 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2863 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2867 * If there's a fast (inaccurate) rsqrt instruction available
2868 * (caller may want to avoid to call rsqrt_fast if it's not available,
2869 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2870 * unavailable it would result in sqrt/div/mul so obviously
2871 * much better to just call sqrt, skipping both div and mul).
2874 lp_build_fast_rsqrt_available(struct lp_type type
)
2876 assert(type
.floating
);
2878 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2879 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2887 * Generate 1/sqrt(a).
2888 * Result is undefined for values < 0, infinity for +0.
2889 * Precision is limited, only ~10 bits guaranteed
2890 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2893 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2896 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2897 const struct lp_type type
= bld
->type
;
2899 assert(lp_check_value(type
, a
));
2901 if (lp_build_fast_rsqrt_available(type
)) {
2902 const char *intrinsic
= NULL
;
2904 if (type
.length
== 4) {
2905 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2908 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2910 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2913 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2915 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2920 * Generate sin(a) or cos(a) using polynomial approximation.
2921 * TODO: it might be worth recognizing sin and cos using same source
2922 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2923 * would be way cheaper than calculating (nearly) everything twice...
2924 * Not sure it's common enough to be worth bothering however, scs
2925 * opcode could also benefit from calculating both though.
2928 lp_build_sin_or_cos(struct lp_build_context
*bld
,
2932 struct gallivm_state
*gallivm
= bld
->gallivm
;
2933 LLVMBuilderRef b
= gallivm
->builder
;
2934 struct lp_type int_type
= lp_int_type(bld
->type
);
2937 * take the absolute value,
2938 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2941 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2942 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2944 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2945 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2949 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2952 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2953 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2956 * store the integer part of y in mm0
2957 * emm2 = _mm_cvttps_epi32(y);
2960 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2963 * j=(j+1) & (~1) (see the cephes sources)
2964 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2967 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2968 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2970 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2972 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2973 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2976 * y = _mm_cvtepi32_ps(emm2);
2978 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2980 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2981 LLVMValueRef const_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2982 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2983 LLVMValueRef sign_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2986 * Argument used for poly selection and sign bit determination
2987 * is different for sin vs. cos.
2989 LLVMValueRef emm2_2
= cos
? LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2") :
2992 LLVMValueRef sign_bit
= cos
? LLVMBuildShl(b
, LLVMBuildAnd(b
, const_4
,
2993 LLVMBuildNot(b
, emm2_2
, ""), ""),
2994 const_29
, "sign_bit") :
2995 LLVMBuildAnd(b
, LLVMBuildXor(b
, a_v4si
,
2996 LLVMBuildShl(b
, emm2_add
,
2998 sign_mask
, "sign_bit");
3001 * get the polynom selection mask
3002 * there is one polynom for 0 <= x <= Pi/4
3003 * and another one for Pi/4<x<=Pi/2
3004 * Both branches will be computed.
3006 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3007 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3010 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, const_2
, "emm2_3");
3011 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
3012 int_type
, PIPE_FUNC_EQUAL
,
3013 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
3016 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3017 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3018 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3020 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
3021 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
3022 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
3025 * The magic pass: "Extended precision modular arithmetic"
3026 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3028 LLVMValueRef x_1
= lp_build_fmuladd(b
, y_2
, DP1
, x_abs
);
3029 LLVMValueRef x_2
= lp_build_fmuladd(b
, y_2
, DP2
, x_1
);
3030 LLVMValueRef x_3
= lp_build_fmuladd(b
, y_2
, DP3
, x_2
);
3033 * Evaluate the first polynom (0 <= x <= Pi/4)
3035 * z = _mm_mul_ps(x,x);
3037 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
3040 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3041 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3042 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3044 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
3045 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
3046 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
3049 * y = *(v4sf*)_ps_coscof_p0;
3050 * y = _mm_mul_ps(y, z);
3052 LLVMValueRef y_4
= lp_build_fmuladd(b
, z
, coscof_p0
, coscof_p1
);
3053 LLVMValueRef y_6
= lp_build_fmuladd(b
, y_4
, z
, coscof_p2
);
3054 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
3055 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
3059 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3060 * y = _mm_sub_ps(y, tmp);
3061 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3063 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
3064 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
3065 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
3066 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
3067 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
3070 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3071 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3072 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3074 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
3075 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
3076 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
3079 * Evaluate the second polynom (Pi/4 <= x <= 0)
3081 * y2 = *(v4sf*)_ps_sincof_p0;
3082 * y2 = _mm_mul_ps(y2, z);
3083 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3084 * y2 = _mm_mul_ps(y2, z);
3085 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3086 * y2 = _mm_mul_ps(y2, z);
3087 * y2 = _mm_mul_ps(y2, x);
3088 * y2 = _mm_add_ps(y2, x);
3091 LLVMValueRef y2_4
= lp_build_fmuladd(b
, z
, sincof_p0
, sincof_p1
);
3092 LLVMValueRef y2_6
= lp_build_fmuladd(b
, y2_4
, z
, sincof_p2
);
3093 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
3094 LLVMValueRef y2_9
= lp_build_fmuladd(b
, y2_7
, x_3
, x_3
);
3097 * select the correct result from the two polynoms
3099 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3100 * y = _mm_andnot_ps(xmm3, y);
3101 * y = _mm_or_ps(y,y2);
3103 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
3104 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
3105 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
3106 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
3107 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
3108 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
3112 * y = _mm_xor_ps(y, sign_bit);
3114 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sign");
3115 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
3117 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
3119 /* clamp output to be within [-1, 1] */
3120 y_result
= lp_build_clamp(bld
, y_result
,
3121 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
3122 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
3123 /* If a is -inf, inf or NaN then return NaN */
3124 y_result
= lp_build_select(bld
, isfinite
, y_result
,
3125 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
3134 lp_build_sin(struct lp_build_context
*bld
,
3137 return lp_build_sin_or_cos(bld
, a
, FALSE
);
3145 lp_build_cos(struct lp_build_context
*bld
,
3148 return lp_build_sin_or_cos(bld
, a
, TRUE
);
3153 * Generate pow(x, y)
3156 lp_build_pow(struct lp_build_context
*bld
,
3160 /* TODO: optimize the constant case */
3161 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3162 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
3163 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3167 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
3175 lp_build_exp(struct lp_build_context
*bld
,
3178 /* log2(e) = 1/log(2) */
3179 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3180 1.4426950408889634);
3182 assert(lp_check_value(bld
->type
, x
));
3184 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
3190 * Behavior is undefined with infs, 0s and nans
3193 lp_build_log(struct lp_build_context
*bld
,
3197 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3198 0.69314718055994529);
3200 assert(lp_check_value(bld
->type
, x
));
3202 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
3206 * Generate log(x) that handles edge cases (infs, 0s and nans)
3209 lp_build_log_safe(struct lp_build_context
*bld
,
3213 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3214 0.69314718055994529);
3216 assert(lp_check_value(bld
->type
, x
));
3218 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
3223 * Generate polynomial.
3224 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3227 lp_build_polynomial(struct lp_build_context
*bld
,
3229 const double *coeffs
,
3230 unsigned num_coeffs
)
3232 const struct lp_type type
= bld
->type
;
3233 LLVMValueRef even
= NULL
, odd
= NULL
;
3237 assert(lp_check_value(bld
->type
, x
));
3239 /* TODO: optimize the constant case */
3240 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3241 LLVMIsConstant(x
)) {
3242 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3247 * Calculate odd and even terms seperately to decrease data dependency
3249 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3250 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3252 x2
= lp_build_mul(bld
, x
, x
);
3254 for (i
= num_coeffs
; i
--; ) {
3257 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
3261 even
= lp_build_mad(bld
, x2
, even
, coeff
);
3266 odd
= lp_build_mad(bld
, x2
, odd
, coeff
);
3273 return lp_build_mad(bld
, odd
, x
, even
);
3282 * Minimax polynomial fit of 2**x, in range [0, 1[
3284 const double lp_build_exp2_polynomial
[] = {
3285 #if EXP_POLY_DEGREE == 5
3286 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3287 0.693153073200168932794,
3288 0.240153617044375388211,
3289 0.0558263180532956664775,
3290 0.00898934009049466391101,
3291 0.00187757667519147912699
3292 #elif EXP_POLY_DEGREE == 4
3293 1.00000259337069434683,
3294 0.693003834469974940458,
3295 0.24144275689150793076,
3296 0.0520114606103070150235,
3297 0.0135341679161270268764
3298 #elif EXP_POLY_DEGREE == 3
3299 0.999925218562710312959,
3300 0.695833540494823811697,
3301 0.226067155427249155588,
3302 0.0780245226406372992967
3303 #elif EXP_POLY_DEGREE == 2
3304 1.00172476321474503578,
3305 0.657636275736077639316,
3306 0.33718943461968720704
3314 lp_build_exp2(struct lp_build_context
*bld
,
3317 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3318 const struct lp_type type
= bld
->type
;
3319 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3320 LLVMValueRef ipart
= NULL
;
3321 LLVMValueRef fpart
= NULL
;
3322 LLVMValueRef expipart
= NULL
;
3323 LLVMValueRef expfpart
= NULL
;
3324 LLVMValueRef res
= NULL
;
3326 assert(lp_check_value(bld
->type
, x
));
3328 /* TODO: optimize the constant case */
3329 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3330 LLVMIsConstant(x
)) {
3331 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3335 assert(type
.floating
&& type
.width
== 32);
3337 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3338 * the result is INF and if it's smaller than -126.9 the result is 0 */
3339 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3340 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3341 x
= lp_build_max_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999),
3342 x
, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3344 /* ipart = floor(x) */
3345 /* fpart = x - ipart */
3346 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3348 /* expipart = (float) (1 << ipart) */
3349 expipart
= LLVMBuildAdd(builder
, ipart
,
3350 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3351 expipart
= LLVMBuildShl(builder
, expipart
,
3352 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3353 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3355 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3356 ARRAY_SIZE(lp_build_exp2_polynomial
));
3358 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3366 * Extract the exponent of a IEEE-754 floating point value.
3368 * Optionally apply an integer bias.
3370 * Result is an integer value with
3372 * ifloor(log2(x)) + bias
3375 lp_build_extract_exponent(struct lp_build_context
*bld
,
3379 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3380 const struct lp_type type
= bld
->type
;
3381 unsigned mantissa
= lp_mantissa(type
);
3384 assert(type
.floating
);
3386 assert(lp_check_value(bld
->type
, x
));
3388 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3390 res
= LLVMBuildLShr(builder
, x
,
3391 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3392 res
= LLVMBuildAnd(builder
, res
,
3393 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3394 res
= LLVMBuildSub(builder
, res
,
3395 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3402 * Extract the mantissa of the a floating.
3404 * Result is a floating point value with
3406 * x / floor(log2(x))
3409 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3412 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3413 const struct lp_type type
= bld
->type
;
3414 unsigned mantissa
= lp_mantissa(type
);
3415 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3416 (1ULL << mantissa
) - 1);
3417 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3420 assert(lp_check_value(bld
->type
, x
));
3422 assert(type
.floating
);
3424 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3426 /* res = x / 2**ipart */
3427 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3428 res
= LLVMBuildOr(builder
, res
, one
, "");
3429 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3437 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3438 * These coefficients can be generate with
3439 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3441 const double lp_build_log2_polynomial
[] = {
3442 #if LOG_POLY_DEGREE == 5
3443 2.88539008148777786488L,
3444 0.961796878841293367824L,
3445 0.577058946784739859012L,
3446 0.412914355135828735411L,
3447 0.308591899232910175289L,
3448 0.352376952300281371868L,
3449 #elif LOG_POLY_DEGREE == 4
3450 2.88539009343309178325L,
3451 0.961791550404184197881L,
3452 0.577440339438736392009L,
3453 0.403343858251329912514L,
3454 0.406718052498846252698L,
3455 #elif LOG_POLY_DEGREE == 3
3456 2.88538959748872753838L,
3457 0.961932915889597772928L,
3458 0.571118517972136195241L,
3459 0.493997535084709500285L,
3466 * See http://www.devmaster.net/forums/showthread.php?p=43580
3467 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3468 * http://www.nezumi.demon.co.uk/consult/logx.htm
3470 * If handle_edge_cases is true the function will perform computations
3471 * to match the required D3D10+ behavior for each of the edge cases.
3472 * That means that if input is:
3473 * - less than zero (to and including -inf) then NaN will be returned
3474 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3475 * - +infinity, then +infinity will be returned
3476 * - NaN, then NaN will be returned
3478 * Those checks are fairly expensive so if you don't need them make sure
3479 * handle_edge_cases is false.
3482 lp_build_log2_approx(struct lp_build_context
*bld
,
3484 LLVMValueRef
*p_exp
,
3485 LLVMValueRef
*p_floor_log2
,
3486 LLVMValueRef
*p_log2
,
3487 boolean handle_edge_cases
)
3489 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3490 const struct lp_type type
= bld
->type
;
3491 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3492 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3494 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3495 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3496 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3498 LLVMValueRef i
= NULL
;
3499 LLVMValueRef y
= NULL
;
3500 LLVMValueRef z
= NULL
;
3501 LLVMValueRef exp
= NULL
;
3502 LLVMValueRef mant
= NULL
;
3503 LLVMValueRef logexp
= NULL
;
3504 LLVMValueRef p_z
= NULL
;
3505 LLVMValueRef res
= NULL
;
3507 assert(lp_check_value(bld
->type
, x
));
3509 if(p_exp
|| p_floor_log2
|| p_log2
) {
3510 /* TODO: optimize the constant case */
3511 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3512 LLVMIsConstant(x
)) {
3513 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3517 assert(type
.floating
&& type
.width
== 32);
3520 * We don't explicitly handle denormalized numbers. They will yield a
3521 * result in the neighbourhood of -127, which appears to be adequate
3525 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3527 /* exp = (float) exponent(x) */
3528 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3531 if(p_floor_log2
|| p_log2
) {
3532 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3533 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3534 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3538 /* mant = 1 + (float) mantissa(x) */
3539 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3540 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3541 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3543 /* y = (mant - 1) / (mant + 1) */
3544 y
= lp_build_div(bld
,
3545 lp_build_sub(bld
, mant
, bld
->one
),
3546 lp_build_add(bld
, mant
, bld
->one
)
3550 z
= lp_build_mul(bld
, y
, y
);
3553 p_z
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3554 ARRAY_SIZE(lp_build_log2_polynomial
));
3556 /* y * P(z) + logexp */
3557 res
= lp_build_mad(bld
, y
, p_z
, logexp
);
3559 if (type
.floating
&& handle_edge_cases
) {
3560 LLVMValueRef negmask
, infmask
, zmask
;
3561 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3562 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3563 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3564 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3565 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3566 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3568 /* If x is qual to inf make sure we return inf */
3569 res
= lp_build_select(bld
, infmask
,
3570 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3572 /* If x is qual to 0, return -inf */
3573 res
= lp_build_select(bld
, zmask
,
3574 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3576 /* If x is nan or less than 0, return nan */
3577 res
= lp_build_select(bld
, negmask
,
3578 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3584 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3589 *p_floor_log2
= logexp
;
3597 * log2 implementation which doesn't have special code to
3598 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3599 * the results for those cases are undefined.
3602 lp_build_log2(struct lp_build_context
*bld
,
3606 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3611 * Version of log2 which handles all edge cases.
3612 * Look at documentation of lp_build_log2_approx for
3613 * description of the behavior for each of the edge cases.
3616 lp_build_log2_safe(struct lp_build_context
*bld
,
3620 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3626 * Faster (and less accurate) log2.
3628 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3630 * Piece-wise linear approximation, with exact results when x is a
3633 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3636 lp_build_fast_log2(struct lp_build_context
*bld
,
3639 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3643 assert(lp_check_value(bld
->type
, x
));
3645 assert(bld
->type
.floating
);
3647 /* ipart = floor(log2(x)) - 1 */
3648 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3649 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3651 /* fpart = x / 2**ipart */
3652 fpart
= lp_build_extract_mantissa(bld
, x
);
3655 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3660 * Fast implementation of iround(log2(x)).
3662 * Not an approximation -- it should give accurate results all the time.
3665 lp_build_ilog2(struct lp_build_context
*bld
,
3668 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3669 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3672 assert(bld
->type
.floating
);
3674 assert(lp_check_value(bld
->type
, x
));
3676 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3677 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3679 /* ipart = floor(log2(x) + 0.5) */
3680 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3686 lp_build_mod(struct lp_build_context
*bld
,
3690 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3692 const struct lp_type type
= bld
->type
;
3694 assert(lp_check_value(type
, x
));
3695 assert(lp_check_value(type
, y
));
3698 res
= LLVMBuildFRem(builder
, x
, y
, "");
3700 res
= LLVMBuildSRem(builder
, x
, y
, "");
3702 res
= LLVMBuildURem(builder
, x
, y
, "");
3708 * For floating inputs it creates and returns a mask
3709 * which is all 1's for channels which are NaN.
3710 * Channels inside x which are not NaN will be 0.
3713 lp_build_isnan(struct lp_build_context
*bld
,
3717 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3719 assert(bld
->type
.floating
);
3720 assert(lp_check_value(bld
->type
, x
));
3722 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3724 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3725 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3729 /* Returns all 1's for floating point numbers that are
3730 * finite numbers and returns all zeros for -inf,
3733 lp_build_isfinite(struct lp_build_context
*bld
,
3736 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3737 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3738 struct lp_type int_type
= lp_int_type(bld
->type
);
3739 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3740 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3743 if (!bld
->type
.floating
) {
3744 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3746 assert(bld
->type
.floating
);
3747 assert(lp_check_value(bld
->type
, x
));
3748 assert(bld
->type
.width
== 32);
3750 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3751 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,
3756 * Returns true if the number is nan or inf and false otherwise.
3757 * The input has to be a floating point vector.
3760 lp_build_is_inf_or_nan(struct gallivm_state
*gallivm
,
3761 const struct lp_type type
,
3764 LLVMBuilderRef builder
= gallivm
->builder
;
3765 struct lp_type int_type
= lp_int_type(type
);
3766 LLVMValueRef const0
= lp_build_const_int_vec(gallivm
, int_type
,
3770 assert(type
.floating
);
3772 ret
= LLVMBuildBitCast(builder
, x
, lp_build_vec_type(gallivm
, int_type
), "");
3773 ret
= LLVMBuildAnd(builder
, ret
, const0
, "");
3774 ret
= lp_build_compare(gallivm
, int_type
, PIPE_FUNC_EQUAL
,
3782 lp_build_fpstate_get(struct gallivm_state
*gallivm
)
3784 if (util_cpu_caps
.has_sse
) {
3785 LLVMBuilderRef builder
= gallivm
->builder
;
3786 LLVMValueRef mxcsr_ptr
= lp_build_alloca(
3788 LLVMInt32TypeInContext(gallivm
->context
),
3790 LLVMValueRef mxcsr_ptr8
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3791 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3792 lp_build_intrinsic(builder
,
3793 "llvm.x86.sse.stmxcsr",
3794 LLVMVoidTypeInContext(gallivm
->context
),
3802 lp_build_fpstate_set_denorms_zero(struct gallivm_state
*gallivm
,
3805 if (util_cpu_caps
.has_sse
) {
3806 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3807 int daz_ftz
= _MM_FLUSH_ZERO_MASK
;
3809 LLVMBuilderRef builder
= gallivm
->builder
;
3810 LLVMValueRef mxcsr_ptr
= lp_build_fpstate_get(gallivm
);
3811 LLVMValueRef mxcsr
=
3812 LLVMBuildLoad(builder
, mxcsr_ptr
, "mxcsr");
3814 if (util_cpu_caps
.has_daz
) {
3815 /* Enable denormals are zero mode */
3816 daz_ftz
|= _MM_DENORMALS_ZERO_MASK
;
3819 mxcsr
= LLVMBuildOr(builder
, mxcsr
,
3820 LLVMConstInt(LLVMTypeOf(mxcsr
), daz_ftz
, 0), "");
3822 mxcsr
= LLVMBuildAnd(builder
, mxcsr
,
3823 LLVMConstInt(LLVMTypeOf(mxcsr
), ~daz_ftz
, 0), "");
3826 LLVMBuildStore(builder
, mxcsr
, mxcsr_ptr
);
3827 lp_build_fpstate_set(gallivm
, mxcsr_ptr
);
3832 lp_build_fpstate_set(struct gallivm_state
*gallivm
,
3833 LLVMValueRef mxcsr_ptr
)
3835 if (util_cpu_caps
.has_sse
) {
3836 LLVMBuilderRef builder
= gallivm
->builder
;
3837 mxcsr_ptr
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3838 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3839 lp_build_intrinsic(builder
,
3840 "llvm.x86.sse.ldmxcsr",
3841 LLVMVoidTypeInContext(gallivm
->context
),