1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include <llvm/Config/llvm-config.h>
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
80 #define EXP_POLY_DEGREE 5
82 #define LOG_POLY_DEGREE 4
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
92 lp_build_min_simple(struct lp_build_context
*bld
,
95 enum gallivm_nan_behavior nan_behavior
)
97 const struct lp_type type
= bld
->type
;
98 const char *intrinsic
= NULL
;
99 unsigned intr_size
= 0;
102 assert(lp_check_value(type
, a
));
103 assert(lp_check_value(type
, b
));
105 /* TODO: optimize the constant case */
107 if (type
.floating
&& util_cpu_caps
.has_sse
) {
108 if (type
.width
== 32) {
109 if (type
.length
== 1) {
110 intrinsic
= "llvm.x86.sse.min.ss";
113 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
114 intrinsic
= "llvm.x86.sse.min.ps";
118 intrinsic
= "llvm.x86.avx.min.ps.256";
122 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
123 if (type
.length
== 1) {
124 intrinsic
= "llvm.x86.sse2.min.sd";
127 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
128 intrinsic
= "llvm.x86.sse2.min.pd";
132 intrinsic
= "llvm.x86.avx.min.pd.256";
137 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
138 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
139 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
143 if (type
.width
== 32 && type
.length
== 4) {
144 intrinsic
= "llvm.ppc.altivec.vminfp";
147 } else if ((LLVM_VERSION_MAJOR
< 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
< 9)) &&
148 util_cpu_caps
.has_avx2
&& type
.length
> 4) {
150 switch (type
.width
) {
152 intrinsic
= type
.sign
? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
155 intrinsic
= type
.sign
? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
158 intrinsic
= type
.sign
? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
161 } else if ((LLVM_VERSION_MAJOR
< 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
< 9)) &&
162 util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
164 if ((type
.width
== 8 || type
.width
== 16) &&
165 (type
.width
* type
.length
<= 64) &&
166 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
167 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
170 if (type
.width
== 8 && !type
.sign
) {
171 intrinsic
= "llvm.x86.sse2.pminu.b";
173 else if (type
.width
== 16 && type
.sign
) {
174 intrinsic
= "llvm.x86.sse2.pmins.w";
176 if (util_cpu_caps
.has_sse4_1
) {
177 if (type
.width
== 8 && type
.sign
) {
178 intrinsic
= "llvm.x86.sse41.pminsb";
180 if (type
.width
== 16 && !type
.sign
) {
181 intrinsic
= "llvm.x86.sse41.pminuw";
183 if (type
.width
== 32 && !type
.sign
) {
184 intrinsic
= "llvm.x86.sse41.pminud";
186 if (type
.width
== 32 && type
.sign
) {
187 intrinsic
= "llvm.x86.sse41.pminsd";
190 } else if (util_cpu_caps
.has_altivec
) {
192 if (type
.width
== 8) {
194 intrinsic
= "llvm.ppc.altivec.vminub";
196 intrinsic
= "llvm.ppc.altivec.vminsb";
198 } else if (type
.width
== 16) {
200 intrinsic
= "llvm.ppc.altivec.vminuh";
202 intrinsic
= "llvm.ppc.altivec.vminsh";
204 } else if (type
.width
== 32) {
206 intrinsic
= "llvm.ppc.altivec.vminuw";
208 intrinsic
= "llvm.ppc.altivec.vminsw";
214 /* We need to handle nan's for floating point numbers. If one of the
215 * inputs is nan the other should be returned (required by both D3D10+
217 * The sse intrinsics return the second operator in case of nan by
218 * default so we need to special code to handle those.
220 if (util_cpu_caps
.has_sse
&& type
.floating
&&
221 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
222 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
223 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
224 LLVMValueRef isnan
, min
;
225 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
228 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
229 isnan
= lp_build_isnan(bld
, b
);
230 return lp_build_select(bld
, isnan
, a
, min
);
232 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
233 isnan
= lp_build_isnan(bld
, a
);
234 return lp_build_select(bld
, isnan
, a
, min
);
237 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
244 switch (nan_behavior
) {
245 case GALLIVM_NAN_RETURN_NAN
: {
246 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
247 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
248 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
249 return lp_build_select(bld
, cond
, a
, b
);
252 case GALLIVM_NAN_RETURN_OTHER
: {
253 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
254 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
255 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
256 return lp_build_select(bld
, cond
, a
, b
);
259 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
260 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
261 return lp_build_select(bld
, cond
, a
, b
);
262 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
263 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, b
, a
);
264 return lp_build_select(bld
, cond
, b
, a
);
265 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
266 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
267 return lp_build_select(bld
, cond
, a
, b
);
271 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
272 return lp_build_select(bld
, cond
, a
, b
);
275 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
276 return lp_build_select(bld
, cond
, a
, b
);
282 lp_build_fmuladd(LLVMBuilderRef builder
,
287 LLVMTypeRef type
= LLVMTypeOf(a
);
288 assert(type
== LLVMTypeOf(b
));
289 assert(type
== LLVMTypeOf(c
));
290 if (LLVM_VERSION_MAJOR
< 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
< 4)) {
291 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
292 * not supported, and instead it falls-back to a C function.
294 return LLVMBuildFAdd(builder
, LLVMBuildFMul(builder
, a
, b
, ""), c
, "");
297 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fmuladd", type
);
298 LLVMValueRef args
[] = { a
, b
, c
};
299 return lp_build_intrinsic(builder
, intrinsic
, type
, args
, 3, 0);
305 * No checks for special case values of a or b = 1 or 0 are done.
306 * NaN's are handled according to the behavior specified by the
307 * nan_behavior argument.
310 lp_build_max_simple(struct lp_build_context
*bld
,
313 enum gallivm_nan_behavior nan_behavior
)
315 const struct lp_type type
= bld
->type
;
316 const char *intrinsic
= NULL
;
317 unsigned intr_size
= 0;
320 assert(lp_check_value(type
, a
));
321 assert(lp_check_value(type
, b
));
323 /* TODO: optimize the constant case */
325 if (type
.floating
&& util_cpu_caps
.has_sse
) {
326 if (type
.width
== 32) {
327 if (type
.length
== 1) {
328 intrinsic
= "llvm.x86.sse.max.ss";
331 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
332 intrinsic
= "llvm.x86.sse.max.ps";
336 intrinsic
= "llvm.x86.avx.max.ps.256";
340 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
341 if (type
.length
== 1) {
342 intrinsic
= "llvm.x86.sse2.max.sd";
345 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
346 intrinsic
= "llvm.x86.sse2.max.pd";
350 intrinsic
= "llvm.x86.avx.max.pd.256";
355 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
356 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
357 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
358 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
361 if (type
.width
== 32 || type
.length
== 4) {
362 intrinsic
= "llvm.ppc.altivec.vmaxfp";
365 } else if ((LLVM_VERSION_MAJOR
< 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
< 9)) &&
366 util_cpu_caps
.has_avx2
&& type
.length
> 4) {
368 switch (type
.width
) {
370 intrinsic
= type
.sign
? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
373 intrinsic
= type
.sign
? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
376 intrinsic
= type
.sign
? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
379 } else if ((LLVM_VERSION_MAJOR
< 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
< 9)) &&
380 util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
382 if ((type
.width
== 8 || type
.width
== 16) &&
383 (type
.width
* type
.length
<= 64) &&
384 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
385 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
388 if (type
.width
== 8 && !type
.sign
) {
389 intrinsic
= "llvm.x86.sse2.pmaxu.b";
392 else if (type
.width
== 16 && type
.sign
) {
393 intrinsic
= "llvm.x86.sse2.pmaxs.w";
395 if (util_cpu_caps
.has_sse4_1
) {
396 if (type
.width
== 8 && type
.sign
) {
397 intrinsic
= "llvm.x86.sse41.pmaxsb";
399 if (type
.width
== 16 && !type
.sign
) {
400 intrinsic
= "llvm.x86.sse41.pmaxuw";
402 if (type
.width
== 32 && !type
.sign
) {
403 intrinsic
= "llvm.x86.sse41.pmaxud";
405 if (type
.width
== 32 && type
.sign
) {
406 intrinsic
= "llvm.x86.sse41.pmaxsd";
409 } else if (util_cpu_caps
.has_altivec
) {
411 if (type
.width
== 8) {
413 intrinsic
= "llvm.ppc.altivec.vmaxub";
415 intrinsic
= "llvm.ppc.altivec.vmaxsb";
417 } else if (type
.width
== 16) {
419 intrinsic
= "llvm.ppc.altivec.vmaxuh";
421 intrinsic
= "llvm.ppc.altivec.vmaxsh";
423 } else if (type
.width
== 32) {
425 intrinsic
= "llvm.ppc.altivec.vmaxuw";
427 intrinsic
= "llvm.ppc.altivec.vmaxsw";
433 if (util_cpu_caps
.has_sse
&& type
.floating
&&
434 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
435 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
436 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
437 LLVMValueRef isnan
, max
;
438 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
441 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
442 isnan
= lp_build_isnan(bld
, b
);
443 return lp_build_select(bld
, isnan
, a
, max
);
445 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
446 isnan
= lp_build_isnan(bld
, a
);
447 return lp_build_select(bld
, isnan
, a
, max
);
450 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
457 switch (nan_behavior
) {
458 case GALLIVM_NAN_RETURN_NAN
: {
459 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
460 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
461 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
462 return lp_build_select(bld
, cond
, a
, b
);
465 case GALLIVM_NAN_RETURN_OTHER
: {
466 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
467 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
468 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
469 return lp_build_select(bld
, cond
, a
, b
);
472 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
473 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
474 return lp_build_select(bld
, cond
, a
, b
);
475 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
476 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, a
);
477 return lp_build_select(bld
, cond
, b
, a
);
478 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
479 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
480 return lp_build_select(bld
, cond
, a
, b
);
484 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
485 return lp_build_select(bld
, cond
, a
, b
);
488 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
489 return lp_build_select(bld
, cond
, a
, b
);
495 * Generate 1 - a, or ~a depending on bld->type.
498 lp_build_comp(struct lp_build_context
*bld
,
501 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
502 const struct lp_type type
= bld
->type
;
504 assert(lp_check_value(type
, a
));
511 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
512 if(LLVMIsConstant(a
))
513 return LLVMConstNot(a
);
515 return LLVMBuildNot(builder
, a
, "");
518 if(LLVMIsConstant(a
))
520 return LLVMConstFSub(bld
->one
, a
);
522 return LLVMConstSub(bld
->one
, a
);
525 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
527 return LLVMBuildSub(builder
, bld
->one
, a
, "");
535 lp_build_add(struct lp_build_context
*bld
,
539 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
540 const struct lp_type type
= bld
->type
;
543 assert(lp_check_value(type
, a
));
544 assert(lp_check_value(type
, b
));
550 if (a
== bld
->undef
|| b
== bld
->undef
)
554 const char *intrinsic
= NULL
;
556 if (!type
.sign
&& (a
== bld
->one
|| b
== bld
->one
))
559 if (!type
.floating
&& !type
.fixed
) {
560 if (LLVM_VERSION_MAJOR
>= 9) {
562 intrinsic
= type
.sign
? "llvm.sadd.sat" : "llvm.uadd.sat";
563 lp_format_intrinsic(intrin
, sizeof intrin
, intrinsic
, bld
->vec_type
);
564 return lp_build_intrinsic_binary(builder
, intrin
, bld
->vec_type
, a
, b
);
566 if (type
.width
* type
.length
== 128) {
567 if (util_cpu_caps
.has_sse2
) {
569 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" :
570 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.paddus.b" : NULL
;
571 if (type
.width
== 16)
572 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" :
573 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.paddus.w" : NULL
;
574 } else if (util_cpu_caps
.has_altivec
) {
576 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
577 if (type
.width
== 16)
578 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
581 if (type
.width
* type
.length
== 256) {
582 if (util_cpu_caps
.has_avx2
) {
584 intrinsic
= type
.sign
? "llvm.x86.avx2.padds.b" :
585 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.paddus.b" : NULL
;
586 if (type
.width
== 16)
587 intrinsic
= type
.sign
? "llvm.x86.avx2.padds.w" :
588 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.paddus.w" : NULL
;
594 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
597 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
599 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
600 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
601 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
602 /* a_clamp_max is the maximum a for positive b,
603 a_clamp_min is the minimum a for negative b. */
604 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildSub(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
605 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildSub(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
606 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_max
, a_clamp_min
);
610 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
612 res
= LLVMConstFAdd(a
, b
);
614 res
= LLVMConstAdd(a
, b
);
617 res
= LLVMBuildFAdd(builder
, a
, b
, "");
619 res
= LLVMBuildAdd(builder
, a
, b
, "");
621 /* clamp to ceiling of 1.0 */
622 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
623 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
625 if (type
.norm
&& !type
.floating
&& !type
.fixed
) {
628 * newer llvm versions no longer support the intrinsics, but recognize
629 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
630 * code, it is important we match the pattern llvm uses (and pray llvm
631 * doesn't change it - and hope they decide on the same pattern for
632 * all backends supporting it...).
633 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
634 * interfere with llvm's ability to recognize the pattern but seems
636 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
638 LLVMValueRef overflowed
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, res
);
639 res
= lp_build_select(bld
, overflowed
,
640 LLVMConstAllOnes(bld
->int_vec_type
), res
);
644 /* XXX clamp to floor of -1 or 0??? */
650 /** Return the scalar sum of the elements of a.
651 * Should avoid this operation whenever possible.
654 lp_build_horizontal_add(struct lp_build_context
*bld
,
657 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
658 const struct lp_type type
= bld
->type
;
659 LLVMValueRef index
, res
;
661 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
662 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
663 LLVMValueRef vecres
, elem2
;
665 assert(lp_check_value(type
, a
));
667 if (type
.length
== 1) {
671 assert(!bld
->type
.norm
);
674 * for byte vectors can do much better with psadbw.
675 * Using repeated shuffle/adds here. Note with multiple vectors
676 * this can be done more efficiently as outlined in the intel
677 * optimization manual.
678 * Note: could cause data rearrangement if used with smaller element
683 length
= type
.length
/ 2;
685 LLVMValueRef vec1
, vec2
;
686 for (i
= 0; i
< length
; i
++) {
687 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
688 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
690 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
691 LLVMConstVector(shuffles1
, length
), "");
692 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
693 LLVMConstVector(shuffles2
, length
), "");
695 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
698 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
700 length
= length
>> 1;
703 /* always have vector of size 2 here */
706 index
= lp_build_const_int32(bld
->gallivm
, 0);
707 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
708 index
= lp_build_const_int32(bld
->gallivm
, 1);
709 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
712 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
714 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
720 * Return the horizontal sums of 4 float vectors as a float4 vector.
721 * This uses the technique as outlined in Intel Optimization Manual.
724 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
727 struct gallivm_state
*gallivm
= bld
->gallivm
;
728 LLVMBuilderRef builder
= gallivm
->builder
;
729 LLVMValueRef shuffles
[4];
731 LLVMValueRef sumtmp
[2], shuftmp
[2];
733 /* lower half of regs */
734 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
735 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
736 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
737 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
738 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
739 LLVMConstVector(shuffles
, 4), "");
740 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
741 LLVMConstVector(shuffles
, 4), "");
743 /* upper half of regs */
744 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
745 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
746 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
747 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
748 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
749 LLVMConstVector(shuffles
, 4), "");
750 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
751 LLVMConstVector(shuffles
, 4), "");
753 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
754 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
756 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
757 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
758 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
759 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
760 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
761 LLVMConstVector(shuffles
, 4), "");
763 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
764 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
765 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
766 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
767 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
768 LLVMConstVector(shuffles
, 4), "");
770 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
775 * partially horizontally add 2-4 float vectors with length nx4,
776 * i.e. only four adjacent values in each vector will be added,
777 * assuming values are really grouped in 4 which also determines
780 * Return a vector of the same length as the initial vectors,
781 * with the excess elements (if any) being undefined.
782 * The element order is independent of number of input vectors.
783 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
784 * the output order thus will be
785 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
788 lp_build_hadd_partial4(struct lp_build_context
*bld
,
789 LLVMValueRef vectors
[],
792 struct gallivm_state
*gallivm
= bld
->gallivm
;
793 LLVMBuilderRef builder
= gallivm
->builder
;
794 LLVMValueRef ret_vec
;
796 const char *intrinsic
= NULL
;
798 assert(num_vecs
>= 2 && num_vecs
<= 4);
799 assert(bld
->type
.floating
);
801 /* only use this with at least 2 vectors, as it is sort of expensive
802 * (depending on cpu) and we always need two horizontal adds anyway,
803 * so a shuffle/add approach might be better.
809 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
810 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
812 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
813 bld
->type
.length
== 4) {
814 intrinsic
= "llvm.x86.sse3.hadd.ps";
816 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
817 bld
->type
.length
== 8) {
818 intrinsic
= "llvm.x86.avx.hadd.ps.256";
821 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
822 lp_build_vec_type(gallivm
, bld
->type
),
825 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
826 lp_build_vec_type(gallivm
, bld
->type
),
832 return lp_build_intrinsic_binary(builder
, intrinsic
,
833 lp_build_vec_type(gallivm
, bld
->type
),
837 if (bld
->type
.length
== 4) {
838 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
841 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
843 unsigned num_iter
= bld
->type
.length
/ 4;
844 struct lp_type parttype
= bld
->type
;
846 for (j
= 0; j
< num_iter
; j
++) {
847 LLVMValueRef partsrc
[4];
849 for (i
= 0; i
< 4; i
++) {
850 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
852 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
854 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
863 lp_build_sub(struct lp_build_context
*bld
,
867 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
868 const struct lp_type type
= bld
->type
;
871 assert(lp_check_value(type
, a
));
872 assert(lp_check_value(type
, b
));
876 if (a
== bld
->undef
|| b
== bld
->undef
)
882 const char *intrinsic
= NULL
;
884 if (!type
.sign
&& b
== bld
->one
)
887 if (!type
.floating
&& !type
.fixed
) {
888 if (LLVM_VERSION_MAJOR
>= 9) {
890 intrinsic
= type
.sign
? "llvm.ssub.sat" : "llvm.usub.sat";
891 lp_format_intrinsic(intrin
, sizeof intrin
, intrinsic
, bld
->vec_type
);
892 return lp_build_intrinsic_binary(builder
, intrin
, bld
->vec_type
, a
, b
);
894 if (type
.width
* type
.length
== 128) {
895 if (util_cpu_caps
.has_sse2
) {
897 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" :
898 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.psubus.b" : NULL
;
899 if (type
.width
== 16)
900 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" :
901 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.sse2.psubus.w" : NULL
;
902 } else if (util_cpu_caps
.has_altivec
) {
904 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
905 if (type
.width
== 16)
906 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
909 if (type
.width
* type
.length
== 256) {
910 if (util_cpu_caps
.has_avx2
) {
912 intrinsic
= type
.sign
? "llvm.x86.avx2.psubs.b" :
913 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.psubus.b" : NULL
;
914 if (type
.width
== 16)
915 intrinsic
= type
.sign
? "llvm.x86.avx2.psubs.w" :
916 LLVM_VERSION_MAJOR
< 8 ? "llvm.x86.avx2.psubus.w" : NULL
;
922 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
925 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
927 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
928 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
929 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
930 /* a_clamp_max is the maximum a for negative b,
931 a_clamp_min is the minimum a for positive b. */
932 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildAdd(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
933 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildAdd(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
934 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_min
, a_clamp_max
);
937 * This must match llvm pattern for saturated unsigned sub.
938 * (lp_build_max_simple actually does the job with its current
939 * definition but do it explicitly here.)
940 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
941 * interfere with llvm's ability to recognize the pattern but seems
943 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
945 LLVMValueRef no_ov
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
946 a
= lp_build_select(bld
, no_ov
, a
, b
);
950 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
952 res
= LLVMConstFSub(a
, b
);
954 res
= LLVMConstSub(a
, b
);
957 res
= LLVMBuildFSub(builder
, a
, b
, "");
959 res
= LLVMBuildSub(builder
, a
, b
, "");
961 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
962 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
970 * Normalized multiplication.
972 * There are several approaches for (using 8-bit normalized multiplication as
977 * makes the following approximation to the division (Sree)
979 * a*b/255 ~= (a*(b + 1)) >> 256
981 * which is the fastest method that satisfies the following OpenGL criteria of
983 * 0*0 = 0 and 255*255 = 255
987 * takes the geometric series approximation to the division
989 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
991 * in this case just the first two terms to fit in 16bit arithmetic
993 * t/255 ~= (t + (t >> 8)) >> 8
995 * note that just by itself it doesn't satisfies the OpenGL criteria, as
996 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
999 * - geometric series plus rounding
1001 * when using a geometric series division instead of truncating the result
1002 * use roundoff in the approximation (Jim Blinn)
1004 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
1006 * achieving the exact results.
1010 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1011 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1012 * @sa Michael Herf, The "double blend trick", May 2000,
1013 * http://www.stereopsis.com/doubleblend.html
1016 lp_build_mul_norm(struct gallivm_state
*gallivm
,
1017 struct lp_type wide_type
,
1018 LLVMValueRef a
, LLVMValueRef b
)
1020 LLVMBuilderRef builder
= gallivm
->builder
;
1021 struct lp_build_context bld
;
1026 assert(!wide_type
.floating
);
1027 assert(lp_check_value(wide_type
, a
));
1028 assert(lp_check_value(wide_type
, b
));
1030 lp_build_context_init(&bld
, gallivm
, wide_type
);
1032 n
= wide_type
.width
/ 2;
1033 if (wide_type
.sign
) {
1038 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1039 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1043 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1046 ab
= LLVMBuildMul(builder
, a
, b
, "");
1047 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
1050 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1053 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1LL << (n
- 1));
1054 if (wide_type
.sign
) {
1055 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
1056 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
1057 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
1059 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
1061 /* Final division */
1062 ab
= lp_build_shr_imm(&bld
, ab
, n
);
1071 lp_build_mul(struct lp_build_context
*bld
,
1075 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1076 const struct lp_type type
= bld
->type
;
1080 assert(lp_check_value(type
, a
));
1081 assert(lp_check_value(type
, b
));
1091 if(a
== bld
->undef
|| b
== bld
->undef
)
1094 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
1095 struct lp_type wide_type
= lp_wider_type(type
);
1096 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
1098 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
1099 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
1101 /* PMULLW, PSRLW, PADDW */
1102 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
1103 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
1105 ab
= lp_build_pack2_native(bld
->gallivm
, wide_type
, type
, abl
, abh
);
1111 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
1115 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1117 res
= LLVMConstFMul(a
, b
);
1119 res
= LLVMConstMul(a
, b
);
1122 res
= LLVMConstAShr(res
, shift
);
1124 res
= LLVMConstLShr(res
, shift
);
1129 res
= LLVMBuildFMul(builder
, a
, b
, "");
1131 res
= LLVMBuildMul(builder
, a
, b
, "");
1134 res
= LLVMBuildAShr(builder
, res
, shift
, "");
1136 res
= LLVMBuildLShr(builder
, res
, shift
, "");
1144 * Widening mul, valid for 32x32 bit -> 64bit only.
1145 * Result is low 32bits, high bits returned in res_hi.
1147 * Emits code that is meant to be compiled for the host CPU.
1150 lp_build_mul_32_lohi_cpu(struct lp_build_context
*bld
,
1153 LLVMValueRef
*res_hi
)
1155 struct gallivm_state
*gallivm
= bld
->gallivm
;
1156 LLVMBuilderRef builder
= gallivm
->builder
;
1158 assert(bld
->type
.width
== 32);
1159 assert(bld
->type
.floating
== 0);
1160 assert(bld
->type
.fixed
== 0);
1161 assert(bld
->type
.norm
== 0);
1164 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1165 * for x86 simd is atrocious (even if the high bits weren't required),
1166 * trying to handle real 64bit inputs (which of course can't happen due
1167 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1168 * apparently llvm does not recognize this widening mul). This includes 6
1169 * (instead of 2) pmuludq plus extra adds and shifts
1170 * The same story applies to signed mul, albeit fixing this requires sse41.
1171 * https://llvm.org/bugs/show_bug.cgi?id=30845
1172 * So, whip up our own code, albeit only for length 4 and 8 (which
1173 * should be good enough)...
1174 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1175 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1176 * for signed), which the fallback code does not, without this llvm
1177 * will likely still produce atrocious code.
1179 if (LLVM_VERSION_MAJOR
< 7 &&
1180 (bld
->type
.length
== 4 || bld
->type
.length
== 8) &&
1181 ((util_cpu_caps
.has_sse2
&& (bld
->type
.sign
== 0)) ||
1182 util_cpu_caps
.has_sse4_1
)) {
1183 const char *intrinsic
= NULL
;
1184 LLVMValueRef aeven
, aodd
, beven
, bodd
, muleven
, mulodd
;
1185 LLVMValueRef shuf
[LP_MAX_VECTOR_WIDTH
/ 32], shuf_vec
;
1186 struct lp_type type_wide
= lp_wider_type(bld
->type
);
1187 LLVMTypeRef wider_type
= lp_build_vec_type(gallivm
, type_wide
);
1189 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1190 shuf
[i
] = lp_build_const_int32(gallivm
, i
+1);
1191 shuf
[i
+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
1193 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1196 aodd
= LLVMBuildShuffleVector(builder
, aeven
, bld
->undef
, shuf_vec
, "");
1197 bodd
= LLVMBuildShuffleVector(builder
, beven
, bld
->undef
, shuf_vec
, "");
1199 if (util_cpu_caps
.has_avx2
&& bld
->type
.length
== 8) {
1200 if (bld
->type
.sign
) {
1201 intrinsic
= "llvm.x86.avx2.pmul.dq";
1203 intrinsic
= "llvm.x86.avx2.pmulu.dq";
1205 muleven
= lp_build_intrinsic_binary(builder
, intrinsic
,
1206 wider_type
, aeven
, beven
);
1207 mulodd
= lp_build_intrinsic_binary(builder
, intrinsic
,
1208 wider_type
, aodd
, bodd
);
1211 /* for consistent naming look elsewhere... */
1212 if (bld
->type
.sign
) {
1213 intrinsic
= "llvm.x86.sse41.pmuldq";
1215 intrinsic
= "llvm.x86.sse2.pmulu.dq";
1218 * XXX If we only have AVX but not AVX2 this is a pain.
1219 * lp_build_intrinsic_binary_anylength() can't handle it
1220 * (due to src and dst type not being identical).
1222 if (bld
->type
.length
== 8) {
1223 LLVMValueRef aevenlo
, aevenhi
, bevenlo
, bevenhi
;
1224 LLVMValueRef aoddlo
, aoddhi
, boddlo
, boddhi
;
1225 LLVMValueRef muleven2
[2], mulodd2
[2];
1226 struct lp_type type_wide_half
= type_wide
;
1227 LLVMTypeRef wtype_half
;
1228 type_wide_half
.length
= 2;
1229 wtype_half
= lp_build_vec_type(gallivm
, type_wide_half
);
1230 aevenlo
= lp_build_extract_range(gallivm
, aeven
, 0, 4);
1231 aevenhi
= lp_build_extract_range(gallivm
, aeven
, 4, 4);
1232 bevenlo
= lp_build_extract_range(gallivm
, beven
, 0, 4);
1233 bevenhi
= lp_build_extract_range(gallivm
, beven
, 4, 4);
1234 aoddlo
= lp_build_extract_range(gallivm
, aodd
, 0, 4);
1235 aoddhi
= lp_build_extract_range(gallivm
, aodd
, 4, 4);
1236 boddlo
= lp_build_extract_range(gallivm
, bodd
, 0, 4);
1237 boddhi
= lp_build_extract_range(gallivm
, bodd
, 4, 4);
1238 muleven2
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
1239 wtype_half
, aevenlo
, bevenlo
);
1240 mulodd2
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
1241 wtype_half
, aoddlo
, boddlo
);
1242 muleven2
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
1243 wtype_half
, aevenhi
, bevenhi
);
1244 mulodd2
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
1245 wtype_half
, aoddhi
, boddhi
);
1246 muleven
= lp_build_concat(gallivm
, muleven2
, type_wide_half
, 2);
1247 mulodd
= lp_build_concat(gallivm
, mulodd2
, type_wide_half
, 2);
1251 muleven
= lp_build_intrinsic_binary(builder
, intrinsic
,
1252 wider_type
, aeven
, beven
);
1253 mulodd
= lp_build_intrinsic_binary(builder
, intrinsic
,
1254 wider_type
, aodd
, bodd
);
1257 muleven
= LLVMBuildBitCast(builder
, muleven
, bld
->vec_type
, "");
1258 mulodd
= LLVMBuildBitCast(builder
, mulodd
, bld
->vec_type
, "");
1260 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1261 shuf
[i
] = lp_build_const_int32(gallivm
, i
+ 1);
1262 shuf
[i
+1] = lp_build_const_int32(gallivm
, i
+ 1 + bld
->type
.length
);
1264 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1265 *res_hi
= LLVMBuildShuffleVector(builder
, muleven
, mulodd
, shuf_vec
, "");
1267 for (i
= 0; i
< bld
->type
.length
; i
+= 2) {
1268 shuf
[i
] = lp_build_const_int32(gallivm
, i
);
1269 shuf
[i
+1] = lp_build_const_int32(gallivm
, i
+ bld
->type
.length
);
1271 shuf_vec
= LLVMConstVector(shuf
, bld
->type
.length
);
1272 return LLVMBuildShuffleVector(builder
, muleven
, mulodd
, shuf_vec
, "");
1275 return lp_build_mul_32_lohi(bld
, a
, b
, res_hi
);
1281 * Widening mul, valid for 32x32 bit -> 64bit only.
1282 * Result is low 32bits, high bits returned in res_hi.
1284 * Emits generic code.
1287 lp_build_mul_32_lohi(struct lp_build_context
*bld
,
1290 LLVMValueRef
*res_hi
)
1292 struct gallivm_state
*gallivm
= bld
->gallivm
;
1293 LLVMBuilderRef builder
= gallivm
->builder
;
1294 LLVMValueRef tmp
, shift
, res_lo
;
1295 struct lp_type type_tmp
;
1296 LLVMTypeRef wide_type
, narrow_type
;
1298 type_tmp
= bld
->type
;
1299 narrow_type
= lp_build_vec_type(gallivm
, type_tmp
);
1300 type_tmp
.width
*= 2;
1301 wide_type
= lp_build_vec_type(gallivm
, type_tmp
);
1302 shift
= lp_build_const_vec(gallivm
, type_tmp
, 32);
1304 if (bld
->type
.sign
) {
1305 a
= LLVMBuildSExt(builder
, a
, wide_type
, "");
1306 b
= LLVMBuildSExt(builder
, b
, wide_type
, "");
1308 a
= LLVMBuildZExt(builder
, a
, wide_type
, "");
1309 b
= LLVMBuildZExt(builder
, b
, wide_type
, "");
1311 tmp
= LLVMBuildMul(builder
, a
, b
, "");
1313 res_lo
= LLVMBuildTrunc(builder
, tmp
, narrow_type
, "");
1315 /* Since we truncate anyway, LShr and AShr are equivalent. */
1316 tmp
= LLVMBuildLShr(builder
, tmp
, shift
, "");
1317 *res_hi
= LLVMBuildTrunc(builder
, tmp
, narrow_type
, "");
1325 lp_build_mad(struct lp_build_context
*bld
,
1330 const struct lp_type type
= bld
->type
;
1331 if (type
.floating
) {
1332 return lp_build_fmuladd(bld
->gallivm
->builder
, a
, b
, c
);
1334 return lp_build_add(bld
, lp_build_mul(bld
, a
, b
), c
);
1340 * Small vector x scale multiplication optimization.
1343 lp_build_mul_imm(struct lp_build_context
*bld
,
1347 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1348 LLVMValueRef factor
;
1350 assert(lp_check_value(bld
->type
, a
));
1359 return lp_build_negate(bld
, a
);
1361 if(b
== 2 && bld
->type
.floating
)
1362 return lp_build_add(bld
, a
, a
);
1364 if(util_is_power_of_two_or_zero(b
)) {
1365 unsigned shift
= ffs(b
) - 1;
1367 if(bld
->type
.floating
) {
1370 * Power of two multiplication by directly manipulating the exponent.
1372 * XXX: This might not be always faster, it will introduce a small error
1373 * for multiplication by zero, and it will produce wrong results
1376 unsigned mantissa
= lp_mantissa(bld
->type
);
1377 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1378 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1379 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1380 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1385 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1386 return LLVMBuildShl(builder
, a
, factor
, "");
1390 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1391 return lp_build_mul(bld
, a
, factor
);
1399 lp_build_div(struct lp_build_context
*bld
,
1403 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1404 const struct lp_type type
= bld
->type
;
1406 assert(lp_check_value(type
, a
));
1407 assert(lp_check_value(type
, b
));
1411 if(a
== bld
->one
&& type
.floating
)
1412 return lp_build_rcp(bld
, b
);
1417 if(a
== bld
->undef
|| b
== bld
->undef
)
1420 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1422 return LLVMConstFDiv(a
, b
);
1424 return LLVMConstSDiv(a
, b
);
1426 return LLVMConstUDiv(a
, b
);
1429 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1431 ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1432 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1434 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1437 return LLVMBuildFDiv(builder
, a
, b
, "");
1439 return LLVMBuildSDiv(builder
, a
, b
, "");
1441 return LLVMBuildUDiv(builder
, a
, b
, "");
1446 * Linear interpolation helper.
1448 * @param normalized whether we are interpolating normalized values,
1449 * encoded in normalized integers, twice as wide.
1451 * @sa http://www.stereopsis.com/doubleblend.html
1453 static inline LLVMValueRef
1454 lp_build_lerp_simple(struct lp_build_context
*bld
,
1460 unsigned half_width
= bld
->type
.width
/2;
1461 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1465 assert(lp_check_value(bld
->type
, x
));
1466 assert(lp_check_value(bld
->type
, v0
));
1467 assert(lp_check_value(bld
->type
, v1
));
1469 delta
= lp_build_sub(bld
, v1
, v0
);
1471 if (bld
->type
.floating
) {
1473 return lp_build_mad(bld
, x
, delta
, v0
);
1476 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1477 if (!bld
->type
.sign
) {
1478 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1480 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1481 * most-significant-bit to the lowest-significant-bit, so that
1482 * later we can just divide by 2**n instead of 2**n - 1.
1485 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1488 /* (x * delta) >> n */
1489 res
= lp_build_mul(bld
, x
, delta
);
1490 res
= lp_build_shr_imm(bld
, res
, half_width
);
1493 * The rescaling trick above doesn't work for signed numbers, so
1494 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1497 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1498 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1501 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1502 res
= lp_build_mul(bld
, x
, delta
);
1505 if ((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) {
1507 * At this point both res and v0 only use the lower half of the bits,
1508 * the rest is zero. Instead of add / mask, do add with half wide type.
1510 struct lp_type narrow_type
;
1511 struct lp_build_context narrow_bld
;
1513 memset(&narrow_type
, 0, sizeof narrow_type
);
1514 narrow_type
.sign
= bld
->type
.sign
;
1515 narrow_type
.width
= bld
->type
.width
/2;
1516 narrow_type
.length
= bld
->type
.length
*2;
1518 lp_build_context_init(&narrow_bld
, bld
->gallivm
, narrow_type
);
1519 res
= LLVMBuildBitCast(builder
, res
, narrow_bld
.vec_type
, "");
1520 v0
= LLVMBuildBitCast(builder
, v0
, narrow_bld
.vec_type
, "");
1521 res
= lp_build_add(&narrow_bld
, v0
, res
);
1522 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
1524 res
= lp_build_add(bld
, v0
, res
);
1526 if (bld
->type
.fixed
) {
1528 * We need to mask out the high order bits when lerping 8bit
1529 * normalized colors stored on 16bits
1531 /* XXX: This step is necessary for lerping 8bit colors stored on
1532 * 16bits, but it will be wrong for true fixed point use cases.
1533 * Basically we need a more powerful lp_type, capable of further
1534 * distinguishing the values interpretation from the value storage.
1536 LLVMValueRef low_bits
;
1537 low_bits
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1);
1538 res
= LLVMBuildAnd(builder
, res
, low_bits
, "");
1547 * Linear interpolation.
1550 lp_build_lerp(struct lp_build_context
*bld
,
1556 const struct lp_type type
= bld
->type
;
1559 assert(lp_check_value(type
, x
));
1560 assert(lp_check_value(type
, v0
));
1561 assert(lp_check_value(type
, v1
));
1563 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1566 struct lp_type wide_type
;
1567 struct lp_build_context wide_bld
;
1568 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1570 assert(type
.length
>= 2);
1573 * Create a wider integer type, enough to hold the
1574 * intermediate result of the multiplication.
1576 memset(&wide_type
, 0, sizeof wide_type
);
1577 wide_type
.sign
= type
.sign
;
1578 wide_type
.width
= type
.width
*2;
1579 wide_type
.length
= type
.length
/2;
1581 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1583 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1584 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1585 lp_build_unpack2_native(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1591 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1593 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1594 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1596 res
= lp_build_pack2_native(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1598 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1606 * Bilinear interpolation.
1608 * Values indices are in v_{yx}.
1611 lp_build_lerp_2d(struct lp_build_context
*bld
,
1620 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1621 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1622 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1627 lp_build_lerp_3d(struct lp_build_context
*bld
,
1641 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1642 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1643 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1648 * Generate min(a, b)
1649 * Do checks for special cases but not for nans.
1652 lp_build_min(struct lp_build_context
*bld
,
1656 assert(lp_check_value(bld
->type
, a
));
1657 assert(lp_check_value(bld
->type
, b
));
1659 if(a
== bld
->undef
|| b
== bld
->undef
)
1665 if (bld
->type
.norm
) {
1666 if (!bld
->type
.sign
) {
1667 if (a
== bld
->zero
|| b
== bld
->zero
) {
1677 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1682 * Generate min(a, b)
1683 * NaN's are handled according to the behavior specified by the
1684 * nan_behavior argument.
1687 lp_build_min_ext(struct lp_build_context
*bld
,
1690 enum gallivm_nan_behavior nan_behavior
)
1692 assert(lp_check_value(bld
->type
, a
));
1693 assert(lp_check_value(bld
->type
, b
));
1695 if(a
== bld
->undef
|| b
== bld
->undef
)
1701 if (bld
->type
.norm
) {
1702 if (!bld
->type
.sign
) {
1703 if (a
== bld
->zero
|| b
== bld
->zero
) {
1713 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1717 * Generate max(a, b)
1718 * Do checks for special cases, but NaN behavior is undefined.
1721 lp_build_max(struct lp_build_context
*bld
,
1725 assert(lp_check_value(bld
->type
, a
));
1726 assert(lp_check_value(bld
->type
, b
));
1728 if(a
== bld
->undef
|| b
== bld
->undef
)
1734 if(bld
->type
.norm
) {
1735 if(a
== bld
->one
|| b
== bld
->one
)
1737 if (!bld
->type
.sign
) {
1738 if (a
== bld
->zero
) {
1741 if (b
== bld
->zero
) {
1747 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1752 * Generate max(a, b)
1753 * Checks for special cases.
1754 * NaN's are handled according to the behavior specified by the
1755 * nan_behavior argument.
1758 lp_build_max_ext(struct lp_build_context
*bld
,
1761 enum gallivm_nan_behavior nan_behavior
)
1763 assert(lp_check_value(bld
->type
, a
));
1764 assert(lp_check_value(bld
->type
, b
));
1766 if(a
== bld
->undef
|| b
== bld
->undef
)
1772 if(bld
->type
.norm
) {
1773 if(a
== bld
->one
|| b
== bld
->one
)
1775 if (!bld
->type
.sign
) {
1776 if (a
== bld
->zero
) {
1779 if (b
== bld
->zero
) {
1785 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1789 * Generate clamp(a, min, max)
1790 * NaN behavior (for any of a, min, max) is undefined.
1791 * Do checks for special cases.
1794 lp_build_clamp(struct lp_build_context
*bld
,
1799 assert(lp_check_value(bld
->type
, a
));
1800 assert(lp_check_value(bld
->type
, min
));
1801 assert(lp_check_value(bld
->type
, max
));
1803 a
= lp_build_min(bld
, a
, max
);
1804 a
= lp_build_max(bld
, a
, min
);
1810 * Generate clamp(a, 0, 1)
1811 * A NaN will get converted to zero.
1814 lp_build_clamp_zero_one_nanzero(struct lp_build_context
*bld
,
1817 a
= lp_build_max_ext(bld
, a
, bld
->zero
, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
1818 a
= lp_build_min(bld
, a
, bld
->one
);
1827 lp_build_abs(struct lp_build_context
*bld
,
1830 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1831 const struct lp_type type
= bld
->type
;
1832 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1834 assert(lp_check_value(type
, a
));
1840 if ((LLVM_VERSION_MAJOR
> 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
> 6)) && (LLVM_VERSION_MAJOR
< 3 || (LLVM_VERSION_MAJOR
== 3 && LLVM_VERSION_MINOR
< 9))) {
1841 /* Workaround llvm.org/PR27332 */
1842 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1843 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1844 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1845 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1846 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1847 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1851 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fabs", vec_type
);
1852 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1856 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
&& LLVM_VERSION_MAJOR
< 6) {
1857 switch(type
.width
) {
1859 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1861 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1863 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1866 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_avx2
&& LLVM_VERSION_MAJOR
< 6) {
1867 switch(type
.width
) {
1869 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.b", vec_type
, a
);
1871 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.w", vec_type
, a
);
1873 return lp_build_intrinsic_unary(builder
, "llvm.x86.avx2.pabs.d", vec_type
, a
);
1877 return lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
),
1878 a
, LLVMBuildNeg(builder
, a
, ""));
1883 lp_build_negate(struct lp_build_context
*bld
,
1886 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1888 assert(lp_check_value(bld
->type
, a
));
1890 if (bld
->type
.floating
)
1891 a
= LLVMBuildFNeg(builder
, a
, "");
1893 a
= LLVMBuildNeg(builder
, a
, "");
1899 /** Return -1, 0 or +1 depending on the sign of a */
1901 lp_build_sgn(struct lp_build_context
*bld
,
1904 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1905 const struct lp_type type
= bld
->type
;
1909 assert(lp_check_value(type
, a
));
1911 /* Handle non-zero case */
1913 /* if not zero then sign must be positive */
1916 else if(type
.floating
) {
1917 LLVMTypeRef vec_type
;
1918 LLVMTypeRef int_type
;
1922 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1924 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1925 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1926 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1928 /* Take the sign bit and add it to 1 constant */
1929 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1930 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1931 one
= LLVMConstBitCast(bld
->one
, int_type
);
1932 res
= LLVMBuildOr(builder
, sign
, one
, "");
1933 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1937 /* signed int/norm/fixed point */
1938 /* could use psign with sse3 and appropriate vectors here */
1939 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1940 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1941 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1945 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1946 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1953 * Set the sign of float vector 'a' according to 'sign'.
1954 * If sign==0, return abs(a).
1955 * If sign==1, return -abs(a);
1956 * Other values for sign produce undefined results.
1959 lp_build_set_sign(struct lp_build_context
*bld
,
1960 LLVMValueRef a
, LLVMValueRef sign
)
1962 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1963 const struct lp_type type
= bld
->type
;
1964 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1965 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1966 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1967 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1968 ~((unsigned long long) 1 << (type
.width
- 1)));
1969 LLVMValueRef val
, res
;
1971 assert(type
.floating
);
1972 assert(lp_check_value(type
, a
));
1974 /* val = reinterpret_cast<int>(a) */
1975 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1976 /* val = val & mask */
1977 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1978 /* sign = sign << shift */
1979 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1980 /* res = val | sign */
1981 res
= LLVMBuildOr(builder
, val
, sign
, "");
1982 /* res = reinterpret_cast<float>(res) */
1983 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1990 * Convert vector of (or scalar) int to vector of (or scalar) float.
1993 lp_build_int_to_float(struct lp_build_context
*bld
,
1996 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1997 const struct lp_type type
= bld
->type
;
1998 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2000 assert(type
.floating
);
2002 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
2006 arch_rounding_available(const struct lp_type type
)
2008 if ((util_cpu_caps
.has_sse4_1
&&
2009 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
2010 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256) ||
2011 (util_cpu_caps
.has_avx512f
&& type
.width
*type
.length
== 512))
2013 else if ((util_cpu_caps
.has_altivec
&&
2014 (type
.width
== 32 && type
.length
== 4)))
2016 else if (util_cpu_caps
.has_neon
)
2022 enum lp_build_round_mode
2024 LP_BUILD_ROUND_NEAREST
= 0,
2025 LP_BUILD_ROUND_FLOOR
= 1,
2026 LP_BUILD_ROUND_CEIL
= 2,
2027 LP_BUILD_ROUND_TRUNCATE
= 3
2030 static inline LLVMValueRef
2031 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
2034 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2035 const struct lp_type type
= bld
->type
;
2036 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
2037 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2038 const char *intrinsic
;
2041 assert(type
.floating
);
2042 /* using the double precision conversions is a bit more complicated */
2043 assert(type
.width
== 32);
2045 assert(lp_check_value(type
, a
));
2046 assert(util_cpu_caps
.has_sse2
);
2048 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2049 if (type
.length
== 1) {
2050 LLVMTypeRef vec_type
;
2053 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
2055 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
2057 intrinsic
= "llvm.x86.sse.cvtss2si";
2059 undef
= LLVMGetUndef(vec_type
);
2061 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
2063 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
2067 if (type
.width
* type
.length
== 128) {
2068 intrinsic
= "llvm.x86.sse2.cvtps2dq";
2071 assert(type
.width
*type
.length
== 256);
2072 assert(util_cpu_caps
.has_avx
);
2074 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
2076 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
2086 static inline LLVMValueRef
2087 lp_build_round_altivec(struct lp_build_context
*bld
,
2089 enum lp_build_round_mode mode
)
2091 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2092 const struct lp_type type
= bld
->type
;
2093 const char *intrinsic
= NULL
;
2095 assert(type
.floating
);
2097 assert(lp_check_value(type
, a
));
2098 assert(util_cpu_caps
.has_altivec
);
2103 case LP_BUILD_ROUND_NEAREST
:
2104 intrinsic
= "llvm.ppc.altivec.vrfin";
2106 case LP_BUILD_ROUND_FLOOR
:
2107 intrinsic
= "llvm.ppc.altivec.vrfim";
2109 case LP_BUILD_ROUND_CEIL
:
2110 intrinsic
= "llvm.ppc.altivec.vrfip";
2112 case LP_BUILD_ROUND_TRUNCATE
:
2113 intrinsic
= "llvm.ppc.altivec.vrfiz";
2117 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2120 static inline LLVMValueRef
2121 lp_build_round_arch(struct lp_build_context
*bld
,
2123 enum lp_build_round_mode mode
)
2125 if (util_cpu_caps
.has_sse4_1
|| util_cpu_caps
.has_neon
) {
2126 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2127 const struct lp_type type
= bld
->type
;
2128 const char *intrinsic_root
;
2131 assert(type
.floating
);
2132 assert(lp_check_value(type
, a
));
2136 case LP_BUILD_ROUND_NEAREST
:
2137 intrinsic_root
= "llvm.nearbyint";
2139 case LP_BUILD_ROUND_FLOOR
:
2140 intrinsic_root
= "llvm.floor";
2142 case LP_BUILD_ROUND_CEIL
:
2143 intrinsic_root
= "llvm.ceil";
2145 case LP_BUILD_ROUND_TRUNCATE
:
2146 intrinsic_root
= "llvm.trunc";
2150 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, intrinsic_root
, bld
->vec_type
);
2151 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2153 else /* (util_cpu_caps.has_altivec) */
2154 return lp_build_round_altivec(bld
, a
, mode
);
2158 * Return the integer part of a float (vector) value (== round toward zero).
2159 * The returned value is a float (vector).
2160 * Ex: trunc(-1.5) = -1.0
2163 lp_build_trunc(struct lp_build_context
*bld
,
2166 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2167 const struct lp_type type
= bld
->type
;
2169 assert(type
.floating
);
2170 assert(lp_check_value(type
, a
));
2172 if (arch_rounding_available(type
)) {
2173 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
2176 const struct lp_type type
= bld
->type
;
2177 struct lp_type inttype
;
2178 struct lp_build_context intbld
;
2179 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2180 LLVMValueRef trunc
, res
, anosign
, mask
;
2181 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2182 LLVMTypeRef vec_type
= bld
->vec_type
;
2184 assert(type
.width
== 32); /* might want to handle doubles at some point */
2187 inttype
.floating
= 0;
2188 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2190 /* round by truncation */
2191 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2192 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2194 /* mask out sign bit */
2195 anosign
= lp_build_abs(bld
, a
);
2197 * mask out all values if anosign > 2^24
2198 * This should work both for large ints (all rounding is no-op for them
2199 * because such floats are always exact) as well as special cases like
2200 * NaNs, Infs (taking advantage of the fact they use max exponent).
2201 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2203 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2204 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2205 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2206 return lp_build_select(bld
, mask
, a
, res
);
2212 * Return float (vector) rounded to nearest integer (vector). The returned
2213 * value is a float (vector).
2214 * Ex: round(0.9) = 1.0
2215 * Ex: round(-1.5) = -2.0
2218 lp_build_round(struct lp_build_context
*bld
,
2221 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2222 const struct lp_type type
= bld
->type
;
2224 assert(type
.floating
);
2225 assert(lp_check_value(type
, a
));
2227 if (arch_rounding_available(type
)) {
2228 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2231 const struct lp_type type
= bld
->type
;
2232 struct lp_type inttype
;
2233 struct lp_build_context intbld
;
2234 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2235 LLVMValueRef res
, anosign
, mask
;
2236 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2237 LLVMTypeRef vec_type
= bld
->vec_type
;
2239 assert(type
.width
== 32); /* might want to handle doubles at some point */
2242 inttype
.floating
= 0;
2243 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2245 res
= lp_build_iround(bld
, a
);
2246 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
2248 /* mask out sign bit */
2249 anosign
= lp_build_abs(bld
, a
);
2251 * mask out all values if anosign > 2^24
2252 * This should work both for large ints (all rounding is no-op for them
2253 * because such floats are always exact) as well as special cases like
2254 * NaNs, Infs (taking advantage of the fact they use max exponent).
2255 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2257 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2258 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2259 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2260 return lp_build_select(bld
, mask
, a
, res
);
2266 * Return floor of float (vector), result is a float (vector)
2267 * Ex: floor(1.1) = 1.0
2268 * Ex: floor(-1.1) = -2.0
2271 lp_build_floor(struct lp_build_context
*bld
,
2274 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2275 const struct lp_type type
= bld
->type
;
2277 assert(type
.floating
);
2278 assert(lp_check_value(type
, a
));
2280 if (arch_rounding_available(type
)) {
2281 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2284 const struct lp_type type
= bld
->type
;
2285 struct lp_type inttype
;
2286 struct lp_build_context intbld
;
2287 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2288 LLVMValueRef trunc
, res
, anosign
, mask
;
2289 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2290 LLVMTypeRef vec_type
= bld
->vec_type
;
2292 if (type
.width
!= 32) {
2294 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.floor", vec_type
);
2295 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2298 assert(type
.width
== 32); /* might want to handle doubles at some point */
2301 inttype
.floating
= 0;
2302 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2304 /* round by truncation */
2305 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2306 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2312 * fix values if rounding is wrong (for non-special cases)
2313 * - this is the case if trunc > a
2315 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
2316 /* tmp = trunc > a ? 1.0 : 0.0 */
2317 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2318 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2319 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2320 res
= lp_build_sub(bld
, res
, tmp
);
2323 /* mask out sign bit */
2324 anosign
= lp_build_abs(bld
, a
);
2326 * mask out all values if anosign > 2^24
2327 * This should work both for large ints (all rounding is no-op for them
2328 * because such floats are always exact) as well as special cases like
2329 * NaNs, Infs (taking advantage of the fact they use max exponent).
2330 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2332 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2333 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2334 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2335 return lp_build_select(bld
, mask
, a
, res
);
2341 * Return ceiling of float (vector), returning float (vector).
2342 * Ex: ceil( 1.1) = 2.0
2343 * Ex: ceil(-1.1) = -1.0
2346 lp_build_ceil(struct lp_build_context
*bld
,
2349 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2350 const struct lp_type type
= bld
->type
;
2352 assert(type
.floating
);
2353 assert(lp_check_value(type
, a
));
2355 if (arch_rounding_available(type
)) {
2356 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2359 const struct lp_type type
= bld
->type
;
2360 struct lp_type inttype
;
2361 struct lp_build_context intbld
;
2362 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2363 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2364 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2365 LLVMTypeRef vec_type
= bld
->vec_type
;
2367 if (type
.width
!= 32) {
2369 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.ceil", vec_type
);
2370 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2373 assert(type
.width
== 32); /* might want to handle doubles at some point */
2376 inttype
.floating
= 0;
2377 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2379 /* round by truncation */
2380 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2381 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2384 * fix values if rounding is wrong (for non-special cases)
2385 * - this is the case if trunc < a
2387 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2388 /* tmp = trunc < a ? 1.0 : 0.0 */
2389 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2390 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2391 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2392 res
= lp_build_add(bld
, trunc
, tmp
);
2394 /* mask out sign bit */
2395 anosign
= lp_build_abs(bld
, a
);
2397 * mask out all values if anosign > 2^24
2398 * This should work both for large ints (all rounding is no-op for them
2399 * because such floats are always exact) as well as special cases like
2400 * NaNs, Infs (taking advantage of the fact they use max exponent).
2401 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2403 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2404 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2405 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2406 return lp_build_select(bld
, mask
, a
, res
);
2412 * Return fractional part of 'a' computed as a - floor(a)
2413 * Typically used in texture coord arithmetic.
2416 lp_build_fract(struct lp_build_context
*bld
,
2419 assert(bld
->type
.floating
);
2420 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2425 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2426 * against 0.99999(9). (Will also return that value for NaNs.)
2428 static inline LLVMValueRef
2429 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2433 /* this is the largest number smaller than 1.0 representable as float */
2434 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2435 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2436 return lp_build_min_ext(bld
, fract
, max
,
2437 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
2442 * Same as lp_build_fract, but guarantees that the result is always smaller
2443 * than one. Will also return the smaller-than-one value for infs, NaNs.
2446 lp_build_fract_safe(struct lp_build_context
*bld
,
2449 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2454 * Return the integer part of a float (vector) value (== round toward zero).
2455 * The returned value is an integer (vector).
2456 * Ex: itrunc(-1.5) = -1
2459 lp_build_itrunc(struct lp_build_context
*bld
,
2462 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2463 const struct lp_type type
= bld
->type
;
2464 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2466 assert(type
.floating
);
2467 assert(lp_check_value(type
, a
));
2469 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2474 * Return float (vector) rounded to nearest integer (vector). The returned
2475 * value is an integer (vector).
2476 * Ex: iround(0.9) = 1
2477 * Ex: iround(-1.5) = -2
2480 lp_build_iround(struct lp_build_context
*bld
,
2483 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2484 const struct lp_type type
= bld
->type
;
2485 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2488 assert(type
.floating
);
2490 assert(lp_check_value(type
, a
));
2492 if ((util_cpu_caps
.has_sse2
&&
2493 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2494 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2495 return lp_build_iround_nearest_sse2(bld
, a
);
2497 if (arch_rounding_available(type
)) {
2498 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2503 half
= lp_build_const_vec(bld
->gallivm
, type
, nextafterf(0.5, 0.0));
2506 LLVMTypeRef vec_type
= bld
->vec_type
;
2507 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2508 (unsigned long long)1 << (type
.width
- 1));
2512 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2513 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2516 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2517 half
= LLVMBuildOr(builder
, sign
, half
, "");
2518 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2521 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2524 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2531 * Return floor of float (vector), result is an int (vector)
2532 * Ex: ifloor(1.1) = 1.0
2533 * Ex: ifloor(-1.1) = -2.0
2536 lp_build_ifloor(struct lp_build_context
*bld
,
2539 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2540 const struct lp_type type
= bld
->type
;
2541 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2544 assert(type
.floating
);
2545 assert(lp_check_value(type
, a
));
2549 if (arch_rounding_available(type
)) {
2550 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2553 struct lp_type inttype
;
2554 struct lp_build_context intbld
;
2555 LLVMValueRef trunc
, itrunc
, mask
;
2557 assert(type
.floating
);
2558 assert(lp_check_value(type
, a
));
2561 inttype
.floating
= 0;
2562 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2564 /* round by truncation */
2565 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2566 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2569 * fix values if rounding is wrong (for non-special cases)
2570 * - this is the case if trunc > a
2571 * The results of doing this with NaNs, very large values etc.
2572 * are undefined but this seems to be the case anyway.
2574 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2575 /* cheapie minus one with mask since the mask is minus one / zero */
2576 return lp_build_add(&intbld
, itrunc
, mask
);
2580 /* round to nearest (toward zero) */
2581 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2588 * Return ceiling of float (vector), returning int (vector).
2589 * Ex: iceil( 1.1) = 2
2590 * Ex: iceil(-1.1) = -1
2593 lp_build_iceil(struct lp_build_context
*bld
,
2596 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2597 const struct lp_type type
= bld
->type
;
2598 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2601 assert(type
.floating
);
2602 assert(lp_check_value(type
, a
));
2604 if (arch_rounding_available(type
)) {
2605 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2608 struct lp_type inttype
;
2609 struct lp_build_context intbld
;
2610 LLVMValueRef trunc
, itrunc
, mask
;
2612 assert(type
.floating
);
2613 assert(lp_check_value(type
, a
));
2616 inttype
.floating
= 0;
2617 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2619 /* round by truncation */
2620 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2621 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2624 * fix values if rounding is wrong (for non-special cases)
2625 * - this is the case if trunc < a
2626 * The results of doing this with NaNs, very large values etc.
2627 * are undefined but this seems to be the case anyway.
2629 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2630 /* cheapie plus one with mask since the mask is minus one / zero */
2631 return lp_build_sub(&intbld
, itrunc
, mask
);
2634 /* round to nearest (toward zero) */
2635 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2642 * Combined ifloor() & fract().
2644 * Preferred to calling the functions separately, as it will ensure that the
2645 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2648 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2650 LLVMValueRef
*out_ipart
,
2651 LLVMValueRef
*out_fpart
)
2653 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2654 const struct lp_type type
= bld
->type
;
2657 assert(type
.floating
);
2658 assert(lp_check_value(type
, a
));
2660 if (arch_rounding_available(type
)) {
2662 * floor() is easier.
2665 ipart
= lp_build_floor(bld
, a
);
2666 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2667 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2671 * ifloor() is easier.
2674 *out_ipart
= lp_build_ifloor(bld
, a
);
2675 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2676 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2682 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2683 * always smaller than one.
2686 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2688 LLVMValueRef
*out_ipart
,
2689 LLVMValueRef
*out_fpart
)
2691 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2692 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2697 lp_build_sqrt(struct lp_build_context
*bld
,
2700 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2701 const struct lp_type type
= bld
->type
;
2702 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2705 assert(lp_check_value(type
, a
));
2707 assert(type
.floating
);
2708 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.sqrt", vec_type
);
2710 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2715 * Do one Newton-Raphson step to improve reciprocate precision:
2717 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2719 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2720 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2721 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2722 * halo. It would be necessary to clamp the argument to prevent this.
2725 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2726 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2728 static inline LLVMValueRef
2729 lp_build_rcp_refine(struct lp_build_context
*bld
,
2733 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2737 neg_a
= LLVMBuildFNeg(builder
, a
, "");
2738 res
= lp_build_fmuladd(builder
, neg_a
, rcp_a
, bld
->one
);
2739 res
= lp_build_fmuladd(builder
, res
, rcp_a
, rcp_a
);
2746 lp_build_rcp(struct lp_build_context
*bld
,
2749 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2750 const struct lp_type type
= bld
->type
;
2752 assert(lp_check_value(type
, a
));
2761 assert(type
.floating
);
2763 if(LLVMIsConstant(a
))
2764 return LLVMConstFDiv(bld
->one
, a
);
2767 * We don't use RCPPS because:
2768 * - it only has 10bits of precision
2769 * - it doesn't even get the reciprocate of 1.0 exactly
2770 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2771 * - for recent processors the benefit over DIVPS is marginal, a case
2774 * We could still use it on certain processors if benchmarks show that the
2775 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2776 * particular uses that require less workarounds.
2779 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2780 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2781 const unsigned num_iterations
= 0;
2784 const char *intrinsic
= NULL
;
2786 if (type
.length
== 4) {
2787 intrinsic
= "llvm.x86.sse.rcp.ps";
2790 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2793 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2795 for (i
= 0; i
< num_iterations
; ++i
) {
2796 res
= lp_build_rcp_refine(bld
, a
, res
);
2802 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2807 * Do one Newton-Raphson step to improve rsqrt precision:
2809 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2811 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2813 static inline LLVMValueRef
2814 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2816 LLVMValueRef rsqrt_a
)
2818 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2819 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2820 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2823 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2824 res
= LLVMBuildFMul(builder
, a
, res
, "");
2825 res
= LLVMBuildFSub(builder
, three
, res
, "");
2826 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2827 res
= LLVMBuildFMul(builder
, half
, res
, "");
2834 * Generate 1/sqrt(a).
2835 * Result is undefined for values < 0, infinity for +0.
2838 lp_build_rsqrt(struct lp_build_context
*bld
,
2841 const struct lp_type type
= bld
->type
;
2843 assert(lp_check_value(type
, a
));
2845 assert(type
.floating
);
2848 * This should be faster but all denormals will end up as infinity.
2850 if (0 && lp_build_fast_rsqrt_available(type
)) {
2851 const unsigned num_iterations
= 1;
2855 /* rsqrt(1.0) != 1.0 here */
2856 res
= lp_build_fast_rsqrt(bld
, a
);
2858 if (num_iterations
) {
2860 * Newton-Raphson will result in NaN instead of infinity for zero,
2861 * and NaN instead of zero for infinity.
2862 * Also, need to ensure rsqrt(1.0) == 1.0.
2863 * All numbers smaller than FLT_MIN will result in +infinity
2864 * (rsqrtps treats all denormals as zero).
2867 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2868 LLVMValueRef inf
= lp_build_const_vec(bld
->gallivm
, type
, INFINITY
);
2870 for (i
= 0; i
< num_iterations
; ++i
) {
2871 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2873 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2874 res
= lp_build_select(bld
, cmp
, inf
, res
);
2875 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2876 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2877 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2878 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2884 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2888 * If there's a fast (inaccurate) rsqrt instruction available
2889 * (caller may want to avoid to call rsqrt_fast if it's not available,
2890 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2891 * unavailable it would result in sqrt/div/mul so obviously
2892 * much better to just call sqrt, skipping both div and mul).
2895 lp_build_fast_rsqrt_available(struct lp_type type
)
2897 assert(type
.floating
);
2899 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2900 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2908 * Generate 1/sqrt(a).
2909 * Result is undefined for values < 0, infinity for +0.
2910 * Precision is limited, only ~10 bits guaranteed
2911 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2914 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2917 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2918 const struct lp_type type
= bld
->type
;
2920 assert(lp_check_value(type
, a
));
2922 if (lp_build_fast_rsqrt_available(type
)) {
2923 const char *intrinsic
= NULL
;
2925 if (type
.length
== 4) {
2926 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2929 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2931 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2934 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2936 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2941 * Generate sin(a) or cos(a) using polynomial approximation.
2942 * TODO: it might be worth recognizing sin and cos using same source
2943 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2944 * would be way cheaper than calculating (nearly) everything twice...
2945 * Not sure it's common enough to be worth bothering however, scs
2946 * opcode could also benefit from calculating both though.
2949 lp_build_sin_or_cos(struct lp_build_context
*bld
,
2953 struct gallivm_state
*gallivm
= bld
->gallivm
;
2954 LLVMBuilderRef b
= gallivm
->builder
;
2955 struct lp_type int_type
= lp_int_type(bld
->type
);
2958 * take the absolute value,
2959 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2962 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2963 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2965 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2966 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2970 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2973 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2974 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2977 * store the integer part of y in mm0
2978 * emm2 = _mm_cvttps_epi32(y);
2981 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2984 * j=(j+1) & (~1) (see the cephes sources)
2985 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2988 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2989 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2991 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2993 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2994 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2997 * y = _mm_cvtepi32_ps(emm2);
2999 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
3001 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
3002 LLVMValueRef const_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
3003 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
3004 LLVMValueRef sign_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
3007 * Argument used for poly selection and sign bit determination
3008 * is different for sin vs. cos.
3010 LLVMValueRef emm2_2
= cos
? LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2") :
3013 LLVMValueRef sign_bit
= cos
? LLVMBuildShl(b
, LLVMBuildAnd(b
, const_4
,
3014 LLVMBuildNot(b
, emm2_2
, ""), ""),
3015 const_29
, "sign_bit") :
3016 LLVMBuildAnd(b
, LLVMBuildXor(b
, a_v4si
,
3017 LLVMBuildShl(b
, emm2_add
,
3019 sign_mask
, "sign_bit");
3022 * get the polynom selection mask
3023 * there is one polynom for 0 <= x <= Pi/4
3024 * and another one for Pi/4<x<=Pi/2
3025 * Both branches will be computed.
3027 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3028 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3031 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, const_2
, "emm2_3");
3032 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
3033 int_type
, PIPE_FUNC_EQUAL
,
3034 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
3037 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3038 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3039 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3041 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
3042 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
3043 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
3046 * The magic pass: "Extended precision modular arithmetic"
3047 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3049 LLVMValueRef x_1
= lp_build_fmuladd(b
, y_2
, DP1
, x_abs
);
3050 LLVMValueRef x_2
= lp_build_fmuladd(b
, y_2
, DP2
, x_1
);
3051 LLVMValueRef x_3
= lp_build_fmuladd(b
, y_2
, DP3
, x_2
);
3054 * Evaluate the first polynom (0 <= x <= Pi/4)
3056 * z = _mm_mul_ps(x,x);
3058 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
3061 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3062 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3063 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3065 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
3066 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
3067 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
3070 * y = *(v4sf*)_ps_coscof_p0;
3071 * y = _mm_mul_ps(y, z);
3073 LLVMValueRef y_4
= lp_build_fmuladd(b
, z
, coscof_p0
, coscof_p1
);
3074 LLVMValueRef y_6
= lp_build_fmuladd(b
, y_4
, z
, coscof_p2
);
3075 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
3076 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
3080 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3081 * y = _mm_sub_ps(y, tmp);
3082 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3084 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
3085 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
3086 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
3087 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
3088 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
3091 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3092 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3093 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3095 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
3096 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
3097 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
3100 * Evaluate the second polynom (Pi/4 <= x <= 0)
3102 * y2 = *(v4sf*)_ps_sincof_p0;
3103 * y2 = _mm_mul_ps(y2, z);
3104 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3105 * y2 = _mm_mul_ps(y2, z);
3106 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3107 * y2 = _mm_mul_ps(y2, z);
3108 * y2 = _mm_mul_ps(y2, x);
3109 * y2 = _mm_add_ps(y2, x);
3112 LLVMValueRef y2_4
= lp_build_fmuladd(b
, z
, sincof_p0
, sincof_p1
);
3113 LLVMValueRef y2_6
= lp_build_fmuladd(b
, y2_4
, z
, sincof_p2
);
3114 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
3115 LLVMValueRef y2_9
= lp_build_fmuladd(b
, y2_7
, x_3
, x_3
);
3118 * select the correct result from the two polynoms
3120 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3121 * y = _mm_andnot_ps(xmm3, y);
3122 * y = _mm_or_ps(y,y2);
3124 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
3125 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
3126 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
3127 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
3128 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
3129 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
3133 * y = _mm_xor_ps(y, sign_bit);
3135 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sign");
3136 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
3138 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
3140 /* clamp output to be within [-1, 1] */
3141 y_result
= lp_build_clamp(bld
, y_result
,
3142 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
3143 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
3144 /* If a is -inf, inf or NaN then return NaN */
3145 y_result
= lp_build_select(bld
, isfinite
, y_result
,
3146 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
3155 lp_build_sin(struct lp_build_context
*bld
,
3158 return lp_build_sin_or_cos(bld
, a
, FALSE
);
3166 lp_build_cos(struct lp_build_context
*bld
,
3169 return lp_build_sin_or_cos(bld
, a
, TRUE
);
3174 * Generate pow(x, y)
3177 lp_build_pow(struct lp_build_context
*bld
,
3181 /* TODO: optimize the constant case */
3182 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3183 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
3184 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3188 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
3196 lp_build_exp(struct lp_build_context
*bld
,
3199 /* log2(e) = 1/log(2) */
3200 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3201 1.4426950408889634);
3203 assert(lp_check_value(bld
->type
, x
));
3205 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
3211 * Behavior is undefined with infs, 0s and nans
3214 lp_build_log(struct lp_build_context
*bld
,
3218 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3219 0.69314718055994529);
3221 assert(lp_check_value(bld
->type
, x
));
3223 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
3227 * Generate log(x) that handles edge cases (infs, 0s and nans)
3230 lp_build_log_safe(struct lp_build_context
*bld
,
3234 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3235 0.69314718055994529);
3237 assert(lp_check_value(bld
->type
, x
));
3239 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
3244 * Generate polynomial.
3245 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3248 lp_build_polynomial(struct lp_build_context
*bld
,
3250 const double *coeffs
,
3251 unsigned num_coeffs
)
3253 const struct lp_type type
= bld
->type
;
3254 LLVMValueRef even
= NULL
, odd
= NULL
;
3258 assert(lp_check_value(bld
->type
, x
));
3260 /* TODO: optimize the constant case */
3261 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3262 LLVMIsConstant(x
)) {
3263 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3268 * Calculate odd and even terms seperately to decrease data dependency
3270 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3271 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3273 x2
= lp_build_mul(bld
, x
, x
);
3275 for (i
= num_coeffs
; i
--; ) {
3278 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
3282 even
= lp_build_mad(bld
, x2
, even
, coeff
);
3287 odd
= lp_build_mad(bld
, x2
, odd
, coeff
);
3294 return lp_build_mad(bld
, odd
, x
, even
);
3303 * Minimax polynomial fit of 2**x, in range [0, 1[
3305 const double lp_build_exp2_polynomial
[] = {
3306 #if EXP_POLY_DEGREE == 5
3307 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3308 0.693153073200168932794,
3309 0.240153617044375388211,
3310 0.0558263180532956664775,
3311 0.00898934009049466391101,
3312 0.00187757667519147912699
3313 #elif EXP_POLY_DEGREE == 4
3314 1.00000259337069434683,
3315 0.693003834469974940458,
3316 0.24144275689150793076,
3317 0.0520114606103070150235,
3318 0.0135341679161270268764
3319 #elif EXP_POLY_DEGREE == 3
3320 0.999925218562710312959,
3321 0.695833540494823811697,
3322 0.226067155427249155588,
3323 0.0780245226406372992967
3324 #elif EXP_POLY_DEGREE == 2
3325 1.00172476321474503578,
3326 0.657636275736077639316,
3327 0.33718943461968720704
3335 lp_build_exp2(struct lp_build_context
*bld
,
3338 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3339 const struct lp_type type
= bld
->type
;
3340 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3341 LLVMValueRef ipart
= NULL
;
3342 LLVMValueRef fpart
= NULL
;
3343 LLVMValueRef expipart
= NULL
;
3344 LLVMValueRef expfpart
= NULL
;
3345 LLVMValueRef res
= NULL
;
3347 assert(lp_check_value(bld
->type
, x
));
3349 /* TODO: optimize the constant case */
3350 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3351 LLVMIsConstant(x
)) {
3352 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3356 assert(type
.floating
&& type
.width
== 32);
3358 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3359 * the result is INF and if it's smaller than -126.9 the result is 0 */
3360 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3361 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3362 x
= lp_build_max_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999),
3363 x
, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3365 /* ipart = floor(x) */
3366 /* fpart = x - ipart */
3367 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3369 /* expipart = (float) (1 << ipart) */
3370 expipart
= LLVMBuildAdd(builder
, ipart
,
3371 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3372 expipart
= LLVMBuildShl(builder
, expipart
,
3373 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3374 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3376 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3377 ARRAY_SIZE(lp_build_exp2_polynomial
));
3379 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3387 * Extract the exponent of a IEEE-754 floating point value.
3389 * Optionally apply an integer bias.
3391 * Result is an integer value with
3393 * ifloor(log2(x)) + bias
3396 lp_build_extract_exponent(struct lp_build_context
*bld
,
3400 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3401 const struct lp_type type
= bld
->type
;
3402 unsigned mantissa
= lp_mantissa(type
);
3405 assert(type
.floating
);
3407 assert(lp_check_value(bld
->type
, x
));
3409 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3411 res
= LLVMBuildLShr(builder
, x
,
3412 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3413 res
= LLVMBuildAnd(builder
, res
,
3414 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3415 res
= LLVMBuildSub(builder
, res
,
3416 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3423 * Extract the mantissa of the a floating.
3425 * Result is a floating point value with
3427 * x / floor(log2(x))
3430 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3433 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3434 const struct lp_type type
= bld
->type
;
3435 unsigned mantissa
= lp_mantissa(type
);
3436 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3437 (1ULL << mantissa
) - 1);
3438 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3441 assert(lp_check_value(bld
->type
, x
));
3443 assert(type
.floating
);
3445 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3447 /* res = x / 2**ipart */
3448 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3449 res
= LLVMBuildOr(builder
, res
, one
, "");
3450 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3458 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3459 * These coefficients can be generate with
3460 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3462 const double lp_build_log2_polynomial
[] = {
3463 #if LOG_POLY_DEGREE == 5
3464 2.88539008148777786488L,
3465 0.961796878841293367824L,
3466 0.577058946784739859012L,
3467 0.412914355135828735411L,
3468 0.308591899232910175289L,
3469 0.352376952300281371868L,
3470 #elif LOG_POLY_DEGREE == 4
3471 2.88539009343309178325L,
3472 0.961791550404184197881L,
3473 0.577440339438736392009L,
3474 0.403343858251329912514L,
3475 0.406718052498846252698L,
3476 #elif LOG_POLY_DEGREE == 3
3477 2.88538959748872753838L,
3478 0.961932915889597772928L,
3479 0.571118517972136195241L,
3480 0.493997535084709500285L,
3487 * See http://www.devmaster.net/forums/showthread.php?p=43580
3488 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3489 * http://www.nezumi.demon.co.uk/consult/logx.htm
3491 * If handle_edge_cases is true the function will perform computations
3492 * to match the required D3D10+ behavior for each of the edge cases.
3493 * That means that if input is:
3494 * - less than zero (to and including -inf) then NaN will be returned
3495 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3496 * - +infinity, then +infinity will be returned
3497 * - NaN, then NaN will be returned
3499 * Those checks are fairly expensive so if you don't need them make sure
3500 * handle_edge_cases is false.
3503 lp_build_log2_approx(struct lp_build_context
*bld
,
3505 LLVMValueRef
*p_exp
,
3506 LLVMValueRef
*p_floor_log2
,
3507 LLVMValueRef
*p_log2
,
3508 boolean handle_edge_cases
)
3510 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3511 const struct lp_type type
= bld
->type
;
3512 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3513 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3515 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3516 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3517 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3519 LLVMValueRef i
= NULL
;
3520 LLVMValueRef y
= NULL
;
3521 LLVMValueRef z
= NULL
;
3522 LLVMValueRef exp
= NULL
;
3523 LLVMValueRef mant
= NULL
;
3524 LLVMValueRef logexp
= NULL
;
3525 LLVMValueRef p_z
= NULL
;
3526 LLVMValueRef res
= NULL
;
3528 assert(lp_check_value(bld
->type
, x
));
3530 if(p_exp
|| p_floor_log2
|| p_log2
) {
3531 /* TODO: optimize the constant case */
3532 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3533 LLVMIsConstant(x
)) {
3534 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3538 assert(type
.floating
&& type
.width
== 32);
3541 * We don't explicitly handle denormalized numbers. They will yield a
3542 * result in the neighbourhood of -127, which appears to be adequate
3546 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3548 /* exp = (float) exponent(x) */
3549 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3552 if(p_floor_log2
|| p_log2
) {
3553 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3554 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3555 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3559 /* mant = 1 + (float) mantissa(x) */
3560 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3561 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3562 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3564 /* y = (mant - 1) / (mant + 1) */
3565 y
= lp_build_div(bld
,
3566 lp_build_sub(bld
, mant
, bld
->one
),
3567 lp_build_add(bld
, mant
, bld
->one
)
3571 z
= lp_build_mul(bld
, y
, y
);
3574 p_z
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3575 ARRAY_SIZE(lp_build_log2_polynomial
));
3577 /* y * P(z) + logexp */
3578 res
= lp_build_mad(bld
, y
, p_z
, logexp
);
3580 if (type
.floating
&& handle_edge_cases
) {
3581 LLVMValueRef negmask
, infmask
, zmask
;
3582 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3583 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3584 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3585 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3586 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3587 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3589 /* If x is qual to inf make sure we return inf */
3590 res
= lp_build_select(bld
, infmask
,
3591 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3593 /* If x is qual to 0, return -inf */
3594 res
= lp_build_select(bld
, zmask
,
3595 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3597 /* If x is nan or less than 0, return nan */
3598 res
= lp_build_select(bld
, negmask
,
3599 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3605 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3610 *p_floor_log2
= logexp
;
3618 * log2 implementation which doesn't have special code to
3619 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3620 * the results for those cases are undefined.
3623 lp_build_log2(struct lp_build_context
*bld
,
3627 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3632 * Version of log2 which handles all edge cases.
3633 * Look at documentation of lp_build_log2_approx for
3634 * description of the behavior for each of the edge cases.
3637 lp_build_log2_safe(struct lp_build_context
*bld
,
3641 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3647 * Faster (and less accurate) log2.
3649 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3651 * Piece-wise linear approximation, with exact results when x is a
3654 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3657 lp_build_fast_log2(struct lp_build_context
*bld
,
3660 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3664 assert(lp_check_value(bld
->type
, x
));
3666 assert(bld
->type
.floating
);
3668 /* ipart = floor(log2(x)) - 1 */
3669 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3670 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3672 /* fpart = x / 2**ipart */
3673 fpart
= lp_build_extract_mantissa(bld
, x
);
3676 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3681 * Fast implementation of iround(log2(x)).
3683 * Not an approximation -- it should give accurate results all the time.
3686 lp_build_ilog2(struct lp_build_context
*bld
,
3689 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3690 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3693 assert(bld
->type
.floating
);
3695 assert(lp_check_value(bld
->type
, x
));
3697 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3698 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3700 /* ipart = floor(log2(x) + 0.5) */
3701 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3707 lp_build_mod(struct lp_build_context
*bld
,
3711 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3713 const struct lp_type type
= bld
->type
;
3715 assert(lp_check_value(type
, x
));
3716 assert(lp_check_value(type
, y
));
3719 res
= LLVMBuildFRem(builder
, x
, y
, "");
3721 res
= LLVMBuildSRem(builder
, x
, y
, "");
3723 res
= LLVMBuildURem(builder
, x
, y
, "");
3729 * For floating inputs it creates and returns a mask
3730 * which is all 1's for channels which are NaN.
3731 * Channels inside x which are not NaN will be 0.
3734 lp_build_isnan(struct lp_build_context
*bld
,
3738 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3740 assert(bld
->type
.floating
);
3741 assert(lp_check_value(bld
->type
, x
));
3743 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3745 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3746 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3750 /* Returns all 1's for floating point numbers that are
3751 * finite numbers and returns all zeros for -inf,
3754 lp_build_isfinite(struct lp_build_context
*bld
,
3757 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3758 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3759 struct lp_type int_type
= lp_int_type(bld
->type
);
3760 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3761 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3764 if (!bld
->type
.floating
) {
3765 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3767 assert(bld
->type
.floating
);
3768 assert(lp_check_value(bld
->type
, x
));
3769 assert(bld
->type
.width
== 32);
3771 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3772 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,
3777 * Returns true if the number is nan or inf and false otherwise.
3778 * The input has to be a floating point vector.
3781 lp_build_is_inf_or_nan(struct gallivm_state
*gallivm
,
3782 const struct lp_type type
,
3785 LLVMBuilderRef builder
= gallivm
->builder
;
3786 struct lp_type int_type
= lp_int_type(type
);
3787 LLVMValueRef const0
= lp_build_const_int_vec(gallivm
, int_type
,
3791 assert(type
.floating
);
3793 ret
= LLVMBuildBitCast(builder
, x
, lp_build_vec_type(gallivm
, int_type
), "");
3794 ret
= LLVMBuildAnd(builder
, ret
, const0
, "");
3795 ret
= lp_build_compare(gallivm
, int_type
, PIPE_FUNC_EQUAL
,
3803 lp_build_fpstate_get(struct gallivm_state
*gallivm
)
3805 if (util_cpu_caps
.has_sse
) {
3806 LLVMBuilderRef builder
= gallivm
->builder
;
3807 LLVMValueRef mxcsr_ptr
= lp_build_alloca(
3809 LLVMInt32TypeInContext(gallivm
->context
),
3811 LLVMValueRef mxcsr_ptr8
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3812 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3813 lp_build_intrinsic(builder
,
3814 "llvm.x86.sse.stmxcsr",
3815 LLVMVoidTypeInContext(gallivm
->context
),
3823 lp_build_fpstate_set_denorms_zero(struct gallivm_state
*gallivm
,
3826 if (util_cpu_caps
.has_sse
) {
3827 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3828 int daz_ftz
= _MM_FLUSH_ZERO_MASK
;
3830 LLVMBuilderRef builder
= gallivm
->builder
;
3831 LLVMValueRef mxcsr_ptr
= lp_build_fpstate_get(gallivm
);
3832 LLVMValueRef mxcsr
=
3833 LLVMBuildLoad(builder
, mxcsr_ptr
, "mxcsr");
3835 if (util_cpu_caps
.has_daz
) {
3836 /* Enable denormals are zero mode */
3837 daz_ftz
|= _MM_DENORMALS_ZERO_MASK
;
3840 mxcsr
= LLVMBuildOr(builder
, mxcsr
,
3841 LLVMConstInt(LLVMTypeOf(mxcsr
), daz_ftz
, 0), "");
3843 mxcsr
= LLVMBuildAnd(builder
, mxcsr
,
3844 LLVMConstInt(LLVMTypeOf(mxcsr
), ~daz_ftz
, 0), "");
3847 LLVMBuildStore(builder
, mxcsr
, mxcsr_ptr
);
3848 lp_build_fpstate_set(gallivm
, mxcsr_ptr
);
3853 lp_build_fpstate_set(struct gallivm_state
*gallivm
,
3854 LLVMValueRef mxcsr_ptr
)
3856 if (util_cpu_caps
.has_sse
) {
3857 LLVMBuilderRef builder
= gallivm
->builder
;
3858 mxcsr_ptr
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3859 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3860 lp_build_intrinsic(builder
,
3861 "llvm.x86.sse.ldmxcsr",
3862 LLVMVoidTypeInContext(gallivm
->context
),