1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #define EXP_POLY_DEGREE 5
80 #define LOG_POLY_DEGREE 4
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
90 lp_build_min_simple(struct lp_build_context
*bld
,
93 enum gallivm_nan_behavior nan_behavior
)
95 const struct lp_type type
= bld
->type
;
96 const char *intrinsic
= NULL
;
97 unsigned intr_size
= 0;
100 assert(lp_check_value(type
, a
));
101 assert(lp_check_value(type
, b
));
103 /* TODO: optimize the constant case */
105 if (type
.floating
&& util_cpu_caps
.has_sse
) {
106 if (type
.width
== 32) {
107 if (type
.length
== 1) {
108 intrinsic
= "llvm.x86.sse.min.ss";
111 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
112 intrinsic
= "llvm.x86.sse.min.ps";
116 intrinsic
= "llvm.x86.avx.min.ps.256";
120 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
121 if (type
.length
== 1) {
122 intrinsic
= "llvm.x86.sse2.min.sd";
125 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
126 intrinsic
= "llvm.x86.sse2.min.pd";
130 intrinsic
= "llvm.x86.avx.min.pd.256";
135 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
136 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
137 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 if (type
.width
== 32 && type
.length
== 4) {
142 intrinsic
= "llvm.ppc.altivec.vminfp";
145 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
147 if ((type
.width
== 8 || type
.width
== 16) &&
148 (type
.width
* type
.length
<= 64) &&
149 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
150 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
153 if (type
.width
== 8 && !type
.sign
) {
154 intrinsic
= "llvm.x86.sse2.pminu.b";
156 else if (type
.width
== 16 && type
.sign
) {
157 intrinsic
= "llvm.x86.sse2.pmins.w";
159 if (util_cpu_caps
.has_sse4_1
) {
160 if (type
.width
== 8 && type
.sign
) {
161 intrinsic
= "llvm.x86.sse41.pminsb";
163 if (type
.width
== 16 && !type
.sign
) {
164 intrinsic
= "llvm.x86.sse41.pminuw";
166 if (type
.width
== 32 && !type
.sign
) {
167 intrinsic
= "llvm.x86.sse41.pminud";
169 if (type
.width
== 32 && type
.sign
) {
170 intrinsic
= "llvm.x86.sse41.pminsd";
173 } else if (util_cpu_caps
.has_altivec
) {
175 if (type
.width
== 8) {
177 intrinsic
= "llvm.ppc.altivec.vminub";
179 intrinsic
= "llvm.ppc.altivec.vminsb";
181 } else if (type
.width
== 16) {
183 intrinsic
= "llvm.ppc.altivec.vminuh";
185 intrinsic
= "llvm.ppc.altivec.vminsh";
187 } else if (type
.width
== 32) {
189 intrinsic
= "llvm.ppc.altivec.vminuw";
191 intrinsic
= "llvm.ppc.altivec.vminsw";
197 /* We need to handle nan's for floating point numbers. If one of the
198 * inputs is nan the other should be returned (required by both D3D10+
200 * The sse intrinsics return the second operator in case of nan by
201 * default so we need to special code to handle those.
203 if (util_cpu_caps
.has_sse
&& type
.floating
&&
204 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
205 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
206 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
207 LLVMValueRef isnan
, min
;
208 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
211 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
212 isnan
= lp_build_isnan(bld
, b
);
213 return lp_build_select(bld
, isnan
, a
, min
);
215 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
216 isnan
= lp_build_isnan(bld
, a
);
217 return lp_build_select(bld
, isnan
, a
, min
);
220 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
227 switch (nan_behavior
) {
228 case GALLIVM_NAN_RETURN_NAN
: {
229 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
230 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
231 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
232 return lp_build_select(bld
, cond
, a
, b
);
235 case GALLIVM_NAN_RETURN_OTHER
: {
236 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
237 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
238 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
239 return lp_build_select(bld
, cond
, a
, b
);
242 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
243 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
244 return lp_build_select(bld
, cond
, a
, b
);
245 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
246 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, b
, a
);
247 return lp_build_select(bld
, cond
, b
, a
);
248 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
249 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
250 return lp_build_select(bld
, cond
, a
, b
);
254 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
255 return lp_build_select(bld
, cond
, a
, b
);
258 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
259 return lp_build_select(bld
, cond
, a
, b
);
265 lp_build_fmuladd(LLVMBuilderRef builder
,
270 LLVMTypeRef type
= LLVMTypeOf(a
);
271 assert(type
== LLVMTypeOf(b
));
272 assert(type
== LLVMTypeOf(c
));
274 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fmuladd", type
);
275 LLVMValueRef args
[] = { a
, b
, c
};
276 return lp_build_intrinsic(builder
, intrinsic
, type
, args
, 3, 0);
282 * No checks for special case values of a or b = 1 or 0 are done.
283 * NaN's are handled according to the behavior specified by the
284 * nan_behavior argument.
287 lp_build_max_simple(struct lp_build_context
*bld
,
290 enum gallivm_nan_behavior nan_behavior
)
292 const struct lp_type type
= bld
->type
;
293 const char *intrinsic
= NULL
;
294 unsigned intr_size
= 0;
297 assert(lp_check_value(type
, a
));
298 assert(lp_check_value(type
, b
));
300 /* TODO: optimize the constant case */
302 if (type
.floating
&& util_cpu_caps
.has_sse
) {
303 if (type
.width
== 32) {
304 if (type
.length
== 1) {
305 intrinsic
= "llvm.x86.sse.max.ss";
308 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
309 intrinsic
= "llvm.x86.sse.max.ps";
313 intrinsic
= "llvm.x86.avx.max.ps.256";
317 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
318 if (type
.length
== 1) {
319 intrinsic
= "llvm.x86.sse2.max.sd";
322 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
323 intrinsic
= "llvm.x86.sse2.max.pd";
327 intrinsic
= "llvm.x86.avx.max.pd.256";
332 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
333 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
334 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
335 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
338 if (type
.width
== 32 || type
.length
== 4) {
339 intrinsic
= "llvm.ppc.altivec.vmaxfp";
342 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
344 if ((type
.width
== 8 || type
.width
== 16) &&
345 (type
.width
* type
.length
<= 64) &&
346 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
347 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
350 if (type
.width
== 8 && !type
.sign
) {
351 intrinsic
= "llvm.x86.sse2.pmaxu.b";
354 else if (type
.width
== 16 && type
.sign
) {
355 intrinsic
= "llvm.x86.sse2.pmaxs.w";
357 if (util_cpu_caps
.has_sse4_1
) {
358 if (type
.width
== 8 && type
.sign
) {
359 intrinsic
= "llvm.x86.sse41.pmaxsb";
361 if (type
.width
== 16 && !type
.sign
) {
362 intrinsic
= "llvm.x86.sse41.pmaxuw";
364 if (type
.width
== 32 && !type
.sign
) {
365 intrinsic
= "llvm.x86.sse41.pmaxud";
367 if (type
.width
== 32 && type
.sign
) {
368 intrinsic
= "llvm.x86.sse41.pmaxsd";
371 } else if (util_cpu_caps
.has_altivec
) {
373 if (type
.width
== 8) {
375 intrinsic
= "llvm.ppc.altivec.vmaxub";
377 intrinsic
= "llvm.ppc.altivec.vmaxsb";
379 } else if (type
.width
== 16) {
381 intrinsic
= "llvm.ppc.altivec.vmaxuh";
383 intrinsic
= "llvm.ppc.altivec.vmaxsh";
385 } else if (type
.width
== 32) {
387 intrinsic
= "llvm.ppc.altivec.vmaxuw";
389 intrinsic
= "llvm.ppc.altivec.vmaxsw";
395 if (util_cpu_caps
.has_sse
&& type
.floating
&&
396 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
397 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
398 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
399 LLVMValueRef isnan
, max
;
400 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
403 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
404 isnan
= lp_build_isnan(bld
, b
);
405 return lp_build_select(bld
, isnan
, a
, max
);
407 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
408 isnan
= lp_build_isnan(bld
, a
);
409 return lp_build_select(bld
, isnan
, a
, max
);
412 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
419 switch (nan_behavior
) {
420 case GALLIVM_NAN_RETURN_NAN
: {
421 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
422 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
423 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
424 return lp_build_select(bld
, cond
, a
, b
);
427 case GALLIVM_NAN_RETURN_OTHER
: {
428 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
429 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
430 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
431 return lp_build_select(bld
, cond
, a
, b
);
434 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
435 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
436 return lp_build_select(bld
, cond
, a
, b
);
437 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
438 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, a
);
439 return lp_build_select(bld
, cond
, b
, a
);
440 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
441 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
442 return lp_build_select(bld
, cond
, a
, b
);
446 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
447 return lp_build_select(bld
, cond
, a
, b
);
450 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
451 return lp_build_select(bld
, cond
, a
, b
);
457 * Generate 1 - a, or ~a depending on bld->type.
460 lp_build_comp(struct lp_build_context
*bld
,
463 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
464 const struct lp_type type
= bld
->type
;
466 assert(lp_check_value(type
, a
));
473 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
474 if(LLVMIsConstant(a
))
475 return LLVMConstNot(a
);
477 return LLVMBuildNot(builder
, a
, "");
480 if(LLVMIsConstant(a
))
482 return LLVMConstFSub(bld
->one
, a
);
484 return LLVMConstSub(bld
->one
, a
);
487 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
489 return LLVMBuildSub(builder
, bld
->one
, a
, "");
497 lp_build_add(struct lp_build_context
*bld
,
501 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
502 const struct lp_type type
= bld
->type
;
505 assert(lp_check_value(type
, a
));
506 assert(lp_check_value(type
, b
));
512 if(a
== bld
->undef
|| b
== bld
->undef
)
516 const char *intrinsic
= NULL
;
518 if(a
== bld
->one
|| b
== bld
->one
)
521 if (type
.width
* type
.length
== 128 &&
522 !type
.floating
&& !type
.fixed
) {
523 if(util_cpu_caps
.has_sse2
) {
525 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
527 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
528 } else if (util_cpu_caps
.has_altivec
) {
530 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
532 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
537 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
540 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
542 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
543 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
544 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
545 /* a_clamp_max is the maximum a for positive b,
546 a_clamp_min is the minimum a for negative b. */
547 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildSub(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
548 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildSub(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
549 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_max
, a_clamp_min
);
551 a
= lp_build_min_simple(bld
, a
, lp_build_comp(bld
, b
), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
555 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
557 res
= LLVMConstFAdd(a
, b
);
559 res
= LLVMConstAdd(a
, b
);
562 res
= LLVMBuildFAdd(builder
, a
, b
, "");
564 res
= LLVMBuildAdd(builder
, a
, b
, "");
566 /* clamp to ceiling of 1.0 */
567 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
568 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
570 /* XXX clamp to floor of -1 or 0??? */
576 /** Return the scalar sum of the elements of a.
577 * Should avoid this operation whenever possible.
580 lp_build_horizontal_add(struct lp_build_context
*bld
,
583 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
584 const struct lp_type type
= bld
->type
;
585 LLVMValueRef index
, res
;
587 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
588 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
589 LLVMValueRef vecres
, elem2
;
591 assert(lp_check_value(type
, a
));
593 if (type
.length
== 1) {
597 assert(!bld
->type
.norm
);
600 * for byte vectors can do much better with psadbw.
601 * Using repeated shuffle/adds here. Note with multiple vectors
602 * this can be done more efficiently as outlined in the intel
603 * optimization manual.
604 * Note: could cause data rearrangement if used with smaller element
609 length
= type
.length
/ 2;
611 LLVMValueRef vec1
, vec2
;
612 for (i
= 0; i
< length
; i
++) {
613 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
614 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
616 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
617 LLVMConstVector(shuffles1
, length
), "");
618 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
619 LLVMConstVector(shuffles2
, length
), "");
621 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
624 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
626 length
= length
>> 1;
629 /* always have vector of size 2 here */
632 index
= lp_build_const_int32(bld
->gallivm
, 0);
633 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
634 index
= lp_build_const_int32(bld
->gallivm
, 1);
635 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
638 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
640 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
646 * Return the horizontal sums of 4 float vectors as a float4 vector.
647 * This uses the technique as outlined in Intel Optimization Manual.
650 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
653 struct gallivm_state
*gallivm
= bld
->gallivm
;
654 LLVMBuilderRef builder
= gallivm
->builder
;
655 LLVMValueRef shuffles
[4];
657 LLVMValueRef sumtmp
[2], shuftmp
[2];
659 /* lower half of regs */
660 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
661 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
662 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
663 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
664 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
665 LLVMConstVector(shuffles
, 4), "");
666 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
667 LLVMConstVector(shuffles
, 4), "");
669 /* upper half of regs */
670 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
671 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
672 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
673 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
674 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
675 LLVMConstVector(shuffles
, 4), "");
676 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
677 LLVMConstVector(shuffles
, 4), "");
679 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
680 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
682 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
683 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
684 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
685 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
686 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
687 LLVMConstVector(shuffles
, 4), "");
689 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
690 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
691 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
692 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
693 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
694 LLVMConstVector(shuffles
, 4), "");
696 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
701 * partially horizontally add 2-4 float vectors with length nx4,
702 * i.e. only four adjacent values in each vector will be added,
703 * assuming values are really grouped in 4 which also determines
706 * Return a vector of the same length as the initial vectors,
707 * with the excess elements (if any) being undefined.
708 * The element order is independent of number of input vectors.
709 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
710 * the output order thus will be
711 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
714 lp_build_hadd_partial4(struct lp_build_context
*bld
,
715 LLVMValueRef vectors
[],
718 struct gallivm_state
*gallivm
= bld
->gallivm
;
719 LLVMBuilderRef builder
= gallivm
->builder
;
720 LLVMValueRef ret_vec
;
722 const char *intrinsic
= NULL
;
724 assert(num_vecs
>= 2 && num_vecs
<= 4);
725 assert(bld
->type
.floating
);
727 /* only use this with at least 2 vectors, as it is sort of expensive
728 * (depending on cpu) and we always need two horizontal adds anyway,
729 * so a shuffle/add approach might be better.
735 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
736 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
738 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
739 bld
->type
.length
== 4) {
740 intrinsic
= "llvm.x86.sse3.hadd.ps";
742 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
743 bld
->type
.length
== 8) {
744 intrinsic
= "llvm.x86.avx.hadd.ps.256";
747 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
748 lp_build_vec_type(gallivm
, bld
->type
),
751 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
752 lp_build_vec_type(gallivm
, bld
->type
),
758 return lp_build_intrinsic_binary(builder
, intrinsic
,
759 lp_build_vec_type(gallivm
, bld
->type
),
763 if (bld
->type
.length
== 4) {
764 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
767 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
769 unsigned num_iter
= bld
->type
.length
/ 4;
770 struct lp_type parttype
= bld
->type
;
772 for (j
= 0; j
< num_iter
; j
++) {
773 LLVMValueRef partsrc
[4];
775 for (i
= 0; i
< 4; i
++) {
776 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
778 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
780 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
789 lp_build_sub(struct lp_build_context
*bld
,
793 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
794 const struct lp_type type
= bld
->type
;
797 assert(lp_check_value(type
, a
));
798 assert(lp_check_value(type
, b
));
802 if(a
== bld
->undef
|| b
== bld
->undef
)
808 const char *intrinsic
= NULL
;
813 if (type
.width
* type
.length
== 128 &&
814 !type
.floating
&& !type
.fixed
) {
815 if (util_cpu_caps
.has_sse2
) {
817 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
819 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
820 } else if (util_cpu_caps
.has_altivec
) {
822 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
824 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
829 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
832 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
834 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
835 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
836 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
837 /* a_clamp_max is the maximum a for negative b,
838 a_clamp_min is the minimum a for positive b. */
839 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildAdd(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
840 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildAdd(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
841 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_min
, a_clamp_max
);
843 a
= lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
847 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
849 res
= LLVMConstFSub(a
, b
);
851 res
= LLVMConstSub(a
, b
);
854 res
= LLVMBuildFSub(builder
, a
, b
, "");
856 res
= LLVMBuildSub(builder
, a
, b
, "");
858 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
859 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
867 * Normalized multiplication.
869 * There are several approaches for (using 8-bit normalized multiplication as
874 * makes the following approximation to the division (Sree)
876 * a*b/255 ~= (a*(b + 1)) >> 256
878 * which is the fastest method that satisfies the following OpenGL criteria of
880 * 0*0 = 0 and 255*255 = 255
884 * takes the geometric series approximation to the division
886 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
888 * in this case just the first two terms to fit in 16bit arithmetic
890 * t/255 ~= (t + (t >> 8)) >> 8
892 * note that just by itself it doesn't satisfies the OpenGL criteria, as
893 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
896 * - geometric series plus rounding
898 * when using a geometric series division instead of truncating the result
899 * use roundoff in the approximation (Jim Blinn)
901 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
903 * achieving the exact results.
907 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
908 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
909 * @sa Michael Herf, The "double blend trick", May 2000,
910 * http://www.stereopsis.com/doubleblend.html
913 lp_build_mul_norm(struct gallivm_state
*gallivm
,
914 struct lp_type wide_type
,
915 LLVMValueRef a
, LLVMValueRef b
)
917 LLVMBuilderRef builder
= gallivm
->builder
;
918 struct lp_build_context bld
;
923 assert(!wide_type
.floating
);
924 assert(lp_check_value(wide_type
, a
));
925 assert(lp_check_value(wide_type
, b
));
927 lp_build_context_init(&bld
, gallivm
, wide_type
);
929 n
= wide_type
.width
/ 2;
930 if (wide_type
.sign
) {
935 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
936 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
940 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
943 ab
= LLVMBuildMul(builder
, a
, b
, "");
944 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
947 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
950 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1LL << (n
- 1));
951 if (wide_type
.sign
) {
952 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
953 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
954 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
956 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
959 ab
= lp_build_shr_imm(&bld
, ab
, n
);
968 lp_build_mul(struct lp_build_context
*bld
,
972 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
973 const struct lp_type type
= bld
->type
;
977 assert(lp_check_value(type
, a
));
978 assert(lp_check_value(type
, b
));
988 if(a
== bld
->undef
|| b
== bld
->undef
)
991 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
992 struct lp_type wide_type
= lp_wider_type(type
);
993 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
995 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
996 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
998 /* PMULLW, PSRLW, PADDW */
999 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
1000 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
1002 ab
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, abl
, abh
);
1008 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
1012 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1014 res
= LLVMConstFMul(a
, b
);
1016 res
= LLVMConstMul(a
, b
);
1019 res
= LLVMConstAShr(res
, shift
);
1021 res
= LLVMConstLShr(res
, shift
);
1026 res
= LLVMBuildFMul(builder
, a
, b
, "");
1028 res
= LLVMBuildMul(builder
, a
, b
, "");
1031 res
= LLVMBuildAShr(builder
, res
, shift
, "");
1033 res
= LLVMBuildLShr(builder
, res
, shift
, "");
1043 lp_build_mad(struct lp_build_context
*bld
,
1048 const struct lp_type type
= bld
->type
;
1049 if (type
.floating
) {
1050 return lp_build_fmuladd(bld
->gallivm
->builder
, a
, b
, c
);
1052 return lp_build_add(bld
, lp_build_mul(bld
, a
, b
), c
);
1058 * Small vector x scale multiplication optimization.
1061 lp_build_mul_imm(struct lp_build_context
*bld
,
1065 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1066 LLVMValueRef factor
;
1068 assert(lp_check_value(bld
->type
, a
));
1077 return lp_build_negate(bld
, a
);
1079 if(b
== 2 && bld
->type
.floating
)
1080 return lp_build_add(bld
, a
, a
);
1082 if(util_is_power_of_two(b
)) {
1083 unsigned shift
= ffs(b
) - 1;
1085 if(bld
->type
.floating
) {
1088 * Power of two multiplication by directly manipulating the exponent.
1090 * XXX: This might not be always faster, it will introduce a small error
1091 * for multiplication by zero, and it will produce wrong results
1094 unsigned mantissa
= lp_mantissa(bld
->type
);
1095 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1096 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1097 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1098 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1103 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1104 return LLVMBuildShl(builder
, a
, factor
, "");
1108 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1109 return lp_build_mul(bld
, a
, factor
);
1117 lp_build_div(struct lp_build_context
*bld
,
1121 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1122 const struct lp_type type
= bld
->type
;
1124 assert(lp_check_value(type
, a
));
1125 assert(lp_check_value(type
, b
));
1129 if(a
== bld
->one
&& type
.floating
)
1130 return lp_build_rcp(bld
, b
);
1135 if(a
== bld
->undef
|| b
== bld
->undef
)
1138 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1140 return LLVMConstFDiv(a
, b
);
1142 return LLVMConstSDiv(a
, b
);
1144 return LLVMConstUDiv(a
, b
);
1147 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1148 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1150 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1153 return LLVMBuildFDiv(builder
, a
, b
, "");
1155 return LLVMBuildSDiv(builder
, a
, b
, "");
1157 return LLVMBuildUDiv(builder
, a
, b
, "");
1162 * Linear interpolation helper.
1164 * @param normalized whether we are interpolating normalized values,
1165 * encoded in normalized integers, twice as wide.
1167 * @sa http://www.stereopsis.com/doubleblend.html
1169 static inline LLVMValueRef
1170 lp_build_lerp_simple(struct lp_build_context
*bld
,
1176 unsigned half_width
= bld
->type
.width
/2;
1177 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1181 assert(lp_check_value(bld
->type
, x
));
1182 assert(lp_check_value(bld
->type
, v0
));
1183 assert(lp_check_value(bld
->type
, v1
));
1185 delta
= lp_build_sub(bld
, v1
, v0
);
1187 if (bld
->type
.floating
) {
1189 return lp_build_mad(bld
, x
, delta
, v0
);
1192 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1193 if (!bld
->type
.sign
) {
1194 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1196 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1197 * most-significant-bit to the lowest-significant-bit, so that
1198 * later we can just divide by 2**n instead of 2**n - 1.
1201 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1204 /* (x * delta) >> n */
1205 res
= lp_build_mul(bld
, x
, delta
);
1206 res
= lp_build_shr_imm(bld
, res
, half_width
);
1209 * The rescaling trick above doesn't work for signed numbers, so
1210 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1213 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1214 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1217 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1218 res
= lp_build_mul(bld
, x
, delta
);
1221 if ((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) {
1223 * At this point both res and v0 only use the lower half of the bits,
1224 * the rest is zero. Instead of add / mask, do add with half wide type.
1226 struct lp_type narrow_type
;
1227 struct lp_build_context narrow_bld
;
1229 memset(&narrow_type
, 0, sizeof narrow_type
);
1230 narrow_type
.sign
= bld
->type
.sign
;
1231 narrow_type
.width
= bld
->type
.width
/2;
1232 narrow_type
.length
= bld
->type
.length
*2;
1234 lp_build_context_init(&narrow_bld
, bld
->gallivm
, narrow_type
);
1235 res
= LLVMBuildBitCast(builder
, res
, narrow_bld
.vec_type
, "");
1236 v0
= LLVMBuildBitCast(builder
, v0
, narrow_bld
.vec_type
, "");
1237 res
= lp_build_add(&narrow_bld
, v0
, res
);
1238 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
1240 res
= lp_build_add(bld
, v0
, res
);
1242 if (bld
->type
.fixed
) {
1244 * We need to mask out the high order bits when lerping 8bit
1245 * normalized colors stored on 16bits
1247 /* XXX: This step is necessary for lerping 8bit colors stored on
1248 * 16bits, but it will be wrong for true fixed point use cases.
1249 * Basically we need a more powerful lp_type, capable of further
1250 * distinguishing the values interpretation from the value storage.
1252 LLVMValueRef low_bits
;
1253 low_bits
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1);
1254 res
= LLVMBuildAnd(builder
, res
, low_bits
, "");
1263 * Linear interpolation.
1266 lp_build_lerp(struct lp_build_context
*bld
,
1272 const struct lp_type type
= bld
->type
;
1275 assert(lp_check_value(type
, x
));
1276 assert(lp_check_value(type
, v0
));
1277 assert(lp_check_value(type
, v1
));
1279 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1282 struct lp_type wide_type
;
1283 struct lp_build_context wide_bld
;
1284 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1286 assert(type
.length
>= 2);
1289 * Create a wider integer type, enough to hold the
1290 * intermediate result of the multiplication.
1292 memset(&wide_type
, 0, sizeof wide_type
);
1293 wide_type
.sign
= type
.sign
;
1294 wide_type
.width
= type
.width
*2;
1295 wide_type
.length
= type
.length
/2;
1297 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1299 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1300 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1301 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1307 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1309 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1310 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1312 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1314 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1322 * Bilinear interpolation.
1324 * Values indices are in v_{yx}.
1327 lp_build_lerp_2d(struct lp_build_context
*bld
,
1336 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1337 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1338 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1343 lp_build_lerp_3d(struct lp_build_context
*bld
,
1357 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1358 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1359 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1364 * Generate min(a, b)
1365 * Do checks for special cases but not for nans.
1368 lp_build_min(struct lp_build_context
*bld
,
1372 assert(lp_check_value(bld
->type
, a
));
1373 assert(lp_check_value(bld
->type
, b
));
1375 if(a
== bld
->undef
|| b
== bld
->undef
)
1381 if (bld
->type
.norm
) {
1382 if (!bld
->type
.sign
) {
1383 if (a
== bld
->zero
|| b
== bld
->zero
) {
1393 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1398 * Generate min(a, b)
1399 * NaN's are handled according to the behavior specified by the
1400 * nan_behavior argument.
1403 lp_build_min_ext(struct lp_build_context
*bld
,
1406 enum gallivm_nan_behavior nan_behavior
)
1408 assert(lp_check_value(bld
->type
, a
));
1409 assert(lp_check_value(bld
->type
, b
));
1411 if(a
== bld
->undef
|| b
== bld
->undef
)
1417 if (bld
->type
.norm
) {
1418 if (!bld
->type
.sign
) {
1419 if (a
== bld
->zero
|| b
== bld
->zero
) {
1429 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1433 * Generate max(a, b)
1434 * Do checks for special cases, but NaN behavior is undefined.
1437 lp_build_max(struct lp_build_context
*bld
,
1441 assert(lp_check_value(bld
->type
, a
));
1442 assert(lp_check_value(bld
->type
, b
));
1444 if(a
== bld
->undef
|| b
== bld
->undef
)
1450 if(bld
->type
.norm
) {
1451 if(a
== bld
->one
|| b
== bld
->one
)
1453 if (!bld
->type
.sign
) {
1454 if (a
== bld
->zero
) {
1457 if (b
== bld
->zero
) {
1463 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1468 * Generate max(a, b)
1469 * Checks for special cases.
1470 * NaN's are handled according to the behavior specified by the
1471 * nan_behavior argument.
1474 lp_build_max_ext(struct lp_build_context
*bld
,
1477 enum gallivm_nan_behavior nan_behavior
)
1479 assert(lp_check_value(bld
->type
, a
));
1480 assert(lp_check_value(bld
->type
, b
));
1482 if(a
== bld
->undef
|| b
== bld
->undef
)
1488 if(bld
->type
.norm
) {
1489 if(a
== bld
->one
|| b
== bld
->one
)
1491 if (!bld
->type
.sign
) {
1492 if (a
== bld
->zero
) {
1495 if (b
== bld
->zero
) {
1501 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1505 * Generate clamp(a, min, max)
1506 * NaN behavior (for any of a, min, max) is undefined.
1507 * Do checks for special cases.
1510 lp_build_clamp(struct lp_build_context
*bld
,
1515 assert(lp_check_value(bld
->type
, a
));
1516 assert(lp_check_value(bld
->type
, min
));
1517 assert(lp_check_value(bld
->type
, max
));
1519 a
= lp_build_min(bld
, a
, max
);
1520 a
= lp_build_max(bld
, a
, min
);
1526 * Generate clamp(a, 0, 1)
1527 * A NaN will get converted to zero.
1530 lp_build_clamp_zero_one_nanzero(struct lp_build_context
*bld
,
1533 a
= lp_build_max_ext(bld
, a
, bld
->zero
, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
1534 a
= lp_build_min(bld
, a
, bld
->one
);
1543 lp_build_abs(struct lp_build_context
*bld
,
1546 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1547 const struct lp_type type
= bld
->type
;
1548 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1550 assert(lp_check_value(type
, a
));
1556 if (0x0306 <= HAVE_LLVM
&& HAVE_LLVM
< 0x0309) {
1557 /* Workaround llvm.org/PR27332 */
1558 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1559 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1560 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1561 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1562 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1563 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1567 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fabs", vec_type
);
1568 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1572 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1573 switch(type
.width
) {
1575 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1577 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1579 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1582 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1583 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1584 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1585 debug_printf("%s: inefficient code, should split vectors manually\n",
1589 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1594 lp_build_negate(struct lp_build_context
*bld
,
1597 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1599 assert(lp_check_value(bld
->type
, a
));
1601 if (bld
->type
.floating
)
1602 a
= LLVMBuildFNeg(builder
, a
, "");
1604 a
= LLVMBuildNeg(builder
, a
, "");
1610 /** Return -1, 0 or +1 depending on the sign of a */
1612 lp_build_sgn(struct lp_build_context
*bld
,
1615 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1616 const struct lp_type type
= bld
->type
;
1620 assert(lp_check_value(type
, a
));
1622 /* Handle non-zero case */
1624 /* if not zero then sign must be positive */
1627 else if(type
.floating
) {
1628 LLVMTypeRef vec_type
;
1629 LLVMTypeRef int_type
;
1633 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1635 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1636 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1637 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1639 /* Take the sign bit and add it to 1 constant */
1640 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1641 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1642 one
= LLVMConstBitCast(bld
->one
, int_type
);
1643 res
= LLVMBuildOr(builder
, sign
, one
, "");
1644 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1648 /* signed int/norm/fixed point */
1649 /* could use psign with sse3 and appropriate vectors here */
1650 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1651 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1652 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1656 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1657 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1664 * Set the sign of float vector 'a' according to 'sign'.
1665 * If sign==0, return abs(a).
1666 * If sign==1, return -abs(a);
1667 * Other values for sign produce undefined results.
1670 lp_build_set_sign(struct lp_build_context
*bld
,
1671 LLVMValueRef a
, LLVMValueRef sign
)
1673 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1674 const struct lp_type type
= bld
->type
;
1675 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1676 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1677 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1678 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1679 ~((unsigned long long) 1 << (type
.width
- 1)));
1680 LLVMValueRef val
, res
;
1682 assert(type
.floating
);
1683 assert(lp_check_value(type
, a
));
1685 /* val = reinterpret_cast<int>(a) */
1686 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1687 /* val = val & mask */
1688 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1689 /* sign = sign << shift */
1690 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1691 /* res = val | sign */
1692 res
= LLVMBuildOr(builder
, val
, sign
, "");
1693 /* res = reinterpret_cast<float>(res) */
1694 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1701 * Convert vector of (or scalar) int to vector of (or scalar) float.
1704 lp_build_int_to_float(struct lp_build_context
*bld
,
1707 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1708 const struct lp_type type
= bld
->type
;
1709 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1711 assert(type
.floating
);
1713 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1717 arch_rounding_available(const struct lp_type type
)
1719 if ((util_cpu_caps
.has_sse4_1
&&
1720 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1721 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1723 else if ((util_cpu_caps
.has_altivec
&&
1724 (type
.width
== 32 && type
.length
== 4)))
1730 enum lp_build_round_mode
1732 LP_BUILD_ROUND_NEAREST
= 0,
1733 LP_BUILD_ROUND_FLOOR
= 1,
1734 LP_BUILD_ROUND_CEIL
= 2,
1735 LP_BUILD_ROUND_TRUNCATE
= 3
1738 static inline LLVMValueRef
1739 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1742 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1743 const struct lp_type type
= bld
->type
;
1744 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1745 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1746 const char *intrinsic
;
1749 assert(type
.floating
);
1750 /* using the double precision conversions is a bit more complicated */
1751 assert(type
.width
== 32);
1753 assert(lp_check_value(type
, a
));
1754 assert(util_cpu_caps
.has_sse2
);
1756 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1757 if (type
.length
== 1) {
1758 LLVMTypeRef vec_type
;
1761 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1763 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1765 intrinsic
= "llvm.x86.sse.cvtss2si";
1767 undef
= LLVMGetUndef(vec_type
);
1769 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1771 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1775 if (type
.width
* type
.length
== 128) {
1776 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1779 assert(type
.width
*type
.length
== 256);
1780 assert(util_cpu_caps
.has_avx
);
1782 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1784 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1794 static inline LLVMValueRef
1795 lp_build_round_altivec(struct lp_build_context
*bld
,
1797 enum lp_build_round_mode mode
)
1799 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1800 const struct lp_type type
= bld
->type
;
1801 const char *intrinsic
= NULL
;
1803 assert(type
.floating
);
1805 assert(lp_check_value(type
, a
));
1806 assert(util_cpu_caps
.has_altivec
);
1811 case LP_BUILD_ROUND_NEAREST
:
1812 intrinsic
= "llvm.ppc.altivec.vrfin";
1814 case LP_BUILD_ROUND_FLOOR
:
1815 intrinsic
= "llvm.ppc.altivec.vrfim";
1817 case LP_BUILD_ROUND_CEIL
:
1818 intrinsic
= "llvm.ppc.altivec.vrfip";
1820 case LP_BUILD_ROUND_TRUNCATE
:
1821 intrinsic
= "llvm.ppc.altivec.vrfiz";
1825 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1828 static inline LLVMValueRef
1829 lp_build_round_arch(struct lp_build_context
*bld
,
1831 enum lp_build_round_mode mode
)
1833 if (util_cpu_caps
.has_sse4_1
) {
1834 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1835 const struct lp_type type
= bld
->type
;
1836 const char *intrinsic_root
;
1839 assert(type
.floating
);
1840 assert(lp_check_value(type
, a
));
1844 case LP_BUILD_ROUND_NEAREST
:
1845 intrinsic_root
= "llvm.nearbyint";
1847 case LP_BUILD_ROUND_FLOOR
:
1848 intrinsic_root
= "llvm.floor";
1850 case LP_BUILD_ROUND_CEIL
:
1851 intrinsic_root
= "llvm.ceil";
1853 case LP_BUILD_ROUND_TRUNCATE
:
1854 intrinsic_root
= "llvm.trunc";
1858 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, intrinsic_root
, bld
->vec_type
);
1859 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1861 else /* (util_cpu_caps.has_altivec) */
1862 return lp_build_round_altivec(bld
, a
, mode
);
1866 * Return the integer part of a float (vector) value (== round toward zero).
1867 * The returned value is a float (vector).
1868 * Ex: trunc(-1.5) = -1.0
1871 lp_build_trunc(struct lp_build_context
*bld
,
1874 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1875 const struct lp_type type
= bld
->type
;
1877 assert(type
.floating
);
1878 assert(lp_check_value(type
, a
));
1880 if (arch_rounding_available(type
)) {
1881 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1884 const struct lp_type type
= bld
->type
;
1885 struct lp_type inttype
;
1886 struct lp_build_context intbld
;
1887 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
1888 LLVMValueRef trunc
, res
, anosign
, mask
;
1889 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1890 LLVMTypeRef vec_type
= bld
->vec_type
;
1892 assert(type
.width
== 32); /* might want to handle doubles at some point */
1895 inttype
.floating
= 0;
1896 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1898 /* round by truncation */
1899 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1900 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1902 /* mask out sign bit */
1903 anosign
= lp_build_abs(bld
, a
);
1905 * mask out all values if anosign > 2^24
1906 * This should work both for large ints (all rounding is no-op for them
1907 * because such floats are always exact) as well as special cases like
1908 * NaNs, Infs (taking advantage of the fact they use max exponent).
1909 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1911 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1912 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1913 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1914 return lp_build_select(bld
, mask
, a
, res
);
1920 * Return float (vector) rounded to nearest integer (vector). The returned
1921 * value is a float (vector).
1922 * Ex: round(0.9) = 1.0
1923 * Ex: round(-1.5) = -2.0
1926 lp_build_round(struct lp_build_context
*bld
,
1929 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1930 const struct lp_type type
= bld
->type
;
1932 assert(type
.floating
);
1933 assert(lp_check_value(type
, a
));
1935 if (arch_rounding_available(type
)) {
1936 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1939 const struct lp_type type
= bld
->type
;
1940 struct lp_type inttype
;
1941 struct lp_build_context intbld
;
1942 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
1943 LLVMValueRef res
, anosign
, mask
;
1944 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1945 LLVMTypeRef vec_type
= bld
->vec_type
;
1947 assert(type
.width
== 32); /* might want to handle doubles at some point */
1950 inttype
.floating
= 0;
1951 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1953 res
= lp_build_iround(bld
, a
);
1954 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1956 /* mask out sign bit */
1957 anosign
= lp_build_abs(bld
, a
);
1959 * mask out all values if anosign > 2^24
1960 * This should work both for large ints (all rounding is no-op for them
1961 * because such floats are always exact) as well as special cases like
1962 * NaNs, Infs (taking advantage of the fact they use max exponent).
1963 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1965 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1966 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1967 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1968 return lp_build_select(bld
, mask
, a
, res
);
1974 * Return floor of float (vector), result is a float (vector)
1975 * Ex: floor(1.1) = 1.0
1976 * Ex: floor(-1.1) = -2.0
1979 lp_build_floor(struct lp_build_context
*bld
,
1982 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1983 const struct lp_type type
= bld
->type
;
1985 assert(type
.floating
);
1986 assert(lp_check_value(type
, a
));
1988 if (arch_rounding_available(type
)) {
1989 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1992 const struct lp_type type
= bld
->type
;
1993 struct lp_type inttype
;
1994 struct lp_build_context intbld
;
1995 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
1996 LLVMValueRef trunc
, res
, anosign
, mask
;
1997 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1998 LLVMTypeRef vec_type
= bld
->vec_type
;
2000 if (type
.width
!= 32) {
2002 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.floor", vec_type
);
2003 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2006 assert(type
.width
== 32); /* might want to handle doubles at some point */
2009 inttype
.floating
= 0;
2010 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2012 /* round by truncation */
2013 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2014 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
2020 * fix values if rounding is wrong (for non-special cases)
2021 * - this is the case if trunc > a
2023 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
2024 /* tmp = trunc > a ? 1.0 : 0.0 */
2025 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2026 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2027 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2028 res
= lp_build_sub(bld
, res
, tmp
);
2031 /* mask out sign bit */
2032 anosign
= lp_build_abs(bld
, a
);
2034 * mask out all values if anosign > 2^24
2035 * This should work both for large ints (all rounding is no-op for them
2036 * because such floats are always exact) as well as special cases like
2037 * NaNs, Infs (taking advantage of the fact they use max exponent).
2038 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2040 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2041 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2042 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2043 return lp_build_select(bld
, mask
, a
, res
);
2049 * Return ceiling of float (vector), returning float (vector).
2050 * Ex: ceil( 1.1) = 2.0
2051 * Ex: ceil(-1.1) = -1.0
2054 lp_build_ceil(struct lp_build_context
*bld
,
2057 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2058 const struct lp_type type
= bld
->type
;
2060 assert(type
.floating
);
2061 assert(lp_check_value(type
, a
));
2063 if (arch_rounding_available(type
)) {
2064 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2067 const struct lp_type type
= bld
->type
;
2068 struct lp_type inttype
;
2069 struct lp_build_context intbld
;
2070 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2071 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2072 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2073 LLVMTypeRef vec_type
= bld
->vec_type
;
2075 if (type
.width
!= 32) {
2077 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.ceil", vec_type
);
2078 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2081 assert(type
.width
== 32); /* might want to handle doubles at some point */
2084 inttype
.floating
= 0;
2085 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2087 /* round by truncation */
2088 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2089 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2092 * fix values if rounding is wrong (for non-special cases)
2093 * - this is the case if trunc < a
2095 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2096 /* tmp = trunc < a ? 1.0 : 0.0 */
2097 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2098 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2099 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2100 res
= lp_build_add(bld
, trunc
, tmp
);
2102 /* mask out sign bit */
2103 anosign
= lp_build_abs(bld
, a
);
2105 * mask out all values if anosign > 2^24
2106 * This should work both for large ints (all rounding is no-op for them
2107 * because such floats are always exact) as well as special cases like
2108 * NaNs, Infs (taking advantage of the fact they use max exponent).
2109 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2111 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2112 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2113 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2114 return lp_build_select(bld
, mask
, a
, res
);
2120 * Return fractional part of 'a' computed as a - floor(a)
2121 * Typically used in texture coord arithmetic.
2124 lp_build_fract(struct lp_build_context
*bld
,
2127 assert(bld
->type
.floating
);
2128 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2133 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2134 * against 0.99999(9). (Will also return that value for NaNs.)
2136 static inline LLVMValueRef
2137 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2141 /* this is the largest number smaller than 1.0 representable as float */
2142 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2143 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2144 return lp_build_min_ext(bld
, fract
, max
,
2145 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
2150 * Same as lp_build_fract, but guarantees that the result is always smaller
2151 * than one. Will also return the smaller-than-one value for infs, NaNs.
2154 lp_build_fract_safe(struct lp_build_context
*bld
,
2157 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2162 * Return the integer part of a float (vector) value (== round toward zero).
2163 * The returned value is an integer (vector).
2164 * Ex: itrunc(-1.5) = -1
2167 lp_build_itrunc(struct lp_build_context
*bld
,
2170 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2171 const struct lp_type type
= bld
->type
;
2172 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2174 assert(type
.floating
);
2175 assert(lp_check_value(type
, a
));
2177 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2182 * Return float (vector) rounded to nearest integer (vector). The returned
2183 * value is an integer (vector).
2184 * Ex: iround(0.9) = 1
2185 * Ex: iround(-1.5) = -2
2188 lp_build_iround(struct lp_build_context
*bld
,
2191 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2192 const struct lp_type type
= bld
->type
;
2193 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2196 assert(type
.floating
);
2198 assert(lp_check_value(type
, a
));
2200 if ((util_cpu_caps
.has_sse2
&&
2201 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2202 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2203 return lp_build_iround_nearest_sse2(bld
, a
);
2205 if (arch_rounding_available(type
)) {
2206 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2211 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
2214 LLVMTypeRef vec_type
= bld
->vec_type
;
2215 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2216 (unsigned long long)1 << (type
.width
- 1));
2220 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2221 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2224 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2225 half
= LLVMBuildOr(builder
, sign
, half
, "");
2226 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2229 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2232 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2239 * Return floor of float (vector), result is an int (vector)
2240 * Ex: ifloor(1.1) = 1.0
2241 * Ex: ifloor(-1.1) = -2.0
2244 lp_build_ifloor(struct lp_build_context
*bld
,
2247 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2248 const struct lp_type type
= bld
->type
;
2249 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2252 assert(type
.floating
);
2253 assert(lp_check_value(type
, a
));
2257 if (arch_rounding_available(type
)) {
2258 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2261 struct lp_type inttype
;
2262 struct lp_build_context intbld
;
2263 LLVMValueRef trunc
, itrunc
, mask
;
2265 assert(type
.floating
);
2266 assert(lp_check_value(type
, a
));
2269 inttype
.floating
= 0;
2270 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2272 /* round by truncation */
2273 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2274 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2277 * fix values if rounding is wrong (for non-special cases)
2278 * - this is the case if trunc > a
2279 * The results of doing this with NaNs, very large values etc.
2280 * are undefined but this seems to be the case anyway.
2282 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2283 /* cheapie minus one with mask since the mask is minus one / zero */
2284 return lp_build_add(&intbld
, itrunc
, mask
);
2288 /* round to nearest (toward zero) */
2289 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2296 * Return ceiling of float (vector), returning int (vector).
2297 * Ex: iceil( 1.1) = 2
2298 * Ex: iceil(-1.1) = -1
2301 lp_build_iceil(struct lp_build_context
*bld
,
2304 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2305 const struct lp_type type
= bld
->type
;
2306 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2309 assert(type
.floating
);
2310 assert(lp_check_value(type
, a
));
2312 if (arch_rounding_available(type
)) {
2313 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2316 struct lp_type inttype
;
2317 struct lp_build_context intbld
;
2318 LLVMValueRef trunc
, itrunc
, mask
;
2320 assert(type
.floating
);
2321 assert(lp_check_value(type
, a
));
2324 inttype
.floating
= 0;
2325 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2327 /* round by truncation */
2328 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2329 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2332 * fix values if rounding is wrong (for non-special cases)
2333 * - this is the case if trunc < a
2334 * The results of doing this with NaNs, very large values etc.
2335 * are undefined but this seems to be the case anyway.
2337 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2338 /* cheapie plus one with mask since the mask is minus one / zero */
2339 return lp_build_sub(&intbld
, itrunc
, mask
);
2342 /* round to nearest (toward zero) */
2343 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2350 * Combined ifloor() & fract().
2352 * Preferred to calling the functions separately, as it will ensure that the
2353 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2356 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2358 LLVMValueRef
*out_ipart
,
2359 LLVMValueRef
*out_fpart
)
2361 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2362 const struct lp_type type
= bld
->type
;
2365 assert(type
.floating
);
2366 assert(lp_check_value(type
, a
));
2368 if (arch_rounding_available(type
)) {
2370 * floor() is easier.
2373 ipart
= lp_build_floor(bld
, a
);
2374 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2375 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2379 * ifloor() is easier.
2382 *out_ipart
= lp_build_ifloor(bld
, a
);
2383 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2384 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2390 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2391 * always smaller than one.
2394 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2396 LLVMValueRef
*out_ipart
,
2397 LLVMValueRef
*out_fpart
)
2399 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2400 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2405 lp_build_sqrt(struct lp_build_context
*bld
,
2408 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2409 const struct lp_type type
= bld
->type
;
2410 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2413 assert(lp_check_value(type
, a
));
2415 assert(type
.floating
);
2416 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.sqrt", vec_type
);
2418 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2423 * Do one Newton-Raphson step to improve reciprocate precision:
2425 * x_{i+1} = x_i * (2 - a * x_i)
2427 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2428 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2429 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2430 * halo. It would be necessary to clamp the argument to prevent this.
2433 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2434 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2436 static inline LLVMValueRef
2437 lp_build_rcp_refine(struct lp_build_context
*bld
,
2441 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2442 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2445 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2446 res
= LLVMBuildFSub(builder
, two
, res
, "");
2447 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2454 lp_build_rcp(struct lp_build_context
*bld
,
2457 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2458 const struct lp_type type
= bld
->type
;
2460 assert(lp_check_value(type
, a
));
2469 assert(type
.floating
);
2471 if(LLVMIsConstant(a
))
2472 return LLVMConstFDiv(bld
->one
, a
);
2475 * We don't use RCPPS because:
2476 * - it only has 10bits of precision
2477 * - it doesn't even get the reciprocate of 1.0 exactly
2478 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2479 * - for recent processors the benefit over DIVPS is marginal, a case
2482 * We could still use it on certain processors if benchmarks show that the
2483 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2484 * particular uses that require less workarounds.
2487 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2488 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2489 const unsigned num_iterations
= 0;
2492 const char *intrinsic
= NULL
;
2494 if (type
.length
== 4) {
2495 intrinsic
= "llvm.x86.sse.rcp.ps";
2498 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2501 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2503 for (i
= 0; i
< num_iterations
; ++i
) {
2504 res
= lp_build_rcp_refine(bld
, a
, res
);
2510 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2515 * Do one Newton-Raphson step to improve rsqrt precision:
2517 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2519 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2521 static inline LLVMValueRef
2522 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2524 LLVMValueRef rsqrt_a
)
2526 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2527 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2528 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2531 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2532 res
= LLVMBuildFMul(builder
, a
, res
, "");
2533 res
= LLVMBuildFSub(builder
, three
, res
, "");
2534 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2535 res
= LLVMBuildFMul(builder
, half
, res
, "");
2542 * Generate 1/sqrt(a).
2543 * Result is undefined for values < 0, infinity for +0.
2546 lp_build_rsqrt(struct lp_build_context
*bld
,
2549 const struct lp_type type
= bld
->type
;
2551 assert(lp_check_value(type
, a
));
2553 assert(type
.floating
);
2556 * This should be faster but all denormals will end up as infinity.
2558 if (0 && lp_build_fast_rsqrt_available(type
)) {
2559 const unsigned num_iterations
= 1;
2563 /* rsqrt(1.0) != 1.0 here */
2564 res
= lp_build_fast_rsqrt(bld
, a
);
2566 if (num_iterations
) {
2568 * Newton-Raphson will result in NaN instead of infinity for zero,
2569 * and NaN instead of zero for infinity.
2570 * Also, need to ensure rsqrt(1.0) == 1.0.
2571 * All numbers smaller than FLT_MIN will result in +infinity
2572 * (rsqrtps treats all denormals as zero).
2575 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2576 LLVMValueRef inf
= lp_build_const_vec(bld
->gallivm
, type
, INFINITY
);
2578 for (i
= 0; i
< num_iterations
; ++i
) {
2579 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2581 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2582 res
= lp_build_select(bld
, cmp
, inf
, res
);
2583 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2584 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2585 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2586 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2592 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2596 * If there's a fast (inaccurate) rsqrt instruction available
2597 * (caller may want to avoid to call rsqrt_fast if it's not available,
2598 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2599 * unavailable it would result in sqrt/div/mul so obviously
2600 * much better to just call sqrt, skipping both div and mul).
2603 lp_build_fast_rsqrt_available(struct lp_type type
)
2605 assert(type
.floating
);
2607 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2608 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2616 * Generate 1/sqrt(a).
2617 * Result is undefined for values < 0, infinity for +0.
2618 * Precision is limited, only ~10 bits guaranteed
2619 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2622 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2625 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2626 const struct lp_type type
= bld
->type
;
2628 assert(lp_check_value(type
, a
));
2630 if (lp_build_fast_rsqrt_available(type
)) {
2631 const char *intrinsic
= NULL
;
2633 if (type
.length
== 4) {
2634 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2637 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2639 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2642 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2644 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2649 * Generate sin(a) or cos(a) using polynomial approximation.
2650 * TODO: it might be worth recognizing sin and cos using same source
2651 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2652 * would be way cheaper than calculating (nearly) everything twice...
2653 * Not sure it's common enough to be worth bothering however, scs
2654 * opcode could also benefit from calculating both though.
2657 lp_build_sin_or_cos(struct lp_build_context
*bld
,
2661 struct gallivm_state
*gallivm
= bld
->gallivm
;
2662 LLVMBuilderRef b
= gallivm
->builder
;
2663 struct lp_type int_type
= lp_int_type(bld
->type
);
2666 * take the absolute value,
2667 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2670 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2671 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2673 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2674 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2678 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2681 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2682 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2685 * store the integer part of y in mm0
2686 * emm2 = _mm_cvttps_epi32(y);
2689 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2692 * j=(j+1) & (~1) (see the cephes sources)
2693 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2696 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2697 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2699 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2701 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2702 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2705 * y = _mm_cvtepi32_ps(emm2);
2707 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2709 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2710 LLVMValueRef const_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2711 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2712 LLVMValueRef sign_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2715 * Argument used for poly selection and sign bit determination
2716 * is different for sin vs. cos.
2718 LLVMValueRef emm2_2
= cos
? LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2") :
2721 LLVMValueRef sign_bit
= cos
? LLVMBuildShl(b
, LLVMBuildAnd(b
, const_4
,
2722 LLVMBuildNot(b
, emm2_2
, ""), ""),
2723 const_29
, "sign_bit") :
2724 LLVMBuildAnd(b
, LLVMBuildXor(b
, a_v4si
,
2725 LLVMBuildShl(b
, emm2_add
,
2727 sign_mask
, "sign_bit");
2730 * get the polynom selection mask
2731 * there is one polynom for 0 <= x <= Pi/4
2732 * and another one for Pi/4<x<=Pi/2
2733 * Both branches will be computed.
2735 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2736 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2739 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, const_2
, "emm2_3");
2740 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2741 int_type
, PIPE_FUNC_EQUAL
,
2742 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2745 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2746 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2747 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2749 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2750 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2751 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2754 * The magic pass: "Extended precision modular arithmetic"
2755 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2757 LLVMValueRef x_1
= lp_build_fmuladd(b
, y_2
, DP1
, x_abs
);
2758 LLVMValueRef x_2
= lp_build_fmuladd(b
, y_2
, DP2
, x_1
);
2759 LLVMValueRef x_3
= lp_build_fmuladd(b
, y_2
, DP3
, x_2
);
2762 * Evaluate the first polynom (0 <= x <= Pi/4)
2764 * z = _mm_mul_ps(x,x);
2766 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2769 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2770 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2771 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2773 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2774 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2775 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2778 * y = *(v4sf*)_ps_coscof_p0;
2779 * y = _mm_mul_ps(y, z);
2781 LLVMValueRef y_4
= lp_build_fmuladd(b
, z
, coscof_p0
, coscof_p1
);
2782 LLVMValueRef y_6
= lp_build_fmuladd(b
, y_4
, z
, coscof_p2
);
2783 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2784 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2788 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2789 * y = _mm_sub_ps(y, tmp);
2790 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2792 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2793 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2794 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2795 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2796 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2799 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2800 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2801 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2803 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2804 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2805 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2808 * Evaluate the second polynom (Pi/4 <= x <= 0)
2810 * y2 = *(v4sf*)_ps_sincof_p0;
2811 * y2 = _mm_mul_ps(y2, z);
2812 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2813 * y2 = _mm_mul_ps(y2, z);
2814 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2815 * y2 = _mm_mul_ps(y2, z);
2816 * y2 = _mm_mul_ps(y2, x);
2817 * y2 = _mm_add_ps(y2, x);
2820 LLVMValueRef y2_4
= lp_build_fmuladd(b
, z
, sincof_p0
, sincof_p1
);
2821 LLVMValueRef y2_6
= lp_build_fmuladd(b
, y2_4
, z
, sincof_p2
);
2822 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2823 LLVMValueRef y2_9
= lp_build_fmuladd(b
, y2_7
, x_3
, x_3
);
2826 * select the correct result from the two polynoms
2828 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2829 * y = _mm_andnot_ps(xmm3, y);
2830 * y = _mm_or_ps(y,y2);
2832 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2833 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2834 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2835 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
2836 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2837 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
2841 * y = _mm_xor_ps(y, sign_bit);
2843 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sign");
2844 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2846 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
2848 /* clamp output to be within [-1, 1] */
2849 y_result
= lp_build_clamp(bld
, y_result
,
2850 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
2851 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
2852 /* If a is -inf, inf or NaN then return NaN */
2853 y_result
= lp_build_select(bld
, isfinite
, y_result
,
2854 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
2863 lp_build_sin(struct lp_build_context
*bld
,
2866 return lp_build_sin_or_cos(bld
, a
, FALSE
);
2874 lp_build_cos(struct lp_build_context
*bld
,
2877 return lp_build_sin_or_cos(bld
, a
, TRUE
);
2882 * Generate pow(x, y)
2885 lp_build_pow(struct lp_build_context
*bld
,
2889 /* TODO: optimize the constant case */
2890 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2891 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2892 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2896 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2904 lp_build_exp(struct lp_build_context
*bld
,
2907 /* log2(e) = 1/log(2) */
2908 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2909 1.4426950408889634);
2911 assert(lp_check_value(bld
->type
, x
));
2913 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2919 * Behavior is undefined with infs, 0s and nans
2922 lp_build_log(struct lp_build_context
*bld
,
2926 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2927 0.69314718055994529);
2929 assert(lp_check_value(bld
->type
, x
));
2931 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2935 * Generate log(x) that handles edge cases (infs, 0s and nans)
2938 lp_build_log_safe(struct lp_build_context
*bld
,
2942 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2943 0.69314718055994529);
2945 assert(lp_check_value(bld
->type
, x
));
2947 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
2952 * Generate polynomial.
2953 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2956 lp_build_polynomial(struct lp_build_context
*bld
,
2958 const double *coeffs
,
2959 unsigned num_coeffs
)
2961 const struct lp_type type
= bld
->type
;
2962 LLVMValueRef even
= NULL
, odd
= NULL
;
2966 assert(lp_check_value(bld
->type
, x
));
2968 /* TODO: optimize the constant case */
2969 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2970 LLVMIsConstant(x
)) {
2971 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2976 * Calculate odd and even terms seperately to decrease data dependency
2978 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2979 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2981 x2
= lp_build_mul(bld
, x
, x
);
2983 for (i
= num_coeffs
; i
--; ) {
2986 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2990 even
= lp_build_mad(bld
, x2
, even
, coeff
);
2995 odd
= lp_build_mad(bld
, x2
, odd
, coeff
);
3002 return lp_build_mad(bld
, odd
, x
, even
);
3011 * Minimax polynomial fit of 2**x, in range [0, 1[
3013 const double lp_build_exp2_polynomial
[] = {
3014 #if EXP_POLY_DEGREE == 5
3015 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3016 0.693153073200168932794,
3017 0.240153617044375388211,
3018 0.0558263180532956664775,
3019 0.00898934009049466391101,
3020 0.00187757667519147912699
3021 #elif EXP_POLY_DEGREE == 4
3022 1.00000259337069434683,
3023 0.693003834469974940458,
3024 0.24144275689150793076,
3025 0.0520114606103070150235,
3026 0.0135341679161270268764
3027 #elif EXP_POLY_DEGREE == 3
3028 0.999925218562710312959,
3029 0.695833540494823811697,
3030 0.226067155427249155588,
3031 0.0780245226406372992967
3032 #elif EXP_POLY_DEGREE == 2
3033 1.00172476321474503578,
3034 0.657636275736077639316,
3035 0.33718943461968720704
3043 lp_build_exp2(struct lp_build_context
*bld
,
3046 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3047 const struct lp_type type
= bld
->type
;
3048 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3049 LLVMValueRef ipart
= NULL
;
3050 LLVMValueRef fpart
= NULL
;
3051 LLVMValueRef expipart
= NULL
;
3052 LLVMValueRef expfpart
= NULL
;
3053 LLVMValueRef res
= NULL
;
3055 assert(lp_check_value(bld
->type
, x
));
3057 /* TODO: optimize the constant case */
3058 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3059 LLVMIsConstant(x
)) {
3060 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3064 assert(type
.floating
&& type
.width
== 32);
3066 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3067 * the result is INF and if it's smaller than -126.9 the result is 0 */
3068 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3069 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3070 x
= lp_build_max_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999),
3071 x
, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3073 /* ipart = floor(x) */
3074 /* fpart = x - ipart */
3075 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3077 /* expipart = (float) (1 << ipart) */
3078 expipart
= LLVMBuildAdd(builder
, ipart
,
3079 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3080 expipart
= LLVMBuildShl(builder
, expipart
,
3081 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3082 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3084 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3085 ARRAY_SIZE(lp_build_exp2_polynomial
));
3087 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3095 * Extract the exponent of a IEEE-754 floating point value.
3097 * Optionally apply an integer bias.
3099 * Result is an integer value with
3101 * ifloor(log2(x)) + bias
3104 lp_build_extract_exponent(struct lp_build_context
*bld
,
3108 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3109 const struct lp_type type
= bld
->type
;
3110 unsigned mantissa
= lp_mantissa(type
);
3113 assert(type
.floating
);
3115 assert(lp_check_value(bld
->type
, x
));
3117 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3119 res
= LLVMBuildLShr(builder
, x
,
3120 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3121 res
= LLVMBuildAnd(builder
, res
,
3122 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3123 res
= LLVMBuildSub(builder
, res
,
3124 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3131 * Extract the mantissa of the a floating.
3133 * Result is a floating point value with
3135 * x / floor(log2(x))
3138 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3141 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3142 const struct lp_type type
= bld
->type
;
3143 unsigned mantissa
= lp_mantissa(type
);
3144 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3145 (1ULL << mantissa
) - 1);
3146 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3149 assert(lp_check_value(bld
->type
, x
));
3151 assert(type
.floating
);
3153 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3155 /* res = x / 2**ipart */
3156 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3157 res
= LLVMBuildOr(builder
, res
, one
, "");
3158 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3166 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3167 * These coefficients can be generate with
3168 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3170 const double lp_build_log2_polynomial
[] = {
3171 #if LOG_POLY_DEGREE == 5
3172 2.88539008148777786488L,
3173 0.961796878841293367824L,
3174 0.577058946784739859012L,
3175 0.412914355135828735411L,
3176 0.308591899232910175289L,
3177 0.352376952300281371868L,
3178 #elif LOG_POLY_DEGREE == 4
3179 2.88539009343309178325L,
3180 0.961791550404184197881L,
3181 0.577440339438736392009L,
3182 0.403343858251329912514L,
3183 0.406718052498846252698L,
3184 #elif LOG_POLY_DEGREE == 3
3185 2.88538959748872753838L,
3186 0.961932915889597772928L,
3187 0.571118517972136195241L,
3188 0.493997535084709500285L,
3195 * See http://www.devmaster.net/forums/showthread.php?p=43580
3196 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3197 * http://www.nezumi.demon.co.uk/consult/logx.htm
3199 * If handle_edge_cases is true the function will perform computations
3200 * to match the required D3D10+ behavior for each of the edge cases.
3201 * That means that if input is:
3202 * - less than zero (to and including -inf) then NaN will be returned
3203 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3204 * - +infinity, then +infinity will be returned
3205 * - NaN, then NaN will be returned
3207 * Those checks are fairly expensive so if you don't need them make sure
3208 * handle_edge_cases is false.
3211 lp_build_log2_approx(struct lp_build_context
*bld
,
3213 LLVMValueRef
*p_exp
,
3214 LLVMValueRef
*p_floor_log2
,
3215 LLVMValueRef
*p_log2
,
3216 boolean handle_edge_cases
)
3218 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3219 const struct lp_type type
= bld
->type
;
3220 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3221 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3223 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3224 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3225 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3227 LLVMValueRef i
= NULL
;
3228 LLVMValueRef y
= NULL
;
3229 LLVMValueRef z
= NULL
;
3230 LLVMValueRef exp
= NULL
;
3231 LLVMValueRef mant
= NULL
;
3232 LLVMValueRef logexp
= NULL
;
3233 LLVMValueRef p_z
= NULL
;
3234 LLVMValueRef res
= NULL
;
3236 assert(lp_check_value(bld
->type
, x
));
3238 if(p_exp
|| p_floor_log2
|| p_log2
) {
3239 /* TODO: optimize the constant case */
3240 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3241 LLVMIsConstant(x
)) {
3242 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3246 assert(type
.floating
&& type
.width
== 32);
3249 * We don't explicitly handle denormalized numbers. They will yield a
3250 * result in the neighbourhood of -127, which appears to be adequate
3254 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3256 /* exp = (float) exponent(x) */
3257 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3260 if(p_floor_log2
|| p_log2
) {
3261 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3262 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3263 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3267 /* mant = 1 + (float) mantissa(x) */
3268 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3269 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3270 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3272 /* y = (mant - 1) / (mant + 1) */
3273 y
= lp_build_div(bld
,
3274 lp_build_sub(bld
, mant
, bld
->one
),
3275 lp_build_add(bld
, mant
, bld
->one
)
3279 z
= lp_build_mul(bld
, y
, y
);
3282 p_z
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3283 ARRAY_SIZE(lp_build_log2_polynomial
));
3285 /* y * P(z) + logexp */
3286 res
= lp_build_mad(bld
, y
, p_z
, logexp
);
3288 if (type
.floating
&& handle_edge_cases
) {
3289 LLVMValueRef negmask
, infmask
, zmask
;
3290 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3291 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3292 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3293 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3294 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3295 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3297 /* If x is qual to inf make sure we return inf */
3298 res
= lp_build_select(bld
, infmask
,
3299 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3301 /* If x is qual to 0, return -inf */
3302 res
= lp_build_select(bld
, zmask
,
3303 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3305 /* If x is nan or less than 0, return nan */
3306 res
= lp_build_select(bld
, negmask
,
3307 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3313 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3318 *p_floor_log2
= logexp
;
3326 * log2 implementation which doesn't have special code to
3327 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3328 * the results for those cases are undefined.
3331 lp_build_log2(struct lp_build_context
*bld
,
3335 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3340 * Version of log2 which handles all edge cases.
3341 * Look at documentation of lp_build_log2_approx for
3342 * description of the behavior for each of the edge cases.
3345 lp_build_log2_safe(struct lp_build_context
*bld
,
3349 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3355 * Faster (and less accurate) log2.
3357 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3359 * Piece-wise linear approximation, with exact results when x is a
3362 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3365 lp_build_fast_log2(struct lp_build_context
*bld
,
3368 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3372 assert(lp_check_value(bld
->type
, x
));
3374 assert(bld
->type
.floating
);
3376 /* ipart = floor(log2(x)) - 1 */
3377 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3378 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3380 /* fpart = x / 2**ipart */
3381 fpart
= lp_build_extract_mantissa(bld
, x
);
3384 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3389 * Fast implementation of iround(log2(x)).
3391 * Not an approximation -- it should give accurate results all the time.
3394 lp_build_ilog2(struct lp_build_context
*bld
,
3397 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3398 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3401 assert(bld
->type
.floating
);
3403 assert(lp_check_value(bld
->type
, x
));
3405 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3406 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3408 /* ipart = floor(log2(x) + 0.5) */
3409 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3415 lp_build_mod(struct lp_build_context
*bld
,
3419 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3421 const struct lp_type type
= bld
->type
;
3423 assert(lp_check_value(type
, x
));
3424 assert(lp_check_value(type
, y
));
3427 res
= LLVMBuildFRem(builder
, x
, y
, "");
3429 res
= LLVMBuildSRem(builder
, x
, y
, "");
3431 res
= LLVMBuildURem(builder
, x
, y
, "");
3437 * For floating inputs it creates and returns a mask
3438 * which is all 1's for channels which are NaN.
3439 * Channels inside x which are not NaN will be 0.
3442 lp_build_isnan(struct lp_build_context
*bld
,
3446 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3448 assert(bld
->type
.floating
);
3449 assert(lp_check_value(bld
->type
, x
));
3451 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3453 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3454 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3458 /* Returns all 1's for floating point numbers that are
3459 * finite numbers and returns all zeros for -inf,
3462 lp_build_isfinite(struct lp_build_context
*bld
,
3465 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3466 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3467 struct lp_type int_type
= lp_int_type(bld
->type
);
3468 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3469 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3472 if (!bld
->type
.floating
) {
3473 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3475 assert(bld
->type
.floating
);
3476 assert(lp_check_value(bld
->type
, x
));
3477 assert(bld
->type
.width
== 32);
3479 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3480 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,
3485 * Returns true if the number is nan or inf and false otherwise.
3486 * The input has to be a floating point vector.
3489 lp_build_is_inf_or_nan(struct gallivm_state
*gallivm
,
3490 const struct lp_type type
,
3493 LLVMBuilderRef builder
= gallivm
->builder
;
3494 struct lp_type int_type
= lp_int_type(type
);
3495 LLVMValueRef const0
= lp_build_const_int_vec(gallivm
, int_type
,
3499 assert(type
.floating
);
3501 ret
= LLVMBuildBitCast(builder
, x
, lp_build_vec_type(gallivm
, int_type
), "");
3502 ret
= LLVMBuildAnd(builder
, ret
, const0
, "");
3503 ret
= lp_build_compare(gallivm
, int_type
, PIPE_FUNC_EQUAL
,
3511 lp_build_fpstate_get(struct gallivm_state
*gallivm
)
3513 if (util_cpu_caps
.has_sse
) {
3514 LLVMBuilderRef builder
= gallivm
->builder
;
3515 LLVMValueRef mxcsr_ptr
= lp_build_alloca(
3517 LLVMInt32TypeInContext(gallivm
->context
),
3519 LLVMValueRef mxcsr_ptr8
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3520 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3521 lp_build_intrinsic(builder
,
3522 "llvm.x86.sse.stmxcsr",
3523 LLVMVoidTypeInContext(gallivm
->context
),
3531 lp_build_fpstate_set_denorms_zero(struct gallivm_state
*gallivm
,
3534 if (util_cpu_caps
.has_sse
) {
3535 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3536 int daz_ftz
= _MM_FLUSH_ZERO_MASK
;
3538 LLVMBuilderRef builder
= gallivm
->builder
;
3539 LLVMValueRef mxcsr_ptr
= lp_build_fpstate_get(gallivm
);
3540 LLVMValueRef mxcsr
=
3541 LLVMBuildLoad(builder
, mxcsr_ptr
, "mxcsr");
3543 if (util_cpu_caps
.has_daz
) {
3544 /* Enable denormals are zero mode */
3545 daz_ftz
|= _MM_DENORMALS_ZERO_MASK
;
3548 mxcsr
= LLVMBuildOr(builder
, mxcsr
,
3549 LLVMConstInt(LLVMTypeOf(mxcsr
), daz_ftz
, 0), "");
3551 mxcsr
= LLVMBuildAnd(builder
, mxcsr
,
3552 LLVMConstInt(LLVMTypeOf(mxcsr
), ~daz_ftz
, 0), "");
3555 LLVMBuildStore(builder
, mxcsr
, mxcsr_ptr
);
3556 lp_build_fpstate_set(gallivm
, mxcsr_ptr
);
3561 lp_build_fpstate_set(struct gallivm_state
*gallivm
,
3562 LLVMValueRef mxcsr_ptr
)
3564 if (util_cpu_caps
.has_sse
) {
3565 LLVMBuilderRef builder
= gallivm
->builder
;
3566 mxcsr_ptr
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3567 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3568 lp_build_intrinsic(builder
,
3569 "llvm.x86.sse.ldmxcsr",
3570 LLVMVoidTypeInContext(gallivm
->context
),