1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
79 #define EXP_POLY_DEGREE 5
81 #define LOG_POLY_DEGREE 4
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
91 lp_build_min_simple(struct lp_build_context
*bld
,
94 enum gallivm_nan_behavior nan_behavior
)
96 const struct lp_type type
= bld
->type
;
97 const char *intrinsic
= NULL
;
98 unsigned intr_size
= 0;
101 assert(lp_check_value(type
, a
));
102 assert(lp_check_value(type
, b
));
104 /* TODO: optimize the constant case */
106 if (type
.floating
&& util_cpu_caps
.has_sse
) {
107 if (type
.width
== 32) {
108 if (type
.length
== 1) {
109 intrinsic
= "llvm.x86.sse.min.ss";
112 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
113 intrinsic
= "llvm.x86.sse.min.ps";
117 intrinsic
= "llvm.x86.avx.min.ps.256";
121 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
122 if (type
.length
== 1) {
123 intrinsic
= "llvm.x86.sse2.min.sd";
126 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
127 intrinsic
= "llvm.x86.sse2.min.pd";
131 intrinsic
= "llvm.x86.avx.min.pd.256";
136 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
137 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
138 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
142 if (type
.width
== 32 && type
.length
== 4) {
143 intrinsic
= "llvm.ppc.altivec.vminfp";
146 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
148 if ((type
.width
== 8 || type
.width
== 16) &&
149 (type
.width
* type
.length
<= 64) &&
150 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
154 if (type
.width
== 8 && !type
.sign
) {
155 intrinsic
= "llvm.x86.sse2.pminu.b";
157 else if (type
.width
== 16 && type
.sign
) {
158 intrinsic
= "llvm.x86.sse2.pmins.w";
160 if (util_cpu_caps
.has_sse4_1
) {
161 if (type
.width
== 8 && type
.sign
) {
162 intrinsic
= "llvm.x86.sse41.pminsb";
164 if (type
.width
== 16 && !type
.sign
) {
165 intrinsic
= "llvm.x86.sse41.pminuw";
167 if (type
.width
== 32 && !type
.sign
) {
168 intrinsic
= "llvm.x86.sse41.pminud";
170 if (type
.width
== 32 && type
.sign
) {
171 intrinsic
= "llvm.x86.sse41.pminsd";
174 } else if (util_cpu_caps
.has_altivec
) {
176 if (type
.width
== 8) {
178 intrinsic
= "llvm.ppc.altivec.vminub";
180 intrinsic
= "llvm.ppc.altivec.vminsb";
182 } else if (type
.width
== 16) {
184 intrinsic
= "llvm.ppc.altivec.vminuh";
186 intrinsic
= "llvm.ppc.altivec.vminsh";
188 } else if (type
.width
== 32) {
190 intrinsic
= "llvm.ppc.altivec.vminuw";
192 intrinsic
= "llvm.ppc.altivec.vminsw";
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
204 if (util_cpu_caps
.has_sse
&& type
.floating
&&
205 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
206 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
207 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
208 LLVMValueRef isnan
, min
;
209 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
212 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
213 isnan
= lp_build_isnan(bld
, b
);
214 return lp_build_select(bld
, isnan
, a
, min
);
216 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
217 isnan
= lp_build_isnan(bld
, a
);
218 return lp_build_select(bld
, isnan
, a
, min
);
221 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
228 switch (nan_behavior
) {
229 case GALLIVM_NAN_RETURN_NAN
: {
230 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
231 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
232 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
233 return lp_build_select(bld
, cond
, a
, b
);
236 case GALLIVM_NAN_RETURN_OTHER
: {
237 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
238 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
239 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
240 return lp_build_select(bld
, cond
, a
, b
);
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
244 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
245 return lp_build_select(bld
, cond
, a
, b
);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
247 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, b
, a
);
248 return lp_build_select(bld
, cond
, b
, a
);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
250 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
251 return lp_build_select(bld
, cond
, a
, b
);
255 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
256 return lp_build_select(bld
, cond
, a
, b
);
259 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
260 return lp_build_select(bld
, cond
, a
, b
);
267 * No checks for special case values of a or b = 1 or 0 are done.
268 * NaN's are handled according to the behavior specified by the
269 * nan_behavior argument.
272 lp_build_max_simple(struct lp_build_context
*bld
,
275 enum gallivm_nan_behavior nan_behavior
)
277 const struct lp_type type
= bld
->type
;
278 const char *intrinsic
= NULL
;
279 unsigned intr_size
= 0;
282 assert(lp_check_value(type
, a
));
283 assert(lp_check_value(type
, b
));
285 /* TODO: optimize the constant case */
287 if (type
.floating
&& util_cpu_caps
.has_sse
) {
288 if (type
.width
== 32) {
289 if (type
.length
== 1) {
290 intrinsic
= "llvm.x86.sse.max.ss";
293 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
294 intrinsic
= "llvm.x86.sse.max.ps";
298 intrinsic
= "llvm.x86.avx.max.ps.256";
302 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
303 if (type
.length
== 1) {
304 intrinsic
= "llvm.x86.sse2.max.sd";
307 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
308 intrinsic
= "llvm.x86.sse2.max.pd";
312 intrinsic
= "llvm.x86.avx.max.pd.256";
317 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
318 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
||
319 nan_behavior
== GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
320 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
323 if (type
.width
== 32 || type
.length
== 4) {
324 intrinsic
= "llvm.ppc.altivec.vmaxfp";
327 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
329 if ((type
.width
== 8 || type
.width
== 16) &&
330 (type
.width
* type
.length
<= 64) &&
331 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
332 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
335 if (type
.width
== 8 && !type
.sign
) {
336 intrinsic
= "llvm.x86.sse2.pmaxu.b";
339 else if (type
.width
== 16 && type
.sign
) {
340 intrinsic
= "llvm.x86.sse2.pmaxs.w";
342 if (util_cpu_caps
.has_sse4_1
) {
343 if (type
.width
== 8 && type
.sign
) {
344 intrinsic
= "llvm.x86.sse41.pmaxsb";
346 if (type
.width
== 16 && !type
.sign
) {
347 intrinsic
= "llvm.x86.sse41.pmaxuw";
349 if (type
.width
== 32 && !type
.sign
) {
350 intrinsic
= "llvm.x86.sse41.pmaxud";
352 if (type
.width
== 32 && type
.sign
) {
353 intrinsic
= "llvm.x86.sse41.pmaxsd";
356 } else if (util_cpu_caps
.has_altivec
) {
358 if (type
.width
== 8) {
360 intrinsic
= "llvm.ppc.altivec.vmaxub";
362 intrinsic
= "llvm.ppc.altivec.vmaxsb";
364 } else if (type
.width
== 16) {
366 intrinsic
= "llvm.ppc.altivec.vmaxuh";
368 intrinsic
= "llvm.ppc.altivec.vmaxsh";
370 } else if (type
.width
== 32) {
372 intrinsic
= "llvm.ppc.altivec.vmaxuw";
374 intrinsic
= "llvm.ppc.altivec.vmaxsw";
380 if (util_cpu_caps
.has_sse
&& type
.floating
&&
381 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
382 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
&&
383 nan_behavior
!= GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
) {
384 LLVMValueRef isnan
, max
;
385 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
388 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
389 isnan
= lp_build_isnan(bld
, b
);
390 return lp_build_select(bld
, isnan
, a
, max
);
392 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
393 isnan
= lp_build_isnan(bld
, a
);
394 return lp_build_select(bld
, isnan
, a
, max
);
397 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
404 switch (nan_behavior
) {
405 case GALLIVM_NAN_RETURN_NAN
: {
406 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
407 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
408 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
409 return lp_build_select(bld
, cond
, a
, b
);
412 case GALLIVM_NAN_RETURN_OTHER
: {
413 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
414 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
415 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
416 return lp_build_select(bld
, cond
, a
, b
);
419 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
420 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
421 return lp_build_select(bld
, cond
, a
, b
);
422 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
:
423 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, a
);
424 return lp_build_select(bld
, cond
, b
, a
);
425 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
426 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
427 return lp_build_select(bld
, cond
, a
, b
);
431 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
432 return lp_build_select(bld
, cond
, a
, b
);
435 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
436 return lp_build_select(bld
, cond
, a
, b
);
442 * Generate 1 - a, or ~a depending on bld->type.
445 lp_build_comp(struct lp_build_context
*bld
,
448 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
449 const struct lp_type type
= bld
->type
;
451 assert(lp_check_value(type
, a
));
458 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
459 if(LLVMIsConstant(a
))
460 return LLVMConstNot(a
);
462 return LLVMBuildNot(builder
, a
, "");
465 if(LLVMIsConstant(a
))
467 return LLVMConstFSub(bld
->one
, a
);
469 return LLVMConstSub(bld
->one
, a
);
472 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
474 return LLVMBuildSub(builder
, bld
->one
, a
, "");
482 lp_build_add(struct lp_build_context
*bld
,
486 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
487 const struct lp_type type
= bld
->type
;
490 assert(lp_check_value(type
, a
));
491 assert(lp_check_value(type
, b
));
497 if(a
== bld
->undef
|| b
== bld
->undef
)
501 const char *intrinsic
= NULL
;
503 if(a
== bld
->one
|| b
== bld
->one
)
506 if (type
.width
* type
.length
== 128 &&
507 !type
.floating
&& !type
.fixed
) {
508 if(util_cpu_caps
.has_sse2
) {
510 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
512 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
513 } else if (util_cpu_caps
.has_altivec
) {
515 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
517 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
522 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
525 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
527 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
528 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
529 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
530 /* a_clamp_max is the maximum a for positive b,
531 a_clamp_min is the minimum a for negative b. */
532 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildSub(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
533 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildSub(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
534 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_max
, a_clamp_min
);
536 a
= lp_build_min_simple(bld
, a
, lp_build_comp(bld
, b
), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
540 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
542 res
= LLVMConstFAdd(a
, b
);
544 res
= LLVMConstAdd(a
, b
);
547 res
= LLVMBuildFAdd(builder
, a
, b
, "");
549 res
= LLVMBuildAdd(builder
, a
, b
, "");
551 /* clamp to ceiling of 1.0 */
552 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
553 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
555 /* XXX clamp to floor of -1 or 0??? */
561 /** Return the scalar sum of the elements of a.
562 * Should avoid this operation whenever possible.
565 lp_build_horizontal_add(struct lp_build_context
*bld
,
568 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
569 const struct lp_type type
= bld
->type
;
570 LLVMValueRef index
, res
;
572 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
573 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
574 LLVMValueRef vecres
, elem2
;
576 assert(lp_check_value(type
, a
));
578 if (type
.length
== 1) {
582 assert(!bld
->type
.norm
);
585 * for byte vectors can do much better with psadbw.
586 * Using repeated shuffle/adds here. Note with multiple vectors
587 * this can be done more efficiently as outlined in the intel
588 * optimization manual.
589 * Note: could cause data rearrangement if used with smaller element
594 length
= type
.length
/ 2;
596 LLVMValueRef vec1
, vec2
;
597 for (i
= 0; i
< length
; i
++) {
598 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
599 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
601 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
602 LLVMConstVector(shuffles1
, length
), "");
603 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
604 LLVMConstVector(shuffles2
, length
), "");
606 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
609 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
611 length
= length
>> 1;
614 /* always have vector of size 2 here */
617 index
= lp_build_const_int32(bld
->gallivm
, 0);
618 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
619 index
= lp_build_const_int32(bld
->gallivm
, 1);
620 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
623 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
625 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
631 * Return the horizontal sums of 4 float vectors as a float4 vector.
632 * This uses the technique as outlined in Intel Optimization Manual.
635 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
638 struct gallivm_state
*gallivm
= bld
->gallivm
;
639 LLVMBuilderRef builder
= gallivm
->builder
;
640 LLVMValueRef shuffles
[4];
642 LLVMValueRef sumtmp
[2], shuftmp
[2];
644 /* lower half of regs */
645 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
646 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
647 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
648 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
649 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
650 LLVMConstVector(shuffles
, 4), "");
651 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
652 LLVMConstVector(shuffles
, 4), "");
654 /* upper half of regs */
655 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
656 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
657 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
658 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
659 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
660 LLVMConstVector(shuffles
, 4), "");
661 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
662 LLVMConstVector(shuffles
, 4), "");
664 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
665 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
667 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
668 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
669 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
670 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
671 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
672 LLVMConstVector(shuffles
, 4), "");
674 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
675 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
676 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
677 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
678 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
679 LLVMConstVector(shuffles
, 4), "");
681 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
686 * partially horizontally add 2-4 float vectors with length nx4,
687 * i.e. only four adjacent values in each vector will be added,
688 * assuming values are really grouped in 4 which also determines
691 * Return a vector of the same length as the initial vectors,
692 * with the excess elements (if any) being undefined.
693 * The element order is independent of number of input vectors.
694 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
695 * the output order thus will be
696 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
699 lp_build_hadd_partial4(struct lp_build_context
*bld
,
700 LLVMValueRef vectors
[],
703 struct gallivm_state
*gallivm
= bld
->gallivm
;
704 LLVMBuilderRef builder
= gallivm
->builder
;
705 LLVMValueRef ret_vec
;
707 const char *intrinsic
= NULL
;
709 assert(num_vecs
>= 2 && num_vecs
<= 4);
710 assert(bld
->type
.floating
);
712 /* only use this with at least 2 vectors, as it is sort of expensive
713 * (depending on cpu) and we always need two horizontal adds anyway,
714 * so a shuffle/add approach might be better.
720 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
721 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
723 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
724 bld
->type
.length
== 4) {
725 intrinsic
= "llvm.x86.sse3.hadd.ps";
727 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
728 bld
->type
.length
== 8) {
729 intrinsic
= "llvm.x86.avx.hadd.ps.256";
732 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
733 lp_build_vec_type(gallivm
, bld
->type
),
736 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
737 lp_build_vec_type(gallivm
, bld
->type
),
743 return lp_build_intrinsic_binary(builder
, intrinsic
,
744 lp_build_vec_type(gallivm
, bld
->type
),
748 if (bld
->type
.length
== 4) {
749 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
752 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
754 unsigned num_iter
= bld
->type
.length
/ 4;
755 struct lp_type parttype
= bld
->type
;
757 for (j
= 0; j
< num_iter
; j
++) {
758 LLVMValueRef partsrc
[4];
760 for (i
= 0; i
< 4; i
++) {
761 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
763 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
765 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
774 lp_build_sub(struct lp_build_context
*bld
,
778 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
779 const struct lp_type type
= bld
->type
;
782 assert(lp_check_value(type
, a
));
783 assert(lp_check_value(type
, b
));
787 if(a
== bld
->undef
|| b
== bld
->undef
)
793 const char *intrinsic
= NULL
;
798 if (type
.width
* type
.length
== 128 &&
799 !type
.floating
&& !type
.fixed
) {
800 if (util_cpu_caps
.has_sse2
) {
802 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
804 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
805 } else if (util_cpu_caps
.has_altivec
) {
807 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
809 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
814 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
817 if(type
.norm
&& !type
.floating
&& !type
.fixed
) {
819 uint64_t sign
= (uint64_t)1 << (type
.width
- 1);
820 LLVMValueRef max_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
- 1);
821 LLVMValueRef min_val
= lp_build_const_int_vec(bld
->gallivm
, type
, sign
);
822 /* a_clamp_max is the maximum a for negative b,
823 a_clamp_min is the minimum a for positive b. */
824 LLVMValueRef a_clamp_max
= lp_build_min_simple(bld
, a
, LLVMBuildAdd(builder
, max_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
825 LLVMValueRef a_clamp_min
= lp_build_max_simple(bld
, a
, LLVMBuildAdd(builder
, min_val
, b
, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
826 a
= lp_build_select(bld
, lp_build_cmp(bld
, PIPE_FUNC_GREATER
, b
, bld
->zero
), a_clamp_min
, a_clamp_max
);
828 a
= lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
832 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
834 res
= LLVMConstFSub(a
, b
);
836 res
= LLVMConstSub(a
, b
);
839 res
= LLVMBuildFSub(builder
, a
, b
, "");
841 res
= LLVMBuildSub(builder
, a
, b
, "");
843 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
844 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
852 * Normalized multiplication.
854 * There are several approaches for (using 8-bit normalized multiplication as
859 * makes the following approximation to the division (Sree)
861 * a*b/255 ~= (a*(b + 1)) >> 256
863 * which is the fastest method that satisfies the following OpenGL criteria of
865 * 0*0 = 0 and 255*255 = 255
869 * takes the geometric series approximation to the division
871 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
873 * in this case just the first two terms to fit in 16bit arithmetic
875 * t/255 ~= (t + (t >> 8)) >> 8
877 * note that just by itself it doesn't satisfies the OpenGL criteria, as
878 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
881 * - geometric series plus rounding
883 * when using a geometric series division instead of truncating the result
884 * use roundoff in the approximation (Jim Blinn)
886 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
888 * achieving the exact results.
892 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
893 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
894 * @sa Michael Herf, The "double blend trick", May 2000,
895 * http://www.stereopsis.com/doubleblend.html
898 lp_build_mul_norm(struct gallivm_state
*gallivm
,
899 struct lp_type wide_type
,
900 LLVMValueRef a
, LLVMValueRef b
)
902 LLVMBuilderRef builder
= gallivm
->builder
;
903 struct lp_build_context bld
;
908 assert(!wide_type
.floating
);
909 assert(lp_check_value(wide_type
, a
));
910 assert(lp_check_value(wide_type
, b
));
912 lp_build_context_init(&bld
, gallivm
, wide_type
);
914 n
= wide_type
.width
/ 2;
915 if (wide_type
.sign
) {
920 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
921 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
925 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
928 ab
= LLVMBuildMul(builder
, a
, b
, "");
929 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
932 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
935 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1LL << (n
- 1));
936 if (wide_type
.sign
) {
937 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
938 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
939 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
941 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
944 ab
= lp_build_shr_imm(&bld
, ab
, n
);
953 lp_build_mul(struct lp_build_context
*bld
,
957 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
958 const struct lp_type type
= bld
->type
;
962 assert(lp_check_value(type
, a
));
963 assert(lp_check_value(type
, b
));
973 if(a
== bld
->undef
|| b
== bld
->undef
)
976 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
977 struct lp_type wide_type
= lp_wider_type(type
);
978 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
980 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
981 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
983 /* PMULLW, PSRLW, PADDW */
984 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
985 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
987 ab
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, abl
, abh
);
993 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
997 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
999 res
= LLVMConstFMul(a
, b
);
1001 res
= LLVMConstMul(a
, b
);
1004 res
= LLVMConstAShr(res
, shift
);
1006 res
= LLVMConstLShr(res
, shift
);
1011 res
= LLVMBuildFMul(builder
, a
, b
, "");
1013 res
= LLVMBuildMul(builder
, a
, b
, "");
1016 res
= LLVMBuildAShr(builder
, res
, shift
, "");
1018 res
= LLVMBuildLShr(builder
, res
, shift
, "");
1027 * Small vector x scale multiplication optimization.
1030 lp_build_mul_imm(struct lp_build_context
*bld
,
1034 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1035 LLVMValueRef factor
;
1037 assert(lp_check_value(bld
->type
, a
));
1046 return lp_build_negate(bld
, a
);
1048 if(b
== 2 && bld
->type
.floating
)
1049 return lp_build_add(bld
, a
, a
);
1051 if(util_is_power_of_two(b
)) {
1052 unsigned shift
= ffs(b
) - 1;
1054 if(bld
->type
.floating
) {
1057 * Power of two multiplication by directly manipulating the exponent.
1059 * XXX: This might not be always faster, it will introduce a small error
1060 * for multiplication by zero, and it will produce wrong results
1063 unsigned mantissa
= lp_mantissa(bld
->type
);
1064 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1065 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1066 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1067 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1072 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1073 return LLVMBuildShl(builder
, a
, factor
, "");
1077 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1078 return lp_build_mul(bld
, a
, factor
);
1086 lp_build_div(struct lp_build_context
*bld
,
1090 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1091 const struct lp_type type
= bld
->type
;
1093 assert(lp_check_value(type
, a
));
1094 assert(lp_check_value(type
, b
));
1098 if(a
== bld
->one
&& type
.floating
)
1099 return lp_build_rcp(bld
, b
);
1104 if(a
== bld
->undef
|| b
== bld
->undef
)
1107 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1109 return LLVMConstFDiv(a
, b
);
1111 return LLVMConstSDiv(a
, b
);
1113 return LLVMConstUDiv(a
, b
);
1116 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1117 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1119 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1122 return LLVMBuildFDiv(builder
, a
, b
, "");
1124 return LLVMBuildSDiv(builder
, a
, b
, "");
1126 return LLVMBuildUDiv(builder
, a
, b
, "");
1131 * Linear interpolation helper.
1133 * @param normalized whether we are interpolating normalized values,
1134 * encoded in normalized integers, twice as wide.
1136 * @sa http://www.stereopsis.com/doubleblend.html
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context
*bld
,
1145 unsigned half_width
= bld
->type
.width
/2;
1146 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1150 assert(lp_check_value(bld
->type
, x
));
1151 assert(lp_check_value(bld
->type
, v0
));
1152 assert(lp_check_value(bld
->type
, v1
));
1154 delta
= lp_build_sub(bld
, v1
, v0
);
1156 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1157 if (!bld
->type
.sign
) {
1158 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1160 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161 * most-significant-bit to the lowest-significant-bit, so that
1162 * later we can just divide by 2**n instead of 2**n - 1.
1165 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1168 /* (x * delta) >> n */
1169 res
= lp_build_mul(bld
, x
, delta
);
1170 res
= lp_build_shr_imm(bld
, res
, half_width
);
1173 * The rescaling trick above doesn't work for signed numbers, so
1174 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1177 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1178 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1181 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1182 res
= lp_build_mul(bld
, x
, delta
);
1185 res
= lp_build_add(bld
, v0
, res
);
1187 if (((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) ||
1189 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191 * but it will be wrong for true fixed point use cases. Basically we need
1192 * a more powerful lp_type, capable of further distinguishing the values
1193 * interpretation from the value storage. */
1194 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1), "");
1202 * Linear interpolation.
1205 lp_build_lerp(struct lp_build_context
*bld
,
1211 const struct lp_type type
= bld
->type
;
1214 assert(lp_check_value(type
, x
));
1215 assert(lp_check_value(type
, v0
));
1216 assert(lp_check_value(type
, v1
));
1218 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1221 struct lp_type wide_type
;
1222 struct lp_build_context wide_bld
;
1223 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1225 assert(type
.length
>= 2);
1228 * Create a wider integer type, enough to hold the
1229 * intermediate result of the multiplication.
1231 memset(&wide_type
, 0, sizeof wide_type
);
1232 wide_type
.sign
= type
.sign
;
1233 wide_type
.width
= type
.width
*2;
1234 wide_type
.length
= type
.length
/2;
1236 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1238 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1239 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1240 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1246 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1248 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1249 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1251 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1253 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1261 * Bilinear interpolation.
1263 * Values indices are in v_{yx}.
1266 lp_build_lerp_2d(struct lp_build_context
*bld
,
1275 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1276 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1277 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1282 lp_build_lerp_3d(struct lp_build_context
*bld
,
1296 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1297 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1298 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1303 * Generate min(a, b)
1304 * Do checks for special cases but not for nans.
1307 lp_build_min(struct lp_build_context
*bld
,
1311 assert(lp_check_value(bld
->type
, a
));
1312 assert(lp_check_value(bld
->type
, b
));
1314 if(a
== bld
->undef
|| b
== bld
->undef
)
1320 if (bld
->type
.norm
) {
1321 if (!bld
->type
.sign
) {
1322 if (a
== bld
->zero
|| b
== bld
->zero
) {
1332 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1337 * Generate min(a, b)
1338 * NaN's are handled according to the behavior specified by the
1339 * nan_behavior argument.
1342 lp_build_min_ext(struct lp_build_context
*bld
,
1345 enum gallivm_nan_behavior nan_behavior
)
1347 assert(lp_check_value(bld
->type
, a
));
1348 assert(lp_check_value(bld
->type
, b
));
1350 if(a
== bld
->undef
|| b
== bld
->undef
)
1356 if (bld
->type
.norm
) {
1357 if (!bld
->type
.sign
) {
1358 if (a
== bld
->zero
|| b
== bld
->zero
) {
1368 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1372 * Generate max(a, b)
1373 * Do checks for special cases, but NaN behavior is undefined.
1376 lp_build_max(struct lp_build_context
*bld
,
1380 assert(lp_check_value(bld
->type
, a
));
1381 assert(lp_check_value(bld
->type
, b
));
1383 if(a
== bld
->undef
|| b
== bld
->undef
)
1389 if(bld
->type
.norm
) {
1390 if(a
== bld
->one
|| b
== bld
->one
)
1392 if (!bld
->type
.sign
) {
1393 if (a
== bld
->zero
) {
1396 if (b
== bld
->zero
) {
1402 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1407 * Generate max(a, b)
1408 * Checks for special cases.
1409 * NaN's are handled according to the behavior specified by the
1410 * nan_behavior argument.
1413 lp_build_max_ext(struct lp_build_context
*bld
,
1416 enum gallivm_nan_behavior nan_behavior
)
1418 assert(lp_check_value(bld
->type
, a
));
1419 assert(lp_check_value(bld
->type
, b
));
1421 if(a
== bld
->undef
|| b
== bld
->undef
)
1427 if(bld
->type
.norm
) {
1428 if(a
== bld
->one
|| b
== bld
->one
)
1430 if (!bld
->type
.sign
) {
1431 if (a
== bld
->zero
) {
1434 if (b
== bld
->zero
) {
1440 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1444 * Generate clamp(a, min, max)
1445 * NaN behavior (for any of a, min, max) is undefined.
1446 * Do checks for special cases.
1449 lp_build_clamp(struct lp_build_context
*bld
,
1454 assert(lp_check_value(bld
->type
, a
));
1455 assert(lp_check_value(bld
->type
, min
));
1456 assert(lp_check_value(bld
->type
, max
));
1458 a
= lp_build_min(bld
, a
, max
);
1459 a
= lp_build_max(bld
, a
, min
);
1465 * Generate clamp(a, 0, 1)
1466 * A NaN will get converted to zero.
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context
*bld
,
1472 a
= lp_build_max_ext(bld
, a
, bld
->zero
, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
1473 a
= lp_build_min(bld
, a
, bld
->one
);
1482 lp_build_abs(struct lp_build_context
*bld
,
1485 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1486 const struct lp_type type
= bld
->type
;
1487 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1489 assert(lp_check_value(type
, a
));
1495 if (0x0306 <= HAVE_LLVM
&& HAVE_LLVM
< 0x0309) {
1496 /* Workaround llvm.org/PR27332 */
1497 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1498 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1499 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1500 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1501 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1502 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1506 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.fabs", vec_type
);
1507 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1511 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1512 switch(type
.width
) {
1514 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1516 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1518 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1521 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1522 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1523 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1524 debug_printf("%s: inefficient code, should split vectors manually\n",
1528 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1533 lp_build_negate(struct lp_build_context
*bld
,
1536 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1538 assert(lp_check_value(bld
->type
, a
));
1540 if (bld
->type
.floating
)
1541 a
= LLVMBuildFNeg(builder
, a
, "");
1543 a
= LLVMBuildNeg(builder
, a
, "");
1549 /** Return -1, 0 or +1 depending on the sign of a */
1551 lp_build_sgn(struct lp_build_context
*bld
,
1554 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1555 const struct lp_type type
= bld
->type
;
1559 assert(lp_check_value(type
, a
));
1561 /* Handle non-zero case */
1563 /* if not zero then sign must be positive */
1566 else if(type
.floating
) {
1567 LLVMTypeRef vec_type
;
1568 LLVMTypeRef int_type
;
1572 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1574 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1575 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1576 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1578 /* Take the sign bit and add it to 1 constant */
1579 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1580 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1581 one
= LLVMConstBitCast(bld
->one
, int_type
);
1582 res
= LLVMBuildOr(builder
, sign
, one
, "");
1583 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1587 /* signed int/norm/fixed point */
1588 /* could use psign with sse3 and appropriate vectors here */
1589 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1590 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1591 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1595 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1596 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1603 * Set the sign of float vector 'a' according to 'sign'.
1604 * If sign==0, return abs(a).
1605 * If sign==1, return -abs(a);
1606 * Other values for sign produce undefined results.
1609 lp_build_set_sign(struct lp_build_context
*bld
,
1610 LLVMValueRef a
, LLVMValueRef sign
)
1612 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1613 const struct lp_type type
= bld
->type
;
1614 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1615 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1616 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1617 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1618 ~((unsigned long long) 1 << (type
.width
- 1)));
1619 LLVMValueRef val
, res
;
1621 assert(type
.floating
);
1622 assert(lp_check_value(type
, a
));
1624 /* val = reinterpret_cast<int>(a) */
1625 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1626 /* val = val & mask */
1627 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1628 /* sign = sign << shift */
1629 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1630 /* res = val | sign */
1631 res
= LLVMBuildOr(builder
, val
, sign
, "");
1632 /* res = reinterpret_cast<float>(res) */
1633 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1640 * Convert vector of (or scalar) int to vector of (or scalar) float.
1643 lp_build_int_to_float(struct lp_build_context
*bld
,
1646 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1647 const struct lp_type type
= bld
->type
;
1648 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1650 assert(type
.floating
);
1652 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1656 arch_rounding_available(const struct lp_type type
)
1658 if ((util_cpu_caps
.has_sse4_1
&&
1659 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1660 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1662 else if ((util_cpu_caps
.has_altivec
&&
1663 (type
.width
== 32 && type
.length
== 4)))
1669 enum lp_build_round_mode
1671 LP_BUILD_ROUND_NEAREST
= 0,
1672 LP_BUILD_ROUND_FLOOR
= 1,
1673 LP_BUILD_ROUND_CEIL
= 2,
1674 LP_BUILD_ROUND_TRUNCATE
= 3
1677 static inline LLVMValueRef
1678 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1681 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1682 const struct lp_type type
= bld
->type
;
1683 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1684 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1685 const char *intrinsic
;
1688 assert(type
.floating
);
1689 /* using the double precision conversions is a bit more complicated */
1690 assert(type
.width
== 32);
1692 assert(lp_check_value(type
, a
));
1693 assert(util_cpu_caps
.has_sse2
);
1695 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1696 if (type
.length
== 1) {
1697 LLVMTypeRef vec_type
;
1700 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1702 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1704 intrinsic
= "llvm.x86.sse.cvtss2si";
1706 undef
= LLVMGetUndef(vec_type
);
1708 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1710 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1714 if (type
.width
* type
.length
== 128) {
1715 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1718 assert(type
.width
*type
.length
== 256);
1719 assert(util_cpu_caps
.has_avx
);
1721 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1723 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1733 static inline LLVMValueRef
1734 lp_build_round_altivec(struct lp_build_context
*bld
,
1736 enum lp_build_round_mode mode
)
1738 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1739 const struct lp_type type
= bld
->type
;
1740 const char *intrinsic
= NULL
;
1742 assert(type
.floating
);
1744 assert(lp_check_value(type
, a
));
1745 assert(util_cpu_caps
.has_altivec
);
1750 case LP_BUILD_ROUND_NEAREST
:
1751 intrinsic
= "llvm.ppc.altivec.vrfin";
1753 case LP_BUILD_ROUND_FLOOR
:
1754 intrinsic
= "llvm.ppc.altivec.vrfim";
1756 case LP_BUILD_ROUND_CEIL
:
1757 intrinsic
= "llvm.ppc.altivec.vrfip";
1759 case LP_BUILD_ROUND_TRUNCATE
:
1760 intrinsic
= "llvm.ppc.altivec.vrfiz";
1764 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1767 static inline LLVMValueRef
1768 lp_build_round_arch(struct lp_build_context
*bld
,
1770 enum lp_build_round_mode mode
)
1772 if (util_cpu_caps
.has_sse4_1
) {
1773 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1774 const struct lp_type type
= bld
->type
;
1775 const char *intrinsic_root
;
1778 assert(type
.floating
);
1779 assert(lp_check_value(type
, a
));
1783 case LP_BUILD_ROUND_NEAREST
:
1784 intrinsic_root
= "llvm.nearbyint";
1786 case LP_BUILD_ROUND_FLOOR
:
1787 intrinsic_root
= "llvm.floor";
1789 case LP_BUILD_ROUND_CEIL
:
1790 intrinsic_root
= "llvm.ceil";
1792 case LP_BUILD_ROUND_TRUNCATE
:
1793 intrinsic_root
= "llvm.trunc";
1797 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, intrinsic_root
, bld
->vec_type
);
1798 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1800 else /* (util_cpu_caps.has_altivec) */
1801 return lp_build_round_altivec(bld
, a
, mode
);
1805 * Return the integer part of a float (vector) value (== round toward zero).
1806 * The returned value is a float (vector).
1807 * Ex: trunc(-1.5) = -1.0
1810 lp_build_trunc(struct lp_build_context
*bld
,
1813 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1814 const struct lp_type type
= bld
->type
;
1816 assert(type
.floating
);
1817 assert(lp_check_value(type
, a
));
1819 if (arch_rounding_available(type
)) {
1820 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1823 const struct lp_type type
= bld
->type
;
1824 struct lp_type inttype
;
1825 struct lp_build_context intbld
;
1826 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
1827 LLVMValueRef trunc
, res
, anosign
, mask
;
1828 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1829 LLVMTypeRef vec_type
= bld
->vec_type
;
1831 assert(type
.width
== 32); /* might want to handle doubles at some point */
1834 inttype
.floating
= 0;
1835 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1837 /* round by truncation */
1838 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1839 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1841 /* mask out sign bit */
1842 anosign
= lp_build_abs(bld
, a
);
1844 * mask out all values if anosign > 2^24
1845 * This should work both for large ints (all rounding is no-op for them
1846 * because such floats are always exact) as well as special cases like
1847 * NaNs, Infs (taking advantage of the fact they use max exponent).
1848 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1850 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1851 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1852 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1853 return lp_build_select(bld
, mask
, a
, res
);
1859 * Return float (vector) rounded to nearest integer (vector). The returned
1860 * value is a float (vector).
1861 * Ex: round(0.9) = 1.0
1862 * Ex: round(-1.5) = -2.0
1865 lp_build_round(struct lp_build_context
*bld
,
1868 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1869 const struct lp_type type
= bld
->type
;
1871 assert(type
.floating
);
1872 assert(lp_check_value(type
, a
));
1874 if (arch_rounding_available(type
)) {
1875 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1878 const struct lp_type type
= bld
->type
;
1879 struct lp_type inttype
;
1880 struct lp_build_context intbld
;
1881 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
1882 LLVMValueRef res
, anosign
, mask
;
1883 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1884 LLVMTypeRef vec_type
= bld
->vec_type
;
1886 assert(type
.width
== 32); /* might want to handle doubles at some point */
1889 inttype
.floating
= 0;
1890 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1892 res
= lp_build_iround(bld
, a
);
1893 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1895 /* mask out sign bit */
1896 anosign
= lp_build_abs(bld
, a
);
1898 * mask out all values if anosign > 2^24
1899 * This should work both for large ints (all rounding is no-op for them
1900 * because such floats are always exact) as well as special cases like
1901 * NaNs, Infs (taking advantage of the fact they use max exponent).
1902 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1904 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1905 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1906 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1907 return lp_build_select(bld
, mask
, a
, res
);
1913 * Return floor of float (vector), result is a float (vector)
1914 * Ex: floor(1.1) = 1.0
1915 * Ex: floor(-1.1) = -2.0
1918 lp_build_floor(struct lp_build_context
*bld
,
1921 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1922 const struct lp_type type
= bld
->type
;
1924 assert(type
.floating
);
1925 assert(lp_check_value(type
, a
));
1927 if (arch_rounding_available(type
)) {
1928 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1931 const struct lp_type type
= bld
->type
;
1932 struct lp_type inttype
;
1933 struct lp_build_context intbld
;
1934 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
1935 LLVMValueRef trunc
, res
, anosign
, mask
;
1936 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1937 LLVMTypeRef vec_type
= bld
->vec_type
;
1939 if (type
.width
!= 32) {
1941 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.floor", vec_type
);
1942 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1945 assert(type
.width
== 32); /* might want to handle doubles at some point */
1948 inttype
.floating
= 0;
1949 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1951 /* round by truncation */
1952 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1953 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1959 * fix values if rounding is wrong (for non-special cases)
1960 * - this is the case if trunc > a
1962 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
1963 /* tmp = trunc > a ? 1.0 : 0.0 */
1964 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1965 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1966 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1967 res
= lp_build_sub(bld
, res
, tmp
);
1970 /* mask out sign bit */
1971 anosign
= lp_build_abs(bld
, a
);
1973 * mask out all values if anosign > 2^24
1974 * This should work both for large ints (all rounding is no-op for them
1975 * because such floats are always exact) as well as special cases like
1976 * NaNs, Infs (taking advantage of the fact they use max exponent).
1977 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1979 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1980 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1981 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1982 return lp_build_select(bld
, mask
, a
, res
);
1988 * Return ceiling of float (vector), returning float (vector).
1989 * Ex: ceil( 1.1) = 2.0
1990 * Ex: ceil(-1.1) = -1.0
1993 lp_build_ceil(struct lp_build_context
*bld
,
1996 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1997 const struct lp_type type
= bld
->type
;
1999 assert(type
.floating
);
2000 assert(lp_check_value(type
, a
));
2002 if (arch_rounding_available(type
)) {
2003 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2006 const struct lp_type type
= bld
->type
;
2007 struct lp_type inttype
;
2008 struct lp_build_context intbld
;
2009 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 1<<24);
2010 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2011 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2012 LLVMTypeRef vec_type
= bld
->vec_type
;
2014 if (type
.width
!= 32) {
2016 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.ceil", vec_type
);
2017 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2020 assert(type
.width
== 32); /* might want to handle doubles at some point */
2023 inttype
.floating
= 0;
2024 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2026 /* round by truncation */
2027 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2028 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2031 * fix values if rounding is wrong (for non-special cases)
2032 * - this is the case if trunc < a
2034 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2035 /* tmp = trunc < a ? 1.0 : 0.0 */
2036 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2037 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2038 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2039 res
= lp_build_add(bld
, trunc
, tmp
);
2041 /* mask out sign bit */
2042 anosign
= lp_build_abs(bld
, a
);
2044 * mask out all values if anosign > 2^24
2045 * This should work both for large ints (all rounding is no-op for them
2046 * because such floats are always exact) as well as special cases like
2047 * NaNs, Infs (taking advantage of the fact they use max exponent).
2048 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2050 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2051 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2052 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2053 return lp_build_select(bld
, mask
, a
, res
);
2059 * Return fractional part of 'a' computed as a - floor(a)
2060 * Typically used in texture coord arithmetic.
2063 lp_build_fract(struct lp_build_context
*bld
,
2066 assert(bld
->type
.floating
);
2067 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2072 * Prevent returning a fractional part of 1.0 for very small negative values of
2073 * 'a' by clamping against 0.99999(9).
2075 static inline LLVMValueRef
2076 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2080 /* this is the largest number smaller than 1.0 representable as float */
2081 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2082 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2083 return lp_build_min(bld
, fract
, max
);
2088 * Same as lp_build_fract, but guarantees that the result is always smaller
2092 lp_build_fract_safe(struct lp_build_context
*bld
,
2095 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2100 * Return the integer part of a float (vector) value (== round toward zero).
2101 * The returned value is an integer (vector).
2102 * Ex: itrunc(-1.5) = -1
2105 lp_build_itrunc(struct lp_build_context
*bld
,
2108 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2109 const struct lp_type type
= bld
->type
;
2110 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2112 assert(type
.floating
);
2113 assert(lp_check_value(type
, a
));
2115 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2120 * Return float (vector) rounded to nearest integer (vector). The returned
2121 * value is an integer (vector).
2122 * Ex: iround(0.9) = 1
2123 * Ex: iround(-1.5) = -2
2126 lp_build_iround(struct lp_build_context
*bld
,
2129 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2130 const struct lp_type type
= bld
->type
;
2131 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2134 assert(type
.floating
);
2136 assert(lp_check_value(type
, a
));
2138 if ((util_cpu_caps
.has_sse2
&&
2139 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2140 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2141 return lp_build_iround_nearest_sse2(bld
, a
);
2143 if (arch_rounding_available(type
)) {
2144 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2149 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
2152 LLVMTypeRef vec_type
= bld
->vec_type
;
2153 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2154 (unsigned long long)1 << (type
.width
- 1));
2158 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2159 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2162 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2163 half
= LLVMBuildOr(builder
, sign
, half
, "");
2164 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2167 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2170 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2177 * Return floor of float (vector), result is an int (vector)
2178 * Ex: ifloor(1.1) = 1.0
2179 * Ex: ifloor(-1.1) = -2.0
2182 lp_build_ifloor(struct lp_build_context
*bld
,
2185 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2186 const struct lp_type type
= bld
->type
;
2187 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2190 assert(type
.floating
);
2191 assert(lp_check_value(type
, a
));
2195 if (arch_rounding_available(type
)) {
2196 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2199 struct lp_type inttype
;
2200 struct lp_build_context intbld
;
2201 LLVMValueRef trunc
, itrunc
, mask
;
2203 assert(type
.floating
);
2204 assert(lp_check_value(type
, a
));
2207 inttype
.floating
= 0;
2208 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2210 /* round by truncation */
2211 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2212 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2215 * fix values if rounding is wrong (for non-special cases)
2216 * - this is the case if trunc > a
2217 * The results of doing this with NaNs, very large values etc.
2218 * are undefined but this seems to be the case anyway.
2220 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2221 /* cheapie minus one with mask since the mask is minus one / zero */
2222 return lp_build_add(&intbld
, itrunc
, mask
);
2226 /* round to nearest (toward zero) */
2227 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2234 * Return ceiling of float (vector), returning int (vector).
2235 * Ex: iceil( 1.1) = 2
2236 * Ex: iceil(-1.1) = -1
2239 lp_build_iceil(struct lp_build_context
*bld
,
2242 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2243 const struct lp_type type
= bld
->type
;
2244 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2247 assert(type
.floating
);
2248 assert(lp_check_value(type
, a
));
2250 if (arch_rounding_available(type
)) {
2251 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2254 struct lp_type inttype
;
2255 struct lp_build_context intbld
;
2256 LLVMValueRef trunc
, itrunc
, mask
;
2258 assert(type
.floating
);
2259 assert(lp_check_value(type
, a
));
2262 inttype
.floating
= 0;
2263 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2265 /* round by truncation */
2266 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2267 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2270 * fix values if rounding is wrong (for non-special cases)
2271 * - this is the case if trunc < a
2272 * The results of doing this with NaNs, very large values etc.
2273 * are undefined but this seems to be the case anyway.
2275 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2276 /* cheapie plus one with mask since the mask is minus one / zero */
2277 return lp_build_sub(&intbld
, itrunc
, mask
);
2280 /* round to nearest (toward zero) */
2281 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2288 * Combined ifloor() & fract().
2290 * Preferred to calling the functions separately, as it will ensure that the
2291 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2294 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2296 LLVMValueRef
*out_ipart
,
2297 LLVMValueRef
*out_fpart
)
2299 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2300 const struct lp_type type
= bld
->type
;
2303 assert(type
.floating
);
2304 assert(lp_check_value(type
, a
));
2306 if (arch_rounding_available(type
)) {
2308 * floor() is easier.
2311 ipart
= lp_build_floor(bld
, a
);
2312 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2313 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2317 * ifloor() is easier.
2320 *out_ipart
= lp_build_ifloor(bld
, a
);
2321 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2322 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2328 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2329 * always smaller than one.
2332 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2334 LLVMValueRef
*out_ipart
,
2335 LLVMValueRef
*out_fpart
)
2337 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2338 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2343 lp_build_sqrt(struct lp_build_context
*bld
,
2346 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2347 const struct lp_type type
= bld
->type
;
2348 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2351 assert(lp_check_value(type
, a
));
2353 assert(type
.floating
);
2354 lp_format_intrinsic(intrinsic
, sizeof intrinsic
, "llvm.sqrt", vec_type
);
2356 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2361 * Do one Newton-Raphson step to improve reciprocate precision:
2363 * x_{i+1} = x_i * (2 - a * x_i)
2365 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2366 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2367 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2368 * halo. It would be necessary to clamp the argument to prevent this.
2371 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2372 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2374 static inline LLVMValueRef
2375 lp_build_rcp_refine(struct lp_build_context
*bld
,
2379 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2380 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2383 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2384 res
= LLVMBuildFSub(builder
, two
, res
, "");
2385 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2392 lp_build_rcp(struct lp_build_context
*bld
,
2395 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2396 const struct lp_type type
= bld
->type
;
2398 assert(lp_check_value(type
, a
));
2407 assert(type
.floating
);
2409 if(LLVMIsConstant(a
))
2410 return LLVMConstFDiv(bld
->one
, a
);
2413 * We don't use RCPPS because:
2414 * - it only has 10bits of precision
2415 * - it doesn't even get the reciprocate of 1.0 exactly
2416 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2417 * - for recent processors the benefit over DIVPS is marginal, a case
2420 * We could still use it on certain processors if benchmarks show that the
2421 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2422 * particular uses that require less workarounds.
2425 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2426 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2427 const unsigned num_iterations
= 0;
2430 const char *intrinsic
= NULL
;
2432 if (type
.length
== 4) {
2433 intrinsic
= "llvm.x86.sse.rcp.ps";
2436 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2439 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2441 for (i
= 0; i
< num_iterations
; ++i
) {
2442 res
= lp_build_rcp_refine(bld
, a
, res
);
2448 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2453 * Do one Newton-Raphson step to improve rsqrt precision:
2455 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2457 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2459 static inline LLVMValueRef
2460 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2462 LLVMValueRef rsqrt_a
)
2464 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2465 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2466 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2469 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2470 res
= LLVMBuildFMul(builder
, a
, res
, "");
2471 res
= LLVMBuildFSub(builder
, three
, res
, "");
2472 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2473 res
= LLVMBuildFMul(builder
, half
, res
, "");
2480 * Generate 1/sqrt(a).
2481 * Result is undefined for values < 0, infinity for +0.
2484 lp_build_rsqrt(struct lp_build_context
*bld
,
2487 const struct lp_type type
= bld
->type
;
2489 assert(lp_check_value(type
, a
));
2491 assert(type
.floating
);
2494 * This should be faster but all denormals will end up as infinity.
2496 if (0 && lp_build_fast_rsqrt_available(type
)) {
2497 const unsigned num_iterations
= 1;
2501 /* rsqrt(1.0) != 1.0 here */
2502 res
= lp_build_fast_rsqrt(bld
, a
);
2504 if (num_iterations
) {
2506 * Newton-Raphson will result in NaN instead of infinity for zero,
2507 * and NaN instead of zero for infinity.
2508 * Also, need to ensure rsqrt(1.0) == 1.0.
2509 * All numbers smaller than FLT_MIN will result in +infinity
2510 * (rsqrtps treats all denormals as zero).
2513 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2514 LLVMValueRef inf
= lp_build_const_vec(bld
->gallivm
, type
, INFINITY
);
2516 for (i
= 0; i
< num_iterations
; ++i
) {
2517 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2519 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2520 res
= lp_build_select(bld
, cmp
, inf
, res
);
2521 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2522 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2523 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2524 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2530 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2534 * If there's a fast (inaccurate) rsqrt instruction available
2535 * (caller may want to avoid to call rsqrt_fast if it's not available,
2536 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2537 * unavailable it would result in sqrt/div/mul so obviously
2538 * much better to just call sqrt, skipping both div and mul).
2541 lp_build_fast_rsqrt_available(struct lp_type type
)
2543 assert(type
.floating
);
2545 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2546 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2554 * Generate 1/sqrt(a).
2555 * Result is undefined for values < 0, infinity for +0.
2556 * Precision is limited, only ~10 bits guaranteed
2557 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2560 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2563 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2564 const struct lp_type type
= bld
->type
;
2566 assert(lp_check_value(type
, a
));
2568 if (lp_build_fast_rsqrt_available(type
)) {
2569 const char *intrinsic
= NULL
;
2571 if (type
.length
== 4) {
2572 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2575 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2577 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2580 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2582 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2587 * Generate sin(a) or cos(a) using polynomial approximation.
2588 * TODO: it might be worth recognizing sin and cos using same source
2589 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2590 * would be way cheaper than calculating (nearly) everything twice...
2591 * Not sure it's common enough to be worth bothering however, scs
2592 * opcode could also benefit from calculating both though.
2595 lp_build_sin_or_cos(struct lp_build_context
*bld
,
2599 struct gallivm_state
*gallivm
= bld
->gallivm
;
2600 LLVMBuilderRef b
= gallivm
->builder
;
2601 struct lp_type int_type
= lp_int_type(bld
->type
);
2604 * take the absolute value,
2605 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2608 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2609 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2611 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2612 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2616 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2619 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2620 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2623 * store the integer part of y in mm0
2624 * emm2 = _mm_cvttps_epi32(y);
2627 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2630 * j=(j+1) & (~1) (see the cephes sources)
2631 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2634 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2635 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2637 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2639 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2640 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2643 * y = _mm_cvtepi32_ps(emm2);
2645 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2647 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2648 LLVMValueRef const_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2649 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2650 LLVMValueRef sign_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2653 * Argument used for poly selection and sign bit determination
2654 * is different for sin vs. cos.
2656 LLVMValueRef emm2_2
= cos
? LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2") :
2659 LLVMValueRef sign_bit
= cos
? LLVMBuildShl(b
, LLVMBuildAnd(b
, const_4
,
2660 LLVMBuildNot(b
, emm2_2
, ""), ""),
2661 const_29
, "sign_bit") :
2662 LLVMBuildAnd(b
, LLVMBuildXor(b
, a_v4si
,
2663 LLVMBuildShl(b
, emm2_add
,
2665 sign_mask
, "sign_bit");
2668 * get the polynom selection mask
2669 * there is one polynom for 0 <= x <= Pi/4
2670 * and another one for Pi/4<x<=Pi/2
2671 * Both branches will be computed.
2673 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2674 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2677 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, const_2
, "emm2_3");
2678 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2679 int_type
, PIPE_FUNC_EQUAL
,
2680 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2683 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2684 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2685 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2687 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2688 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2689 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2692 * The magic pass: "Extended precision modular arithmetic"
2693 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2694 * xmm1 = _mm_mul_ps(y, xmm1);
2695 * xmm2 = _mm_mul_ps(y, xmm2);
2696 * xmm3 = _mm_mul_ps(y, xmm3);
2698 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2699 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2700 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2703 * x = _mm_add_ps(x, xmm1);
2704 * x = _mm_add_ps(x, xmm2);
2705 * x = _mm_add_ps(x, xmm3);
2708 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2709 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2710 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2713 * Evaluate the first polynom (0 <= x <= Pi/4)
2715 * z = _mm_mul_ps(x,x);
2717 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2720 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2721 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2722 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2724 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2725 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2726 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2729 * y = *(v4sf*)_ps_coscof_p0;
2730 * y = _mm_mul_ps(y, z);
2732 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2733 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2734 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2735 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2736 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2737 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2741 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2742 * y = _mm_sub_ps(y, tmp);
2743 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2745 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2746 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2747 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2748 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2749 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2752 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2753 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2754 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2756 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2757 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2758 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2761 * Evaluate the second polynom (Pi/4 <= x <= 0)
2763 * y2 = *(v4sf*)_ps_sincof_p0;
2764 * y2 = _mm_mul_ps(y2, z);
2765 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2766 * y2 = _mm_mul_ps(y2, z);
2767 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2768 * y2 = _mm_mul_ps(y2, z);
2769 * y2 = _mm_mul_ps(y2, x);
2770 * y2 = _mm_add_ps(y2, x);
2773 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2774 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2775 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2776 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2777 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2778 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2779 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2782 * select the correct result from the two polynoms
2784 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2785 * y = _mm_andnot_ps(xmm3, y);
2786 * y = _mm_or_ps(y,y2);
2788 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2789 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2790 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2791 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
2792 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2793 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
2797 * y = _mm_xor_ps(y, sign_bit);
2799 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sign");
2800 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2802 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
2804 /* clamp output to be within [-1, 1] */
2805 y_result
= lp_build_clamp(bld
, y_result
,
2806 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
2807 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
2808 /* If a is -inf, inf or NaN then return NaN */
2809 y_result
= lp_build_select(bld
, isfinite
, y_result
,
2810 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
2819 lp_build_sin(struct lp_build_context
*bld
,
2822 return lp_build_sin_or_cos(bld
, a
, FALSE
);
2830 lp_build_cos(struct lp_build_context
*bld
,
2833 return lp_build_sin_or_cos(bld
, a
, TRUE
);
2838 * Generate pow(x, y)
2841 lp_build_pow(struct lp_build_context
*bld
,
2845 /* TODO: optimize the constant case */
2846 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2847 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2848 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2852 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2860 lp_build_exp(struct lp_build_context
*bld
,
2863 /* log2(e) = 1/log(2) */
2864 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2865 1.4426950408889634);
2867 assert(lp_check_value(bld
->type
, x
));
2869 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2875 * Behavior is undefined with infs, 0s and nans
2878 lp_build_log(struct lp_build_context
*bld
,
2882 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2883 0.69314718055994529);
2885 assert(lp_check_value(bld
->type
, x
));
2887 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2891 * Generate log(x) that handles edge cases (infs, 0s and nans)
2894 lp_build_log_safe(struct lp_build_context
*bld
,
2898 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2899 0.69314718055994529);
2901 assert(lp_check_value(bld
->type
, x
));
2903 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
2908 * Generate polynomial.
2909 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2912 lp_build_polynomial(struct lp_build_context
*bld
,
2914 const double *coeffs
,
2915 unsigned num_coeffs
)
2917 const struct lp_type type
= bld
->type
;
2918 LLVMValueRef even
= NULL
, odd
= NULL
;
2922 assert(lp_check_value(bld
->type
, x
));
2924 /* TODO: optimize the constant case */
2925 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2926 LLVMIsConstant(x
)) {
2927 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2932 * Calculate odd and even terms seperately to decrease data dependency
2934 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2935 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2937 x2
= lp_build_mul(bld
, x
, x
);
2939 for (i
= num_coeffs
; i
--; ) {
2942 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2946 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2951 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2958 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2967 * Minimax polynomial fit of 2**x, in range [0, 1[
2969 const double lp_build_exp2_polynomial
[] = {
2970 #if EXP_POLY_DEGREE == 5
2971 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2972 0.693153073200168932794,
2973 0.240153617044375388211,
2974 0.0558263180532956664775,
2975 0.00898934009049466391101,
2976 0.00187757667519147912699
2977 #elif EXP_POLY_DEGREE == 4
2978 1.00000259337069434683,
2979 0.693003834469974940458,
2980 0.24144275689150793076,
2981 0.0520114606103070150235,
2982 0.0135341679161270268764
2983 #elif EXP_POLY_DEGREE == 3
2984 0.999925218562710312959,
2985 0.695833540494823811697,
2986 0.226067155427249155588,
2987 0.0780245226406372992967
2988 #elif EXP_POLY_DEGREE == 2
2989 1.00172476321474503578,
2990 0.657636275736077639316,
2991 0.33718943461968720704
2999 lp_build_exp2(struct lp_build_context
*bld
,
3002 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3003 const struct lp_type type
= bld
->type
;
3004 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3005 LLVMValueRef ipart
= NULL
;
3006 LLVMValueRef fpart
= NULL
;
3007 LLVMValueRef expipart
= NULL
;
3008 LLVMValueRef expfpart
= NULL
;
3009 LLVMValueRef res
= NULL
;
3011 assert(lp_check_value(bld
->type
, x
));
3013 /* TODO: optimize the constant case */
3014 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3015 LLVMIsConstant(x
)) {
3016 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3020 assert(type
.floating
&& type
.width
== 32);
3022 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3023 * the result is INF and if it's smaller than -126.9 the result is 0 */
3024 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3025 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3026 x
= lp_build_max_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999),
3027 x
, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
3029 /* ipart = floor(x) */
3030 /* fpart = x - ipart */
3031 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3033 /* expipart = (float) (1 << ipart) */
3034 expipart
= LLVMBuildAdd(builder
, ipart
,
3035 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3036 expipart
= LLVMBuildShl(builder
, expipart
,
3037 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3038 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3040 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3041 Elements(lp_build_exp2_polynomial
));
3043 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3051 * Extract the exponent of a IEEE-754 floating point value.
3053 * Optionally apply an integer bias.
3055 * Result is an integer value with
3057 * ifloor(log2(x)) + bias
3060 lp_build_extract_exponent(struct lp_build_context
*bld
,
3064 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3065 const struct lp_type type
= bld
->type
;
3066 unsigned mantissa
= lp_mantissa(type
);
3069 assert(type
.floating
);
3071 assert(lp_check_value(bld
->type
, x
));
3073 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3075 res
= LLVMBuildLShr(builder
, x
,
3076 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3077 res
= LLVMBuildAnd(builder
, res
,
3078 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3079 res
= LLVMBuildSub(builder
, res
,
3080 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3087 * Extract the mantissa of the a floating.
3089 * Result is a floating point value with
3091 * x / floor(log2(x))
3094 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3097 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3098 const struct lp_type type
= bld
->type
;
3099 unsigned mantissa
= lp_mantissa(type
);
3100 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3101 (1ULL << mantissa
) - 1);
3102 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3105 assert(lp_check_value(bld
->type
, x
));
3107 assert(type
.floating
);
3109 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3111 /* res = x / 2**ipart */
3112 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3113 res
= LLVMBuildOr(builder
, res
, one
, "");
3114 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3122 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3123 * These coefficients can be generate with
3124 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3126 const double lp_build_log2_polynomial
[] = {
3127 #if LOG_POLY_DEGREE == 5
3128 2.88539008148777786488L,
3129 0.961796878841293367824L,
3130 0.577058946784739859012L,
3131 0.412914355135828735411L,
3132 0.308591899232910175289L,
3133 0.352376952300281371868L,
3134 #elif LOG_POLY_DEGREE == 4
3135 2.88539009343309178325L,
3136 0.961791550404184197881L,
3137 0.577440339438736392009L,
3138 0.403343858251329912514L,
3139 0.406718052498846252698L,
3140 #elif LOG_POLY_DEGREE == 3
3141 2.88538959748872753838L,
3142 0.961932915889597772928L,
3143 0.571118517972136195241L,
3144 0.493997535084709500285L,
3151 * See http://www.devmaster.net/forums/showthread.php?p=43580
3152 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3153 * http://www.nezumi.demon.co.uk/consult/logx.htm
3155 * If handle_edge_cases is true the function will perform computations
3156 * to match the required D3D10+ behavior for each of the edge cases.
3157 * That means that if input is:
3158 * - less than zero (to and including -inf) then NaN will be returned
3159 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3160 * - +infinity, then +infinity will be returned
3161 * - NaN, then NaN will be returned
3163 * Those checks are fairly expensive so if you don't need them make sure
3164 * handle_edge_cases is false.
3167 lp_build_log2_approx(struct lp_build_context
*bld
,
3169 LLVMValueRef
*p_exp
,
3170 LLVMValueRef
*p_floor_log2
,
3171 LLVMValueRef
*p_log2
,
3172 boolean handle_edge_cases
)
3174 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3175 const struct lp_type type
= bld
->type
;
3176 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3177 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3179 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3180 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3181 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3183 LLVMValueRef i
= NULL
;
3184 LLVMValueRef y
= NULL
;
3185 LLVMValueRef z
= NULL
;
3186 LLVMValueRef exp
= NULL
;
3187 LLVMValueRef mant
= NULL
;
3188 LLVMValueRef logexp
= NULL
;
3189 LLVMValueRef logmant
= NULL
;
3190 LLVMValueRef res
= NULL
;
3192 assert(lp_check_value(bld
->type
, x
));
3194 if(p_exp
|| p_floor_log2
|| p_log2
) {
3195 /* TODO: optimize the constant case */
3196 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3197 LLVMIsConstant(x
)) {
3198 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3202 assert(type
.floating
&& type
.width
== 32);
3205 * We don't explicitly handle denormalized numbers. They will yield a
3206 * result in the neighbourhood of -127, which appears to be adequate
3210 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3212 /* exp = (float) exponent(x) */
3213 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3216 if(p_floor_log2
|| p_log2
) {
3217 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3218 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3219 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3223 /* mant = 1 + (float) mantissa(x) */
3224 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3225 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3226 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3228 /* y = (mant - 1) / (mant + 1) */
3229 y
= lp_build_div(bld
,
3230 lp_build_sub(bld
, mant
, bld
->one
),
3231 lp_build_add(bld
, mant
, bld
->one
)
3235 z
= lp_build_mul(bld
, y
, y
);
3238 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3239 Elements(lp_build_log2_polynomial
));
3241 /* logmant = y * P(z) */
3242 logmant
= lp_build_mul(bld
, y
, logmant
);
3244 res
= lp_build_add(bld
, logmant
, logexp
);
3246 if (type
.floating
&& handle_edge_cases
) {
3247 LLVMValueRef negmask
, infmask
, zmask
;
3248 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3249 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3250 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3251 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3252 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3253 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3255 /* If x is qual to inf make sure we return inf */
3256 res
= lp_build_select(bld
, infmask
,
3257 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3259 /* If x is qual to 0, return -inf */
3260 res
= lp_build_select(bld
, zmask
,
3261 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3263 /* If x is nan or less than 0, return nan */
3264 res
= lp_build_select(bld
, negmask
,
3265 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3271 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3276 *p_floor_log2
= logexp
;
3284 * log2 implementation which doesn't have special code to
3285 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3286 * the results for those cases are undefined.
3289 lp_build_log2(struct lp_build_context
*bld
,
3293 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3298 * Version of log2 which handles all edge cases.
3299 * Look at documentation of lp_build_log2_approx for
3300 * description of the behavior for each of the edge cases.
3303 lp_build_log2_safe(struct lp_build_context
*bld
,
3307 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3313 * Faster (and less accurate) log2.
3315 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3317 * Piece-wise linear approximation, with exact results when x is a
3320 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3323 lp_build_fast_log2(struct lp_build_context
*bld
,
3326 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3330 assert(lp_check_value(bld
->type
, x
));
3332 assert(bld
->type
.floating
);
3334 /* ipart = floor(log2(x)) - 1 */
3335 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3336 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3338 /* fpart = x / 2**ipart */
3339 fpart
= lp_build_extract_mantissa(bld
, x
);
3342 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3347 * Fast implementation of iround(log2(x)).
3349 * Not an approximation -- it should give accurate results all the time.
3352 lp_build_ilog2(struct lp_build_context
*bld
,
3355 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3356 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3359 assert(bld
->type
.floating
);
3361 assert(lp_check_value(bld
->type
, x
));
3363 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3364 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3366 /* ipart = floor(log2(x) + 0.5) */
3367 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3373 lp_build_mod(struct lp_build_context
*bld
,
3377 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3379 const struct lp_type type
= bld
->type
;
3381 assert(lp_check_value(type
, x
));
3382 assert(lp_check_value(type
, y
));
3385 res
= LLVMBuildFRem(builder
, x
, y
, "");
3387 res
= LLVMBuildSRem(builder
, x
, y
, "");
3389 res
= LLVMBuildURem(builder
, x
, y
, "");
3395 * For floating inputs it creates and returns a mask
3396 * which is all 1's for channels which are NaN.
3397 * Channels inside x which are not NaN will be 0.
3400 lp_build_isnan(struct lp_build_context
*bld
,
3404 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3406 assert(bld
->type
.floating
);
3407 assert(lp_check_value(bld
->type
, x
));
3409 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3411 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3412 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3416 /* Returns all 1's for floating point numbers that are
3417 * finite numbers and returns all zeros for -inf,
3420 lp_build_isfinite(struct lp_build_context
*bld
,
3423 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3424 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3425 struct lp_type int_type
= lp_int_type(bld
->type
);
3426 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3427 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3430 if (!bld
->type
.floating
) {
3431 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3433 assert(bld
->type
.floating
);
3434 assert(lp_check_value(bld
->type
, x
));
3435 assert(bld
->type
.width
== 32);
3437 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3438 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,
3443 * Returns true if the number is nan or inf and false otherwise.
3444 * The input has to be a floating point vector.
3447 lp_build_is_inf_or_nan(struct gallivm_state
*gallivm
,
3448 const struct lp_type type
,
3451 LLVMBuilderRef builder
= gallivm
->builder
;
3452 struct lp_type int_type
= lp_int_type(type
);
3453 LLVMValueRef const0
= lp_build_const_int_vec(gallivm
, int_type
,
3457 assert(type
.floating
);
3459 ret
= LLVMBuildBitCast(builder
, x
, lp_build_vec_type(gallivm
, int_type
), "");
3460 ret
= LLVMBuildAnd(builder
, ret
, const0
, "");
3461 ret
= lp_build_compare(gallivm
, int_type
, PIPE_FUNC_EQUAL
,
3469 lp_build_fpstate_get(struct gallivm_state
*gallivm
)
3471 if (util_cpu_caps
.has_sse
) {
3472 LLVMBuilderRef builder
= gallivm
->builder
;
3473 LLVMValueRef mxcsr_ptr
= lp_build_alloca(
3475 LLVMInt32TypeInContext(gallivm
->context
),
3477 LLVMValueRef mxcsr_ptr8
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3478 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3479 lp_build_intrinsic(builder
,
3480 "llvm.x86.sse.stmxcsr",
3481 LLVMVoidTypeInContext(gallivm
->context
),
3489 lp_build_fpstate_set_denorms_zero(struct gallivm_state
*gallivm
,
3492 if (util_cpu_caps
.has_sse
) {
3493 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3494 int daz_ftz
= _MM_FLUSH_ZERO_MASK
;
3496 LLVMBuilderRef builder
= gallivm
->builder
;
3497 LLVMValueRef mxcsr_ptr
= lp_build_fpstate_get(gallivm
);
3498 LLVMValueRef mxcsr
=
3499 LLVMBuildLoad(builder
, mxcsr_ptr
, "mxcsr");
3501 if (util_cpu_caps
.has_daz
) {
3502 /* Enable denormals are zero mode */
3503 daz_ftz
|= _MM_DENORMALS_ZERO_MASK
;
3506 mxcsr
= LLVMBuildOr(builder
, mxcsr
,
3507 LLVMConstInt(LLVMTypeOf(mxcsr
), daz_ftz
, 0), "");
3509 mxcsr
= LLVMBuildAnd(builder
, mxcsr
,
3510 LLVMConstInt(LLVMTypeOf(mxcsr
), ~daz_ftz
, 0), "");
3513 LLVMBuildStore(builder
, mxcsr
, mxcsr_ptr
);
3514 lp_build_fpstate_set(gallivm
, mxcsr_ptr
);
3519 lp_build_fpstate_set(struct gallivm_state
*gallivm
,
3520 LLVMValueRef mxcsr_ptr
)
3522 if (util_cpu_caps
.has_sse
) {
3523 LLVMBuilderRef builder
= gallivm
->builder
;
3524 mxcsr_ptr
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3525 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3526 lp_build_intrinsic(builder
,
3527 "llvm.x86.sse.ldmxcsr",
3528 LLVMVoidTypeInContext(gallivm
->context
),