1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
79 #define EXP_POLY_DEGREE 5
81 #define LOG_POLY_DEGREE 4
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
91 lp_build_min_simple(struct lp_build_context
*bld
,
94 enum gallivm_nan_behavior nan_behavior
)
96 const struct lp_type type
= bld
->type
;
97 const char *intrinsic
= NULL
;
98 unsigned intr_size
= 0;
101 assert(lp_check_value(type
, a
));
102 assert(lp_check_value(type
, b
));
104 /* TODO: optimize the constant case */
106 if (type
.floating
&& util_cpu_caps
.has_sse
) {
107 if (type
.width
== 32) {
108 if (type
.length
== 1) {
109 intrinsic
= "llvm.x86.sse.min.ss";
112 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
113 intrinsic
= "llvm.x86.sse.min.ps";
117 intrinsic
= "llvm.x86.avx.min.ps.256";
121 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
122 if (type
.length
== 1) {
123 intrinsic
= "llvm.x86.sse2.min.sd";
126 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
127 intrinsic
= "llvm.x86.sse2.min.pd";
131 intrinsic
= "llvm.x86.avx.min.pd.256";
136 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
137 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 if (type
.width
== 32 && type
.length
== 4) {
142 intrinsic
= "llvm.ppc.altivec.vminfp";
145 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
147 if ((type
.width
== 8 || type
.width
== 16) &&
148 (type
.width
* type
.length
<= 64) &&
149 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
150 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
153 if (type
.width
== 8 && !type
.sign
) {
154 intrinsic
= "llvm.x86.sse2.pminu.b";
156 else if (type
.width
== 16 && type
.sign
) {
157 intrinsic
= "llvm.x86.sse2.pmins.w";
159 if (util_cpu_caps
.has_sse4_1
) {
160 if (type
.width
== 8 && type
.sign
) {
161 intrinsic
= "llvm.x86.sse41.pminsb";
163 if (type
.width
== 16 && !type
.sign
) {
164 intrinsic
= "llvm.x86.sse41.pminuw";
166 if (type
.width
== 32 && !type
.sign
) {
167 intrinsic
= "llvm.x86.sse41.pminud";
169 if (type
.width
== 32 && type
.sign
) {
170 intrinsic
= "llvm.x86.sse41.pminsd";
173 } else if (util_cpu_caps
.has_altivec
) {
175 if (type
.width
== 8) {
177 intrinsic
= "llvm.ppc.altivec.vminub";
179 intrinsic
= "llvm.ppc.altivec.vminsb";
181 } else if (type
.width
== 16) {
183 intrinsic
= "llvm.ppc.altivec.vminuh";
185 intrinsic
= "llvm.ppc.altivec.vminsh";
187 } else if (type
.width
== 32) {
189 intrinsic
= "llvm.ppc.altivec.vminuw";
191 intrinsic
= "llvm.ppc.altivec.vminsw";
197 /* We need to handle nan's for floating point numbers. If one of the
198 * inputs is nan the other should be returned (required by both D3D10+
200 * The sse intrinsics return the second operator in case of nan by
201 * default so we need to special code to handle those.
203 if (util_cpu_caps
.has_sse
&& type
.floating
&&
204 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
205 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
) {
206 LLVMValueRef isnan
, max
;
207 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
210 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
211 isnan
= lp_build_isnan(bld
, b
);
212 return lp_build_select(bld
, isnan
, a
, max
);
214 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
215 isnan
= lp_build_isnan(bld
, a
);
216 return lp_build_select(bld
, isnan
, a
, max
);
219 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
226 switch (nan_behavior
) {
227 case GALLIVM_NAN_RETURN_NAN
: {
228 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
229 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
230 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
231 return lp_build_select(bld
, cond
, a
, b
);
234 case GALLIVM_NAN_RETURN_OTHER
: {
235 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
236 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
237 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
238 return lp_build_select(bld
, cond
, a
, b
);
241 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
242 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
243 return lp_build_select(bld
, cond
, a
, b
);
244 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
245 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
246 return lp_build_select(bld
, cond
, a
, b
);
250 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
251 return lp_build_select(bld
, cond
, a
, b
);
254 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
255 return lp_build_select(bld
, cond
, a
, b
);
262 * No checks for special case values of a or b = 1 or 0 are done.
263 * NaN's are handled according to the behavior specified by the
264 * nan_behavior argument.
267 lp_build_max_simple(struct lp_build_context
*bld
,
270 enum gallivm_nan_behavior nan_behavior
)
272 const struct lp_type type
= bld
->type
;
273 const char *intrinsic
= NULL
;
274 unsigned intr_size
= 0;
277 assert(lp_check_value(type
, a
));
278 assert(lp_check_value(type
, b
));
280 /* TODO: optimize the constant case */
282 if (type
.floating
&& util_cpu_caps
.has_sse
) {
283 if (type
.width
== 32) {
284 if (type
.length
== 1) {
285 intrinsic
= "llvm.x86.sse.max.ss";
288 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
289 intrinsic
= "llvm.x86.sse.max.ps";
293 intrinsic
= "llvm.x86.avx.max.ps.256";
297 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
298 if (type
.length
== 1) {
299 intrinsic
= "llvm.x86.sse2.max.sd";
302 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
303 intrinsic
= "llvm.x86.sse2.max.pd";
307 intrinsic
= "llvm.x86.avx.max.pd.256";
312 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
313 if (nan_behavior
== GALLIVM_NAN_RETURN_NAN
) {
314 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
317 if (type
.width
== 32 || type
.length
== 4) {
318 intrinsic
= "llvm.ppc.altivec.vmaxfp";
321 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
323 if ((type
.width
== 8 || type
.width
== 16) &&
324 (type
.width
* type
.length
<= 64) &&
325 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
326 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
329 if (type
.width
== 8 && !type
.sign
) {
330 intrinsic
= "llvm.x86.sse2.pmaxu.b";
333 else if (type
.width
== 16 && type
.sign
) {
334 intrinsic
= "llvm.x86.sse2.pmaxs.w";
336 if (util_cpu_caps
.has_sse4_1
) {
337 if (type
.width
== 8 && type
.sign
) {
338 intrinsic
= "llvm.x86.sse41.pmaxsb";
340 if (type
.width
== 16 && !type
.sign
) {
341 intrinsic
= "llvm.x86.sse41.pmaxuw";
343 if (type
.width
== 32 && !type
.sign
) {
344 intrinsic
= "llvm.x86.sse41.pmaxud";
346 if (type
.width
== 32 && type
.sign
) {
347 intrinsic
= "llvm.x86.sse41.pmaxsd";
350 } else if (util_cpu_caps
.has_altivec
) {
352 if (type
.width
== 8) {
354 intrinsic
= "llvm.ppc.altivec.vmaxub";
356 intrinsic
= "llvm.ppc.altivec.vmaxsb";
358 } else if (type
.width
== 16) {
360 intrinsic
= "llvm.ppc.altivec.vmaxuh";
362 intrinsic
= "llvm.ppc.altivec.vmaxsh";
364 } else if (type
.width
== 32) {
366 intrinsic
= "llvm.ppc.altivec.vmaxuw";
368 intrinsic
= "llvm.ppc.altivec.vmaxsw";
374 if (util_cpu_caps
.has_sse
&& type
.floating
&&
375 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
376 nan_behavior
!= GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
) {
377 LLVMValueRef isnan
, min
;
378 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
381 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
382 isnan
= lp_build_isnan(bld
, b
);
383 return lp_build_select(bld
, isnan
, a
, min
);
385 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
386 isnan
= lp_build_isnan(bld
, a
);
387 return lp_build_select(bld
, isnan
, a
, min
);
390 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
397 switch (nan_behavior
) {
398 case GALLIVM_NAN_RETURN_NAN
: {
399 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
400 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
401 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
402 return lp_build_select(bld
, cond
, a
, b
);
405 case GALLIVM_NAN_RETURN_OTHER
: {
406 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
407 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
408 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
409 return lp_build_select(bld
, cond
, a
, b
);
412 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
:
413 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
414 return lp_build_select(bld
, cond
, a
, b
);
415 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
416 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
417 return lp_build_select(bld
, cond
, a
, b
);
421 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
422 return lp_build_select(bld
, cond
, a
, b
);
425 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
426 return lp_build_select(bld
, cond
, a
, b
);
432 * Generate 1 - a, or ~a depending on bld->type.
435 lp_build_comp(struct lp_build_context
*bld
,
438 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
439 const struct lp_type type
= bld
->type
;
441 assert(lp_check_value(type
, a
));
448 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
449 if(LLVMIsConstant(a
))
450 return LLVMConstNot(a
);
452 return LLVMBuildNot(builder
, a
, "");
455 if(LLVMIsConstant(a
))
457 return LLVMConstFSub(bld
->one
, a
);
459 return LLVMConstSub(bld
->one
, a
);
462 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
464 return LLVMBuildSub(builder
, bld
->one
, a
, "");
472 lp_build_add(struct lp_build_context
*bld
,
476 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
477 const struct lp_type type
= bld
->type
;
480 assert(lp_check_value(type
, a
));
481 assert(lp_check_value(type
, b
));
487 if(a
== bld
->undef
|| b
== bld
->undef
)
491 const char *intrinsic
= NULL
;
493 if(a
== bld
->one
|| b
== bld
->one
)
496 if (type
.width
* type
.length
== 128 &&
497 !type
.floating
&& !type
.fixed
) {
498 if(util_cpu_caps
.has_sse2
) {
500 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
502 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
503 } else if (util_cpu_caps
.has_altivec
) {
505 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
507 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
512 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
515 /* TODO: handle signed case */
516 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
)
517 a
= lp_build_min_simple(bld
, a
, lp_build_comp(bld
, b
), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
519 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
521 res
= LLVMConstFAdd(a
, b
);
523 res
= LLVMConstAdd(a
, b
);
526 res
= LLVMBuildFAdd(builder
, a
, b
, "");
528 res
= LLVMBuildAdd(builder
, a
, b
, "");
530 /* clamp to ceiling of 1.0 */
531 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
532 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
534 /* XXX clamp to floor of -1 or 0??? */
540 /** Return the scalar sum of the elements of a.
541 * Should avoid this operation whenever possible.
544 lp_build_horizontal_add(struct lp_build_context
*bld
,
547 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
548 const struct lp_type type
= bld
->type
;
549 LLVMValueRef index
, res
;
551 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
552 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
553 LLVMValueRef vecres
, elem2
;
555 assert(lp_check_value(type
, a
));
557 if (type
.length
== 1) {
561 assert(!bld
->type
.norm
);
564 * for byte vectors can do much better with psadbw.
565 * Using repeated shuffle/adds here. Note with multiple vectors
566 * this can be done more efficiently as outlined in the intel
567 * optimization manual.
568 * Note: could cause data rearrangement if used with smaller element
573 length
= type
.length
/ 2;
575 LLVMValueRef vec1
, vec2
;
576 for (i
= 0; i
< length
; i
++) {
577 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
578 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
580 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
581 LLVMConstVector(shuffles1
, length
), "");
582 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
583 LLVMConstVector(shuffles2
, length
), "");
585 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
588 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
590 length
= length
>> 1;
593 /* always have vector of size 2 here */
596 index
= lp_build_const_int32(bld
->gallivm
, 0);
597 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
598 index
= lp_build_const_int32(bld
->gallivm
, 1);
599 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
602 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
604 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
610 * Return the horizontal sums of 4 float vectors as a float4 vector.
611 * This uses the technique as outlined in Intel Optimization Manual.
614 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
617 struct gallivm_state
*gallivm
= bld
->gallivm
;
618 LLVMBuilderRef builder
= gallivm
->builder
;
619 LLVMValueRef shuffles
[4];
621 LLVMValueRef sumtmp
[2], shuftmp
[2];
623 /* lower half of regs */
624 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
625 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
626 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
627 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
628 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
629 LLVMConstVector(shuffles
, 4), "");
630 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
631 LLVMConstVector(shuffles
, 4), "");
633 /* upper half of regs */
634 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
635 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
636 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
637 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
638 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
639 LLVMConstVector(shuffles
, 4), "");
640 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
641 LLVMConstVector(shuffles
, 4), "");
643 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
644 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
646 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
647 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
648 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
649 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
650 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
651 LLVMConstVector(shuffles
, 4), "");
653 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
654 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
655 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
656 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
657 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
658 LLVMConstVector(shuffles
, 4), "");
660 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
665 * partially horizontally add 2-4 float vectors with length nx4,
666 * i.e. only four adjacent values in each vector will be added,
667 * assuming values are really grouped in 4 which also determines
670 * Return a vector of the same length as the initial vectors,
671 * with the excess elements (if any) being undefined.
672 * The element order is independent of number of input vectors.
673 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
674 * the output order thus will be
675 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
678 lp_build_hadd_partial4(struct lp_build_context
*bld
,
679 LLVMValueRef vectors
[],
682 struct gallivm_state
*gallivm
= bld
->gallivm
;
683 LLVMBuilderRef builder
= gallivm
->builder
;
684 LLVMValueRef ret_vec
;
686 const char *intrinsic
= NULL
;
688 assert(num_vecs
>= 2 && num_vecs
<= 4);
689 assert(bld
->type
.floating
);
691 /* only use this with at least 2 vectors, as it is sort of expensive
692 * (depending on cpu) and we always need two horizontal adds anyway,
693 * so a shuffle/add approach might be better.
699 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
700 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
702 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
703 bld
->type
.length
== 4) {
704 intrinsic
= "llvm.x86.sse3.hadd.ps";
706 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
707 bld
->type
.length
== 8) {
708 intrinsic
= "llvm.x86.avx.hadd.ps.256";
711 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
712 lp_build_vec_type(gallivm
, bld
->type
),
715 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
716 lp_build_vec_type(gallivm
, bld
->type
),
722 return lp_build_intrinsic_binary(builder
, intrinsic
,
723 lp_build_vec_type(gallivm
, bld
->type
),
727 if (bld
->type
.length
== 4) {
728 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
731 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
733 unsigned num_iter
= bld
->type
.length
/ 4;
734 struct lp_type parttype
= bld
->type
;
736 for (j
= 0; j
< num_iter
; j
++) {
737 LLVMValueRef partsrc
[4];
739 for (i
= 0; i
< 4; i
++) {
740 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
742 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
744 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
753 lp_build_sub(struct lp_build_context
*bld
,
757 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
758 const struct lp_type type
= bld
->type
;
761 assert(lp_check_value(type
, a
));
762 assert(lp_check_value(type
, b
));
766 if(a
== bld
->undef
|| b
== bld
->undef
)
772 const char *intrinsic
= NULL
;
777 if (type
.width
* type
.length
== 128 &&
778 !type
.floating
&& !type
.fixed
) {
779 if (util_cpu_caps
.has_sse2
) {
781 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
783 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
784 } else if (util_cpu_caps
.has_altivec
) {
786 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
788 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
793 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
796 /* TODO: handle signed case */
797 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
)
798 a
= lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
800 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
802 res
= LLVMConstFSub(a
, b
);
804 res
= LLVMConstSub(a
, b
);
807 res
= LLVMBuildFSub(builder
, a
, b
, "");
809 res
= LLVMBuildSub(builder
, a
, b
, "");
811 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
812 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
820 * Normalized multiplication.
822 * There are several approaches for (using 8-bit normalized multiplication as
827 * makes the following approximation to the division (Sree)
829 * a*b/255 ~= (a*(b + 1)) >> 256
831 * which is the fastest method that satisfies the following OpenGL criteria of
833 * 0*0 = 0 and 255*255 = 255
837 * takes the geometric series approximation to the division
839 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
841 * in this case just the first two terms to fit in 16bit arithmetic
843 * t/255 ~= (t + (t >> 8)) >> 8
845 * note that just by itself it doesn't satisfies the OpenGL criteria, as
846 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
849 * - geometric series plus rounding
851 * when using a geometric series division instead of truncating the result
852 * use roundoff in the approximation (Jim Blinn)
854 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
856 * achieving the exact results.
860 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
861 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
862 * @sa Michael Herf, The "double blend trick", May 2000,
863 * http://www.stereopsis.com/doubleblend.html
866 lp_build_mul_norm(struct gallivm_state
*gallivm
,
867 struct lp_type wide_type
,
868 LLVMValueRef a
, LLVMValueRef b
)
870 LLVMBuilderRef builder
= gallivm
->builder
;
871 struct lp_build_context bld
;
876 assert(!wide_type
.floating
);
877 assert(lp_check_value(wide_type
, a
));
878 assert(lp_check_value(wide_type
, b
));
880 lp_build_context_init(&bld
, gallivm
, wide_type
);
882 n
= wide_type
.width
/ 2;
883 if (wide_type
.sign
) {
888 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
889 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
893 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
896 ab
= LLVMBuildMul(builder
, a
, b
, "");
897 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
900 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
903 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1 << (n
- 1));
904 if (wide_type
.sign
) {
905 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
906 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
907 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
909 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
912 ab
= lp_build_shr_imm(&bld
, ab
, n
);
921 lp_build_mul(struct lp_build_context
*bld
,
925 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
926 const struct lp_type type
= bld
->type
;
930 assert(lp_check_value(type
, a
));
931 assert(lp_check_value(type
, b
));
941 if(a
== bld
->undef
|| b
== bld
->undef
)
944 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
945 struct lp_type wide_type
= lp_wider_type(type
);
946 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
948 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
949 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
951 /* PMULLW, PSRLW, PADDW */
952 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
953 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
955 ab
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, abl
, abh
);
961 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
965 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
967 res
= LLVMConstFMul(a
, b
);
969 res
= LLVMConstMul(a
, b
);
972 res
= LLVMConstAShr(res
, shift
);
974 res
= LLVMConstLShr(res
, shift
);
979 res
= LLVMBuildFMul(builder
, a
, b
, "");
981 res
= LLVMBuildMul(builder
, a
, b
, "");
984 res
= LLVMBuildAShr(builder
, res
, shift
, "");
986 res
= LLVMBuildLShr(builder
, res
, shift
, "");
995 * Small vector x scale multiplication optimization.
998 lp_build_mul_imm(struct lp_build_context
*bld
,
1002 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1003 LLVMValueRef factor
;
1005 assert(lp_check_value(bld
->type
, a
));
1014 return lp_build_negate(bld
, a
);
1016 if(b
== 2 && bld
->type
.floating
)
1017 return lp_build_add(bld
, a
, a
);
1019 if(util_is_power_of_two(b
)) {
1020 unsigned shift
= ffs(b
) - 1;
1022 if(bld
->type
.floating
) {
1025 * Power of two multiplication by directly manipulating the exponent.
1027 * XXX: This might not be always faster, it will introduce a small error
1028 * for multiplication by zero, and it will produce wrong results
1031 unsigned mantissa
= lp_mantissa(bld
->type
);
1032 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1033 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1034 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1035 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1040 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1041 return LLVMBuildShl(builder
, a
, factor
, "");
1045 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1046 return lp_build_mul(bld
, a
, factor
);
1054 lp_build_div(struct lp_build_context
*bld
,
1058 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1059 const struct lp_type type
= bld
->type
;
1061 assert(lp_check_value(type
, a
));
1062 assert(lp_check_value(type
, b
));
1067 return lp_build_rcp(bld
, b
);
1072 if(a
== bld
->undef
|| b
== bld
->undef
)
1075 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1077 return LLVMConstFDiv(a
, b
);
1079 return LLVMConstSDiv(a
, b
);
1081 return LLVMConstUDiv(a
, b
);
1084 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1085 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1087 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1090 return LLVMBuildFDiv(builder
, a
, b
, "");
1092 return LLVMBuildSDiv(builder
, a
, b
, "");
1094 return LLVMBuildUDiv(builder
, a
, b
, "");
1099 * Linear interpolation helper.
1101 * @param normalized whether we are interpolating normalized values,
1102 * encoded in normalized integers, twice as wide.
1104 * @sa http://www.stereopsis.com/doubleblend.html
1106 static INLINE LLVMValueRef
1107 lp_build_lerp_simple(struct lp_build_context
*bld
,
1113 unsigned half_width
= bld
->type
.width
/2;
1114 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1118 assert(lp_check_value(bld
->type
, x
));
1119 assert(lp_check_value(bld
->type
, v0
));
1120 assert(lp_check_value(bld
->type
, v1
));
1122 delta
= lp_build_sub(bld
, v1
, v0
);
1124 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1125 if (!bld
->type
.sign
) {
1126 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1128 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1129 * most-significant-bit to the lowest-significant-bit, so that
1130 * later we can just divide by 2**n instead of 2**n - 1.
1133 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1136 /* (x * delta) >> n */
1137 res
= lp_build_mul(bld
, x
, delta
);
1138 res
= lp_build_shr_imm(bld
, res
, half_width
);
1141 * The rescaling trick above doesn't work for signed numbers, so
1142 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1145 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1146 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1149 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1150 res
= lp_build_mul(bld
, x
, delta
);
1153 res
= lp_build_add(bld
, v0
, res
);
1155 if (((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) ||
1157 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1158 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1159 * but it will be wrong for true fixed point use cases. Basically we need
1160 * a more powerful lp_type, capable of further distinguishing the values
1161 * interpretation from the value storage. */
1162 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1), "");
1170 * Linear interpolation.
1173 lp_build_lerp(struct lp_build_context
*bld
,
1179 const struct lp_type type
= bld
->type
;
1182 assert(lp_check_value(type
, x
));
1183 assert(lp_check_value(type
, v0
));
1184 assert(lp_check_value(type
, v1
));
1186 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1189 struct lp_type wide_type
;
1190 struct lp_build_context wide_bld
;
1191 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1193 assert(type
.length
>= 2);
1196 * Create a wider integer type, enough to hold the
1197 * intermediate result of the multiplication.
1199 memset(&wide_type
, 0, sizeof wide_type
);
1200 wide_type
.sign
= type
.sign
;
1201 wide_type
.width
= type
.width
*2;
1202 wide_type
.length
= type
.length
/2;
1204 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1206 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1207 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1208 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1214 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1216 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1217 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1219 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1221 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1229 * Bilinear interpolation.
1231 * Values indices are in v_{yx}.
1234 lp_build_lerp_2d(struct lp_build_context
*bld
,
1243 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1244 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1245 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1250 lp_build_lerp_3d(struct lp_build_context
*bld
,
1264 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1265 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1266 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1271 * Generate min(a, b)
1272 * Do checks for special cases but not for nans.
1275 lp_build_min(struct lp_build_context
*bld
,
1279 assert(lp_check_value(bld
->type
, a
));
1280 assert(lp_check_value(bld
->type
, b
));
1282 if(a
== bld
->undef
|| b
== bld
->undef
)
1288 if (bld
->type
.norm
) {
1289 if (!bld
->type
.sign
) {
1290 if (a
== bld
->zero
|| b
== bld
->zero
) {
1300 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1305 * Generate min(a, b)
1306 * NaN's are handled according to the behavior specified by the
1307 * nan_behavior argument.
1310 lp_build_min_ext(struct lp_build_context
*bld
,
1313 enum gallivm_nan_behavior nan_behavior
)
1315 assert(lp_check_value(bld
->type
, a
));
1316 assert(lp_check_value(bld
->type
, b
));
1318 if(a
== bld
->undef
|| b
== bld
->undef
)
1324 if (bld
->type
.norm
) {
1325 if (!bld
->type
.sign
) {
1326 if (a
== bld
->zero
|| b
== bld
->zero
) {
1336 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1340 * Generate max(a, b)
1341 * Do checks for special cases, but NaN behavior is undefined.
1344 lp_build_max(struct lp_build_context
*bld
,
1348 assert(lp_check_value(bld
->type
, a
));
1349 assert(lp_check_value(bld
->type
, b
));
1351 if(a
== bld
->undef
|| b
== bld
->undef
)
1357 if(bld
->type
.norm
) {
1358 if(a
== bld
->one
|| b
== bld
->one
)
1360 if (!bld
->type
.sign
) {
1361 if (a
== bld
->zero
) {
1364 if (b
== bld
->zero
) {
1370 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1375 * Generate max(a, b)
1376 * Checks for special cases.
1377 * NaN's are handled according to the behavior specified by the
1378 * nan_behavior argument.
1381 lp_build_max_ext(struct lp_build_context
*bld
,
1384 enum gallivm_nan_behavior nan_behavior
)
1386 assert(lp_check_value(bld
->type
, a
));
1387 assert(lp_check_value(bld
->type
, b
));
1389 if(a
== bld
->undef
|| b
== bld
->undef
)
1395 if(bld
->type
.norm
) {
1396 if(a
== bld
->one
|| b
== bld
->one
)
1398 if (!bld
->type
.sign
) {
1399 if (a
== bld
->zero
) {
1402 if (b
== bld
->zero
) {
1408 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1412 * Generate clamp(a, min, max)
1413 * NaN behavior (for any of a, min, max) is undefined.
1414 * Do checks for special cases.
1417 lp_build_clamp(struct lp_build_context
*bld
,
1422 assert(lp_check_value(bld
->type
, a
));
1423 assert(lp_check_value(bld
->type
, min
));
1424 assert(lp_check_value(bld
->type
, max
));
1426 a
= lp_build_min(bld
, a
, max
);
1427 a
= lp_build_max(bld
, a
, min
);
1433 * Generate clamp(a, 0, 1)
1434 * A NaN will get converted to zero.
1437 lp_build_clamp_zero_one_nanzero(struct lp_build_context
*bld
,
1440 a
= lp_build_max_ext(bld
, a
, bld
->zero
, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
1441 a
= lp_build_min(bld
, a
, bld
->one
);
1450 lp_build_abs(struct lp_build_context
*bld
,
1453 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1454 const struct lp_type type
= bld
->type
;
1455 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1457 assert(lp_check_value(type
, a
));
1463 /* Mask out the sign bit */
1464 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1465 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1466 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1467 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1468 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1469 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1473 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1474 switch(type
.width
) {
1476 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1478 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1480 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1483 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1484 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1485 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1486 debug_printf("%s: inefficient code, should split vectors manually\n",
1490 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1495 lp_build_negate(struct lp_build_context
*bld
,
1498 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1500 assert(lp_check_value(bld
->type
, a
));
1502 #if HAVE_LLVM >= 0x0207
1503 if (bld
->type
.floating
)
1504 a
= LLVMBuildFNeg(builder
, a
, "");
1507 a
= LLVMBuildNeg(builder
, a
, "");
1513 /** Return -1, 0 or +1 depending on the sign of a */
1515 lp_build_sgn(struct lp_build_context
*bld
,
1518 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1519 const struct lp_type type
= bld
->type
;
1523 assert(lp_check_value(type
, a
));
1525 /* Handle non-zero case */
1527 /* if not zero then sign must be positive */
1530 else if(type
.floating
) {
1531 LLVMTypeRef vec_type
;
1532 LLVMTypeRef int_type
;
1536 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1538 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1539 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1540 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1542 /* Take the sign bit and add it to 1 constant */
1543 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1544 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1545 one
= LLVMConstBitCast(bld
->one
, int_type
);
1546 res
= LLVMBuildOr(builder
, sign
, one
, "");
1547 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1551 /* signed int/norm/fixed point */
1552 /* could use psign with sse3 and appropriate vectors here */
1553 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1554 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1555 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1559 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1560 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1567 * Set the sign of float vector 'a' according to 'sign'.
1568 * If sign==0, return abs(a).
1569 * If sign==1, return -abs(a);
1570 * Other values for sign produce undefined results.
1573 lp_build_set_sign(struct lp_build_context
*bld
,
1574 LLVMValueRef a
, LLVMValueRef sign
)
1576 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1577 const struct lp_type type
= bld
->type
;
1578 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1579 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1580 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1581 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1582 ~((unsigned long long) 1 << (type
.width
- 1)));
1583 LLVMValueRef val
, res
;
1585 assert(type
.floating
);
1586 assert(lp_check_value(type
, a
));
1588 /* val = reinterpret_cast<int>(a) */
1589 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1590 /* val = val & mask */
1591 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1592 /* sign = sign << shift */
1593 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1594 /* res = val | sign */
1595 res
= LLVMBuildOr(builder
, val
, sign
, "");
1596 /* res = reinterpret_cast<float>(res) */
1597 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1604 * Convert vector of (or scalar) int to vector of (or scalar) float.
1607 lp_build_int_to_float(struct lp_build_context
*bld
,
1610 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1611 const struct lp_type type
= bld
->type
;
1612 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1614 assert(type
.floating
);
1616 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1620 arch_rounding_available(const struct lp_type type
)
1622 if ((util_cpu_caps
.has_sse4_1
&&
1623 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1624 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1626 else if ((util_cpu_caps
.has_altivec
&&
1627 (type
.width
== 32 && type
.length
== 4)))
1633 enum lp_build_round_mode
1635 LP_BUILD_ROUND_NEAREST
= 0,
1636 LP_BUILD_ROUND_FLOOR
= 1,
1637 LP_BUILD_ROUND_CEIL
= 2,
1638 LP_BUILD_ROUND_TRUNCATE
= 3
1642 * Helper for SSE4.1's ROUNDxx instructions.
1644 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1645 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1647 static INLINE LLVMValueRef
1648 lp_build_round_sse41(struct lp_build_context
*bld
,
1650 enum lp_build_round_mode mode
)
1652 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1653 const struct lp_type type
= bld
->type
;
1654 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1655 const char *intrinsic
;
1658 assert(type
.floating
);
1660 assert(lp_check_value(type
, a
));
1661 assert(util_cpu_caps
.has_sse4_1
);
1663 if (type
.length
== 1) {
1664 LLVMTypeRef vec_type
;
1666 LLVMValueRef args
[3];
1667 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1669 switch(type
.width
) {
1671 intrinsic
= "llvm.x86.sse41.round.ss";
1674 intrinsic
= "llvm.x86.sse41.round.sd";
1681 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1683 undef
= LLVMGetUndef(vec_type
);
1686 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1687 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1689 res
= lp_build_intrinsic(builder
, intrinsic
,
1690 vec_type
, args
, Elements(args
));
1692 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1695 if (type
.width
* type
.length
== 128) {
1696 switch(type
.width
) {
1698 intrinsic
= "llvm.x86.sse41.round.ps";
1701 intrinsic
= "llvm.x86.sse41.round.pd";
1709 assert(type
.width
* type
.length
== 256);
1710 assert(util_cpu_caps
.has_avx
);
1712 switch(type
.width
) {
1714 intrinsic
= "llvm.x86.avx.round.ps.256";
1717 intrinsic
= "llvm.x86.avx.round.pd.256";
1725 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1727 LLVMConstInt(i32t
, mode
, 0));
1734 static INLINE LLVMValueRef
1735 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1738 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1739 const struct lp_type type
= bld
->type
;
1740 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1741 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1742 const char *intrinsic
;
1745 assert(type
.floating
);
1746 /* using the double precision conversions is a bit more complicated */
1747 assert(type
.width
== 32);
1749 assert(lp_check_value(type
, a
));
1750 assert(util_cpu_caps
.has_sse2
);
1752 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1753 if (type
.length
== 1) {
1754 LLVMTypeRef vec_type
;
1757 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1759 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1761 intrinsic
= "llvm.x86.sse.cvtss2si";
1763 undef
= LLVMGetUndef(vec_type
);
1765 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1767 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1771 if (type
.width
* type
.length
== 128) {
1772 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1775 assert(type
.width
*type
.length
== 256);
1776 assert(util_cpu_caps
.has_avx
);
1778 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1780 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1790 static INLINE LLVMValueRef
1791 lp_build_round_altivec(struct lp_build_context
*bld
,
1793 enum lp_build_round_mode mode
)
1795 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1796 const struct lp_type type
= bld
->type
;
1797 const char *intrinsic
= NULL
;
1799 assert(type
.floating
);
1801 assert(lp_check_value(type
, a
));
1802 assert(util_cpu_caps
.has_altivec
);
1805 case LP_BUILD_ROUND_NEAREST
:
1806 intrinsic
= "llvm.ppc.altivec.vrfin";
1808 case LP_BUILD_ROUND_FLOOR
:
1809 intrinsic
= "llvm.ppc.altivec.vrfim";
1811 case LP_BUILD_ROUND_CEIL
:
1812 intrinsic
= "llvm.ppc.altivec.vrfip";
1814 case LP_BUILD_ROUND_TRUNCATE
:
1815 intrinsic
= "llvm.ppc.altivec.vrfiz";
1819 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1822 static INLINE LLVMValueRef
1823 lp_build_round_arch(struct lp_build_context
*bld
,
1825 enum lp_build_round_mode mode
)
1827 if (util_cpu_caps
.has_sse4_1
)
1828 return lp_build_round_sse41(bld
, a
, mode
);
1829 else /* (util_cpu_caps.has_altivec) */
1830 return lp_build_round_altivec(bld
, a
, mode
);
1834 * Return the integer part of a float (vector) value (== round toward zero).
1835 * The returned value is a float (vector).
1836 * Ex: trunc(-1.5) = -1.0
1839 lp_build_trunc(struct lp_build_context
*bld
,
1842 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1843 const struct lp_type type
= bld
->type
;
1845 assert(type
.floating
);
1846 assert(lp_check_value(type
, a
));
1848 if (arch_rounding_available(type
)) {
1849 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1852 const struct lp_type type
= bld
->type
;
1853 struct lp_type inttype
;
1854 struct lp_build_context intbld
;
1855 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1856 LLVMValueRef trunc
, res
, anosign
, mask
;
1857 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1858 LLVMTypeRef vec_type
= bld
->vec_type
;
1860 assert(type
.width
== 32); /* might want to handle doubles at some point */
1863 inttype
.floating
= 0;
1864 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1866 /* round by truncation */
1867 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1868 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1870 /* mask out sign bit */
1871 anosign
= lp_build_abs(bld
, a
);
1873 * mask out all values if anosign > 2^24
1874 * This should work both for large ints (all rounding is no-op for them
1875 * because such floats are always exact) as well as special cases like
1876 * NaNs, Infs (taking advantage of the fact they use max exponent).
1877 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1879 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1880 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1881 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1882 return lp_build_select(bld
, mask
, a
, res
);
1888 * Return float (vector) rounded to nearest integer (vector). The returned
1889 * value is a float (vector).
1890 * Ex: round(0.9) = 1.0
1891 * Ex: round(-1.5) = -2.0
1894 lp_build_round(struct lp_build_context
*bld
,
1897 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1898 const struct lp_type type
= bld
->type
;
1900 assert(type
.floating
);
1901 assert(lp_check_value(type
, a
));
1903 if (arch_rounding_available(type
)) {
1904 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1907 const struct lp_type type
= bld
->type
;
1908 struct lp_type inttype
;
1909 struct lp_build_context intbld
;
1910 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1911 LLVMValueRef res
, anosign
, mask
;
1912 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1913 LLVMTypeRef vec_type
= bld
->vec_type
;
1915 assert(type
.width
== 32); /* might want to handle doubles at some point */
1918 inttype
.floating
= 0;
1919 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1921 res
= lp_build_iround(bld
, a
);
1922 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1924 /* mask out sign bit */
1925 anosign
= lp_build_abs(bld
, a
);
1927 * mask out all values if anosign > 2^24
1928 * This should work both for large ints (all rounding is no-op for them
1929 * because such floats are always exact) as well as special cases like
1930 * NaNs, Infs (taking advantage of the fact they use max exponent).
1931 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1933 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1934 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1935 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1936 return lp_build_select(bld
, mask
, a
, res
);
1942 * Return floor of float (vector), result is a float (vector)
1943 * Ex: floor(1.1) = 1.0
1944 * Ex: floor(-1.1) = -2.0
1947 lp_build_floor(struct lp_build_context
*bld
,
1950 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1951 const struct lp_type type
= bld
->type
;
1953 assert(type
.floating
);
1954 assert(lp_check_value(type
, a
));
1956 if (arch_rounding_available(type
)) {
1957 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1960 const struct lp_type type
= bld
->type
;
1961 struct lp_type inttype
;
1962 struct lp_build_context intbld
;
1963 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1964 LLVMValueRef trunc
, res
, anosign
, mask
;
1965 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1966 LLVMTypeRef vec_type
= bld
->vec_type
;
1968 assert(type
.width
== 32); /* might want to handle doubles at some point */
1971 inttype
.floating
= 0;
1972 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1974 /* round by truncation */
1975 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1976 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1982 * fix values if rounding is wrong (for non-special cases)
1983 * - this is the case if trunc > a
1985 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
1986 /* tmp = trunc > a ? 1.0 : 0.0 */
1987 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1988 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1989 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1990 res
= lp_build_sub(bld
, res
, tmp
);
1993 /* mask out sign bit */
1994 anosign
= lp_build_abs(bld
, a
);
1996 * mask out all values if anosign > 2^24
1997 * This should work both for large ints (all rounding is no-op for them
1998 * because such floats are always exact) as well as special cases like
1999 * NaNs, Infs (taking advantage of the fact they use max exponent).
2000 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2002 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2003 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2004 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2005 return lp_build_select(bld
, mask
, a
, res
);
2011 * Return ceiling of float (vector), returning float (vector).
2012 * Ex: ceil( 1.1) = 2.0
2013 * Ex: ceil(-1.1) = -1.0
2016 lp_build_ceil(struct lp_build_context
*bld
,
2019 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2020 const struct lp_type type
= bld
->type
;
2022 assert(type
.floating
);
2023 assert(lp_check_value(type
, a
));
2025 if (arch_rounding_available(type
)) {
2026 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2029 const struct lp_type type
= bld
->type
;
2030 struct lp_type inttype
;
2031 struct lp_build_context intbld
;
2032 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
2033 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2034 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2035 LLVMTypeRef vec_type
= bld
->vec_type
;
2037 assert(type
.width
== 32); /* might want to handle doubles at some point */
2040 inttype
.floating
= 0;
2041 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2043 /* round by truncation */
2044 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2045 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2048 * fix values if rounding is wrong (for non-special cases)
2049 * - this is the case if trunc < a
2051 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2052 /* tmp = trunc < a ? 1.0 : 0.0 */
2053 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2054 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2055 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2056 res
= lp_build_add(bld
, trunc
, tmp
);
2058 /* mask out sign bit */
2059 anosign
= lp_build_abs(bld
, a
);
2061 * mask out all values if anosign > 2^24
2062 * This should work both for large ints (all rounding is no-op for them
2063 * because such floats are always exact) as well as special cases like
2064 * NaNs, Infs (taking advantage of the fact they use max exponent).
2065 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2067 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2068 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2069 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2070 return lp_build_select(bld
, mask
, a
, res
);
2076 * Return fractional part of 'a' computed as a - floor(a)
2077 * Typically used in texture coord arithmetic.
2080 lp_build_fract(struct lp_build_context
*bld
,
2083 assert(bld
->type
.floating
);
2084 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2089 * Prevent returning a fractional part of 1.0 for very small negative values of
2090 * 'a' by clamping against 0.99999(9).
2092 static inline LLVMValueRef
2093 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2097 /* this is the largest number smaller than 1.0 representable as float */
2098 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2099 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2100 return lp_build_min(bld
, fract
, max
);
2105 * Same as lp_build_fract, but guarantees that the result is always smaller
2109 lp_build_fract_safe(struct lp_build_context
*bld
,
2112 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2117 * Return the integer part of a float (vector) value (== round toward zero).
2118 * The returned value is an integer (vector).
2119 * Ex: itrunc(-1.5) = -1
2122 lp_build_itrunc(struct lp_build_context
*bld
,
2125 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2126 const struct lp_type type
= bld
->type
;
2127 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2129 assert(type
.floating
);
2130 assert(lp_check_value(type
, a
));
2132 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2137 * Return float (vector) rounded to nearest integer (vector). The returned
2138 * value is an integer (vector).
2139 * Ex: iround(0.9) = 1
2140 * Ex: iround(-1.5) = -2
2143 lp_build_iround(struct lp_build_context
*bld
,
2146 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2147 const struct lp_type type
= bld
->type
;
2148 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2151 assert(type
.floating
);
2153 assert(lp_check_value(type
, a
));
2155 if ((util_cpu_caps
.has_sse2
&&
2156 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2157 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2158 return lp_build_iround_nearest_sse2(bld
, a
);
2160 if (arch_rounding_available(type
)) {
2161 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2166 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
2169 LLVMTypeRef vec_type
= bld
->vec_type
;
2170 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2171 (unsigned long long)1 << (type
.width
- 1));
2175 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2176 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2179 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2180 half
= LLVMBuildOr(builder
, sign
, half
, "");
2181 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2184 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2187 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2194 * Return floor of float (vector), result is an int (vector)
2195 * Ex: ifloor(1.1) = 1.0
2196 * Ex: ifloor(-1.1) = -2.0
2199 lp_build_ifloor(struct lp_build_context
*bld
,
2202 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2203 const struct lp_type type
= bld
->type
;
2204 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2207 assert(type
.floating
);
2208 assert(lp_check_value(type
, a
));
2212 if (arch_rounding_available(type
)) {
2213 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2216 struct lp_type inttype
;
2217 struct lp_build_context intbld
;
2218 LLVMValueRef trunc
, itrunc
, mask
;
2220 assert(type
.floating
);
2221 assert(lp_check_value(type
, a
));
2224 inttype
.floating
= 0;
2225 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2227 /* round by truncation */
2228 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2229 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2232 * fix values if rounding is wrong (for non-special cases)
2233 * - this is the case if trunc > a
2234 * The results of doing this with NaNs, very large values etc.
2235 * are undefined but this seems to be the case anyway.
2237 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2238 /* cheapie minus one with mask since the mask is minus one / zero */
2239 return lp_build_add(&intbld
, itrunc
, mask
);
2243 /* round to nearest (toward zero) */
2244 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2251 * Return ceiling of float (vector), returning int (vector).
2252 * Ex: iceil( 1.1) = 2
2253 * Ex: iceil(-1.1) = -1
2256 lp_build_iceil(struct lp_build_context
*bld
,
2259 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2260 const struct lp_type type
= bld
->type
;
2261 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2264 assert(type
.floating
);
2265 assert(lp_check_value(type
, a
));
2267 if (arch_rounding_available(type
)) {
2268 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2271 struct lp_type inttype
;
2272 struct lp_build_context intbld
;
2273 LLVMValueRef trunc
, itrunc
, mask
;
2275 assert(type
.floating
);
2276 assert(lp_check_value(type
, a
));
2279 inttype
.floating
= 0;
2280 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2282 /* round by truncation */
2283 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2284 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2287 * fix values if rounding is wrong (for non-special cases)
2288 * - this is the case if trunc < a
2289 * The results of doing this with NaNs, very large values etc.
2290 * are undefined but this seems to be the case anyway.
2292 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2293 /* cheapie plus one with mask since the mask is minus one / zero */
2294 return lp_build_sub(&intbld
, itrunc
, mask
);
2297 /* round to nearest (toward zero) */
2298 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2305 * Combined ifloor() & fract().
2307 * Preferred to calling the functions separately, as it will ensure that the
2308 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2311 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2313 LLVMValueRef
*out_ipart
,
2314 LLVMValueRef
*out_fpart
)
2316 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2317 const struct lp_type type
= bld
->type
;
2320 assert(type
.floating
);
2321 assert(lp_check_value(type
, a
));
2323 if (arch_rounding_available(type
)) {
2325 * floor() is easier.
2328 ipart
= lp_build_floor(bld
, a
);
2329 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2330 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2334 * ifloor() is easier.
2337 *out_ipart
= lp_build_ifloor(bld
, a
);
2338 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2339 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2345 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2346 * always smaller than one.
2349 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2351 LLVMValueRef
*out_ipart
,
2352 LLVMValueRef
*out_fpart
)
2354 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2355 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2360 lp_build_sqrt(struct lp_build_context
*bld
,
2363 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2364 const struct lp_type type
= bld
->type
;
2365 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2368 assert(lp_check_value(type
, a
));
2370 /* TODO: optimize the constant case */
2372 assert(type
.floating
);
2373 if (type
.length
== 1) {
2374 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.f%u", type
.width
);
2377 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
2380 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2385 * Do one Newton-Raphson step to improve reciprocate precision:
2387 * x_{i+1} = x_i * (2 - a * x_i)
2389 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2390 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2391 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2392 * halo. It would be necessary to clamp the argument to prevent this.
2395 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2396 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2398 static INLINE LLVMValueRef
2399 lp_build_rcp_refine(struct lp_build_context
*bld
,
2403 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2404 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2407 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2408 res
= LLVMBuildFSub(builder
, two
, res
, "");
2409 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2416 lp_build_rcp(struct lp_build_context
*bld
,
2419 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2420 const struct lp_type type
= bld
->type
;
2422 assert(lp_check_value(type
, a
));
2431 assert(type
.floating
);
2433 if(LLVMIsConstant(a
))
2434 return LLVMConstFDiv(bld
->one
, a
);
2437 * We don't use RCPPS because:
2438 * - it only has 10bits of precision
2439 * - it doesn't even get the reciprocate of 1.0 exactly
2440 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2441 * - for recent processors the benefit over DIVPS is marginal, a case
2444 * We could still use it on certain processors if benchmarks show that the
2445 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2446 * particular uses that require less workarounds.
2449 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2450 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2451 const unsigned num_iterations
= 0;
2454 const char *intrinsic
= NULL
;
2456 if (type
.length
== 4) {
2457 intrinsic
= "llvm.x86.sse.rcp.ps";
2460 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2463 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2465 for (i
= 0; i
< num_iterations
; ++i
) {
2466 res
= lp_build_rcp_refine(bld
, a
, res
);
2472 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2477 * Do one Newton-Raphson step to improve rsqrt precision:
2479 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2481 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2483 static INLINE LLVMValueRef
2484 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2486 LLVMValueRef rsqrt_a
)
2488 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2489 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2490 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2493 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2494 res
= LLVMBuildFMul(builder
, a
, res
, "");
2495 res
= LLVMBuildFSub(builder
, three
, res
, "");
2496 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2497 res
= LLVMBuildFMul(builder
, half
, res
, "");
2504 * Generate 1/sqrt(a).
2505 * Result is undefined for values < 0, infinity for +0.
2508 lp_build_rsqrt(struct lp_build_context
*bld
,
2511 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2512 const struct lp_type type
= bld
->type
;
2514 assert(lp_check_value(type
, a
));
2516 assert(type
.floating
);
2519 * This should be faster but all denormals will end up as infinity.
2521 if (0 && lp_build_fast_rsqrt_available(type
)) {
2522 const unsigned num_iterations
= 1;
2526 /* rsqrt(1.0) != 1.0 here */
2527 res
= lp_build_fast_rsqrt(bld
, a
);
2529 if (num_iterations
) {
2531 * Newton-Raphson will result in NaN instead of infinity for zero,
2532 * and NaN instead of zero for infinity.
2533 * Also, need to ensure rsqrt(1.0) == 1.0.
2534 * All numbers smaller than FLT_MIN will result in +infinity
2535 * (rsqrtps treats all denormals as zero).
2538 * Certain non-c99 compilers don't know INFINITY and might not support
2539 * hacks to evaluate it at compile time neither.
2541 const unsigned posinf_int
= 0x7F800000;
2543 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2544 LLVMValueRef inf
= lp_build_const_int_vec(bld
->gallivm
, type
, posinf_int
);
2546 inf
= LLVMBuildBitCast(builder
, inf
, lp_build_vec_type(bld
->gallivm
, type
), "");
2548 for (i
= 0; i
< num_iterations
; ++i
) {
2549 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2551 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2552 res
= lp_build_select(bld
, cmp
, inf
, res
);
2553 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2554 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2555 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2556 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2562 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2566 * If there's a fast (inaccurate) rsqrt instruction available
2567 * (caller may want to avoid to call rsqrt_fast if it's not available,
2568 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2569 * unavailable it would result in sqrt/div/mul so obviously
2570 * much better to just call sqrt, skipping both div and mul).
2573 lp_build_fast_rsqrt_available(struct lp_type type
)
2575 assert(type
.floating
);
2577 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2578 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2586 * Generate 1/sqrt(a).
2587 * Result is undefined for values < 0, infinity for +0.
2588 * Precision is limited, only ~10 bits guaranteed
2589 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2592 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2595 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2596 const struct lp_type type
= bld
->type
;
2598 assert(lp_check_value(type
, a
));
2600 if (lp_build_fast_rsqrt_available(type
)) {
2601 const char *intrinsic
= NULL
;
2603 if (type
.length
== 4) {
2604 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2607 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2609 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2612 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2614 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2619 * Generate sin(a) or cos(a) using polynomial approximation.
2620 * TODO: it might be worth recognizing sin and cos using same source
2621 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2622 * would be way cheaper than calculating (nearly) everything twice...
2623 * Not sure it's common enough to be worth bothering however, scs
2624 * opcode could also benefit from calculating both though.
2627 lp_build_sin_or_cos(struct lp_build_context
*bld
,
2631 struct gallivm_state
*gallivm
= bld
->gallivm
;
2632 LLVMBuilderRef b
= gallivm
->builder
;
2633 struct lp_type int_type
= lp_int_type(bld
->type
);
2636 * take the absolute value,
2637 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2640 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2641 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2643 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2644 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2648 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2651 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2652 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2655 * store the integer part of y in mm0
2656 * emm2 = _mm_cvttps_epi32(y);
2659 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2662 * j=(j+1) & (~1) (see the cephes sources)
2663 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2666 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2667 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2669 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2671 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2672 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2675 * y = _mm_cvtepi32_ps(emm2);
2677 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2679 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2680 LLVMValueRef const_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2681 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2682 LLVMValueRef sign_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2685 * Argument used for poly selection and sign bit determination
2686 * is different for sin vs. cos.
2688 LLVMValueRef emm2_2
= cos
? LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2") :
2691 LLVMValueRef sign_bit
= cos
? LLVMBuildShl(b
, LLVMBuildAnd(b
, const_4
,
2692 LLVMBuildNot(b
, emm2_2
, ""), ""),
2693 const_29
, "sign_bit") :
2694 LLVMBuildAnd(b
, LLVMBuildXor(b
, a_v4si
,
2695 LLVMBuildShl(b
, emm2_add
,
2697 sign_mask
, "sign_bit");
2700 * get the polynom selection mask
2701 * there is one polynom for 0 <= x <= Pi/4
2702 * and another one for Pi/4<x<=Pi/2
2703 * Both branches will be computed.
2705 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2706 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2709 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, const_2
, "emm2_3");
2710 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2711 int_type
, PIPE_FUNC_EQUAL
,
2712 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2715 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2716 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2717 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2719 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2720 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2721 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2724 * The magic pass: "Extended precision modular arithmetic"
2725 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2726 * xmm1 = _mm_mul_ps(y, xmm1);
2727 * xmm2 = _mm_mul_ps(y, xmm2);
2728 * xmm3 = _mm_mul_ps(y, xmm3);
2730 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2731 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2732 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2735 * x = _mm_add_ps(x, xmm1);
2736 * x = _mm_add_ps(x, xmm2);
2737 * x = _mm_add_ps(x, xmm3);
2740 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2741 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2742 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2745 * Evaluate the first polynom (0 <= x <= Pi/4)
2747 * z = _mm_mul_ps(x,x);
2749 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2752 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2753 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2754 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2756 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2757 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2758 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2761 * y = *(v4sf*)_ps_coscof_p0;
2762 * y = _mm_mul_ps(y, z);
2764 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2765 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2766 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2767 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2768 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2769 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2773 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2774 * y = _mm_sub_ps(y, tmp);
2775 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2777 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2778 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2779 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2780 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2781 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2784 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2785 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2786 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2788 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2789 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2790 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2793 * Evaluate the second polynom (Pi/4 <= x <= 0)
2795 * y2 = *(v4sf*)_ps_sincof_p0;
2796 * y2 = _mm_mul_ps(y2, z);
2797 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2798 * y2 = _mm_mul_ps(y2, z);
2799 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2800 * y2 = _mm_mul_ps(y2, z);
2801 * y2 = _mm_mul_ps(y2, x);
2802 * y2 = _mm_add_ps(y2, x);
2805 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2806 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2807 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2808 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2809 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2810 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2811 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2814 * select the correct result from the two polynoms
2816 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2817 * y = _mm_andnot_ps(xmm3, y);
2818 * y = _mm_or_ps(y,y2);
2820 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2821 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2822 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2823 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
2824 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2825 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
2829 * y = _mm_xor_ps(y, sign_bit);
2831 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sign");
2832 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2834 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
2836 /* clamp output to be within [-1, 1] */
2837 y_result
= lp_build_clamp(bld
, y_result
,
2838 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
2839 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
2840 /* If a is -inf, inf or NaN then return NaN */
2841 y_result
= lp_build_select(bld
, isfinite
, y_result
,
2842 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
2851 lp_build_sin(struct lp_build_context
*bld
,
2854 return lp_build_sin_or_cos(bld
, a
, FALSE
);
2862 lp_build_cos(struct lp_build_context
*bld
,
2865 return lp_build_sin_or_cos(bld
, a
, TRUE
);
2870 * Generate pow(x, y)
2873 lp_build_pow(struct lp_build_context
*bld
,
2877 /* TODO: optimize the constant case */
2878 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2879 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2880 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2884 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2892 lp_build_exp(struct lp_build_context
*bld
,
2895 /* log2(e) = 1/log(2) */
2896 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2897 1.4426950408889634);
2899 assert(lp_check_value(bld
->type
, x
));
2901 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2907 * Behavior is undefined with infs, 0s and nans
2910 lp_build_log(struct lp_build_context
*bld
,
2914 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2915 0.69314718055994529);
2917 assert(lp_check_value(bld
->type
, x
));
2919 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2923 * Generate log(x) that handles edge cases (infs, 0s and nans)
2926 lp_build_log_safe(struct lp_build_context
*bld
,
2930 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2931 0.69314718055994529);
2933 assert(lp_check_value(bld
->type
, x
));
2935 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
2940 * Generate polynomial.
2941 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2944 lp_build_polynomial(struct lp_build_context
*bld
,
2946 const double *coeffs
,
2947 unsigned num_coeffs
)
2949 const struct lp_type type
= bld
->type
;
2950 LLVMValueRef even
= NULL
, odd
= NULL
;
2954 assert(lp_check_value(bld
->type
, x
));
2956 /* TODO: optimize the constant case */
2957 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2958 LLVMIsConstant(x
)) {
2959 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2964 * Calculate odd and even terms seperately to decrease data dependency
2966 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2967 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2969 x2
= lp_build_mul(bld
, x
, x
);
2971 for (i
= num_coeffs
; i
--; ) {
2974 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2978 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2983 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2990 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2999 * Minimax polynomial fit of 2**x, in range [0, 1[
3001 const double lp_build_exp2_polynomial
[] = {
3002 #if EXP_POLY_DEGREE == 5
3003 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3004 0.693153073200168932794,
3005 0.240153617044375388211,
3006 0.0558263180532956664775,
3007 0.00898934009049466391101,
3008 0.00187757667519147912699
3009 #elif EXP_POLY_DEGREE == 4
3010 1.00000259337069434683,
3011 0.693003834469974940458,
3012 0.24144275689150793076,
3013 0.0520114606103070150235,
3014 0.0135341679161270268764
3015 #elif EXP_POLY_DEGREE == 3
3016 0.999925218562710312959,
3017 0.695833540494823811697,
3018 0.226067155427249155588,
3019 0.0780245226406372992967
3020 #elif EXP_POLY_DEGREE == 2
3021 1.00172476321474503578,
3022 0.657636275736077639316,
3023 0.33718943461968720704
3031 lp_build_exp2(struct lp_build_context
*bld
,
3034 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3035 const struct lp_type type
= bld
->type
;
3036 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3037 LLVMValueRef ipart
= NULL
;
3038 LLVMValueRef fpart
= NULL
;
3039 LLVMValueRef expipart
= NULL
;
3040 LLVMValueRef expfpart
= NULL
;
3041 LLVMValueRef res
= NULL
;
3043 assert(lp_check_value(bld
->type
, x
));
3046 /* TODO: optimize the constant case */
3047 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3048 LLVMIsConstant(x
)) {
3049 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3053 assert(type
.floating
&& type
.width
== 32);
3055 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3056 * the result is INF and if it's smaller than -126.9 the result is 0 */
3057 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3058 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN
);
3059 x
= lp_build_max(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999), x
);
3061 /* ipart = floor(x) */
3062 /* fpart = x - ipart */
3063 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3067 /* expipart = (float) (1 << ipart) */
3068 expipart
= LLVMBuildAdd(builder
, ipart
,
3069 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3070 expipart
= LLVMBuildShl(builder
, expipart
,
3071 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3072 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3075 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3076 Elements(lp_build_exp2_polynomial
));
3078 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3087 * Extract the exponent of a IEEE-754 floating point value.
3089 * Optionally apply an integer bias.
3091 * Result is an integer value with
3093 * ifloor(log2(x)) + bias
3096 lp_build_extract_exponent(struct lp_build_context
*bld
,
3100 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3101 const struct lp_type type
= bld
->type
;
3102 unsigned mantissa
= lp_mantissa(type
);
3105 assert(type
.floating
);
3107 assert(lp_check_value(bld
->type
, x
));
3109 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3111 res
= LLVMBuildLShr(builder
, x
,
3112 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3113 res
= LLVMBuildAnd(builder
, res
,
3114 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3115 res
= LLVMBuildSub(builder
, res
,
3116 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3123 * Extract the mantissa of the a floating.
3125 * Result is a floating point value with
3127 * x / floor(log2(x))
3130 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3133 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3134 const struct lp_type type
= bld
->type
;
3135 unsigned mantissa
= lp_mantissa(type
);
3136 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3137 (1ULL << mantissa
) - 1);
3138 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3141 assert(lp_check_value(bld
->type
, x
));
3143 assert(type
.floating
);
3145 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3147 /* res = x / 2**ipart */
3148 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3149 res
= LLVMBuildOr(builder
, res
, one
, "");
3150 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3158 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3159 * These coefficients can be generate with
3160 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3162 const double lp_build_log2_polynomial
[] = {
3163 #if LOG_POLY_DEGREE == 5
3164 2.88539008148777786488L,
3165 0.961796878841293367824L,
3166 0.577058946784739859012L,
3167 0.412914355135828735411L,
3168 0.308591899232910175289L,
3169 0.352376952300281371868L,
3170 #elif LOG_POLY_DEGREE == 4
3171 2.88539009343309178325L,
3172 0.961791550404184197881L,
3173 0.577440339438736392009L,
3174 0.403343858251329912514L,
3175 0.406718052498846252698L,
3176 #elif LOG_POLY_DEGREE == 3
3177 2.88538959748872753838L,
3178 0.961932915889597772928L,
3179 0.571118517972136195241L,
3180 0.493997535084709500285L,
3187 * See http://www.devmaster.net/forums/showthread.php?p=43580
3188 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3189 * http://www.nezumi.demon.co.uk/consult/logx.htm
3191 * If handle_edge_cases is true the function will perform computations
3192 * to match the required D3D10+ behavior for each of the edge cases.
3193 * That means that if input is:
3194 * - less than zero (to and including -inf) then NaN will be returned
3195 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3196 * - +infinity, then +infinity will be returned
3197 * - NaN, then NaN will be returned
3199 * Those checks are fairly expensive so if you don't need them make sure
3200 * handle_edge_cases is false.
3203 lp_build_log2_approx(struct lp_build_context
*bld
,
3205 LLVMValueRef
*p_exp
,
3206 LLVMValueRef
*p_floor_log2
,
3207 LLVMValueRef
*p_log2
,
3208 boolean handle_edge_cases
)
3210 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3211 const struct lp_type type
= bld
->type
;
3212 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3213 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3215 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3216 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3217 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3219 LLVMValueRef i
= NULL
;
3220 LLVMValueRef y
= NULL
;
3221 LLVMValueRef z
= NULL
;
3222 LLVMValueRef exp
= NULL
;
3223 LLVMValueRef mant
= NULL
;
3224 LLVMValueRef logexp
= NULL
;
3225 LLVMValueRef logmant
= NULL
;
3226 LLVMValueRef res
= NULL
;
3228 assert(lp_check_value(bld
->type
, x
));
3230 if(p_exp
|| p_floor_log2
|| p_log2
) {
3231 /* TODO: optimize the constant case */
3232 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3233 LLVMIsConstant(x
)) {
3234 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3238 assert(type
.floating
&& type
.width
== 32);
3241 * We don't explicitly handle denormalized numbers. They will yield a
3242 * result in the neighbourhood of -127, which appears to be adequate
3246 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3248 /* exp = (float) exponent(x) */
3249 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3252 if(p_floor_log2
|| p_log2
) {
3253 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3254 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3255 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3259 /* mant = 1 + (float) mantissa(x) */
3260 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3261 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3262 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3264 /* y = (mant - 1) / (mant + 1) */
3265 y
= lp_build_div(bld
,
3266 lp_build_sub(bld
, mant
, bld
->one
),
3267 lp_build_add(bld
, mant
, bld
->one
)
3271 z
= lp_build_mul(bld
, y
, y
);
3274 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3275 Elements(lp_build_log2_polynomial
));
3277 /* logmant = y * P(z) */
3278 logmant
= lp_build_mul(bld
, y
, logmant
);
3280 res
= lp_build_add(bld
, logmant
, logexp
);
3282 if (type
.floating
&& handle_edge_cases
) {
3283 LLVMValueRef negmask
, infmask
, zmask
;
3284 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3285 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3286 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3287 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3288 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3289 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3291 /* If x is qual to inf make sure we return inf */
3292 res
= lp_build_select(bld
, infmask
,
3293 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3295 /* If x is qual to 0, return -inf */
3296 res
= lp_build_select(bld
, zmask
,
3297 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3299 /* If x is nan or less than 0, return nan */
3300 res
= lp_build_select(bld
, negmask
,
3301 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3307 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3312 *p_floor_log2
= logexp
;
3320 * log2 implementation which doesn't have special code to
3321 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3322 * the results for those cases are undefined.
3325 lp_build_log2(struct lp_build_context
*bld
,
3329 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3334 * Version of log2 which handles all edge cases.
3335 * Look at documentation of lp_build_log2_approx for
3336 * description of the behavior for each of the edge cases.
3339 lp_build_log2_safe(struct lp_build_context
*bld
,
3343 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3349 * Faster (and less accurate) log2.
3351 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3353 * Piece-wise linear approximation, with exact results when x is a
3356 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3359 lp_build_fast_log2(struct lp_build_context
*bld
,
3362 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3366 assert(lp_check_value(bld
->type
, x
));
3368 assert(bld
->type
.floating
);
3370 /* ipart = floor(log2(x)) - 1 */
3371 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3372 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3374 /* fpart = x / 2**ipart */
3375 fpart
= lp_build_extract_mantissa(bld
, x
);
3378 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3383 * Fast implementation of iround(log2(x)).
3385 * Not an approximation -- it should give accurate results all the time.
3388 lp_build_ilog2(struct lp_build_context
*bld
,
3391 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3392 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3395 assert(bld
->type
.floating
);
3397 assert(lp_check_value(bld
->type
, x
));
3399 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3400 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3402 /* ipart = floor(log2(x) + 0.5) */
3403 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3409 lp_build_mod(struct lp_build_context
*bld
,
3413 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3415 const struct lp_type type
= bld
->type
;
3417 assert(lp_check_value(type
, x
));
3418 assert(lp_check_value(type
, y
));
3421 res
= LLVMBuildFRem(builder
, x
, y
, "");
3423 res
= LLVMBuildSRem(builder
, x
, y
, "");
3425 res
= LLVMBuildURem(builder
, x
, y
, "");
3431 * For floating inputs it creates and returns a mask
3432 * which is all 1's for channels which are NaN.
3433 * Channels inside x which are not NaN will be 0.
3436 lp_build_isnan(struct lp_build_context
*bld
,
3440 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3442 assert(bld
->type
.floating
);
3443 assert(lp_check_value(bld
->type
, x
));
3445 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3447 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3448 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3452 /* Returns all 1's for floating point numbers that are
3453 * finite numbers and returns all zeros for -inf,
3456 lp_build_isfinite(struct lp_build_context
*bld
,
3459 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3460 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3461 struct lp_type int_type
= lp_int_type(bld
->type
);
3462 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3463 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3466 if (!bld
->type
.floating
) {
3467 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3469 assert(bld
->type
.floating
);
3470 assert(lp_check_value(bld
->type
, x
));
3471 assert(bld
->type
.width
== 32);
3473 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3474 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,
3479 * Returns true if the number is nan or inf and false otherwise.
3480 * The input has to be a floating point vector.
3483 lp_build_is_inf_or_nan(struct gallivm_state
*gallivm
,
3484 const struct lp_type type
,
3487 LLVMBuilderRef builder
= gallivm
->builder
;
3488 struct lp_type int_type
= lp_int_type(type
);
3489 LLVMValueRef const0
= lp_build_const_int_vec(gallivm
, int_type
,
3493 assert(type
.floating
);
3495 ret
= LLVMBuildBitCast(builder
, x
, lp_build_vec_type(gallivm
, int_type
), "");
3496 ret
= LLVMBuildAnd(builder
, ret
, const0
, "");
3497 ret
= lp_build_compare(gallivm
, int_type
, PIPE_FUNC_EQUAL
,
3505 lp_build_fpstate_get(struct gallivm_state
*gallivm
)
3507 if (util_cpu_caps
.has_sse
) {
3508 LLVMBuilderRef builder
= gallivm
->builder
;
3509 LLVMValueRef mxcsr_ptr
= lp_build_alloca(
3511 LLVMInt32TypeInContext(gallivm
->context
),
3513 LLVMValueRef mxcsr_ptr8
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3514 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3515 lp_build_intrinsic(builder
,
3516 "llvm.x86.sse.stmxcsr",
3517 LLVMVoidTypeInContext(gallivm
->context
),
3525 lp_build_fpstate_set_denorms_zero(struct gallivm_state
*gallivm
,
3528 if (util_cpu_caps
.has_sse
) {
3529 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3530 int daz_ftz
= _MM_FLUSH_ZERO_MASK
;
3532 LLVMBuilderRef builder
= gallivm
->builder
;
3533 LLVMValueRef mxcsr_ptr
= lp_build_fpstate_get(gallivm
);
3534 LLVMValueRef mxcsr
=
3535 LLVMBuildLoad(builder
, mxcsr_ptr
, "mxcsr");
3537 if (util_cpu_caps
.has_daz
) {
3538 /* Enable denormals are zero mode */
3539 daz_ftz
|= _MM_DENORMALS_ZERO_MASK
;
3542 mxcsr
= LLVMBuildOr(builder
, mxcsr
,
3543 LLVMConstInt(LLVMTypeOf(mxcsr
), daz_ftz
, 0), "");
3545 mxcsr
= LLVMBuildAnd(builder
, mxcsr
,
3546 LLVMConstInt(LLVMTypeOf(mxcsr
), ~daz_ftz
, 0), "");
3549 LLVMBuildStore(builder
, mxcsr
, mxcsr_ptr
);
3550 lp_build_fpstate_set(gallivm
, mxcsr_ptr
);
3555 lp_build_fpstate_set(struct gallivm_state
*gallivm
,
3556 LLVMValueRef mxcsr_ptr
)
3558 if (util_cpu_caps
.has_sse
) {
3559 LLVMBuilderRef builder
= gallivm
->builder
;
3560 mxcsr_ptr
= LLVMBuildPointerCast(builder
, mxcsr_ptr
,
3561 LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0), "");
3562 lp_build_intrinsic(builder
,
3563 "llvm.x86.sse.ldmxcsr",
3564 LLVMVoidTypeInContext(gallivm
->context
),