1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
68 #define EXP_POLY_DEGREE 5
70 #define LOG_POLY_DEGREE 4
75 * No checks for special case values of a or b = 1 or 0 are done.
76 * NaN's are handled according to the behavior specified by the
77 * nan_behavior argument.
80 lp_build_min_simple(struct lp_build_context
*bld
,
83 enum gallivm_nan_behavior nan_behavior
)
85 const struct lp_type type
= bld
->type
;
86 const char *intrinsic
= NULL
;
87 unsigned intr_size
= 0;
90 assert(lp_check_value(type
, a
));
91 assert(lp_check_value(type
, b
));
93 /* TODO: optimize the constant case */
95 if (type
.floating
&& util_cpu_caps
.has_sse
) {
96 if (type
.width
== 32) {
97 if (type
.length
== 1) {
98 intrinsic
= "llvm.x86.sse.min.ss";
101 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
102 intrinsic
= "llvm.x86.sse.min.ps";
106 intrinsic
= "llvm.x86.avx.min.ps.256";
110 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
111 if (type
.length
== 1) {
112 intrinsic
= "llvm.x86.sse2.min.sd";
115 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
116 intrinsic
= "llvm.x86.sse2.min.pd";
120 intrinsic
= "llvm.x86.avx.min.pd.256";
125 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
126 debug_printf("%s: altivec doesn't support nan behavior modes\n",
128 if (type
.width
== 32 && type
.length
== 4) {
129 intrinsic
= "llvm.ppc.altivec.vminfp";
132 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
134 if ((type
.width
== 8 || type
.width
== 16) &&
135 (type
.width
* type
.length
<= 64) &&
136 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
137 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
140 if (type
.width
== 8 && !type
.sign
) {
141 intrinsic
= "llvm.x86.sse2.pminu.b";
143 else if (type
.width
== 16 && type
.sign
) {
144 intrinsic
= "llvm.x86.sse2.pmins.w";
146 if (util_cpu_caps
.has_sse4_1
) {
147 if (type
.width
== 8 && type
.sign
) {
148 intrinsic
= "llvm.x86.sse41.pminsb";
150 if (type
.width
== 16 && !type
.sign
) {
151 intrinsic
= "llvm.x86.sse41.pminuw";
153 if (type
.width
== 32 && !type
.sign
) {
154 intrinsic
= "llvm.x86.sse41.pminud";
156 if (type
.width
== 32 && type
.sign
) {
157 intrinsic
= "llvm.x86.sse41.pminsd";
160 } else if (util_cpu_caps
.has_altivec
) {
162 debug_printf("%s: altivec doesn't support nan behavior modes\n",
164 if (type
.width
== 8) {
166 intrinsic
= "llvm.ppc.altivec.vminub";
168 intrinsic
= "llvm.ppc.altivec.vminsb";
170 } else if (type
.width
== 16) {
172 intrinsic
= "llvm.ppc.altivec.vminuh";
174 intrinsic
= "llvm.ppc.altivec.vminsh";
176 } else if (type
.width
== 32) {
178 intrinsic
= "llvm.ppc.altivec.vminuw";
180 intrinsic
= "llvm.ppc.altivec.vminsw";
186 /* We need to handle nan's for floating point numbers. If one of the
187 * inputs is nan the other should be returned (required by both D3D10+
189 * The sse intrinsics return the second operator in case of nan by
190 * default so we need to special code to handle those.
192 if (util_cpu_caps
.has_sse
&& type
.floating
&&
193 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
194 nan_behavior
!= GALLIVM_NAN_RETURN_SECOND
) {
195 LLVMValueRef isnan
, max
;
196 max
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
199 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
200 isnan
= lp_build_isnan(bld
, b
);
201 return lp_build_select(bld
, isnan
, a
, max
);
203 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
204 isnan
= lp_build_isnan(bld
, a
);
205 return lp_build_select(bld
, isnan
, a
, max
);
208 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
215 switch (nan_behavior
) {
216 case GALLIVM_NAN_RETURN_NAN
: {
217 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
218 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
219 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
220 return lp_build_select(bld
, cond
, a
, b
);
223 case GALLIVM_NAN_RETURN_OTHER
: {
224 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
225 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
226 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
227 return lp_build_select(bld
, cond
, a
, b
);
230 case GALLIVM_NAN_RETURN_SECOND
:
231 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_LESS
, a
, b
);
232 return lp_build_select(bld
, cond
, a
, b
);
233 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
234 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
235 return lp_build_select(bld
, cond
, a
, b
);
239 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
240 return lp_build_select(bld
, cond
, a
, b
);
243 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
244 return lp_build_select(bld
, cond
, a
, b
);
251 * No checks for special case values of a or b = 1 or 0 are done.
252 * NaN's are handled according to the behavior specified by the
253 * nan_behavior argument.
256 lp_build_max_simple(struct lp_build_context
*bld
,
259 enum gallivm_nan_behavior nan_behavior
)
261 const struct lp_type type
= bld
->type
;
262 const char *intrinsic
= NULL
;
263 unsigned intr_size
= 0;
266 assert(lp_check_value(type
, a
));
267 assert(lp_check_value(type
, b
));
269 /* TODO: optimize the constant case */
271 if (type
.floating
&& util_cpu_caps
.has_sse
) {
272 if (type
.width
== 32) {
273 if (type
.length
== 1) {
274 intrinsic
= "llvm.x86.sse.max.ss";
277 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
278 intrinsic
= "llvm.x86.sse.max.ps";
282 intrinsic
= "llvm.x86.avx.max.ps.256";
286 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
287 if (type
.length
== 1) {
288 intrinsic
= "llvm.x86.sse2.max.sd";
291 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
292 intrinsic
= "llvm.x86.sse2.max.pd";
296 intrinsic
= "llvm.x86.avx.max.pd.256";
301 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
302 debug_printf("%s: altivec doesn't support nan behavior modes\n",
304 if (type
.width
== 32 || type
.length
== 4) {
305 intrinsic
= "llvm.ppc.altivec.vmaxfp";
308 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
310 if ((type
.width
== 8 || type
.width
== 16) &&
311 (type
.width
* type
.length
<= 64) &&
312 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
313 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
316 if (type
.width
== 8 && !type
.sign
) {
317 intrinsic
= "llvm.x86.sse2.pmaxu.b";
320 else if (type
.width
== 16 && type
.sign
) {
321 intrinsic
= "llvm.x86.sse2.pmaxs.w";
323 if (util_cpu_caps
.has_sse4_1
) {
324 if (type
.width
== 8 && type
.sign
) {
325 intrinsic
= "llvm.x86.sse41.pmaxsb";
327 if (type
.width
== 16 && !type
.sign
) {
328 intrinsic
= "llvm.x86.sse41.pmaxuw";
330 if (type
.width
== 32 && !type
.sign
) {
331 intrinsic
= "llvm.x86.sse41.pmaxud";
333 if (type
.width
== 32 && type
.sign
) {
334 intrinsic
= "llvm.x86.sse41.pmaxsd";
337 } else if (util_cpu_caps
.has_altivec
) {
339 debug_printf("%s: altivec doesn't support nan behavior modes\n",
341 if (type
.width
== 8) {
343 intrinsic
= "llvm.ppc.altivec.vmaxub";
345 intrinsic
= "llvm.ppc.altivec.vmaxsb";
347 } else if (type
.width
== 16) {
349 intrinsic
= "llvm.ppc.altivec.vmaxuh";
351 intrinsic
= "llvm.ppc.altivec.vmaxsh";
353 } else if (type
.width
== 32) {
355 intrinsic
= "llvm.ppc.altivec.vmaxuw";
357 intrinsic
= "llvm.ppc.altivec.vmaxsw";
363 if (util_cpu_caps
.has_sse
&& type
.floating
&&
364 nan_behavior
!= GALLIVM_NAN_BEHAVIOR_UNDEFINED
&&
365 nan_behavior
!= GALLIVM_NAN_RETURN_SECOND
) {
366 LLVMValueRef isnan
, min
;
367 min
= lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
370 if (nan_behavior
== GALLIVM_NAN_RETURN_OTHER
) {
371 isnan
= lp_build_isnan(bld
, b
);
372 return lp_build_select(bld
, isnan
, a
, min
);
374 assert(nan_behavior
== GALLIVM_NAN_RETURN_NAN
);
375 isnan
= lp_build_isnan(bld
, a
);
376 return lp_build_select(bld
, isnan
, a
, min
);
379 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
386 switch (nan_behavior
) {
387 case GALLIVM_NAN_RETURN_NAN
: {
388 LLVMValueRef isnan
= lp_build_isnan(bld
, b
);
389 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
390 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
391 return lp_build_select(bld
, cond
, a
, b
);
394 case GALLIVM_NAN_RETURN_OTHER
: {
395 LLVMValueRef isnan
= lp_build_isnan(bld
, a
);
396 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
397 cond
= LLVMBuildXor(bld
->gallivm
->builder
, cond
, isnan
, "");
398 return lp_build_select(bld
, cond
, a
, b
);
401 case GALLIVM_NAN_RETURN_SECOND
:
402 cond
= lp_build_cmp_ordered(bld
, PIPE_FUNC_GREATER
, a
, b
);
403 return lp_build_select(bld
, cond
, a
, b
);
404 case GALLIVM_NAN_BEHAVIOR_UNDEFINED
:
405 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
406 return lp_build_select(bld
, cond
, a
, b
);
410 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
411 return lp_build_select(bld
, cond
, a
, b
);
414 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
415 return lp_build_select(bld
, cond
, a
, b
);
421 * Generate 1 - a, or ~a depending on bld->type.
424 lp_build_comp(struct lp_build_context
*bld
,
427 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
428 const struct lp_type type
= bld
->type
;
430 assert(lp_check_value(type
, a
));
437 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
438 if(LLVMIsConstant(a
))
439 return LLVMConstNot(a
);
441 return LLVMBuildNot(builder
, a
, "");
444 if(LLVMIsConstant(a
))
446 return LLVMConstFSub(bld
->one
, a
);
448 return LLVMConstSub(bld
->one
, a
);
451 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
453 return LLVMBuildSub(builder
, bld
->one
, a
, "");
461 lp_build_add(struct lp_build_context
*bld
,
465 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
466 const struct lp_type type
= bld
->type
;
469 assert(lp_check_value(type
, a
));
470 assert(lp_check_value(type
, b
));
476 if(a
== bld
->undef
|| b
== bld
->undef
)
480 const char *intrinsic
= NULL
;
482 if(a
== bld
->one
|| b
== bld
->one
)
485 if (type
.width
* type
.length
== 128 &&
486 !type
.floating
&& !type
.fixed
) {
487 if(util_cpu_caps
.has_sse2
) {
489 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
491 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
492 } else if (util_cpu_caps
.has_altivec
) {
494 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
496 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
501 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
504 /* TODO: handle signed case */
505 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
)
506 a
= lp_build_min_simple(bld
, a
, lp_build_comp(bld
, b
), GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
508 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
510 res
= LLVMConstFAdd(a
, b
);
512 res
= LLVMConstAdd(a
, b
);
515 res
= LLVMBuildFAdd(builder
, a
, b
, "");
517 res
= LLVMBuildAdd(builder
, a
, b
, "");
519 /* clamp to ceiling of 1.0 */
520 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
521 res
= lp_build_min_simple(bld
, res
, bld
->one
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
523 /* XXX clamp to floor of -1 or 0??? */
529 /** Return the scalar sum of the elements of a.
530 * Should avoid this operation whenever possible.
533 lp_build_horizontal_add(struct lp_build_context
*bld
,
536 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
537 const struct lp_type type
= bld
->type
;
538 LLVMValueRef index
, res
;
540 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
541 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
542 LLVMValueRef vecres
, elem2
;
544 assert(lp_check_value(type
, a
));
546 if (type
.length
== 1) {
550 assert(!bld
->type
.norm
);
553 * for byte vectors can do much better with psadbw.
554 * Using repeated shuffle/adds here. Note with multiple vectors
555 * this can be done more efficiently as outlined in the intel
556 * optimization manual.
557 * Note: could cause data rearrangement if used with smaller element
562 length
= type
.length
/ 2;
564 LLVMValueRef vec1
, vec2
;
565 for (i
= 0; i
< length
; i
++) {
566 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
567 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
569 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
570 LLVMConstVector(shuffles1
, length
), "");
571 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
572 LLVMConstVector(shuffles2
, length
), "");
574 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
577 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
579 length
= length
>> 1;
582 /* always have vector of size 2 here */
585 index
= lp_build_const_int32(bld
->gallivm
, 0);
586 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
587 index
= lp_build_const_int32(bld
->gallivm
, 1);
588 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
591 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
593 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
599 * Return the horizontal sums of 4 float vectors as a float4 vector.
600 * This uses the technique as outlined in Intel Optimization Manual.
603 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
606 struct gallivm_state
*gallivm
= bld
->gallivm
;
607 LLVMBuilderRef builder
= gallivm
->builder
;
608 LLVMValueRef shuffles
[4];
610 LLVMValueRef sumtmp
[2], shuftmp
[2];
612 /* lower half of regs */
613 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
614 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
615 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
616 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
617 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
618 LLVMConstVector(shuffles
, 4), "");
619 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
620 LLVMConstVector(shuffles
, 4), "");
622 /* upper half of regs */
623 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
624 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
625 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
626 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
627 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
628 LLVMConstVector(shuffles
, 4), "");
629 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
630 LLVMConstVector(shuffles
, 4), "");
632 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
633 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
635 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
636 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
637 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
638 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
639 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
640 LLVMConstVector(shuffles
, 4), "");
642 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
643 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
644 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
645 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
646 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
647 LLVMConstVector(shuffles
, 4), "");
649 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
654 * partially horizontally add 2-4 float vectors with length nx4,
655 * i.e. only four adjacent values in each vector will be added,
656 * assuming values are really grouped in 4 which also determines
659 * Return a vector of the same length as the initial vectors,
660 * with the excess elements (if any) being undefined.
661 * The element order is independent of number of input vectors.
662 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
663 * the output order thus will be
664 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
667 lp_build_hadd_partial4(struct lp_build_context
*bld
,
668 LLVMValueRef vectors
[],
671 struct gallivm_state
*gallivm
= bld
->gallivm
;
672 LLVMBuilderRef builder
= gallivm
->builder
;
673 LLVMValueRef ret_vec
;
675 const char *intrinsic
= NULL
;
677 assert(num_vecs
>= 2 && num_vecs
<= 4);
678 assert(bld
->type
.floating
);
680 /* only use this with at least 2 vectors, as it is sort of expensive
681 * (depending on cpu) and we always need two horizontal adds anyway,
682 * so a shuffle/add approach might be better.
688 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
689 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
691 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
692 bld
->type
.length
== 4) {
693 intrinsic
= "llvm.x86.sse3.hadd.ps";
695 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
696 bld
->type
.length
== 8) {
697 intrinsic
= "llvm.x86.avx.hadd.ps.256";
700 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
701 lp_build_vec_type(gallivm
, bld
->type
),
704 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
705 lp_build_vec_type(gallivm
, bld
->type
),
711 return lp_build_intrinsic_binary(builder
, intrinsic
,
712 lp_build_vec_type(gallivm
, bld
->type
),
716 if (bld
->type
.length
== 4) {
717 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
720 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
722 unsigned num_iter
= bld
->type
.length
/ 4;
723 struct lp_type parttype
= bld
->type
;
725 for (j
= 0; j
< num_iter
; j
++) {
726 LLVMValueRef partsrc
[4];
728 for (i
= 0; i
< 4; i
++) {
729 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
731 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
733 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
742 lp_build_sub(struct lp_build_context
*bld
,
746 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
747 const struct lp_type type
= bld
->type
;
750 assert(lp_check_value(type
, a
));
751 assert(lp_check_value(type
, b
));
755 if(a
== bld
->undef
|| b
== bld
->undef
)
761 const char *intrinsic
= NULL
;
766 if (type
.width
* type
.length
== 128 &&
767 !type
.floating
&& !type
.fixed
) {
768 if (util_cpu_caps
.has_sse2
) {
770 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
772 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
773 } else if (util_cpu_caps
.has_altivec
) {
775 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
777 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
782 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
785 /* TODO: handle signed case */
786 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
)
787 a
= lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
789 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
791 res
= LLVMConstFSub(a
, b
);
793 res
= LLVMConstSub(a
, b
);
796 res
= LLVMBuildFSub(builder
, a
, b
, "");
798 res
= LLVMBuildSub(builder
, a
, b
, "");
800 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
801 res
= lp_build_max_simple(bld
, res
, bld
->zero
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
809 * Normalized multiplication.
811 * There are several approaches for (using 8-bit normalized multiplication as
816 * makes the following approximation to the division (Sree)
818 * a*b/255 ~= (a*(b + 1)) >> 256
820 * which is the fastest method that satisfies the following OpenGL criteria of
822 * 0*0 = 0 and 255*255 = 255
826 * takes the geometric series approximation to the division
828 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
830 * in this case just the first two terms to fit in 16bit arithmetic
832 * t/255 ~= (t + (t >> 8)) >> 8
834 * note that just by itself it doesn't satisfies the OpenGL criteria, as
835 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
838 * - geometric series plus rounding
840 * when using a geometric series division instead of truncating the result
841 * use roundoff in the approximation (Jim Blinn)
843 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
845 * achieving the exact results.
849 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
850 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
851 * @sa Michael Herf, The "double blend trick", May 2000,
852 * http://www.stereopsis.com/doubleblend.html
855 lp_build_mul_norm(struct gallivm_state
*gallivm
,
856 struct lp_type wide_type
,
857 LLVMValueRef a
, LLVMValueRef b
)
859 LLVMBuilderRef builder
= gallivm
->builder
;
860 struct lp_build_context bld
;
865 assert(!wide_type
.floating
);
866 assert(lp_check_value(wide_type
, a
));
867 assert(lp_check_value(wide_type
, b
));
869 lp_build_context_init(&bld
, gallivm
, wide_type
);
871 n
= wide_type
.width
/ 2;
872 if (wide_type
.sign
) {
877 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
878 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
882 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
885 ab
= LLVMBuildMul(builder
, a
, b
, "");
886 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
889 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
892 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1 << (n
- 1));
893 if (wide_type
.sign
) {
894 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
895 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
896 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
898 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
901 ab
= lp_build_shr_imm(&bld
, ab
, n
);
910 lp_build_mul(struct lp_build_context
*bld
,
914 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
915 const struct lp_type type
= bld
->type
;
919 assert(lp_check_value(type
, a
));
920 assert(lp_check_value(type
, b
));
930 if(a
== bld
->undef
|| b
== bld
->undef
)
933 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
934 struct lp_type wide_type
= lp_wider_type(type
);
935 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
937 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
938 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
940 /* PMULLW, PSRLW, PADDW */
941 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
942 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
944 ab
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, abl
, abh
);
950 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
954 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
956 res
= LLVMConstFMul(a
, b
);
958 res
= LLVMConstMul(a
, b
);
961 res
= LLVMConstAShr(res
, shift
);
963 res
= LLVMConstLShr(res
, shift
);
968 res
= LLVMBuildFMul(builder
, a
, b
, "");
970 res
= LLVMBuildMul(builder
, a
, b
, "");
973 res
= LLVMBuildAShr(builder
, res
, shift
, "");
975 res
= LLVMBuildLShr(builder
, res
, shift
, "");
984 * Small vector x scale multiplication optimization.
987 lp_build_mul_imm(struct lp_build_context
*bld
,
991 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
994 assert(lp_check_value(bld
->type
, a
));
1003 return lp_build_negate(bld
, a
);
1005 if(b
== 2 && bld
->type
.floating
)
1006 return lp_build_add(bld
, a
, a
);
1008 if(util_is_power_of_two(b
)) {
1009 unsigned shift
= ffs(b
) - 1;
1011 if(bld
->type
.floating
) {
1014 * Power of two multiplication by directly manipulating the exponent.
1016 * XXX: This might not be always faster, it will introduce a small error
1017 * for multiplication by zero, and it will produce wrong results
1020 unsigned mantissa
= lp_mantissa(bld
->type
);
1021 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
1022 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
1023 a
= LLVMBuildAdd(builder
, a
, factor
, "");
1024 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
1029 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
1030 return LLVMBuildShl(builder
, a
, factor
, "");
1034 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
1035 return lp_build_mul(bld
, a
, factor
);
1043 lp_build_div(struct lp_build_context
*bld
,
1047 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1048 const struct lp_type type
= bld
->type
;
1050 assert(lp_check_value(type
, a
));
1051 assert(lp_check_value(type
, b
));
1056 return lp_build_rcp(bld
, b
);
1061 if(a
== bld
->undef
|| b
== bld
->undef
)
1064 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
1066 return LLVMConstFDiv(a
, b
);
1068 return LLVMConstSDiv(a
, b
);
1070 return LLVMConstUDiv(a
, b
);
1073 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1074 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
1076 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
1079 return LLVMBuildFDiv(builder
, a
, b
, "");
1081 return LLVMBuildSDiv(builder
, a
, b
, "");
1083 return LLVMBuildUDiv(builder
, a
, b
, "");
1088 * Linear interpolation helper.
1090 * @param normalized whether we are interpolating normalized values,
1091 * encoded in normalized integers, twice as wide.
1093 * @sa http://www.stereopsis.com/doubleblend.html
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context
*bld
,
1102 unsigned half_width
= bld
->type
.width
/2;
1103 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1107 assert(lp_check_value(bld
->type
, x
));
1108 assert(lp_check_value(bld
->type
, v0
));
1109 assert(lp_check_value(bld
->type
, v1
));
1111 delta
= lp_build_sub(bld
, v1
, v0
);
1113 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1114 if (!bld
->type
.sign
) {
1115 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1117 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118 * most-significant-bit to the lowest-significant-bit, so that
1119 * later we can just divide by 2**n instead of 2**n - 1.
1122 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1125 /* (x * delta) >> n */
1126 res
= lp_build_mul(bld
, x
, delta
);
1127 res
= lp_build_shr_imm(bld
, res
, half_width
);
1130 * The rescaling trick above doesn't work for signed numbers, so
1131 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1134 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1135 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1138 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1139 res
= lp_build_mul(bld
, x
, delta
);
1142 res
= lp_build_add(bld
, v0
, res
);
1144 if (((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) ||
1146 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148 * but it will be wrong for true fixed point use cases. Basically we need
1149 * a more powerful lp_type, capable of further distinguishing the values
1150 * interpretation from the value storage. */
1151 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1), "");
1159 * Linear interpolation.
1162 lp_build_lerp(struct lp_build_context
*bld
,
1168 const struct lp_type type
= bld
->type
;
1171 assert(lp_check_value(type
, x
));
1172 assert(lp_check_value(type
, v0
));
1173 assert(lp_check_value(type
, v1
));
1175 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1178 struct lp_type wide_type
;
1179 struct lp_build_context wide_bld
;
1180 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1182 assert(type
.length
>= 2);
1185 * Create a wider integer type, enough to hold the
1186 * intermediate result of the multiplication.
1188 memset(&wide_type
, 0, sizeof wide_type
);
1189 wide_type
.sign
= type
.sign
;
1190 wide_type
.width
= type
.width
*2;
1191 wide_type
.length
= type
.length
/2;
1193 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1195 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1196 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1197 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1203 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1205 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1206 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1208 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1210 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1218 * Bilinear interpolation.
1220 * Values indices are in v_{yx}.
1223 lp_build_lerp_2d(struct lp_build_context
*bld
,
1232 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1233 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1234 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1239 lp_build_lerp_3d(struct lp_build_context
*bld
,
1253 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1254 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1255 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1260 * Generate min(a, b)
1261 * Do checks for special cases but not for nans.
1264 lp_build_min(struct lp_build_context
*bld
,
1268 assert(lp_check_value(bld
->type
, a
));
1269 assert(lp_check_value(bld
->type
, b
));
1271 if(a
== bld
->undef
|| b
== bld
->undef
)
1277 if (bld
->type
.norm
) {
1278 if (!bld
->type
.sign
) {
1279 if (a
== bld
->zero
|| b
== bld
->zero
) {
1289 return lp_build_min_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1294 * Generate min(a, b)
1295 * NaN's are handled according to the behavior specified by the
1296 * nan_behavior argument.
1299 lp_build_min_ext(struct lp_build_context
*bld
,
1302 enum gallivm_nan_behavior nan_behavior
)
1304 assert(lp_check_value(bld
->type
, a
));
1305 assert(lp_check_value(bld
->type
, b
));
1307 if(a
== bld
->undef
|| b
== bld
->undef
)
1313 if (bld
->type
.norm
) {
1314 if (!bld
->type
.sign
) {
1315 if (a
== bld
->zero
|| b
== bld
->zero
) {
1325 return lp_build_min_simple(bld
, a
, b
, nan_behavior
);
1329 * Generate max(a, b)
1330 * Do checks for special cases, but NaN behavior is undefined.
1333 lp_build_max(struct lp_build_context
*bld
,
1337 assert(lp_check_value(bld
->type
, a
));
1338 assert(lp_check_value(bld
->type
, b
));
1340 if(a
== bld
->undef
|| b
== bld
->undef
)
1346 if(bld
->type
.norm
) {
1347 if(a
== bld
->one
|| b
== bld
->one
)
1349 if (!bld
->type
.sign
) {
1350 if (a
== bld
->zero
) {
1353 if (b
== bld
->zero
) {
1359 return lp_build_max_simple(bld
, a
, b
, GALLIVM_NAN_BEHAVIOR_UNDEFINED
);
1364 * Generate max(a, b)
1365 * Checks for special cases.
1366 * NaN's are handled according to the behavior specified by the
1367 * nan_behavior argument.
1370 lp_build_max_ext(struct lp_build_context
*bld
,
1373 enum gallivm_nan_behavior nan_behavior
)
1375 assert(lp_check_value(bld
->type
, a
));
1376 assert(lp_check_value(bld
->type
, b
));
1378 if(a
== bld
->undef
|| b
== bld
->undef
)
1384 if(bld
->type
.norm
) {
1385 if(a
== bld
->one
|| b
== bld
->one
)
1387 if (!bld
->type
.sign
) {
1388 if (a
== bld
->zero
) {
1391 if (b
== bld
->zero
) {
1397 return lp_build_max_simple(bld
, a
, b
, nan_behavior
);
1401 * Generate clamp(a, min, max)
1402 * Do checks for special cases.
1405 lp_build_clamp(struct lp_build_context
*bld
,
1410 assert(lp_check_value(bld
->type
, a
));
1411 assert(lp_check_value(bld
->type
, min
));
1412 assert(lp_check_value(bld
->type
, max
));
1415 * XXX dark magic warning: The order of min/max here matters (!).
1416 * The reason is a typical use case is clamp(a, 0.0, 1.0)
1417 * (for example for float->unorm conversion) and on x86 sse2
1418 * this will give 0.0 for NaNs, whereas doing min first will
1419 * give 1.0 for NaN which makes d3d10 angry...
1420 * This is very much not guaranteed behavior though which just
1421 * happens to work x86 sse2 (and up), and obviously won't do anything
1422 * for other non-zero clamps (say -1.0/1.0 in a SNORM conversion) neither,
1423 * so need to fix this for real...
1425 a
= lp_build_max(bld
, a
, min
);
1426 a
= lp_build_min(bld
, a
, max
);
1435 lp_build_abs(struct lp_build_context
*bld
,
1438 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1439 const struct lp_type type
= bld
->type
;
1440 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1442 assert(lp_check_value(type
, a
));
1448 /* Mask out the sign bit */
1449 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1450 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1451 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1452 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1453 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1454 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1458 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1459 switch(type
.width
) {
1461 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1463 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1465 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1468 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1469 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1470 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1471 debug_printf("%s: inefficient code, should split vectors manually\n",
1475 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1480 lp_build_negate(struct lp_build_context
*bld
,
1483 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1485 assert(lp_check_value(bld
->type
, a
));
1487 #if HAVE_LLVM >= 0x0207
1488 if (bld
->type
.floating
)
1489 a
= LLVMBuildFNeg(builder
, a
, "");
1492 a
= LLVMBuildNeg(builder
, a
, "");
1498 /** Return -1, 0 or +1 depending on the sign of a */
1500 lp_build_sgn(struct lp_build_context
*bld
,
1503 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1504 const struct lp_type type
= bld
->type
;
1508 assert(lp_check_value(type
, a
));
1510 /* Handle non-zero case */
1512 /* if not zero then sign must be positive */
1515 else if(type
.floating
) {
1516 LLVMTypeRef vec_type
;
1517 LLVMTypeRef int_type
;
1521 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1523 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1524 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1525 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1527 /* Take the sign bit and add it to 1 constant */
1528 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1529 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1530 one
= LLVMConstBitCast(bld
->one
, int_type
);
1531 res
= LLVMBuildOr(builder
, sign
, one
, "");
1532 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1536 /* signed int/norm/fixed point */
1537 /* could use psign with sse3 and appropriate vectors here */
1538 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1539 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1540 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1544 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1545 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1552 * Set the sign of float vector 'a' according to 'sign'.
1553 * If sign==0, return abs(a).
1554 * If sign==1, return -abs(a);
1555 * Other values for sign produce undefined results.
1558 lp_build_set_sign(struct lp_build_context
*bld
,
1559 LLVMValueRef a
, LLVMValueRef sign
)
1561 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1562 const struct lp_type type
= bld
->type
;
1563 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1564 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1565 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1566 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1567 ~((unsigned long long) 1 << (type
.width
- 1)));
1568 LLVMValueRef val
, res
;
1570 assert(type
.floating
);
1571 assert(lp_check_value(type
, a
));
1573 /* val = reinterpret_cast<int>(a) */
1574 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1575 /* val = val & mask */
1576 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1577 /* sign = sign << shift */
1578 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1579 /* res = val | sign */
1580 res
= LLVMBuildOr(builder
, val
, sign
, "");
1581 /* res = reinterpret_cast<float>(res) */
1582 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1589 * Convert vector of (or scalar) int to vector of (or scalar) float.
1592 lp_build_int_to_float(struct lp_build_context
*bld
,
1595 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1596 const struct lp_type type
= bld
->type
;
1597 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1599 assert(type
.floating
);
1601 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1605 arch_rounding_available(const struct lp_type type
)
1607 if ((util_cpu_caps
.has_sse4_1
&&
1608 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1609 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1611 else if ((util_cpu_caps
.has_altivec
&&
1612 (type
.width
== 32 && type
.length
== 4)))
1618 enum lp_build_round_mode
1620 LP_BUILD_ROUND_NEAREST
= 0,
1621 LP_BUILD_ROUND_FLOOR
= 1,
1622 LP_BUILD_ROUND_CEIL
= 2,
1623 LP_BUILD_ROUND_TRUNCATE
= 3
1627 * Helper for SSE4.1's ROUNDxx instructions.
1629 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1630 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1632 static INLINE LLVMValueRef
1633 lp_build_round_sse41(struct lp_build_context
*bld
,
1635 enum lp_build_round_mode mode
)
1637 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1638 const struct lp_type type
= bld
->type
;
1639 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1640 const char *intrinsic
;
1643 assert(type
.floating
);
1645 assert(lp_check_value(type
, a
));
1646 assert(util_cpu_caps
.has_sse4_1
);
1648 if (type
.length
== 1) {
1649 LLVMTypeRef vec_type
;
1651 LLVMValueRef args
[3];
1652 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1654 switch(type
.width
) {
1656 intrinsic
= "llvm.x86.sse41.round.ss";
1659 intrinsic
= "llvm.x86.sse41.round.sd";
1666 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1668 undef
= LLVMGetUndef(vec_type
);
1671 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1672 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1674 res
= lp_build_intrinsic(builder
, intrinsic
,
1675 vec_type
, args
, Elements(args
));
1677 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1680 if (type
.width
* type
.length
== 128) {
1681 switch(type
.width
) {
1683 intrinsic
= "llvm.x86.sse41.round.ps";
1686 intrinsic
= "llvm.x86.sse41.round.pd";
1694 assert(type
.width
* type
.length
== 256);
1695 assert(util_cpu_caps
.has_avx
);
1697 switch(type
.width
) {
1699 intrinsic
= "llvm.x86.avx.round.ps.256";
1702 intrinsic
= "llvm.x86.avx.round.pd.256";
1710 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1712 LLVMConstInt(i32t
, mode
, 0));
1719 static INLINE LLVMValueRef
1720 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1723 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1724 const struct lp_type type
= bld
->type
;
1725 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1726 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1727 const char *intrinsic
;
1730 assert(type
.floating
);
1731 /* using the double precision conversions is a bit more complicated */
1732 assert(type
.width
== 32);
1734 assert(lp_check_value(type
, a
));
1735 assert(util_cpu_caps
.has_sse2
);
1737 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1738 if (type
.length
== 1) {
1739 LLVMTypeRef vec_type
;
1742 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1744 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1746 intrinsic
= "llvm.x86.sse.cvtss2si";
1748 undef
= LLVMGetUndef(vec_type
);
1750 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1752 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1756 if (type
.width
* type
.length
== 128) {
1757 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1760 assert(type
.width
*type
.length
== 256);
1761 assert(util_cpu_caps
.has_avx
);
1763 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1765 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1775 static INLINE LLVMValueRef
1776 lp_build_round_altivec(struct lp_build_context
*bld
,
1778 enum lp_build_round_mode mode
)
1780 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1781 const struct lp_type type
= bld
->type
;
1782 const char *intrinsic
= NULL
;
1784 assert(type
.floating
);
1786 assert(lp_check_value(type
, a
));
1787 assert(util_cpu_caps
.has_altivec
);
1790 case LP_BUILD_ROUND_NEAREST
:
1791 intrinsic
= "llvm.ppc.altivec.vrfin";
1793 case LP_BUILD_ROUND_FLOOR
:
1794 intrinsic
= "llvm.ppc.altivec.vrfim";
1796 case LP_BUILD_ROUND_CEIL
:
1797 intrinsic
= "llvm.ppc.altivec.vrfip";
1799 case LP_BUILD_ROUND_TRUNCATE
:
1800 intrinsic
= "llvm.ppc.altivec.vrfiz";
1804 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1807 static INLINE LLVMValueRef
1808 lp_build_round_arch(struct lp_build_context
*bld
,
1810 enum lp_build_round_mode mode
)
1812 if (util_cpu_caps
.has_sse4_1
)
1813 return lp_build_round_sse41(bld
, a
, mode
);
1814 else /* (util_cpu_caps.has_altivec) */
1815 return lp_build_round_altivec(bld
, a
, mode
);
1819 * Return the integer part of a float (vector) value (== round toward zero).
1820 * The returned value is a float (vector).
1821 * Ex: trunc(-1.5) = -1.0
1824 lp_build_trunc(struct lp_build_context
*bld
,
1827 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1828 const struct lp_type type
= bld
->type
;
1830 assert(type
.floating
);
1831 assert(lp_check_value(type
, a
));
1833 if (arch_rounding_available(type
)) {
1834 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1837 const struct lp_type type
= bld
->type
;
1838 struct lp_type inttype
;
1839 struct lp_build_context intbld
;
1840 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1841 LLVMValueRef trunc
, res
, anosign
, mask
;
1842 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1843 LLVMTypeRef vec_type
= bld
->vec_type
;
1845 assert(type
.width
== 32); /* might want to handle doubles at some point */
1848 inttype
.floating
= 0;
1849 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1851 /* round by truncation */
1852 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1853 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1855 /* mask out sign bit */
1856 anosign
= lp_build_abs(bld
, a
);
1858 * mask out all values if anosign > 2^24
1859 * This should work both for large ints (all rounding is no-op for them
1860 * because such floats are always exact) as well as special cases like
1861 * NaNs, Infs (taking advantage of the fact they use max exponent).
1862 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1864 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1865 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1866 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1867 return lp_build_select(bld
, mask
, a
, res
);
1873 * Return float (vector) rounded to nearest integer (vector). The returned
1874 * value is a float (vector).
1875 * Ex: round(0.9) = 1.0
1876 * Ex: round(-1.5) = -2.0
1879 lp_build_round(struct lp_build_context
*bld
,
1882 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1883 const struct lp_type type
= bld
->type
;
1885 assert(type
.floating
);
1886 assert(lp_check_value(type
, a
));
1888 if (arch_rounding_available(type
)) {
1889 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1892 const struct lp_type type
= bld
->type
;
1893 struct lp_type inttype
;
1894 struct lp_build_context intbld
;
1895 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1896 LLVMValueRef res
, anosign
, mask
;
1897 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1898 LLVMTypeRef vec_type
= bld
->vec_type
;
1900 assert(type
.width
== 32); /* might want to handle doubles at some point */
1903 inttype
.floating
= 0;
1904 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1906 res
= lp_build_iround(bld
, a
);
1907 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1909 /* mask out sign bit */
1910 anosign
= lp_build_abs(bld
, a
);
1912 * mask out all values if anosign > 2^24
1913 * This should work both for large ints (all rounding is no-op for them
1914 * because such floats are always exact) as well as special cases like
1915 * NaNs, Infs (taking advantage of the fact they use max exponent).
1916 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1918 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1919 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1920 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1921 return lp_build_select(bld
, mask
, a
, res
);
1927 * Return floor of float (vector), result is a float (vector)
1928 * Ex: floor(1.1) = 1.0
1929 * Ex: floor(-1.1) = -2.0
1932 lp_build_floor(struct lp_build_context
*bld
,
1935 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1936 const struct lp_type type
= bld
->type
;
1938 assert(type
.floating
);
1939 assert(lp_check_value(type
, a
));
1941 if (arch_rounding_available(type
)) {
1942 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1945 const struct lp_type type
= bld
->type
;
1946 struct lp_type inttype
;
1947 struct lp_build_context intbld
;
1948 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1949 LLVMValueRef trunc
, res
, anosign
, mask
;
1950 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1951 LLVMTypeRef vec_type
= bld
->vec_type
;
1953 assert(type
.width
== 32); /* might want to handle doubles at some point */
1956 inttype
.floating
= 0;
1957 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1959 /* round by truncation */
1960 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1961 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1967 * fix values if rounding is wrong (for non-special cases)
1968 * - this is the case if trunc > a
1970 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
1971 /* tmp = trunc > a ? 1.0 : 0.0 */
1972 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1973 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1974 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1975 res
= lp_build_sub(bld
, res
, tmp
);
1978 /* mask out sign bit */
1979 anosign
= lp_build_abs(bld
, a
);
1981 * mask out all values if anosign > 2^24
1982 * This should work both for large ints (all rounding is no-op for them
1983 * because such floats are always exact) as well as special cases like
1984 * NaNs, Infs (taking advantage of the fact they use max exponent).
1985 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1987 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1988 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1989 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1990 return lp_build_select(bld
, mask
, a
, res
);
1996 * Return ceiling of float (vector), returning float (vector).
1997 * Ex: ceil( 1.1) = 2.0
1998 * Ex: ceil(-1.1) = -1.0
2001 lp_build_ceil(struct lp_build_context
*bld
,
2004 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2005 const struct lp_type type
= bld
->type
;
2007 assert(type
.floating
);
2008 assert(lp_check_value(type
, a
));
2010 if (arch_rounding_available(type
)) {
2011 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2014 const struct lp_type type
= bld
->type
;
2015 struct lp_type inttype
;
2016 struct lp_build_context intbld
;
2017 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
2018 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
2019 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2020 LLVMTypeRef vec_type
= bld
->vec_type
;
2022 assert(type
.width
== 32); /* might want to handle doubles at some point */
2025 inttype
.floating
= 0;
2026 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2028 /* round by truncation */
2029 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2030 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
2033 * fix values if rounding is wrong (for non-special cases)
2034 * - this is the case if trunc < a
2036 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2037 /* tmp = trunc < a ? 1.0 : 0.0 */
2038 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
2039 tmp
= lp_build_and(&intbld
, mask
, tmp
);
2040 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
2041 res
= lp_build_add(bld
, trunc
, tmp
);
2043 /* mask out sign bit */
2044 anosign
= lp_build_abs(bld
, a
);
2046 * mask out all values if anosign > 2^24
2047 * This should work both for large ints (all rounding is no-op for them
2048 * because such floats are always exact) as well as special cases like
2049 * NaNs, Infs (taking advantage of the fact they use max exponent).
2050 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2052 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
2053 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
2054 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
2055 return lp_build_select(bld
, mask
, a
, res
);
2061 * Return fractional part of 'a' computed as a - floor(a)
2062 * Typically used in texture coord arithmetic.
2065 lp_build_fract(struct lp_build_context
*bld
,
2068 assert(bld
->type
.floating
);
2069 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
2074 * Prevent returning a fractional part of 1.0 for very small negative values of
2075 * 'a' by clamping against 0.99999(9).
2077 static inline LLVMValueRef
2078 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
2082 /* this is the largest number smaller than 1.0 representable as float */
2083 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2084 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
2085 return lp_build_min(bld
, fract
, max
);
2090 * Same as lp_build_fract, but guarantees that the result is always smaller
2094 lp_build_fract_safe(struct lp_build_context
*bld
,
2097 return clamp_fract(bld
, lp_build_fract(bld
, a
));
2102 * Return the integer part of a float (vector) value (== round toward zero).
2103 * The returned value is an integer (vector).
2104 * Ex: itrunc(-1.5) = -1
2107 lp_build_itrunc(struct lp_build_context
*bld
,
2110 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2111 const struct lp_type type
= bld
->type
;
2112 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2114 assert(type
.floating
);
2115 assert(lp_check_value(type
, a
));
2117 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2122 * Return float (vector) rounded to nearest integer (vector). The returned
2123 * value is an integer (vector).
2124 * Ex: iround(0.9) = 1
2125 * Ex: iround(-1.5) = -2
2128 lp_build_iround(struct lp_build_context
*bld
,
2131 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2132 const struct lp_type type
= bld
->type
;
2133 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2136 assert(type
.floating
);
2138 assert(lp_check_value(type
, a
));
2140 if ((util_cpu_caps
.has_sse2
&&
2141 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
2142 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2143 return lp_build_iround_nearest_sse2(bld
, a
);
2145 if (arch_rounding_available(type
)) {
2146 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
2151 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
2154 LLVMTypeRef vec_type
= bld
->vec_type
;
2155 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2156 (unsigned long long)1 << (type
.width
- 1));
2160 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
2161 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
2164 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
2165 half
= LLVMBuildOr(builder
, sign
, half
, "");
2166 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
2169 res
= LLVMBuildFAdd(builder
, a
, half
, "");
2172 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
2179 * Return floor of float (vector), result is an int (vector)
2180 * Ex: ifloor(1.1) = 1.0
2181 * Ex: ifloor(-1.1) = -2.0
2184 lp_build_ifloor(struct lp_build_context
*bld
,
2187 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2188 const struct lp_type type
= bld
->type
;
2189 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2192 assert(type
.floating
);
2193 assert(lp_check_value(type
, a
));
2197 if (arch_rounding_available(type
)) {
2198 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2201 struct lp_type inttype
;
2202 struct lp_build_context intbld
;
2203 LLVMValueRef trunc
, itrunc
, mask
;
2205 assert(type
.floating
);
2206 assert(lp_check_value(type
, a
));
2209 inttype
.floating
= 0;
2210 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2212 /* round by truncation */
2213 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2214 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2217 * fix values if rounding is wrong (for non-special cases)
2218 * - this is the case if trunc > a
2219 * The results of doing this with NaNs, very large values etc.
2220 * are undefined but this seems to be the case anyway.
2222 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2223 /* cheapie minus one with mask since the mask is minus one / zero */
2224 return lp_build_add(&intbld
, itrunc
, mask
);
2228 /* round to nearest (toward zero) */
2229 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2236 * Return ceiling of float (vector), returning int (vector).
2237 * Ex: iceil( 1.1) = 2
2238 * Ex: iceil(-1.1) = -1
2241 lp_build_iceil(struct lp_build_context
*bld
,
2244 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2245 const struct lp_type type
= bld
->type
;
2246 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2249 assert(type
.floating
);
2250 assert(lp_check_value(type
, a
));
2252 if (arch_rounding_available(type
)) {
2253 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2256 struct lp_type inttype
;
2257 struct lp_build_context intbld
;
2258 LLVMValueRef trunc
, itrunc
, mask
;
2260 assert(type
.floating
);
2261 assert(lp_check_value(type
, a
));
2264 inttype
.floating
= 0;
2265 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2267 /* round by truncation */
2268 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2269 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2272 * fix values if rounding is wrong (for non-special cases)
2273 * - this is the case if trunc < a
2274 * The results of doing this with NaNs, very large values etc.
2275 * are undefined but this seems to be the case anyway.
2277 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2278 /* cheapie plus one with mask since the mask is minus one / zero */
2279 return lp_build_sub(&intbld
, itrunc
, mask
);
2282 /* round to nearest (toward zero) */
2283 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2290 * Combined ifloor() & fract().
2292 * Preferred to calling the functions separately, as it will ensure that the
2293 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2296 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2298 LLVMValueRef
*out_ipart
,
2299 LLVMValueRef
*out_fpart
)
2301 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2302 const struct lp_type type
= bld
->type
;
2305 assert(type
.floating
);
2306 assert(lp_check_value(type
, a
));
2308 if (arch_rounding_available(type
)) {
2310 * floor() is easier.
2313 ipart
= lp_build_floor(bld
, a
);
2314 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2315 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2319 * ifloor() is easier.
2322 *out_ipart
= lp_build_ifloor(bld
, a
);
2323 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2324 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2330 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2331 * always smaller than one.
2334 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2336 LLVMValueRef
*out_ipart
,
2337 LLVMValueRef
*out_fpart
)
2339 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2340 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2345 lp_build_sqrt(struct lp_build_context
*bld
,
2348 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2349 const struct lp_type type
= bld
->type
;
2350 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2353 assert(lp_check_value(type
, a
));
2355 /* TODO: optimize the constant case */
2357 assert(type
.floating
);
2358 if (type
.length
== 1) {
2359 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.f%u", type
.width
);
2362 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
2365 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2370 * Do one Newton-Raphson step to improve reciprocate precision:
2372 * x_{i+1} = x_i * (2 - a * x_i)
2374 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2375 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2376 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2377 * halo. It would be necessary to clamp the argument to prevent this.
2380 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2381 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2383 static INLINE LLVMValueRef
2384 lp_build_rcp_refine(struct lp_build_context
*bld
,
2388 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2389 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2392 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2393 res
= LLVMBuildFSub(builder
, two
, res
, "");
2394 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2401 lp_build_rcp(struct lp_build_context
*bld
,
2404 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2405 const struct lp_type type
= bld
->type
;
2407 assert(lp_check_value(type
, a
));
2416 assert(type
.floating
);
2418 if(LLVMIsConstant(a
))
2419 return LLVMConstFDiv(bld
->one
, a
);
2422 * We don't use RCPPS because:
2423 * - it only has 10bits of precision
2424 * - it doesn't even get the reciprocate of 1.0 exactly
2425 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2426 * - for recent processors the benefit over DIVPS is marginal, a case
2429 * We could still use it on certain processors if benchmarks show that the
2430 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2431 * particular uses that require less workarounds.
2434 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2435 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2436 const unsigned num_iterations
= 0;
2439 const char *intrinsic
= NULL
;
2441 if (type
.length
== 4) {
2442 intrinsic
= "llvm.x86.sse.rcp.ps";
2445 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2448 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2450 for (i
= 0; i
< num_iterations
; ++i
) {
2451 res
= lp_build_rcp_refine(bld
, a
, res
);
2457 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2462 * Do one Newton-Raphson step to improve rsqrt precision:
2464 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2466 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2468 static INLINE LLVMValueRef
2469 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2471 LLVMValueRef rsqrt_a
)
2473 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2474 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2475 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2478 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2479 res
= LLVMBuildFMul(builder
, a
, res
, "");
2480 res
= LLVMBuildFSub(builder
, three
, res
, "");
2481 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2482 res
= LLVMBuildFMul(builder
, half
, res
, "");
2489 * Generate 1/sqrt(a).
2490 * Result is undefined for values < 0, infinity for +0.
2493 lp_build_rsqrt(struct lp_build_context
*bld
,
2496 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2497 const struct lp_type type
= bld
->type
;
2499 assert(lp_check_value(type
, a
));
2501 assert(type
.floating
);
2504 * This should be faster but all denormals will end up as infinity.
2506 if (0 && lp_build_fast_rsqrt_available(type
)) {
2507 const unsigned num_iterations
= 1;
2511 /* rsqrt(1.0) != 1.0 here */
2512 res
= lp_build_fast_rsqrt(bld
, a
);
2514 if (num_iterations
) {
2516 * Newton-Raphson will result in NaN instead of infinity for zero,
2517 * and NaN instead of zero for infinity.
2518 * Also, need to ensure rsqrt(1.0) == 1.0.
2519 * All numbers smaller than FLT_MIN will result in +infinity
2520 * (rsqrtps treats all denormals as zero).
2523 * Certain non-c99 compilers don't know INFINITY and might not support
2524 * hacks to evaluate it at compile time neither.
2526 const unsigned posinf_int
= 0x7F800000;
2528 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2529 LLVMValueRef inf
= lp_build_const_int_vec(bld
->gallivm
, type
, posinf_int
);
2531 inf
= LLVMBuildBitCast(builder
, inf
, lp_build_vec_type(bld
->gallivm
, type
), "");
2533 for (i
= 0; i
< num_iterations
; ++i
) {
2534 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2536 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2537 res
= lp_build_select(bld
, cmp
, inf
, res
);
2538 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2539 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2540 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2541 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2547 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2551 * If there's a fast (inaccurate) rsqrt instruction available
2552 * (caller may want to avoid to call rsqrt_fast if it's not available,
2553 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2554 * unavailable it would result in sqrt/div/mul so obviously
2555 * much better to just call sqrt, skipping both div and mul).
2558 lp_build_fast_rsqrt_available(struct lp_type type
)
2560 assert(type
.floating
);
2562 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2563 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2571 * Generate 1/sqrt(a).
2572 * Result is undefined for values < 0, infinity for +0.
2573 * Precision is limited, only ~10 bits guaranteed
2574 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2577 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2580 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2581 const struct lp_type type
= bld
->type
;
2583 assert(lp_check_value(type
, a
));
2585 if (lp_build_fast_rsqrt_available(type
)) {
2586 const char *intrinsic
= NULL
;
2588 if (type
.length
== 4) {
2589 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2592 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2594 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2597 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2599 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2604 * Generate sin(a) using SSE2
2607 lp_build_sin(struct lp_build_context
*bld
,
2610 struct gallivm_state
*gallivm
= bld
->gallivm
;
2611 LLVMBuilderRef builder
= gallivm
->builder
;
2612 struct lp_type int_type
= lp_int_type(bld
->type
);
2613 LLVMBuilderRef b
= builder
;
2616 * take the absolute value,
2617 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2620 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2621 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2623 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2624 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2627 * extract the sign bit (upper one)
2628 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2630 LLVMValueRef sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2631 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
2635 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2638 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2639 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2642 * store the integer part of y in mm0
2643 * emm2 = _mm_cvttps_epi32(y);
2646 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2649 * j=(j+1) & (~1) (see the cephes sources)
2650 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2653 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2654 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2656 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2658 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2659 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2662 * y = _mm_cvtepi32_ps(emm2);
2664 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2666 /* get the swap sign flag
2667 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2669 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2670 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
2673 * emm2 = _mm_slli_epi32(emm0, 29);
2675 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2676 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
2679 * get the polynom selection mask
2680 * there is one polynom for 0 <= x <= Pi/4
2681 * and another one for Pi/4<x<=Pi/2
2682 * Both branches will be computed.
2684 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2685 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2688 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2689 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
2690 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2691 int_type
, PIPE_FUNC_EQUAL
,
2692 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2694 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2696 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
2699 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2700 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2701 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2703 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2704 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2705 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2708 * The magic pass: "Extended precision modular arithmetic"
2709 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2710 * xmm1 = _mm_mul_ps(y, xmm1);
2711 * xmm2 = _mm_mul_ps(y, xmm2);
2712 * xmm3 = _mm_mul_ps(y, xmm3);
2714 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2715 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2716 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2719 * x = _mm_add_ps(x, xmm1);
2720 * x = _mm_add_ps(x, xmm2);
2721 * x = _mm_add_ps(x, xmm3);
2724 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2725 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2726 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2729 * Evaluate the first polynom (0 <= x <= Pi/4)
2731 * z = _mm_mul_ps(x,x);
2733 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2736 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2737 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2738 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2740 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2741 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2742 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2745 * y = *(v4sf*)_ps_coscof_p0;
2746 * y = _mm_mul_ps(y, z);
2748 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2749 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2750 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2751 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2752 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2753 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2757 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2758 * y = _mm_sub_ps(y, tmp);
2759 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2761 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2762 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2763 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2764 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2765 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2768 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2769 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2770 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2772 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2773 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2774 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2777 * Evaluate the second polynom (Pi/4 <= x <= 0)
2779 * y2 = *(v4sf*)_ps_sincof_p0;
2780 * y2 = _mm_mul_ps(y2, z);
2781 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2782 * y2 = _mm_mul_ps(y2, z);
2783 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2784 * y2 = _mm_mul_ps(y2, z);
2785 * y2 = _mm_mul_ps(y2, x);
2786 * y2 = _mm_add_ps(y2, x);
2789 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2790 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2791 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2792 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2793 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2794 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2795 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2798 * select the correct result from the two polynoms
2800 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2801 * y = _mm_andnot_ps(xmm3, y);
2802 * y = _mm_or_ps(y,y2);
2804 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2805 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2806 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2807 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
2808 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2809 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
2813 * y = _mm_xor_ps(y, sign_bit);
2815 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
2816 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2817 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
2819 /* clamp output to be within [-1, 1] */
2820 y_result
= lp_build_clamp(bld
, y_result
,
2821 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
2822 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
2823 /* If a is -inf, inf or NaN then return NaN */
2824 y_result
= lp_build_select(bld
, isfinite
, y_result
,
2825 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
2831 * Generate cos(a) using SSE2
2834 lp_build_cos(struct lp_build_context
*bld
,
2837 struct gallivm_state
*gallivm
= bld
->gallivm
;
2838 LLVMBuilderRef builder
= gallivm
->builder
;
2839 struct lp_type int_type
= lp_int_type(bld
->type
);
2840 LLVMBuilderRef b
= builder
;
2843 * take the absolute value,
2844 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2847 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2848 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2850 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2851 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2855 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2858 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2859 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2862 * store the integer part of y in mm0
2863 * emm2 = _mm_cvttps_epi32(y);
2866 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2869 * j=(j+1) & (~1) (see the cephes sources)
2870 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2873 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2874 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2876 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2878 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2879 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2882 * y = _mm_cvtepi32_ps(emm2);
2884 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2888 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2890 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2891 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
2894 /* get the swap sign flag
2895 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2897 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2898 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
2899 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2900 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
2903 * emm2 = _mm_slli_epi32(emm0, 29);
2905 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2906 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
2909 * get the polynom selection mask
2910 * there is one polynom for 0 <= x <= Pi/4
2911 * and another one for Pi/4<x<=Pi/2
2912 * Both branches will be computed.
2914 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2915 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2918 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2919 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
2920 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2921 int_type
, PIPE_FUNC_EQUAL
,
2922 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2925 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2926 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2927 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2929 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2930 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2931 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2934 * The magic pass: "Extended precision modular arithmetic"
2935 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2936 * xmm1 = _mm_mul_ps(y, xmm1);
2937 * xmm2 = _mm_mul_ps(y, xmm2);
2938 * xmm3 = _mm_mul_ps(y, xmm3);
2940 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2941 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2942 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2945 * x = _mm_add_ps(x, xmm1);
2946 * x = _mm_add_ps(x, xmm2);
2947 * x = _mm_add_ps(x, xmm3);
2950 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2951 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2952 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2955 * Evaluate the first polynom (0 <= x <= Pi/4)
2957 * z = _mm_mul_ps(x,x);
2959 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2962 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2963 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2964 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2966 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2967 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2968 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2971 * y = *(v4sf*)_ps_coscof_p0;
2972 * y = _mm_mul_ps(y, z);
2974 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2975 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2976 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2977 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2978 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2979 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2983 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2984 * y = _mm_sub_ps(y, tmp);
2985 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2987 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2988 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2989 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2990 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2991 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2994 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2995 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2996 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2998 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2999 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
3000 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
3003 * Evaluate the second polynom (Pi/4 <= x <= 0)
3005 * y2 = *(v4sf*)_ps_sincof_p0;
3006 * y2 = _mm_mul_ps(y2, z);
3007 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3008 * y2 = _mm_mul_ps(y2, z);
3009 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3010 * y2 = _mm_mul_ps(y2, z);
3011 * y2 = _mm_mul_ps(y2, x);
3012 * y2 = _mm_add_ps(y2, x);
3015 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
3016 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
3017 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
3018 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
3019 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
3020 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
3021 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
3024 * select the correct result from the two polynoms
3026 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3027 * y = _mm_andnot_ps(xmm3, y);
3028 * y = _mm_or_ps(y,y2);
3030 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
3031 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
3032 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
3033 LLVMValueRef poly_mask_inv
= LLVMBuildNot(b
, poly_mask
, "poly_mask_inv");
3034 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
3035 LLVMValueRef y_combine
= LLVMBuildOr(b
, y_and
, y2_and
, "y_combine");
3039 * y = _mm_xor_ps(y, sign_bit);
3041 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
3042 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
3043 LLVMValueRef isfinite
= lp_build_isfinite(bld
, a
);
3045 /* clamp output to be within [-1, 1] */
3046 y_result
= lp_build_clamp(bld
, y_result
,
3047 lp_build_const_vec(bld
->gallivm
, bld
->type
, -1.f
),
3048 lp_build_const_vec(bld
->gallivm
, bld
->type
, 1.f
));
3049 /* If a is -inf, inf or NaN then return NaN */
3050 y_result
= lp_build_select(bld
, isfinite
, y_result
,
3051 lp_build_const_vec(bld
->gallivm
, bld
->type
, NAN
));
3057 * Generate pow(x, y)
3060 lp_build_pow(struct lp_build_context
*bld
,
3064 /* TODO: optimize the constant case */
3065 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3066 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
3067 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3071 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
3079 lp_build_exp(struct lp_build_context
*bld
,
3082 /* log2(e) = 1/log(2) */
3083 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3084 1.4426950408889634);
3086 assert(lp_check_value(bld
->type
, x
));
3088 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
3094 * Behavior is undefined with infs, 0s and nans
3097 lp_build_log(struct lp_build_context
*bld
,
3101 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3102 0.69314718055994529);
3104 assert(lp_check_value(bld
->type
, x
));
3106 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
3110 * Generate log(x) that handles edge cases (infs, 0s and nans)
3113 lp_build_log_safe(struct lp_build_context
*bld
,
3117 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
3118 0.69314718055994529);
3120 assert(lp_check_value(bld
->type
, x
));
3122 return lp_build_mul(bld
, log2
, lp_build_log2_safe(bld
, x
));
3127 * Generate polynomial.
3128 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3131 lp_build_polynomial(struct lp_build_context
*bld
,
3133 const double *coeffs
,
3134 unsigned num_coeffs
)
3136 const struct lp_type type
= bld
->type
;
3137 LLVMValueRef even
= NULL
, odd
= NULL
;
3141 assert(lp_check_value(bld
->type
, x
));
3143 /* TODO: optimize the constant case */
3144 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3145 LLVMIsConstant(x
)) {
3146 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3151 * Calculate odd and even terms seperately to decrease data dependency
3153 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3154 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3156 x2
= lp_build_mul(bld
, x
, x
);
3158 for (i
= num_coeffs
; i
--; ) {
3161 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
3165 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
3170 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
3177 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
3186 * Minimax polynomial fit of 2**x, in range [0, 1[
3188 const double lp_build_exp2_polynomial
[] = {
3189 #if EXP_POLY_DEGREE == 5
3190 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3191 0.693153073200168932794,
3192 0.240153617044375388211,
3193 0.0558263180532956664775,
3194 0.00898934009049466391101,
3195 0.00187757667519147912699
3196 #elif EXP_POLY_DEGREE == 4
3197 1.00000259337069434683,
3198 0.693003834469974940458,
3199 0.24144275689150793076,
3200 0.0520114606103070150235,
3201 0.0135341679161270268764
3202 #elif EXP_POLY_DEGREE == 3
3203 0.999925218562710312959,
3204 0.695833540494823811697,
3205 0.226067155427249155588,
3206 0.0780245226406372992967
3207 #elif EXP_POLY_DEGREE == 2
3208 1.00172476321474503578,
3209 0.657636275736077639316,
3210 0.33718943461968720704
3218 lp_build_exp2_approx(struct lp_build_context
*bld
,
3220 LLVMValueRef
*p_exp2_int_part
,
3221 LLVMValueRef
*p_frac_part
,
3222 LLVMValueRef
*p_exp2
)
3224 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3225 const struct lp_type type
= bld
->type
;
3226 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3227 LLVMValueRef ipart
= NULL
;
3228 LLVMValueRef fpart
= NULL
;
3229 LLVMValueRef expipart
= NULL
;
3230 LLVMValueRef expfpart
= NULL
;
3231 LLVMValueRef res
= NULL
;
3233 assert(lp_check_value(bld
->type
, x
));
3235 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
3236 /* TODO: optimize the constant case */
3237 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3238 LLVMIsConstant(x
)) {
3239 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3243 assert(type
.floating
&& type
.width
== 32);
3245 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3246 * the result is INF and if it's smaller than -126.9 the result is 0 */
3247 x
= lp_build_min_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, 128.0), x
,
3248 GALLIVM_NAN_RETURN_SECOND
);
3249 x
= lp_build_max_ext(bld
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999), x
,
3250 GALLIVM_NAN_RETURN_SECOND
);
3252 /* ipart = floor(x) */
3253 /* fpart = x - ipart */
3254 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3257 if(p_exp2_int_part
|| p_exp2
) {
3258 /* expipart = (float) (1 << ipart) */
3259 expipart
= LLVMBuildAdd(builder
, ipart
,
3260 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3261 expipart
= LLVMBuildShl(builder
, expipart
,
3262 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3263 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3267 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3268 Elements(lp_build_exp2_polynomial
));
3270 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3274 *p_exp2_int_part
= expipart
;
3277 *p_frac_part
= fpart
;
3285 lp_build_exp2(struct lp_build_context
*bld
,
3289 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
3295 * Extract the exponent of a IEEE-754 floating point value.
3297 * Optionally apply an integer bias.
3299 * Result is an integer value with
3301 * ifloor(log2(x)) + bias
3304 lp_build_extract_exponent(struct lp_build_context
*bld
,
3308 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3309 const struct lp_type type
= bld
->type
;
3310 unsigned mantissa
= lp_mantissa(type
);
3313 assert(type
.floating
);
3315 assert(lp_check_value(bld
->type
, x
));
3317 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3319 res
= LLVMBuildLShr(builder
, x
,
3320 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3321 res
= LLVMBuildAnd(builder
, res
,
3322 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3323 res
= LLVMBuildSub(builder
, res
,
3324 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3331 * Extract the mantissa of the a floating.
3333 * Result is a floating point value with
3335 * x / floor(log2(x))
3338 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3341 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3342 const struct lp_type type
= bld
->type
;
3343 unsigned mantissa
= lp_mantissa(type
);
3344 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3345 (1ULL << mantissa
) - 1);
3346 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3349 assert(lp_check_value(bld
->type
, x
));
3351 assert(type
.floating
);
3353 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3355 /* res = x / 2**ipart */
3356 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3357 res
= LLVMBuildOr(builder
, res
, one
, "");
3358 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3366 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3367 * These coefficients can be generate with
3368 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3370 const double lp_build_log2_polynomial
[] = {
3371 #if LOG_POLY_DEGREE == 5
3372 2.88539008148777786488L,
3373 0.961796878841293367824L,
3374 0.577058946784739859012L,
3375 0.412914355135828735411L,
3376 0.308591899232910175289L,
3377 0.352376952300281371868L,
3378 #elif LOG_POLY_DEGREE == 4
3379 2.88539009343309178325L,
3380 0.961791550404184197881L,
3381 0.577440339438736392009L,
3382 0.403343858251329912514L,
3383 0.406718052498846252698L,
3384 #elif LOG_POLY_DEGREE == 3
3385 2.88538959748872753838L,
3386 0.961932915889597772928L,
3387 0.571118517972136195241L,
3388 0.493997535084709500285L,
3395 * See http://www.devmaster.net/forums/showthread.php?p=43580
3396 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3397 * http://www.nezumi.demon.co.uk/consult/logx.htm
3399 * If handle_edge_cases is true the function will perform computations
3400 * to match the required D3D10+ behavior for each of the edge cases.
3401 * That means that if input is:
3402 * - less than zero (to and including -inf) then NaN will be returned
3403 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3404 * - +infinity, then +infinity will be returned
3405 * - NaN, then NaN will be returned
3407 * Those checks are fairly expensive so if you don't need them make sure
3408 * handle_edge_cases is false.
3411 lp_build_log2_approx(struct lp_build_context
*bld
,
3413 LLVMValueRef
*p_exp
,
3414 LLVMValueRef
*p_floor_log2
,
3415 LLVMValueRef
*p_log2
,
3416 boolean handle_edge_cases
)
3418 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3419 const struct lp_type type
= bld
->type
;
3420 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3421 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3423 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3424 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3425 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3427 LLVMValueRef i
= NULL
;
3428 LLVMValueRef y
= NULL
;
3429 LLVMValueRef z
= NULL
;
3430 LLVMValueRef exp
= NULL
;
3431 LLVMValueRef mant
= NULL
;
3432 LLVMValueRef logexp
= NULL
;
3433 LLVMValueRef logmant
= NULL
;
3434 LLVMValueRef res
= NULL
;
3436 assert(lp_check_value(bld
->type
, x
));
3438 if(p_exp
|| p_floor_log2
|| p_log2
) {
3439 /* TODO: optimize the constant case */
3440 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3441 LLVMIsConstant(x
)) {
3442 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3446 assert(type
.floating
&& type
.width
== 32);
3449 * We don't explicitly handle denormalized numbers. They will yield a
3450 * result in the neighbourhood of -127, which appears to be adequate
3454 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3456 /* exp = (float) exponent(x) */
3457 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3460 if(p_floor_log2
|| p_log2
) {
3461 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3462 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3463 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3467 /* mant = 1 + (float) mantissa(x) */
3468 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3469 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3470 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3472 /* y = (mant - 1) / (mant + 1) */
3473 y
= lp_build_div(bld
,
3474 lp_build_sub(bld
, mant
, bld
->one
),
3475 lp_build_add(bld
, mant
, bld
->one
)
3479 z
= lp_build_mul(bld
, y
, y
);
3482 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3483 Elements(lp_build_log2_polynomial
));
3485 /* logmant = y * P(z) */
3486 logmant
= lp_build_mul(bld
, y
, logmant
);
3488 res
= lp_build_add(bld
, logmant
, logexp
);
3490 if (type
.floating
&& handle_edge_cases
) {
3491 LLVMValueRef negmask
, infmask
, zmask
;
3492 negmask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, x
,
3493 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3494 zmask
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, x
,
3495 lp_build_const_vec(bld
->gallivm
, type
, 0.0f
));
3496 infmask
= lp_build_cmp(bld
, PIPE_FUNC_GEQUAL
, x
,
3497 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
));
3499 /* If x is qual to inf make sure we return inf */
3500 res
= lp_build_select(bld
, infmask
,
3501 lp_build_const_vec(bld
->gallivm
, type
, INFINITY
),
3503 /* If x is qual to 0, return -inf */
3504 res
= lp_build_select(bld
, zmask
,
3505 lp_build_const_vec(bld
->gallivm
, type
, -INFINITY
),
3507 /* If x is nan or less than 0, return nan */
3508 res
= lp_build_select(bld
, negmask
,
3509 lp_build_const_vec(bld
->gallivm
, type
, NAN
),
3515 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3520 *p_floor_log2
= logexp
;
3528 * log2 implementation which doesn't have special code to
3529 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3530 * the results for those cases are undefined.
3533 lp_build_log2(struct lp_build_context
*bld
,
3537 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, FALSE
);
3542 * Version of log2 which handles all edge cases.
3543 * Look at documentation of lp_build_log2_approx for
3544 * description of the behavior for each of the edge cases.
3547 lp_build_log2_safe(struct lp_build_context
*bld
,
3551 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
, TRUE
);
3557 * Faster (and less accurate) log2.
3559 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3561 * Piece-wise linear approximation, with exact results when x is a
3564 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3567 lp_build_fast_log2(struct lp_build_context
*bld
,
3570 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3574 assert(lp_check_value(bld
->type
, x
));
3576 assert(bld
->type
.floating
);
3578 /* ipart = floor(log2(x)) - 1 */
3579 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3580 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3582 /* fpart = x / 2**ipart */
3583 fpart
= lp_build_extract_mantissa(bld
, x
);
3586 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3591 * Fast implementation of iround(log2(x)).
3593 * Not an approximation -- it should give accurate results all the time.
3596 lp_build_ilog2(struct lp_build_context
*bld
,
3599 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3600 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3603 assert(bld
->type
.floating
);
3605 assert(lp_check_value(bld
->type
, x
));
3607 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3608 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3610 /* ipart = floor(log2(x) + 0.5) */
3611 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3617 lp_build_mod(struct lp_build_context
*bld
,
3621 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3623 const struct lp_type type
= bld
->type
;
3625 assert(lp_check_value(type
, x
));
3626 assert(lp_check_value(type
, y
));
3629 res
= LLVMBuildFRem(builder
, x
, y
, "");
3631 res
= LLVMBuildSRem(builder
, x
, y
, "");
3633 res
= LLVMBuildURem(builder
, x
, y
, "");
3639 * For floating inputs it creates and returns a mask
3640 * which is all 1's for channels which are NaN.
3641 * Channels inside x which are not NaN will be 0.
3644 lp_build_isnan(struct lp_build_context
*bld
,
3648 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3650 assert(bld
->type
.floating
);
3651 assert(lp_check_value(bld
->type
, x
));
3653 mask
= LLVMBuildFCmp(bld
->gallivm
->builder
, LLVMRealOEQ
, x
, x
,
3655 mask
= LLVMBuildNot(bld
->gallivm
->builder
, mask
, "");
3656 mask
= LLVMBuildSExt(bld
->gallivm
->builder
, mask
, int_vec_type
, "isnan");
3660 /* Returns all 1's for floating point numbers that are
3661 * finite numbers and returns all zeros for -inf,
3664 lp_build_isfinite(struct lp_build_context
*bld
,
3667 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3668 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, bld
->type
);
3669 struct lp_type int_type
= lp_int_type(bld
->type
);
3670 LLVMValueRef intx
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3671 LLVMValueRef infornan32
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
,
3674 if (!bld
->type
.floating
) {
3675 return lp_build_const_int_vec(bld
->gallivm
, bld
->type
, 0);
3677 assert(bld
->type
.floating
);
3678 assert(lp_check_value(bld
->type
, x
));
3679 assert(bld
->type
.width
== 32);
3681 intx
= LLVMBuildAnd(builder
, intx
, infornan32
, "");
3682 return lp_build_compare(bld
->gallivm
, int_type
, PIPE_FUNC_NOTEQUAL
,