1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
68 #define EXP_POLY_DEGREE 5
70 #define LOG_POLY_DEGREE 4
75 * No checks for special case values of a or b = 1 or 0 are done.
78 lp_build_min_simple(struct lp_build_context
*bld
,
82 const struct lp_type type
= bld
->type
;
83 const char *intrinsic
= NULL
;
84 unsigned intr_size
= 0;
87 assert(lp_check_value(type
, a
));
88 assert(lp_check_value(type
, b
));
90 /* TODO: optimize the constant case */
92 if (type
.floating
&& util_cpu_caps
.has_sse
) {
93 if (type
.width
== 32) {
94 if (type
.length
== 1) {
95 intrinsic
= "llvm.x86.sse.min.ss";
98 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
99 intrinsic
= "llvm.x86.sse.min.ps";
103 intrinsic
= "llvm.x86.avx.min.ps.256";
107 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
108 if (type
.length
== 1) {
109 intrinsic
= "llvm.x86.sse2.min.sd";
112 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
113 intrinsic
= "llvm.x86.sse2.min.pd";
117 intrinsic
= "llvm.x86.avx.min.pd.256";
122 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
123 if (type
.width
== 32 && type
.length
== 4) {
124 intrinsic
= "llvm.ppc.altivec.vminfp";
127 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
129 if ((type
.width
== 8 || type
.width
== 16) &&
130 (type
.width
* type
.length
<= 64) &&
131 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
132 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
135 if (type
.width
== 8 && !type
.sign
) {
136 intrinsic
= "llvm.x86.sse2.pminu.b";
138 else if (type
.width
== 16 && type
.sign
) {
139 intrinsic
= "llvm.x86.sse2.pmins.w";
141 if (util_cpu_caps
.has_sse4_1
) {
142 if (type
.width
== 8 && type
.sign
) {
143 intrinsic
= "llvm.x86.sse41.pminsb";
145 if (type
.width
== 16 && !type
.sign
) {
146 intrinsic
= "llvm.x86.sse41.pminuw";
148 if (type
.width
== 32 && !type
.sign
) {
149 intrinsic
= "llvm.x86.sse41.pminud";
151 if (type
.width
== 32 && type
.sign
) {
152 intrinsic
= "llvm.x86.sse41.pminsd";
155 } else if (util_cpu_caps
.has_altivec
) {
157 if (type
.width
== 8) {
159 intrinsic
= "llvm.ppc.altivec.vminub";
161 intrinsic
= "llvm.ppc.altivec.vminsb";
163 } else if (type
.width
== 16) {
165 intrinsic
= "llvm.ppc.altivec.vminuh";
167 intrinsic
= "llvm.ppc.altivec.vminsh";
169 } else if (type
.width
== 32) {
171 intrinsic
= "llvm.ppc.altivec.vminuw";
173 intrinsic
= "llvm.ppc.altivec.vminsw";
179 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
184 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
185 return lp_build_select(bld
, cond
, a
, b
);
191 * No checks for special case values of a or b = 1 or 0 are done.
194 lp_build_max_simple(struct lp_build_context
*bld
,
198 const struct lp_type type
= bld
->type
;
199 const char *intrinsic
= NULL
;
200 unsigned intr_size
= 0;
203 assert(lp_check_value(type
, a
));
204 assert(lp_check_value(type
, b
));
206 /* TODO: optimize the constant case */
208 if (type
.floating
&& util_cpu_caps
.has_sse
) {
209 if (type
.width
== 32) {
210 if (type
.length
== 1) {
211 intrinsic
= "llvm.x86.sse.max.ss";
214 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
215 intrinsic
= "llvm.x86.sse.max.ps";
219 intrinsic
= "llvm.x86.avx.max.ps.256";
223 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
224 if (type
.length
== 1) {
225 intrinsic
= "llvm.x86.sse2.max.sd";
228 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
229 intrinsic
= "llvm.x86.sse2.max.pd";
233 intrinsic
= "llvm.x86.avx.max.pd.256";
238 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
239 if (type
.width
== 32 || type
.length
== 4) {
240 intrinsic
= "llvm.ppc.altivec.vmaxfp";
243 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
245 if ((type
.width
== 8 || type
.width
== 16) &&
246 (type
.width
* type
.length
<= 64) &&
247 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
248 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
251 if (type
.width
== 8 && !type
.sign
) {
252 intrinsic
= "llvm.x86.sse2.pmaxu.b";
255 else if (type
.width
== 16 && type
.sign
) {
256 intrinsic
= "llvm.x86.sse2.pmaxs.w";
258 if (util_cpu_caps
.has_sse4_1
) {
259 if (type
.width
== 8 && type
.sign
) {
260 intrinsic
= "llvm.x86.sse41.pmaxsb";
262 if (type
.width
== 16 && !type
.sign
) {
263 intrinsic
= "llvm.x86.sse41.pmaxuw";
265 if (type
.width
== 32 && !type
.sign
) {
266 intrinsic
= "llvm.x86.sse41.pmaxud";
268 if (type
.width
== 32 && type
.sign
) {
269 intrinsic
= "llvm.x86.sse41.pmaxsd";
272 } else if (util_cpu_caps
.has_altivec
) {
274 if (type
.width
== 8) {
276 intrinsic
= "llvm.ppc.altivec.vmaxub";
278 intrinsic
= "llvm.ppc.altivec.vmaxsb";
280 } else if (type
.width
== 16) {
282 intrinsic
= "llvm.ppc.altivec.vmaxuh";
284 intrinsic
= "llvm.ppc.altivec.vmaxsh";
286 } else if (type
.width
== 32) {
288 intrinsic
= "llvm.ppc.altivec.vmaxuw";
290 intrinsic
= "llvm.ppc.altivec.vmaxsw";
296 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
301 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
302 return lp_build_select(bld
, cond
, a
, b
);
307 * Generate 1 - a, or ~a depending on bld->type.
310 lp_build_comp(struct lp_build_context
*bld
,
313 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
314 const struct lp_type type
= bld
->type
;
316 assert(lp_check_value(type
, a
));
323 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
324 if(LLVMIsConstant(a
))
325 return LLVMConstNot(a
);
327 return LLVMBuildNot(builder
, a
, "");
330 if(LLVMIsConstant(a
))
332 return LLVMConstFSub(bld
->one
, a
);
334 return LLVMConstSub(bld
->one
, a
);
337 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
339 return LLVMBuildSub(builder
, bld
->one
, a
, "");
347 lp_build_add(struct lp_build_context
*bld
,
351 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
352 const struct lp_type type
= bld
->type
;
355 assert(lp_check_value(type
, a
));
356 assert(lp_check_value(type
, b
));
362 if(a
== bld
->undef
|| b
== bld
->undef
)
366 const char *intrinsic
= NULL
;
368 if(a
== bld
->one
|| b
== bld
->one
)
371 if (type
.width
* type
.length
== 128 &&
372 !type
.floating
&& !type
.fixed
) {
373 if(util_cpu_caps
.has_sse2
) {
375 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
377 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
378 } else if (util_cpu_caps
.has_altivec
) {
380 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
382 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
387 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
390 /* TODO: handle signed case */
391 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
)
392 a
= lp_build_min_simple(bld
, a
, lp_build_comp(bld
, b
));
394 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
396 res
= LLVMConstFAdd(a
, b
);
398 res
= LLVMConstAdd(a
, b
);
401 res
= LLVMBuildFAdd(builder
, a
, b
, "");
403 res
= LLVMBuildAdd(builder
, a
, b
, "");
405 /* clamp to ceiling of 1.0 */
406 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
407 res
= lp_build_min_simple(bld
, res
, bld
->one
);
409 /* XXX clamp to floor of -1 or 0??? */
415 /** Return the scalar sum of the elements of a.
416 * Should avoid this operation whenever possible.
419 lp_build_horizontal_add(struct lp_build_context
*bld
,
422 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
423 const struct lp_type type
= bld
->type
;
424 LLVMValueRef index
, res
;
426 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
427 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
428 LLVMValueRef vecres
, elem2
;
430 assert(lp_check_value(type
, a
));
432 if (type
.length
== 1) {
436 assert(!bld
->type
.norm
);
439 * for byte vectors can do much better with psadbw.
440 * Using repeated shuffle/adds here. Note with multiple vectors
441 * this can be done more efficiently as outlined in the intel
442 * optimization manual.
443 * Note: could cause data rearrangement if used with smaller element
448 length
= type
.length
/ 2;
450 LLVMValueRef vec1
, vec2
;
451 for (i
= 0; i
< length
; i
++) {
452 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
453 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
455 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
456 LLVMConstVector(shuffles1
, length
), "");
457 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
458 LLVMConstVector(shuffles2
, length
), "");
460 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
463 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
465 length
= length
>> 1;
468 /* always have vector of size 2 here */
471 index
= lp_build_const_int32(bld
->gallivm
, 0);
472 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
473 index
= lp_build_const_int32(bld
->gallivm
, 1);
474 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
477 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
479 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
485 * Return the horizontal sums of 4 float vectors as a float4 vector.
486 * This uses the technique as outlined in Intel Optimization Manual.
489 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
492 struct gallivm_state
*gallivm
= bld
->gallivm
;
493 LLVMBuilderRef builder
= gallivm
->builder
;
494 LLVMValueRef shuffles
[4];
496 LLVMValueRef sumtmp
[2], shuftmp
[2];
498 /* lower half of regs */
499 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
500 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
501 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
502 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
503 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
504 LLVMConstVector(shuffles
, 4), "");
505 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
506 LLVMConstVector(shuffles
, 4), "");
508 /* upper half of regs */
509 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
510 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
511 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
512 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
513 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
514 LLVMConstVector(shuffles
, 4), "");
515 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
516 LLVMConstVector(shuffles
, 4), "");
518 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
519 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
521 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
522 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
523 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
524 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
525 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
526 LLVMConstVector(shuffles
, 4), "");
528 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
529 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
530 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
531 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
532 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
533 LLVMConstVector(shuffles
, 4), "");
535 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
540 * partially horizontally add 2-4 float vectors with length nx4,
541 * i.e. only four adjacent values in each vector will be added,
542 * assuming values are really grouped in 4 which also determines
545 * Return a vector of the same length as the initial vectors,
546 * with the excess elements (if any) being undefined.
547 * The element order is independent of number of input vectors.
548 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
549 * the output order thus will be
550 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
553 lp_build_hadd_partial4(struct lp_build_context
*bld
,
554 LLVMValueRef vectors
[],
557 struct gallivm_state
*gallivm
= bld
->gallivm
;
558 LLVMBuilderRef builder
= gallivm
->builder
;
559 LLVMValueRef ret_vec
;
561 const char *intrinsic
= NULL
;
563 assert(num_vecs
>= 2 && num_vecs
<= 4);
564 assert(bld
->type
.floating
);
566 /* only use this with at least 2 vectors, as it is sort of expensive
567 * (depending on cpu) and we always need two horizontal adds anyway,
568 * so a shuffle/add approach might be better.
574 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
575 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
577 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
578 bld
->type
.length
== 4) {
579 intrinsic
= "llvm.x86.sse3.hadd.ps";
581 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
582 bld
->type
.length
== 8) {
583 intrinsic
= "llvm.x86.avx.hadd.ps.256";
586 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
587 lp_build_vec_type(gallivm
, bld
->type
),
590 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
591 lp_build_vec_type(gallivm
, bld
->type
),
597 return lp_build_intrinsic_binary(builder
, intrinsic
,
598 lp_build_vec_type(gallivm
, bld
->type
),
602 if (bld
->type
.length
== 4) {
603 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
606 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
608 unsigned num_iter
= bld
->type
.length
/ 4;
609 struct lp_type parttype
= bld
->type
;
611 for (j
= 0; j
< num_iter
; j
++) {
612 LLVMValueRef partsrc
[4];
614 for (i
= 0; i
< 4; i
++) {
615 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
617 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
619 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
628 lp_build_sub(struct lp_build_context
*bld
,
632 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
633 const struct lp_type type
= bld
->type
;
636 assert(lp_check_value(type
, a
));
637 assert(lp_check_value(type
, b
));
641 if(a
== bld
->undef
|| b
== bld
->undef
)
647 const char *intrinsic
= NULL
;
652 if (type
.width
* type
.length
== 128 &&
653 !type
.floating
&& !type
.fixed
) {
654 if (util_cpu_caps
.has_sse2
) {
656 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
658 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
659 } else if (util_cpu_caps
.has_altivec
) {
661 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
663 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
668 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
671 /* TODO: handle signed case */
672 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
)
673 a
= lp_build_max_simple(bld
, a
, b
);
675 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
677 res
= LLVMConstFSub(a
, b
);
679 res
= LLVMConstSub(a
, b
);
682 res
= LLVMBuildFSub(builder
, a
, b
, "");
684 res
= LLVMBuildSub(builder
, a
, b
, "");
686 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
687 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
695 * Normalized multiplication.
697 * There are several approaches for (using 8-bit normalized multiplication as
702 * makes the following approximation to the division (Sree)
704 * a*b/255 ~= (a*(b + 1)) >> 256
706 * which is the fastest method that satisfies the following OpenGL criteria of
708 * 0*0 = 0 and 255*255 = 255
712 * takes the geometric series approximation to the division
714 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
716 * in this case just the first two terms to fit in 16bit arithmetic
718 * t/255 ~= (t + (t >> 8)) >> 8
720 * note that just by itself it doesn't satisfies the OpenGL criteria, as
721 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
724 * - geometric series plus rounding
726 * when using a geometric series division instead of truncating the result
727 * use roundoff in the approximation (Jim Blinn)
729 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
731 * achieving the exact results.
735 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
736 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
737 * @sa Michael Herf, The "double blend trick", May 2000,
738 * http://www.stereopsis.com/doubleblend.html
741 lp_build_mul_norm(struct gallivm_state
*gallivm
,
742 struct lp_type wide_type
,
743 LLVMValueRef a
, LLVMValueRef b
)
745 LLVMBuilderRef builder
= gallivm
->builder
;
746 struct lp_build_context bld
;
751 assert(!wide_type
.floating
);
752 assert(lp_check_value(wide_type
, a
));
753 assert(lp_check_value(wide_type
, b
));
755 lp_build_context_init(&bld
, gallivm
, wide_type
);
757 n
= wide_type
.width
/ 2;
758 if (wide_type
.sign
) {
763 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
764 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
768 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
771 ab
= LLVMBuildMul(builder
, a
, b
, "");
772 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
775 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
778 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1 << (n
- 1));
779 if (wide_type
.sign
) {
780 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
781 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
782 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
784 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
787 ab
= lp_build_shr_imm(&bld
, ab
, n
);
796 lp_build_mul(struct lp_build_context
*bld
,
800 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
801 const struct lp_type type
= bld
->type
;
805 assert(lp_check_value(type
, a
));
806 assert(lp_check_value(type
, b
));
816 if(a
== bld
->undef
|| b
== bld
->undef
)
819 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
820 struct lp_type wide_type
= lp_wider_type(type
);
821 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
823 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
824 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
826 /* PMULLW, PSRLW, PADDW */
827 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
828 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
830 ab
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, abl
, abh
);
836 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
840 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
842 res
= LLVMConstFMul(a
, b
);
844 res
= LLVMConstMul(a
, b
);
847 res
= LLVMConstAShr(res
, shift
);
849 res
= LLVMConstLShr(res
, shift
);
854 res
= LLVMBuildFMul(builder
, a
, b
, "");
856 res
= LLVMBuildMul(builder
, a
, b
, "");
859 res
= LLVMBuildAShr(builder
, res
, shift
, "");
861 res
= LLVMBuildLShr(builder
, res
, shift
, "");
870 * Small vector x scale multiplication optimization.
873 lp_build_mul_imm(struct lp_build_context
*bld
,
877 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
880 assert(lp_check_value(bld
->type
, a
));
889 return lp_build_negate(bld
, a
);
891 if(b
== 2 && bld
->type
.floating
)
892 return lp_build_add(bld
, a
, a
);
894 if(util_is_power_of_two(b
)) {
895 unsigned shift
= ffs(b
) - 1;
897 if(bld
->type
.floating
) {
900 * Power of two multiplication by directly manipulating the exponent.
902 * XXX: This might not be always faster, it will introduce a small error
903 * for multiplication by zero, and it will produce wrong results
906 unsigned mantissa
= lp_mantissa(bld
->type
);
907 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
908 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
909 a
= LLVMBuildAdd(builder
, a
, factor
, "");
910 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
915 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
916 return LLVMBuildShl(builder
, a
, factor
, "");
920 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
921 return lp_build_mul(bld
, a
, factor
);
929 lp_build_div(struct lp_build_context
*bld
,
933 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
934 const struct lp_type type
= bld
->type
;
936 assert(lp_check_value(type
, a
));
937 assert(lp_check_value(type
, b
));
942 return lp_build_rcp(bld
, b
);
947 if(a
== bld
->undef
|| b
== bld
->undef
)
950 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
952 return LLVMConstFDiv(a
, b
);
954 return LLVMConstSDiv(a
, b
);
956 return LLVMConstUDiv(a
, b
);
959 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
960 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
962 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
965 return LLVMBuildFDiv(builder
, a
, b
, "");
967 return LLVMBuildSDiv(builder
, a
, b
, "");
969 return LLVMBuildUDiv(builder
, a
, b
, "");
974 * Linear interpolation helper.
976 * @param normalized whether we are interpolating normalized values,
977 * encoded in normalized integers, twice as wide.
979 * @sa http://www.stereopsis.com/doubleblend.html
981 static INLINE LLVMValueRef
982 lp_build_lerp_simple(struct lp_build_context
*bld
,
988 unsigned half_width
= bld
->type
.width
/2;
989 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
993 assert(lp_check_value(bld
->type
, x
));
994 assert(lp_check_value(bld
->type
, v0
));
995 assert(lp_check_value(bld
->type
, v1
));
997 delta
= lp_build_sub(bld
, v1
, v0
);
999 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
1000 if (!bld
->type
.sign
) {
1001 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
1003 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1004 * most-significant-bit to the lowest-significant-bit, so that
1005 * later we can just divide by 2**n instead of 2**n - 1.
1008 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1011 /* (x * delta) >> n */
1012 res
= lp_build_mul(bld
, x
, delta
);
1013 res
= lp_build_shr_imm(bld
, res
, half_width
);
1016 * The rescaling trick above doesn't work for signed numbers, so
1017 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1020 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1021 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1024 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1025 res
= lp_build_mul(bld
, x
, delta
);
1028 res
= lp_build_add(bld
, v0
, res
);
1030 if (((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) ||
1032 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1033 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1034 * but it will be wrong for true fixed point use cases. Basically we need
1035 * a more powerful lp_type, capable of further distinguishing the values
1036 * interpretation from the value storage. */
1037 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1), "");
1045 * Linear interpolation.
1048 lp_build_lerp(struct lp_build_context
*bld
,
1054 const struct lp_type type
= bld
->type
;
1057 assert(lp_check_value(type
, x
));
1058 assert(lp_check_value(type
, v0
));
1059 assert(lp_check_value(type
, v1
));
1061 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1064 struct lp_type wide_type
;
1065 struct lp_build_context wide_bld
;
1066 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1068 assert(type
.length
>= 2);
1071 * Create a wider integer type, enough to hold the
1072 * intermediate result of the multiplication.
1074 memset(&wide_type
, 0, sizeof wide_type
);
1075 wide_type
.sign
= type
.sign
;
1076 wide_type
.width
= type
.width
*2;
1077 wide_type
.length
= type
.length
/2;
1079 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1081 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1082 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1083 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1089 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1091 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1092 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1094 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1096 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1104 * Bilinear interpolation.
1106 * Values indices are in v_{yx}.
1109 lp_build_lerp_2d(struct lp_build_context
*bld
,
1118 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1119 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1120 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1125 lp_build_lerp_3d(struct lp_build_context
*bld
,
1139 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1140 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1141 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1146 * Generate min(a, b)
1147 * Do checks for special cases.
1150 lp_build_min(struct lp_build_context
*bld
,
1154 assert(lp_check_value(bld
->type
, a
));
1155 assert(lp_check_value(bld
->type
, b
));
1157 if(a
== bld
->undef
|| b
== bld
->undef
)
1163 if (bld
->type
.norm
) {
1164 if (!bld
->type
.sign
) {
1165 if (a
== bld
->zero
|| b
== bld
->zero
) {
1175 return lp_build_min_simple(bld
, a
, b
);
1180 * Generate max(a, b)
1181 * Do checks for special cases.
1184 lp_build_max(struct lp_build_context
*bld
,
1188 assert(lp_check_value(bld
->type
, a
));
1189 assert(lp_check_value(bld
->type
, b
));
1191 if(a
== bld
->undef
|| b
== bld
->undef
)
1197 if(bld
->type
.norm
) {
1198 if(a
== bld
->one
|| b
== bld
->one
)
1200 if (!bld
->type
.sign
) {
1201 if (a
== bld
->zero
) {
1204 if (b
== bld
->zero
) {
1210 return lp_build_max_simple(bld
, a
, b
);
1215 * Generate clamp(a, min, max)
1216 * Do checks for special cases.
1219 lp_build_clamp(struct lp_build_context
*bld
,
1224 assert(lp_check_value(bld
->type
, a
));
1225 assert(lp_check_value(bld
->type
, min
));
1226 assert(lp_check_value(bld
->type
, max
));
1228 a
= lp_build_min(bld
, a
, max
);
1229 a
= lp_build_max(bld
, a
, min
);
1238 lp_build_abs(struct lp_build_context
*bld
,
1241 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1242 const struct lp_type type
= bld
->type
;
1243 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1245 assert(lp_check_value(type
, a
));
1251 /* Mask out the sign bit */
1252 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1253 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1254 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1255 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1256 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1257 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1261 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1262 switch(type
.width
) {
1264 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1266 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1268 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1271 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1272 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1273 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1274 debug_printf("%s: inefficient code, should split vectors manually\n",
1278 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1283 lp_build_negate(struct lp_build_context
*bld
,
1286 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1288 assert(lp_check_value(bld
->type
, a
));
1290 #if HAVE_LLVM >= 0x0207
1291 if (bld
->type
.floating
)
1292 a
= LLVMBuildFNeg(builder
, a
, "");
1295 a
= LLVMBuildNeg(builder
, a
, "");
1301 /** Return -1, 0 or +1 depending on the sign of a */
1303 lp_build_sgn(struct lp_build_context
*bld
,
1306 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1307 const struct lp_type type
= bld
->type
;
1311 assert(lp_check_value(type
, a
));
1313 /* Handle non-zero case */
1315 /* if not zero then sign must be positive */
1318 else if(type
.floating
) {
1319 LLVMTypeRef vec_type
;
1320 LLVMTypeRef int_type
;
1324 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1326 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1327 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1328 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1330 /* Take the sign bit and add it to 1 constant */
1331 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1332 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1333 one
= LLVMConstBitCast(bld
->one
, int_type
);
1334 res
= LLVMBuildOr(builder
, sign
, one
, "");
1335 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1339 /* signed int/norm/fixed point */
1340 /* could use psign with sse3 and appropriate vectors here */
1341 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1342 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1343 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1347 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1348 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1355 * Set the sign of float vector 'a' according to 'sign'.
1356 * If sign==0, return abs(a).
1357 * If sign==1, return -abs(a);
1358 * Other values for sign produce undefined results.
1361 lp_build_set_sign(struct lp_build_context
*bld
,
1362 LLVMValueRef a
, LLVMValueRef sign
)
1364 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1365 const struct lp_type type
= bld
->type
;
1366 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1367 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1368 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1369 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1370 ~((unsigned long long) 1 << (type
.width
- 1)));
1371 LLVMValueRef val
, res
;
1373 assert(type
.floating
);
1374 assert(lp_check_value(type
, a
));
1376 /* val = reinterpret_cast<int>(a) */
1377 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1378 /* val = val & mask */
1379 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1380 /* sign = sign << shift */
1381 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1382 /* res = val | sign */
1383 res
= LLVMBuildOr(builder
, val
, sign
, "");
1384 /* res = reinterpret_cast<float>(res) */
1385 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1392 * Convert vector of (or scalar) int to vector of (or scalar) float.
1395 lp_build_int_to_float(struct lp_build_context
*bld
,
1398 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1399 const struct lp_type type
= bld
->type
;
1400 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1402 assert(type
.floating
);
1404 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1408 arch_rounding_available(const struct lp_type type
)
1410 if ((util_cpu_caps
.has_sse4_1
&&
1411 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1412 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1414 else if ((util_cpu_caps
.has_altivec
&&
1415 (type
.width
== 32 && type
.length
== 4)))
1421 enum lp_build_round_mode
1423 LP_BUILD_ROUND_NEAREST
= 0,
1424 LP_BUILD_ROUND_FLOOR
= 1,
1425 LP_BUILD_ROUND_CEIL
= 2,
1426 LP_BUILD_ROUND_TRUNCATE
= 3
1430 * Helper for SSE4.1's ROUNDxx instructions.
1432 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1433 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1435 static INLINE LLVMValueRef
1436 lp_build_round_sse41(struct lp_build_context
*bld
,
1438 enum lp_build_round_mode mode
)
1440 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1441 const struct lp_type type
= bld
->type
;
1442 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1443 const char *intrinsic
;
1446 assert(type
.floating
);
1448 assert(lp_check_value(type
, a
));
1449 assert(util_cpu_caps
.has_sse4_1
);
1451 if (type
.length
== 1) {
1452 LLVMTypeRef vec_type
;
1454 LLVMValueRef args
[3];
1455 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1457 switch(type
.width
) {
1459 intrinsic
= "llvm.x86.sse41.round.ss";
1462 intrinsic
= "llvm.x86.sse41.round.sd";
1469 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1471 undef
= LLVMGetUndef(vec_type
);
1474 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1475 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1477 res
= lp_build_intrinsic(builder
, intrinsic
,
1478 vec_type
, args
, Elements(args
));
1480 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1483 if (type
.width
* type
.length
== 128) {
1484 switch(type
.width
) {
1486 intrinsic
= "llvm.x86.sse41.round.ps";
1489 intrinsic
= "llvm.x86.sse41.round.pd";
1497 assert(type
.width
* type
.length
== 256);
1498 assert(util_cpu_caps
.has_avx
);
1500 switch(type
.width
) {
1502 intrinsic
= "llvm.x86.avx.round.ps.256";
1505 intrinsic
= "llvm.x86.avx.round.pd.256";
1513 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1515 LLVMConstInt(i32t
, mode
, 0));
1522 static INLINE LLVMValueRef
1523 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1526 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1527 const struct lp_type type
= bld
->type
;
1528 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1529 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1530 const char *intrinsic
;
1533 assert(type
.floating
);
1534 /* using the double precision conversions is a bit more complicated */
1535 assert(type
.width
== 32);
1537 assert(lp_check_value(type
, a
));
1538 assert(util_cpu_caps
.has_sse2
);
1540 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1541 if (type
.length
== 1) {
1542 LLVMTypeRef vec_type
;
1545 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1547 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1549 intrinsic
= "llvm.x86.sse.cvtss2si";
1551 undef
= LLVMGetUndef(vec_type
);
1553 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1555 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1559 if (type
.width
* type
.length
== 128) {
1560 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1563 assert(type
.width
*type
.length
== 256);
1564 assert(util_cpu_caps
.has_avx
);
1566 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1568 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1578 static INLINE LLVMValueRef
1579 lp_build_round_altivec(struct lp_build_context
*bld
,
1581 enum lp_build_round_mode mode
)
1583 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1584 const struct lp_type type
= bld
->type
;
1585 const char *intrinsic
= NULL
;
1587 assert(type
.floating
);
1589 assert(lp_check_value(type
, a
));
1590 assert(util_cpu_caps
.has_altivec
);
1593 case LP_BUILD_ROUND_NEAREST
:
1594 intrinsic
= "llvm.ppc.altivec.vrfin";
1596 case LP_BUILD_ROUND_FLOOR
:
1597 intrinsic
= "llvm.ppc.altivec.vrfim";
1599 case LP_BUILD_ROUND_CEIL
:
1600 intrinsic
= "llvm.ppc.altivec.vrfip";
1602 case LP_BUILD_ROUND_TRUNCATE
:
1603 intrinsic
= "llvm.ppc.altivec.vrfiz";
1607 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1610 static INLINE LLVMValueRef
1611 lp_build_round_arch(struct lp_build_context
*bld
,
1613 enum lp_build_round_mode mode
)
1615 if (util_cpu_caps
.has_sse4_1
)
1616 return lp_build_round_sse41(bld
, a
, mode
);
1617 else /* (util_cpu_caps.has_altivec) */
1618 return lp_build_round_altivec(bld
, a
, mode
);
1622 * Return the integer part of a float (vector) value (== round toward zero).
1623 * The returned value is a float (vector).
1624 * Ex: trunc(-1.5) = -1.0
1627 lp_build_trunc(struct lp_build_context
*bld
,
1630 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1631 const struct lp_type type
= bld
->type
;
1633 assert(type
.floating
);
1634 assert(lp_check_value(type
, a
));
1636 if (arch_rounding_available(type
)) {
1637 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1640 const struct lp_type type
= bld
->type
;
1641 struct lp_type inttype
;
1642 struct lp_build_context intbld
;
1643 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1644 LLVMValueRef trunc
, res
, anosign
, mask
;
1645 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1646 LLVMTypeRef vec_type
= bld
->vec_type
;
1648 assert(type
.width
== 32); /* might want to handle doubles at some point */
1651 inttype
.floating
= 0;
1652 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1654 /* round by truncation */
1655 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1656 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1658 /* mask out sign bit */
1659 anosign
= lp_build_abs(bld
, a
);
1661 * mask out all values if anosign > 2^24
1662 * This should work both for large ints (all rounding is no-op for them
1663 * because such floats are always exact) as well as special cases like
1664 * NaNs, Infs (taking advantage of the fact they use max exponent).
1665 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1667 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1668 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1669 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1670 return lp_build_select(bld
, mask
, a
, res
);
1676 * Return float (vector) rounded to nearest integer (vector). The returned
1677 * value is a float (vector).
1678 * Ex: round(0.9) = 1.0
1679 * Ex: round(-1.5) = -2.0
1682 lp_build_round(struct lp_build_context
*bld
,
1685 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1686 const struct lp_type type
= bld
->type
;
1688 assert(type
.floating
);
1689 assert(lp_check_value(type
, a
));
1691 if (arch_rounding_available(type
)) {
1692 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1695 const struct lp_type type
= bld
->type
;
1696 struct lp_type inttype
;
1697 struct lp_build_context intbld
;
1698 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1699 LLVMValueRef res
, anosign
, mask
;
1700 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1701 LLVMTypeRef vec_type
= bld
->vec_type
;
1703 assert(type
.width
== 32); /* might want to handle doubles at some point */
1706 inttype
.floating
= 0;
1707 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1709 res
= lp_build_iround(bld
, a
);
1710 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1712 /* mask out sign bit */
1713 anosign
= lp_build_abs(bld
, a
);
1715 * mask out all values if anosign > 2^24
1716 * This should work both for large ints (all rounding is no-op for them
1717 * because such floats are always exact) as well as special cases like
1718 * NaNs, Infs (taking advantage of the fact they use max exponent).
1719 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1721 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1722 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1723 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1724 return lp_build_select(bld
, mask
, a
, res
);
1730 * Return floor of float (vector), result is a float (vector)
1731 * Ex: floor(1.1) = 1.0
1732 * Ex: floor(-1.1) = -2.0
1735 lp_build_floor(struct lp_build_context
*bld
,
1738 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1739 const struct lp_type type
= bld
->type
;
1741 assert(type
.floating
);
1742 assert(lp_check_value(type
, a
));
1744 if (arch_rounding_available(type
)) {
1745 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1748 const struct lp_type type
= bld
->type
;
1749 struct lp_type inttype
;
1750 struct lp_build_context intbld
;
1751 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1752 LLVMValueRef trunc
, res
, anosign
, mask
;
1753 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1754 LLVMTypeRef vec_type
= bld
->vec_type
;
1756 assert(type
.width
== 32); /* might want to handle doubles at some point */
1759 inttype
.floating
= 0;
1760 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1762 /* round by truncation */
1763 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1764 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1770 * fix values if rounding is wrong (for non-special cases)
1771 * - this is the case if trunc > a
1773 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
1774 /* tmp = trunc > a ? 1.0 : 0.0 */
1775 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1776 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1777 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1778 res
= lp_build_sub(bld
, res
, tmp
);
1781 /* mask out sign bit */
1782 anosign
= lp_build_abs(bld
, a
);
1784 * mask out all values if anosign > 2^24
1785 * This should work both for large ints (all rounding is no-op for them
1786 * because such floats are always exact) as well as special cases like
1787 * NaNs, Infs (taking advantage of the fact they use max exponent).
1788 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1790 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1791 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1792 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1793 return lp_build_select(bld
, mask
, a
, res
);
1799 * Return ceiling of float (vector), returning float (vector).
1800 * Ex: ceil( 1.1) = 2.0
1801 * Ex: ceil(-1.1) = -1.0
1804 lp_build_ceil(struct lp_build_context
*bld
,
1807 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1808 const struct lp_type type
= bld
->type
;
1810 assert(type
.floating
);
1811 assert(lp_check_value(type
, a
));
1813 if (arch_rounding_available(type
)) {
1814 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
1817 const struct lp_type type
= bld
->type
;
1818 struct lp_type inttype
;
1819 struct lp_build_context intbld
;
1820 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1821 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
1822 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1823 LLVMTypeRef vec_type
= bld
->vec_type
;
1825 assert(type
.width
== 32); /* might want to handle doubles at some point */
1828 inttype
.floating
= 0;
1829 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1831 /* round by truncation */
1832 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1833 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
1836 * fix values if rounding is wrong (for non-special cases)
1837 * - this is the case if trunc < a
1839 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
1840 /* tmp = trunc < a ? 1.0 : 0.0 */
1841 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1842 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1843 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1844 res
= lp_build_add(bld
, trunc
, tmp
);
1846 /* mask out sign bit */
1847 anosign
= lp_build_abs(bld
, a
);
1849 * mask out all values if anosign > 2^24
1850 * This should work both for large ints (all rounding is no-op for them
1851 * because such floats are always exact) as well as special cases like
1852 * NaNs, Infs (taking advantage of the fact they use max exponent).
1853 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1855 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1856 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1857 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1858 return lp_build_select(bld
, mask
, a
, res
);
1864 * Return fractional part of 'a' computed as a - floor(a)
1865 * Typically used in texture coord arithmetic.
1868 lp_build_fract(struct lp_build_context
*bld
,
1871 assert(bld
->type
.floating
);
1872 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1877 * Prevent returning a fractional part of 1.0 for very small negative values of
1878 * 'a' by clamping against 0.99999(9).
1880 static inline LLVMValueRef
1881 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
1885 /* this is the largest number smaller than 1.0 representable as float */
1886 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
1887 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
1888 return lp_build_min(bld
, fract
, max
);
1893 * Same as lp_build_fract, but guarantees that the result is always smaller
1897 lp_build_fract_safe(struct lp_build_context
*bld
,
1900 return clamp_fract(bld
, lp_build_fract(bld
, a
));
1905 * Return the integer part of a float (vector) value (== round toward zero).
1906 * The returned value is an integer (vector).
1907 * Ex: itrunc(-1.5) = -1
1910 lp_build_itrunc(struct lp_build_context
*bld
,
1913 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1914 const struct lp_type type
= bld
->type
;
1915 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1917 assert(type
.floating
);
1918 assert(lp_check_value(type
, a
));
1920 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1925 * Return float (vector) rounded to nearest integer (vector). The returned
1926 * value is an integer (vector).
1927 * Ex: iround(0.9) = 1
1928 * Ex: iround(-1.5) = -2
1931 lp_build_iround(struct lp_build_context
*bld
,
1934 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1935 const struct lp_type type
= bld
->type
;
1936 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1939 assert(type
.floating
);
1941 assert(lp_check_value(type
, a
));
1943 if ((util_cpu_caps
.has_sse2
&&
1944 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
1945 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
1946 return lp_build_iround_nearest_sse2(bld
, a
);
1948 if (arch_rounding_available(type
)) {
1949 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1954 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
1957 LLVMTypeRef vec_type
= bld
->vec_type
;
1958 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1959 (unsigned long long)1 << (type
.width
- 1));
1963 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1964 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1967 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
1968 half
= LLVMBuildOr(builder
, sign
, half
, "");
1969 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
1972 res
= LLVMBuildFAdd(builder
, a
, half
, "");
1975 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
1982 * Return floor of float (vector), result is an int (vector)
1983 * Ex: ifloor(1.1) = 1.0
1984 * Ex: ifloor(-1.1) = -2.0
1987 lp_build_ifloor(struct lp_build_context
*bld
,
1990 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1991 const struct lp_type type
= bld
->type
;
1992 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1995 assert(type
.floating
);
1996 assert(lp_check_value(type
, a
));
2000 if (arch_rounding_available(type
)) {
2001 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
2004 struct lp_type inttype
;
2005 struct lp_build_context intbld
;
2006 LLVMValueRef trunc
, itrunc
, mask
;
2008 assert(type
.floating
);
2009 assert(lp_check_value(type
, a
));
2012 inttype
.floating
= 0;
2013 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2015 /* round by truncation */
2016 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2017 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2020 * fix values if rounding is wrong (for non-special cases)
2021 * - this is the case if trunc > a
2022 * The results of doing this with NaNs, very large values etc.
2023 * are undefined but this seems to be the case anyway.
2025 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2026 /* cheapie minus one with mask since the mask is minus one / zero */
2027 return lp_build_add(&intbld
, itrunc
, mask
);
2031 /* round to nearest (toward zero) */
2032 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2039 * Return ceiling of float (vector), returning int (vector).
2040 * Ex: iceil( 1.1) = 2
2041 * Ex: iceil(-1.1) = -1
2044 lp_build_iceil(struct lp_build_context
*bld
,
2047 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2048 const struct lp_type type
= bld
->type
;
2049 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2052 assert(type
.floating
);
2053 assert(lp_check_value(type
, a
));
2055 if (arch_rounding_available(type
)) {
2056 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2059 struct lp_type inttype
;
2060 struct lp_build_context intbld
;
2061 LLVMValueRef trunc
, itrunc
, mask
;
2063 assert(type
.floating
);
2064 assert(lp_check_value(type
, a
));
2067 inttype
.floating
= 0;
2068 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2070 /* round by truncation */
2071 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2072 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2075 * fix values if rounding is wrong (for non-special cases)
2076 * - this is the case if trunc < a
2077 * The results of doing this with NaNs, very large values etc.
2078 * are undefined but this seems to be the case anyway.
2080 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2081 /* cheapie plus one with mask since the mask is minus one / zero */
2082 return lp_build_sub(&intbld
, itrunc
, mask
);
2085 /* round to nearest (toward zero) */
2086 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2093 * Combined ifloor() & fract().
2095 * Preferred to calling the functions separately, as it will ensure that the
2096 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2099 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2101 LLVMValueRef
*out_ipart
,
2102 LLVMValueRef
*out_fpart
)
2104 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2105 const struct lp_type type
= bld
->type
;
2108 assert(type
.floating
);
2109 assert(lp_check_value(type
, a
));
2111 if (arch_rounding_available(type
)) {
2113 * floor() is easier.
2116 ipart
= lp_build_floor(bld
, a
);
2117 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2118 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2122 * ifloor() is easier.
2125 *out_ipart
= lp_build_ifloor(bld
, a
);
2126 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2127 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2133 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2134 * always smaller than one.
2137 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2139 LLVMValueRef
*out_ipart
,
2140 LLVMValueRef
*out_fpart
)
2142 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2143 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2148 lp_build_sqrt(struct lp_build_context
*bld
,
2151 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2152 const struct lp_type type
= bld
->type
;
2153 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2156 assert(lp_check_value(type
, a
));
2158 /* TODO: optimize the constant case */
2160 assert(type
.floating
);
2161 if (type
.length
== 1) {
2162 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.f%u", type
.width
);
2165 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
2168 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2173 * Do one Newton-Raphson step to improve reciprocate precision:
2175 * x_{i+1} = x_i * (2 - a * x_i)
2177 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2178 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2179 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2180 * halo. It would be necessary to clamp the argument to prevent this.
2183 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2184 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2186 static INLINE LLVMValueRef
2187 lp_build_rcp_refine(struct lp_build_context
*bld
,
2191 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2192 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2195 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2196 res
= LLVMBuildFSub(builder
, two
, res
, "");
2197 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2204 lp_build_rcp(struct lp_build_context
*bld
,
2207 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2208 const struct lp_type type
= bld
->type
;
2210 assert(lp_check_value(type
, a
));
2219 assert(type
.floating
);
2221 if(LLVMIsConstant(a
))
2222 return LLVMConstFDiv(bld
->one
, a
);
2225 * We don't use RCPPS because:
2226 * - it only has 10bits of precision
2227 * - it doesn't even get the reciprocate of 1.0 exactly
2228 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2229 * - for recent processors the benefit over DIVPS is marginal, a case
2232 * We could still use it on certain processors if benchmarks show that the
2233 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2234 * particular uses that require less workarounds.
2237 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2238 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2239 const unsigned num_iterations
= 0;
2242 const char *intrinsic
= NULL
;
2244 if (type
.length
== 4) {
2245 intrinsic
= "llvm.x86.sse.rcp.ps";
2248 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2251 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2253 for (i
= 0; i
< num_iterations
; ++i
) {
2254 res
= lp_build_rcp_refine(bld
, a
, res
);
2260 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2265 * Do one Newton-Raphson step to improve rsqrt precision:
2267 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2269 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2271 static INLINE LLVMValueRef
2272 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2274 LLVMValueRef rsqrt_a
)
2276 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2277 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2278 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2281 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2282 res
= LLVMBuildFMul(builder
, a
, res
, "");
2283 res
= LLVMBuildFSub(builder
, three
, res
, "");
2284 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2285 res
= LLVMBuildFMul(builder
, half
, res
, "");
2292 * Generate 1/sqrt(a).
2293 * Result is undefined for values < 0, infinity for +0.
2296 lp_build_rsqrt(struct lp_build_context
*bld
,
2299 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2300 const struct lp_type type
= bld
->type
;
2302 assert(lp_check_value(type
, a
));
2304 assert(type
.floating
);
2307 * This should be faster but all denormals will end up as infinity.
2309 if (0 && lp_build_fast_rsqrt_available(type
)) {
2310 const unsigned num_iterations
= 1;
2314 /* rsqrt(1.0) != 1.0 here */
2315 res
= lp_build_fast_rsqrt(bld
, a
);
2317 if (num_iterations
) {
2319 * Newton-Raphson will result in NaN instead of infinity for zero,
2320 * and NaN instead of zero for infinity.
2321 * Also, need to ensure rsqrt(1.0) == 1.0.
2322 * All numbers smaller than FLT_MIN will result in +infinity
2323 * (rsqrtps treats all denormals as zero).
2326 * Certain non-c99 compilers don't know INFINITY and might not support
2327 * hacks to evaluate it at compile time neither.
2329 const unsigned posinf_int
= 0x7F800000;
2331 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2332 LLVMValueRef inf
= lp_build_const_int_vec(bld
->gallivm
, type
, posinf_int
);
2334 inf
= LLVMBuildBitCast(builder
, inf
, lp_build_vec_type(bld
->gallivm
, type
), "");
2336 for (i
= 0; i
< num_iterations
; ++i
) {
2337 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2339 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2340 res
= lp_build_select(bld
, cmp
, inf
, res
);
2341 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2342 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2343 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2344 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2350 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2354 * If there's a fast (inaccurate) rsqrt instruction available
2355 * (caller may want to avoid to call rsqrt_fast if it's not available,
2356 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2357 * unavailable it would result in sqrt/div/mul so obviously
2358 * much better to just call sqrt, skipping both div and mul).
2361 lp_build_fast_rsqrt_available(struct lp_type type
)
2363 assert(type
.floating
);
2365 if ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2366 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
2374 * Generate 1/sqrt(a).
2375 * Result is undefined for values < 0, infinity for +0.
2376 * Precision is limited, only ~10 bits guaranteed
2377 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2380 lp_build_fast_rsqrt(struct lp_build_context
*bld
,
2383 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2384 const struct lp_type type
= bld
->type
;
2386 assert(lp_check_value(type
, a
));
2388 if (lp_build_fast_rsqrt_available(type
)) {
2389 const char *intrinsic
= NULL
;
2391 if (type
.length
== 4) {
2392 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2395 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2397 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2400 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__
);
2402 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2407 * Generate sin(a) using SSE2
2410 lp_build_sin(struct lp_build_context
*bld
,
2413 struct gallivm_state
*gallivm
= bld
->gallivm
;
2414 LLVMBuilderRef builder
= gallivm
->builder
;
2415 struct lp_type int_type
= lp_int_type(bld
->type
);
2416 LLVMBuilderRef b
= builder
;
2419 * take the absolute value,
2420 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2423 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2424 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2426 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2427 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2430 * extract the sign bit (upper one)
2431 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2433 LLVMValueRef sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2434 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
2438 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2441 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2442 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2445 * store the integer part of y in mm0
2446 * emm2 = _mm_cvttps_epi32(y);
2449 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2452 * j=(j+1) & (~1) (see the cephes sources)
2453 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2456 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2457 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2459 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2461 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2462 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2465 * y = _mm_cvtepi32_ps(emm2);
2467 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2469 /* get the swap sign flag
2470 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2472 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2473 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
2476 * emm2 = _mm_slli_epi32(emm0, 29);
2478 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2479 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
2482 * get the polynom selection mask
2483 * there is one polynom for 0 <= x <= Pi/4
2484 * and another one for Pi/4<x<=Pi/2
2485 * Both branches will be computed.
2487 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2488 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2491 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2492 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
2493 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2494 int_type
, PIPE_FUNC_EQUAL
,
2495 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2497 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2499 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
2502 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2503 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2504 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2506 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2507 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2508 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2511 * The magic pass: "Extended precision modular arithmetic"
2512 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2513 * xmm1 = _mm_mul_ps(y, xmm1);
2514 * xmm2 = _mm_mul_ps(y, xmm2);
2515 * xmm3 = _mm_mul_ps(y, xmm3);
2517 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2518 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2519 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2522 * x = _mm_add_ps(x, xmm1);
2523 * x = _mm_add_ps(x, xmm2);
2524 * x = _mm_add_ps(x, xmm3);
2527 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2528 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2529 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2532 * Evaluate the first polynom (0 <= x <= Pi/4)
2534 * z = _mm_mul_ps(x,x);
2536 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2539 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2540 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2541 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2543 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2544 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2545 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2548 * y = *(v4sf*)_ps_coscof_p0;
2549 * y = _mm_mul_ps(y, z);
2551 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2552 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2553 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2554 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2555 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2556 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2560 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2561 * y = _mm_sub_ps(y, tmp);
2562 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2564 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2565 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2566 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2567 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2568 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2571 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2572 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2573 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2575 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2576 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2577 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2580 * Evaluate the second polynom (Pi/4 <= x <= 0)
2582 * y2 = *(v4sf*)_ps_sincof_p0;
2583 * y2 = _mm_mul_ps(y2, z);
2584 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2585 * y2 = _mm_mul_ps(y2, z);
2586 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2587 * y2 = _mm_mul_ps(y2, z);
2588 * y2 = _mm_mul_ps(y2, x);
2589 * y2 = _mm_add_ps(y2, x);
2592 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2593 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2594 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2595 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2596 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2597 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2598 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2601 * select the correct result from the two polynoms
2603 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2604 * y = _mm_andnot_ps(xmm3, y);
2605 * y = _mm_add_ps(y,y2);
2607 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2608 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2609 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2610 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2611 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2612 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2613 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2617 * y = _mm_xor_ps(y, sign_bit);
2619 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
2620 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2626 * Generate cos(a) using SSE2
2629 lp_build_cos(struct lp_build_context
*bld
,
2632 struct gallivm_state
*gallivm
= bld
->gallivm
;
2633 LLVMBuilderRef builder
= gallivm
->builder
;
2634 struct lp_type int_type
= lp_int_type(bld
->type
);
2635 LLVMBuilderRef b
= builder
;
2638 * take the absolute value,
2639 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2642 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2643 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2645 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2646 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2650 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2653 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2654 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2657 * store the integer part of y in mm0
2658 * emm2 = _mm_cvttps_epi32(y);
2661 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2664 * j=(j+1) & (~1) (see the cephes sources)
2665 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2668 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2669 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2671 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2673 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2674 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2677 * y = _mm_cvtepi32_ps(emm2);
2679 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2683 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2685 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2686 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
2689 /* get the swap sign flag
2690 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2692 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2693 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
2694 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2695 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
2698 * emm2 = _mm_slli_epi32(emm0, 29);
2700 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2701 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
2704 * get the polynom selection mask
2705 * there is one polynom for 0 <= x <= Pi/4
2706 * and another one for Pi/4<x<=Pi/2
2707 * Both branches will be computed.
2709 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2710 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2713 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2714 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
2715 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2716 int_type
, PIPE_FUNC_EQUAL
,
2717 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2720 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2721 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2722 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2724 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2725 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2726 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2729 * The magic pass: "Extended precision modular arithmetic"
2730 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2731 * xmm1 = _mm_mul_ps(y, xmm1);
2732 * xmm2 = _mm_mul_ps(y, xmm2);
2733 * xmm3 = _mm_mul_ps(y, xmm3);
2735 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2736 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2737 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2740 * x = _mm_add_ps(x, xmm1);
2741 * x = _mm_add_ps(x, xmm2);
2742 * x = _mm_add_ps(x, xmm3);
2745 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2746 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2747 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2750 * Evaluate the first polynom (0 <= x <= Pi/4)
2752 * z = _mm_mul_ps(x,x);
2754 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2757 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2758 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2759 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2761 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2762 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2763 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2766 * y = *(v4sf*)_ps_coscof_p0;
2767 * y = _mm_mul_ps(y, z);
2769 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2770 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2771 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2772 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2773 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2774 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2778 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2779 * y = _mm_sub_ps(y, tmp);
2780 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2782 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2783 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2784 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2785 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2786 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2789 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2790 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2791 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2793 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2794 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2795 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2798 * Evaluate the second polynom (Pi/4 <= x <= 0)
2800 * y2 = *(v4sf*)_ps_sincof_p0;
2801 * y2 = _mm_mul_ps(y2, z);
2802 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2803 * y2 = _mm_mul_ps(y2, z);
2804 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2805 * y2 = _mm_mul_ps(y2, z);
2806 * y2 = _mm_mul_ps(y2, x);
2807 * y2 = _mm_add_ps(y2, x);
2810 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2811 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2812 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2813 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2814 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2815 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2816 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2819 * select the correct result from the two polynoms
2821 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2822 * y = _mm_andnot_ps(xmm3, y);
2823 * y = _mm_add_ps(y,y2);
2825 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2826 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2827 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2828 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2829 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2830 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2834 * y = _mm_xor_ps(y, sign_bit);
2836 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
2837 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2843 * Generate pow(x, y)
2846 lp_build_pow(struct lp_build_context
*bld
,
2850 /* TODO: optimize the constant case */
2851 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2852 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2853 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2857 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2865 lp_build_exp(struct lp_build_context
*bld
,
2868 /* log2(e) = 1/log(2) */
2869 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2870 1.4426950408889634);
2872 assert(lp_check_value(bld
->type
, x
));
2874 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2882 lp_build_log(struct lp_build_context
*bld
,
2886 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2887 0.69314718055994529);
2889 assert(lp_check_value(bld
->type
, x
));
2891 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2896 * Generate polynomial.
2897 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2900 lp_build_polynomial(struct lp_build_context
*bld
,
2902 const double *coeffs
,
2903 unsigned num_coeffs
)
2905 const struct lp_type type
= bld
->type
;
2906 LLVMValueRef even
= NULL
, odd
= NULL
;
2910 assert(lp_check_value(bld
->type
, x
));
2912 /* TODO: optimize the constant case */
2913 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2914 LLVMIsConstant(x
)) {
2915 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2920 * Calculate odd and even terms seperately to decrease data dependency
2922 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2923 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2925 x2
= lp_build_mul(bld
, x
, x
);
2927 for (i
= num_coeffs
; i
--; ) {
2930 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2934 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2939 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2946 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2955 * Minimax polynomial fit of 2**x, in range [0, 1[
2957 const double lp_build_exp2_polynomial
[] = {
2958 #if EXP_POLY_DEGREE == 5
2959 0.999999925063526176901,
2960 0.693153073200168932794,
2961 0.240153617044375388211,
2962 0.0558263180532956664775,
2963 0.00898934009049466391101,
2964 0.00187757667519147912699
2965 #elif EXP_POLY_DEGREE == 4
2966 1.00000259337069434683,
2967 0.693003834469974940458,
2968 0.24144275689150793076,
2969 0.0520114606103070150235,
2970 0.0135341679161270268764
2971 #elif EXP_POLY_DEGREE == 3
2972 0.999925218562710312959,
2973 0.695833540494823811697,
2974 0.226067155427249155588,
2975 0.0780245226406372992967
2976 #elif EXP_POLY_DEGREE == 2
2977 1.00172476321474503578,
2978 0.657636275736077639316,
2979 0.33718943461968720704
2987 lp_build_exp2_approx(struct lp_build_context
*bld
,
2989 LLVMValueRef
*p_exp2_int_part
,
2990 LLVMValueRef
*p_frac_part
,
2991 LLVMValueRef
*p_exp2
)
2993 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2994 const struct lp_type type
= bld
->type
;
2995 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2996 LLVMValueRef ipart
= NULL
;
2997 LLVMValueRef fpart
= NULL
;
2998 LLVMValueRef expipart
= NULL
;
2999 LLVMValueRef expfpart
= NULL
;
3000 LLVMValueRef res
= NULL
;
3002 assert(lp_check_value(bld
->type
, x
));
3004 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
3005 /* TODO: optimize the constant case */
3006 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3007 LLVMIsConstant(x
)) {
3008 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3012 assert(type
.floating
&& type
.width
== 32);
3014 x
= lp_build_min(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, 129.0));
3015 x
= lp_build_max(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999));
3017 /* ipart = floor(x) */
3018 /* fpart = x - ipart */
3019 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
3022 if(p_exp2_int_part
|| p_exp2
) {
3023 /* expipart = (float) (1 << ipart) */
3024 expipart
= LLVMBuildAdd(builder
, ipart
,
3025 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3026 expipart
= LLVMBuildShl(builder
, expipart
,
3027 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3028 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
3032 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
3033 Elements(lp_build_exp2_polynomial
));
3035 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
3039 *p_exp2_int_part
= expipart
;
3042 *p_frac_part
= fpart
;
3050 lp_build_exp2(struct lp_build_context
*bld
,
3054 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
3060 * Extract the exponent of a IEEE-754 floating point value.
3062 * Optionally apply an integer bias.
3064 * Result is an integer value with
3066 * ifloor(log2(x)) + bias
3069 lp_build_extract_exponent(struct lp_build_context
*bld
,
3073 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3074 const struct lp_type type
= bld
->type
;
3075 unsigned mantissa
= lp_mantissa(type
);
3078 assert(type
.floating
);
3080 assert(lp_check_value(bld
->type
, x
));
3082 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3084 res
= LLVMBuildLShr(builder
, x
,
3085 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3086 res
= LLVMBuildAnd(builder
, res
,
3087 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3088 res
= LLVMBuildSub(builder
, res
,
3089 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3096 * Extract the mantissa of the a floating.
3098 * Result is a floating point value with
3100 * x / floor(log2(x))
3103 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3106 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3107 const struct lp_type type
= bld
->type
;
3108 unsigned mantissa
= lp_mantissa(type
);
3109 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3110 (1ULL << mantissa
) - 1);
3111 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3114 assert(lp_check_value(bld
->type
, x
));
3116 assert(type
.floating
);
3118 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3120 /* res = x / 2**ipart */
3121 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3122 res
= LLVMBuildOr(builder
, res
, one
, "");
3123 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3131 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3132 * These coefficients can be generate with
3133 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3135 const double lp_build_log2_polynomial
[] = {
3136 #if LOG_POLY_DEGREE == 5
3137 2.88539008148777786488L,
3138 0.961796878841293367824L,
3139 0.577058946784739859012L,
3140 0.412914355135828735411L,
3141 0.308591899232910175289L,
3142 0.352376952300281371868L,
3143 #elif LOG_POLY_DEGREE == 4
3144 2.88539009343309178325L,
3145 0.961791550404184197881L,
3146 0.577440339438736392009L,
3147 0.403343858251329912514L,
3148 0.406718052498846252698L,
3149 #elif LOG_POLY_DEGREE == 3
3150 2.88538959748872753838L,
3151 0.961932915889597772928L,
3152 0.571118517972136195241L,
3153 0.493997535084709500285L,
3160 * See http://www.devmaster.net/forums/showthread.php?p=43580
3161 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3162 * http://www.nezumi.demon.co.uk/consult/logx.htm
3165 lp_build_log2_approx(struct lp_build_context
*bld
,
3167 LLVMValueRef
*p_exp
,
3168 LLVMValueRef
*p_floor_log2
,
3169 LLVMValueRef
*p_log2
)
3171 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3172 const struct lp_type type
= bld
->type
;
3173 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3174 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3176 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3177 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3178 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3180 LLVMValueRef i
= NULL
;
3181 LLVMValueRef y
= NULL
;
3182 LLVMValueRef z
= NULL
;
3183 LLVMValueRef exp
= NULL
;
3184 LLVMValueRef mant
= NULL
;
3185 LLVMValueRef logexp
= NULL
;
3186 LLVMValueRef logmant
= NULL
;
3187 LLVMValueRef res
= NULL
;
3189 assert(lp_check_value(bld
->type
, x
));
3191 if(p_exp
|| p_floor_log2
|| p_log2
) {
3192 /* TODO: optimize the constant case */
3193 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3194 LLVMIsConstant(x
)) {
3195 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3199 assert(type
.floating
&& type
.width
== 32);
3202 * We don't explicitly handle denormalized numbers. They will yield a
3203 * result in the neighbourhood of -127, which appears to be adequate
3207 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3209 /* exp = (float) exponent(x) */
3210 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3213 if(p_floor_log2
|| p_log2
) {
3214 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3215 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3216 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3220 /* mant = 1 + (float) mantissa(x) */
3221 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3222 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3223 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3225 /* y = (mant - 1) / (mant + 1) */
3226 y
= lp_build_div(bld
,
3227 lp_build_sub(bld
, mant
, bld
->one
),
3228 lp_build_add(bld
, mant
, bld
->one
)
3232 z
= lp_build_mul(bld
, y
, y
);
3235 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3236 Elements(lp_build_log2_polynomial
));
3238 /* logmant = y * P(z) */
3239 logmant
= lp_build_mul(bld
, y
, logmant
);
3241 res
= lp_build_add(bld
, logmant
, logexp
);
3245 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3250 *p_floor_log2
= logexp
;
3258 lp_build_log2(struct lp_build_context
*bld
,
3262 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);
3268 * Faster (and less accurate) log2.
3270 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3272 * Piece-wise linear approximation, with exact results when x is a
3275 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3278 lp_build_fast_log2(struct lp_build_context
*bld
,
3281 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3285 assert(lp_check_value(bld
->type
, x
));
3287 assert(bld
->type
.floating
);
3289 /* ipart = floor(log2(x)) - 1 */
3290 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3291 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3293 /* fpart = x / 2**ipart */
3294 fpart
= lp_build_extract_mantissa(bld
, x
);
3297 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3302 * Fast implementation of iround(log2(x)).
3304 * Not an approximation -- it should give accurate results all the time.
3307 lp_build_ilog2(struct lp_build_context
*bld
,
3310 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3311 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3314 assert(bld
->type
.floating
);
3316 assert(lp_check_value(bld
->type
, x
));
3318 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3319 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3321 /* ipart = floor(log2(x) + 0.5) */
3322 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3328 lp_build_mod(struct lp_build_context
*bld
,
3332 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3334 const struct lp_type type
= bld
->type
;
3336 assert(lp_check_value(type
, x
));
3337 assert(lp_check_value(type
, y
));
3340 res
= LLVMBuildFRem(builder
, x
, y
, "");
3342 res
= LLVMBuildSRem(builder
, x
, y
, "");
3344 res
= LLVMBuildURem(builder
, x
, y
, "");