1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
64 #define EXP_POLY_DEGREE 5
66 #define LOG_POLY_DEGREE 5
71 * No checks for special case values of a or b = 1 or 0 are done.
74 lp_build_min_simple(struct lp_build_context
*bld
,
78 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
79 const struct lp_type type
= bld
->type
;
80 const char *intrinsic
= NULL
;
83 assert(lp_check_value(type
, a
));
84 assert(lp_check_value(type
, b
));
86 /* TODO: optimize the constant case */
88 if(type
.width
* type
.length
== 128) {
90 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
91 intrinsic
= "llvm.x86.sse.min.ps";
92 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
93 intrinsic
= "llvm.x86.sse2.min.pd";
96 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
97 intrinsic
= "llvm.x86.sse2.pminu.b";
98 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
99 intrinsic
= "llvm.x86.sse41.pminsb";
100 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
101 intrinsic
= "llvm.x86.sse41.pminuw";
102 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
103 intrinsic
= "llvm.x86.sse2.pmins.w";
104 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
105 intrinsic
= "llvm.x86.sse41.pminud";
106 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
107 intrinsic
= "llvm.x86.sse41.pminsd";
112 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
114 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
115 return lp_build_select(bld
, cond
, a
, b
);
121 * No checks for special case values of a or b = 1 or 0 are done.
124 lp_build_max_simple(struct lp_build_context
*bld
,
128 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
129 const struct lp_type type
= bld
->type
;
130 const char *intrinsic
= NULL
;
133 assert(lp_check_value(type
, a
));
134 assert(lp_check_value(type
, b
));
136 /* TODO: optimize the constant case */
138 if(type
.width
* type
.length
== 128) {
140 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
141 intrinsic
= "llvm.x86.sse.max.ps";
142 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
143 intrinsic
= "llvm.x86.sse2.max.pd";
146 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
147 intrinsic
= "llvm.x86.sse2.pmaxu.b";
148 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
149 intrinsic
= "llvm.x86.sse41.pmaxsb";
150 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
151 intrinsic
= "llvm.x86.sse41.pmaxuw";
152 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
153 intrinsic
= "llvm.x86.sse2.pmaxs.w";
154 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
155 intrinsic
= "llvm.x86.sse41.pmaxud";
156 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
157 intrinsic
= "llvm.x86.sse41.pmaxsd";
162 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
164 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
165 return lp_build_select(bld
, cond
, a
, b
);
170 * Generate 1 - a, or ~a depending on bld->type.
173 lp_build_comp(struct lp_build_context
*bld
,
176 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
177 const struct lp_type type
= bld
->type
;
179 assert(lp_check_value(type
, a
));
186 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
187 if(LLVMIsConstant(a
))
188 return LLVMConstNot(a
);
190 return LLVMBuildNot(builder
, a
, "");
193 if(LLVMIsConstant(a
))
195 return LLVMConstFSub(bld
->one
, a
);
197 return LLVMConstSub(bld
->one
, a
);
200 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
202 return LLVMBuildSub(builder
, bld
->one
, a
, "");
210 lp_build_add(struct lp_build_context
*bld
,
214 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
215 const struct lp_type type
= bld
->type
;
218 assert(lp_check_value(type
, a
));
219 assert(lp_check_value(type
, b
));
225 if(a
== bld
->undef
|| b
== bld
->undef
)
229 const char *intrinsic
= NULL
;
231 if(a
== bld
->one
|| b
== bld
->one
)
234 if(util_cpu_caps
.has_sse2
&&
235 type
.width
* type
.length
== 128 &&
236 !type
.floating
&& !type
.fixed
) {
238 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
240 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
244 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
247 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
249 res
= LLVMConstFAdd(a
, b
);
251 res
= LLVMConstAdd(a
, b
);
254 res
= LLVMBuildFAdd(builder
, a
, b
, "");
256 res
= LLVMBuildAdd(builder
, a
, b
, "");
258 /* clamp to ceiling of 1.0 */
259 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
260 res
= lp_build_min_simple(bld
, res
, bld
->one
);
262 /* XXX clamp to floor of -1 or 0??? */
268 /** Return the scalar sum of the elements of a */
270 lp_build_sum_vector(struct lp_build_context
*bld
,
273 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
274 const struct lp_type type
= bld
->type
;
275 LLVMValueRef index
, res
;
278 assert(lp_check_value(type
, a
));
280 if (type
.length
== 1) {
284 assert(!bld
->type
.norm
);
286 index
= lp_build_const_int32(bld
->gallivm
, 0);
287 res
= LLVMBuildExtractElement(builder
, a
, index
, "");
289 for (i
= 1; i
< type
.length
; i
++) {
290 index
= lp_build_const_int32(bld
->gallivm
, i
);
292 res
= LLVMBuildFAdd(builder
, res
,
293 LLVMBuildExtractElement(builder
,
297 res
= LLVMBuildAdd(builder
, res
,
298 LLVMBuildExtractElement(builder
,
311 lp_build_sub(struct lp_build_context
*bld
,
315 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
316 const struct lp_type type
= bld
->type
;
319 assert(lp_check_value(type
, a
));
320 assert(lp_check_value(type
, b
));
324 if(a
== bld
->undef
|| b
== bld
->undef
)
330 const char *intrinsic
= NULL
;
335 if(util_cpu_caps
.has_sse2
&&
336 type
.width
* type
.length
== 128 &&
337 !type
.floating
&& !type
.fixed
) {
339 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
341 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
345 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
348 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
350 res
= LLVMConstFSub(a
, b
);
352 res
= LLVMConstSub(a
, b
);
355 res
= LLVMBuildFSub(builder
, a
, b
, "");
357 res
= LLVMBuildSub(builder
, a
, b
, "");
359 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
360 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
367 * Normalized 8bit multiplication.
371 * makes the following approximation to the division (Sree)
373 * a*b/255 ~= (a*(b + 1)) >> 256
375 * which is the fastest method that satisfies the following OpenGL criteria
377 * 0*0 = 0 and 255*255 = 255
381 * takes the geometric series approximation to the division
383 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
385 * in this case just the first two terms to fit in 16bit arithmetic
387 * t/255 ~= (t + (t >> 8)) >> 8
389 * note that just by itself it doesn't satisfies the OpenGL criteria, as
390 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
393 * - geometric series plus rounding
395 * when using a geometric series division instead of truncating the result
396 * use roundoff in the approximation (Jim Blinn)
398 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
400 * achieving the exact results
402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
403 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
404 * @sa Michael Herf, The "double blend trick", May 2000,
405 * http://www.stereopsis.com/doubleblend.html
408 lp_build_mul_u8n(struct gallivm_state
*gallivm
,
409 struct lp_type i16_type
,
410 LLVMValueRef a
, LLVMValueRef b
)
412 LLVMBuilderRef builder
= gallivm
->builder
;
416 assert(!i16_type
.floating
);
417 assert(lp_check_value(i16_type
, a
));
418 assert(lp_check_value(i16_type
, b
));
420 c8
= lp_build_const_int_vec(gallivm
, i16_type
, 8);
424 /* a*b/255 ~= (a*(b + 1)) >> 256 */
425 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(gallium
, i16_type
, 1), "");
426 ab
= LLVMBuildMul(builder
, a
, b
, "");
430 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
431 ab
= LLVMBuildMul(builder
, a
, b
, "");
432 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
433 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(gallivm
, i16_type
, 0x80), "");
437 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
447 lp_build_mul(struct lp_build_context
*bld
,
451 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
452 const struct lp_type type
= bld
->type
;
456 assert(lp_check_value(type
, a
));
457 assert(lp_check_value(type
, b
));
467 if(a
== bld
->undef
|| b
== bld
->undef
)
470 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
471 if(type
.width
== 8) {
472 struct lp_type i16_type
= lp_wider_type(type
);
473 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
475 lp_build_unpack2(bld
->gallivm
, type
, i16_type
, a
, &al
, &ah
);
476 lp_build_unpack2(bld
->gallivm
, type
, i16_type
, b
, &bl
, &bh
);
478 /* PMULLW, PSRLW, PADDW */
479 abl
= lp_build_mul_u8n(bld
->gallivm
, i16_type
, al
, bl
);
480 abh
= lp_build_mul_u8n(bld
->gallivm
, i16_type
, ah
, bh
);
482 ab
= lp_build_pack2(bld
->gallivm
, i16_type
, type
, abl
, abh
);
492 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
496 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
498 res
= LLVMConstFMul(a
, b
);
500 res
= LLVMConstMul(a
, b
);
503 res
= LLVMConstAShr(res
, shift
);
505 res
= LLVMConstLShr(res
, shift
);
510 res
= LLVMBuildFMul(builder
, a
, b
, "");
512 res
= LLVMBuildMul(builder
, a
, b
, "");
515 res
= LLVMBuildAShr(builder
, res
, shift
, "");
517 res
= LLVMBuildLShr(builder
, res
, shift
, "");
526 * Small vector x scale multiplication optimization.
529 lp_build_mul_imm(struct lp_build_context
*bld
,
533 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
536 assert(lp_check_value(bld
->type
, a
));
545 return lp_build_negate(bld
, a
);
547 if(b
== 2 && bld
->type
.floating
)
548 return lp_build_add(bld
, a
, a
);
550 if(util_is_power_of_two(b
)) {
551 unsigned shift
= ffs(b
) - 1;
553 if(bld
->type
.floating
) {
556 * Power of two multiplication by directly manipulating the mantissa.
558 * XXX: This might not be always faster, it will introduce a small error
559 * for multiplication by zero, and it will produce wrong results
562 unsigned mantissa
= lp_mantissa(bld
->type
);
563 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
564 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
565 a
= LLVMBuildAdd(builder
, a
, factor
, "");
566 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
571 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
572 return LLVMBuildShl(builder
, a
, factor
, "");
576 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
577 return lp_build_mul(bld
, a
, factor
);
585 lp_build_div(struct lp_build_context
*bld
,
589 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
590 const struct lp_type type
= bld
->type
;
592 assert(lp_check_value(type
, a
));
593 assert(lp_check_value(type
, b
));
598 return lp_build_rcp(bld
, b
);
603 if(a
== bld
->undef
|| b
== bld
->undef
)
606 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
608 return LLVMConstFDiv(a
, b
);
610 return LLVMConstSDiv(a
, b
);
612 return LLVMConstUDiv(a
, b
);
615 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4 &&
617 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
620 return LLVMBuildFDiv(builder
, a
, b
, "");
622 return LLVMBuildSDiv(builder
, a
, b
, "");
624 return LLVMBuildUDiv(builder
, a
, b
, "");
629 * Linear interpolation -- without any checks.
631 * @sa http://www.stereopsis.com/doubleblend.html
633 static INLINE LLVMValueRef
634 lp_build_lerp_simple(struct lp_build_context
*bld
,
639 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
643 assert(lp_check_value(bld
->type
, x
));
644 assert(lp_check_value(bld
->type
, v0
));
645 assert(lp_check_value(bld
->type
, v1
));
647 delta
= lp_build_sub(bld
, v1
, v0
);
649 res
= lp_build_mul(bld
, x
, delta
);
651 res
= lp_build_add(bld
, v0
, res
);
653 if (bld
->type
.fixed
) {
654 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
655 * but it will be wrong for other uses. Basically we need a more
656 * powerful lp_type, capable of further distinguishing the values
657 * interpretation from the value storage. */
658 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
666 * Linear interpolation.
669 lp_build_lerp(struct lp_build_context
*bld
,
674 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
675 const struct lp_type type
= bld
->type
;
678 assert(lp_check_value(type
, x
));
679 assert(lp_check_value(type
, v0
));
680 assert(lp_check_value(type
, v1
));
683 struct lp_type wide_type
;
684 struct lp_build_context wide_bld
;
685 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
688 assert(type
.length
>= 2);
692 * Create a wider type, enough to hold the intermediate result of the
695 memset(&wide_type
, 0, sizeof wide_type
);
696 wide_type
.fixed
= TRUE
;
697 wide_type
.width
= type
.width
*2;
698 wide_type
.length
= type
.length
/2;
700 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
702 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
703 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
704 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
707 * Scale x from [0, 255] to [0, 256]
710 shift
= lp_build_const_int_vec(bld
->gallivm
, wide_type
, type
.width
- 1);
712 xl
= lp_build_add(&wide_bld
, xl
,
713 LLVMBuildAShr(builder
, xl
, shift
, ""));
714 xh
= lp_build_add(&wide_bld
, xh
,
715 LLVMBuildAShr(builder
, xh
, shift
, ""));
721 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
);
722 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
);
724 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
726 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
);
734 lp_build_lerp_2d(struct lp_build_context
*bld
,
742 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
743 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
744 return lp_build_lerp(bld
, y
, v0
, v1
);
750 * Do checks for special cases.
753 lp_build_min(struct lp_build_context
*bld
,
757 assert(lp_check_value(bld
->type
, a
));
758 assert(lp_check_value(bld
->type
, b
));
760 if(a
== bld
->undef
|| b
== bld
->undef
)
767 if(a
== bld
->zero
|| b
== bld
->zero
)
775 return lp_build_min_simple(bld
, a
, b
);
781 * Do checks for special cases.
784 lp_build_max(struct lp_build_context
*bld
,
788 assert(lp_check_value(bld
->type
, a
));
789 assert(lp_check_value(bld
->type
, b
));
791 if(a
== bld
->undef
|| b
== bld
->undef
)
798 if(a
== bld
->one
|| b
== bld
->one
)
806 return lp_build_max_simple(bld
, a
, b
);
811 * Generate clamp(a, min, max)
812 * Do checks for special cases.
815 lp_build_clamp(struct lp_build_context
*bld
,
820 assert(lp_check_value(bld
->type
, a
));
821 assert(lp_check_value(bld
->type
, min
));
822 assert(lp_check_value(bld
->type
, max
));
824 a
= lp_build_min(bld
, a
, max
);
825 a
= lp_build_max(bld
, a
, min
);
834 lp_build_abs(struct lp_build_context
*bld
,
837 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
838 const struct lp_type type
= bld
->type
;
839 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
841 assert(lp_check_value(type
, a
));
847 /* Mask out the sign bit */
848 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
849 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
850 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
851 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
852 a
= LLVMBuildAnd(builder
, a
, mask
, "");
853 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
857 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
860 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
862 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
864 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
868 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
873 lp_build_negate(struct lp_build_context
*bld
,
876 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
878 assert(lp_check_value(bld
->type
, a
));
880 #if HAVE_LLVM >= 0x0207
881 if (bld
->type
.floating
)
882 a
= LLVMBuildFNeg(builder
, a
, "");
885 a
= LLVMBuildNeg(builder
, a
, "");
891 /** Return -1, 0 or +1 depending on the sign of a */
893 lp_build_sgn(struct lp_build_context
*bld
,
896 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
897 const struct lp_type type
= bld
->type
;
901 assert(lp_check_value(type
, a
));
903 /* Handle non-zero case */
905 /* if not zero then sign must be positive */
908 else if(type
.floating
) {
909 LLVMTypeRef vec_type
;
910 LLVMTypeRef int_type
;
914 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
916 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
917 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
918 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
920 /* Take the sign bit and add it to 1 constant */
921 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
922 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
923 one
= LLVMConstBitCast(bld
->one
, int_type
);
924 res
= LLVMBuildOr(builder
, sign
, one
, "");
925 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
929 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
930 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
931 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
935 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
936 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
943 * Set the sign of float vector 'a' according to 'sign'.
944 * If sign==0, return abs(a).
945 * If sign==1, return -abs(a);
946 * Other values for sign produce undefined results.
949 lp_build_set_sign(struct lp_build_context
*bld
,
950 LLVMValueRef a
, LLVMValueRef sign
)
952 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
953 const struct lp_type type
= bld
->type
;
954 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
955 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
956 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
957 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
958 ~((unsigned long long) 1 << (type
.width
- 1)));
959 LLVMValueRef val
, res
;
961 assert(type
.floating
);
962 assert(lp_check_value(type
, a
));
964 /* val = reinterpret_cast<int>(a) */
965 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
966 /* val = val & mask */
967 val
= LLVMBuildAnd(builder
, val
, mask
, "");
968 /* sign = sign << shift */
969 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
970 /* res = val | sign */
971 res
= LLVMBuildOr(builder
, val
, sign
, "");
972 /* res = reinterpret_cast<float>(res) */
973 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
980 * Convert vector of (or scalar) int to vector of (or scalar) float.
983 lp_build_int_to_float(struct lp_build_context
*bld
,
986 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
987 const struct lp_type type
= bld
->type
;
988 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
990 assert(type
.floating
);
992 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
997 enum lp_build_round_sse41_mode
999 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
1000 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
1001 LP_BUILD_ROUND_SSE41_CEIL
= 2,
1002 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
1007 * Helper for SSE4.1's ROUNDxx instructions.
1009 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1010 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1012 static INLINE LLVMValueRef
1013 lp_build_round_sse41(struct lp_build_context
*bld
,
1015 enum lp_build_round_sse41_mode mode
)
1017 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1018 const struct lp_type type
= bld
->type
;
1019 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1020 const char *intrinsic
;
1023 assert(type
.floating
);
1025 assert(lp_check_value(type
, a
));
1026 assert(util_cpu_caps
.has_sse4_1
);
1028 if (type
.length
== 1) {
1029 LLVMTypeRef vec_type
;
1031 LLVMValueRef args
[3];
1032 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1034 switch(type
.width
) {
1036 intrinsic
= "llvm.x86.sse41.round.ss";
1039 intrinsic
= "llvm.x86.sse41.round.sd";
1046 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1048 undef
= LLVMGetUndef(vec_type
);
1051 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1052 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1054 res
= lp_build_intrinsic(builder
, intrinsic
,
1055 vec_type
, args
, Elements(args
));
1057 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1060 assert(type
.width
*type
.length
== 128);
1062 switch(type
.width
) {
1064 intrinsic
= "llvm.x86.sse41.round.ps";
1067 intrinsic
= "llvm.x86.sse41.round.pd";
1074 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1076 LLVMConstInt(i32t
, mode
, 0));
1083 static INLINE LLVMValueRef
1084 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1087 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1088 const struct lp_type type
= bld
->type
;
1089 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1090 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1091 const char *intrinsic
;
1094 assert(type
.floating
);
1095 /* using the double precision conversions is a bit more complicated */
1096 assert(type
.width
== 32);
1098 assert(lp_check_value(type
, a
));
1099 assert(util_cpu_caps
.has_sse2
);
1101 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1102 if (type
.length
== 1) {
1103 LLVMTypeRef vec_type
;
1106 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1108 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1110 intrinsic
= "llvm.x86.sse.cvtss2si";
1112 undef
= LLVMGetUndef(vec_type
);
1114 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1116 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1120 assert(type
.width
*type
.length
== 128);
1122 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1124 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1133 * Return the integer part of a float (vector) value (== round toward zero).
1134 * The returned value is a float (vector).
1135 * Ex: trunc(-1.5) = -1.0
1138 lp_build_trunc(struct lp_build_context
*bld
,
1141 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1142 const struct lp_type type
= bld
->type
;
1144 assert(type
.floating
);
1145 assert(lp_check_value(type
, a
));
1147 if (util_cpu_caps
.has_sse4_1
&&
1148 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1149 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
1152 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1153 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1155 res
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1156 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1163 * Return float (vector) rounded to nearest integer (vector). The returned
1164 * value is a float (vector).
1165 * Ex: round(0.9) = 1.0
1166 * Ex: round(-1.5) = -2.0
1169 lp_build_round(struct lp_build_context
*bld
,
1172 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1173 const struct lp_type type
= bld
->type
;
1175 assert(type
.floating
);
1176 assert(lp_check_value(type
, a
));
1178 if (util_cpu_caps
.has_sse4_1
&&
1179 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1180 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1183 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1185 res
= lp_build_iround(bld
, a
);
1186 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1193 * Return floor of float (vector), result is a float (vector)
1194 * Ex: floor(1.1) = 1.0
1195 * Ex: floor(-1.1) = -2.0
1198 lp_build_floor(struct lp_build_context
*bld
,
1201 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1202 const struct lp_type type
= bld
->type
;
1204 assert(type
.floating
);
1205 assert(lp_check_value(type
, a
));
1207 if (util_cpu_caps
.has_sse4_1
&&
1208 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1209 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1212 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1214 res
= lp_build_ifloor(bld
, a
);
1215 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1222 * Return ceiling of float (vector), returning float (vector).
1223 * Ex: ceil( 1.1) = 2.0
1224 * Ex: ceil(-1.1) = -1.0
1227 lp_build_ceil(struct lp_build_context
*bld
,
1230 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1231 const struct lp_type type
= bld
->type
;
1233 assert(type
.floating
);
1234 assert(lp_check_value(type
, a
));
1236 if (util_cpu_caps
.has_sse4_1
&&
1237 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1238 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1241 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1243 res
= lp_build_iceil(bld
, a
);
1244 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1251 * Return fractional part of 'a' computed as a - floor(a)
1252 * Typically used in texture coord arithmetic.
1255 lp_build_fract(struct lp_build_context
*bld
,
1258 assert(bld
->type
.floating
);
1259 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1264 * Return the integer part of a float (vector) value (== round toward zero).
1265 * The returned value is an integer (vector).
1266 * Ex: itrunc(-1.5) = -1
1269 lp_build_itrunc(struct lp_build_context
*bld
,
1272 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1273 const struct lp_type type
= bld
->type
;
1274 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1276 assert(type
.floating
);
1277 assert(lp_check_value(type
, a
));
1279 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1284 * Return float (vector) rounded to nearest integer (vector). The returned
1285 * value is an integer (vector).
1286 * Ex: iround(0.9) = 1
1287 * Ex: iround(-1.5) = -2
1290 lp_build_iround(struct lp_build_context
*bld
,
1293 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1294 const struct lp_type type
= bld
->type
;
1295 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1298 assert(type
.floating
);
1300 assert(lp_check_value(type
, a
));
1302 if (util_cpu_caps
.has_sse2
&&
1303 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) {
1304 return lp_build_iround_nearest_sse2(bld
, a
);
1306 else if (util_cpu_caps
.has_sse4_1
&&
1307 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1308 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1313 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
1316 LLVMTypeRef vec_type
= bld
->vec_type
;
1317 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1318 (unsigned long long)1 << (type
.width
- 1));
1322 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1323 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1326 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
1327 half
= LLVMBuildOr(builder
, sign
, half
, "");
1328 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
1331 res
= LLVMBuildFAdd(builder
, a
, half
, "");
1334 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
1341 * Return floor of float (vector), result is an int (vector)
1342 * Ex: ifloor(1.1) = 1.0
1343 * Ex: ifloor(-1.1) = -2.0
1346 lp_build_ifloor(struct lp_build_context
*bld
,
1349 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1350 const struct lp_type type
= bld
->type
;
1351 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1354 assert(type
.floating
);
1355 assert(lp_check_value(type
, a
));
1357 if (util_cpu_caps
.has_sse4_1
&&
1358 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1359 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1365 /* Take the sign bit and add it to 1 constant */
1366 LLVMTypeRef vec_type
= bld
->vec_type
;
1367 unsigned mantissa
= lp_mantissa(type
);
1368 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1369 (unsigned long long)1 << (type
.width
- 1));
1371 LLVMValueRef offset
;
1373 /* sign = a < 0 ? ~0 : 0 */
1374 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1375 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1376 sign
= LLVMBuildAShr(builder
, sign
,
1377 lp_build_const_int_vec(bld
->gallivm
, type
,
1381 /* offset = -0.99999(9)f */
1382 offset
= lp_build_const_vec(bld
->gallivm
, type
,
1383 -(double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1384 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1386 /* offset = a < 0 ? offset : 0.0f */
1387 offset
= LLVMBuildAnd(builder
, offset
, sign
, "");
1388 offset
= LLVMBuildBitCast(builder
, offset
, vec_type
, "ifloor.offset");
1390 res
= LLVMBuildFAdd(builder
, res
, offset
, "ifloor.res");
1394 /* round to nearest (toward zero) */
1395 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
1402 * Return ceiling of float (vector), returning int (vector).
1403 * Ex: iceil( 1.1) = 2
1404 * Ex: iceil(-1.1) = -1
1407 lp_build_iceil(struct lp_build_context
*bld
,
1410 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1411 const struct lp_type type
= bld
->type
;
1412 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1415 assert(type
.floating
);
1416 assert(lp_check_value(type
, a
));
1418 if (util_cpu_caps
.has_sse4_1
&&
1419 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1420 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1423 LLVMTypeRef vec_type
= bld
->vec_type
;
1424 unsigned mantissa
= lp_mantissa(type
);
1425 LLVMValueRef offset
;
1427 /* offset = 0.99999(9)f */
1428 offset
= lp_build_const_vec(bld
->gallivm
, type
,
1429 (double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1432 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1433 (unsigned long long)1 << (type
.width
- 1));
1436 /* sign = a < 0 ? 0 : ~0 */
1437 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1438 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1439 sign
= LLVMBuildAShr(builder
, sign
,
1440 lp_build_const_int_vec(bld
->gallivm
, type
,
1443 sign
= LLVMBuildNot(builder
, sign
, "iceil.not");
1445 /* offset = a < 0 ? 0.0 : offset */
1446 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1447 offset
= LLVMBuildAnd(builder
, offset
, sign
, "");
1448 offset
= LLVMBuildBitCast(builder
, offset
, vec_type
, "iceil.offset");
1451 res
= LLVMBuildFAdd(builder
, a
, offset
, "iceil.res");
1454 /* round to nearest (toward zero) */
1455 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
1462 * Combined ifloor() & fract().
1464 * Preferred to calling the functions separately, as it will ensure that the
1465 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1468 lp_build_ifloor_fract(struct lp_build_context
*bld
,
1470 LLVMValueRef
*out_ipart
,
1471 LLVMValueRef
*out_fpart
)
1473 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1474 const struct lp_type type
= bld
->type
;
1477 assert(type
.floating
);
1478 assert(lp_check_value(type
, a
));
1480 if (util_cpu_caps
.has_sse4_1
&&
1481 (type
.length
== 1 || type
.width
*type
.length
== 128)) {
1483 * floor() is easier.
1486 ipart
= lp_build_floor(bld
, a
);
1487 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
1488 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
1492 * ifloor() is easier.
1495 *out_ipart
= lp_build_ifloor(bld
, a
);
1496 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
1497 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
1503 lp_build_sqrt(struct lp_build_context
*bld
,
1506 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1507 const struct lp_type type
= bld
->type
;
1508 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1511 assert(lp_check_value(type
, a
));
1513 /* TODO: optimize the constant case */
1514 /* TODO: optimize the constant case */
1516 assert(type
.floating
);
1517 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
1519 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1524 * Do one Newton-Raphson step to improve reciprocate precision:
1526 * x_{i+1} = x_i * (2 - a * x_i)
1528 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1529 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1530 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1531 * halo. It would be necessary to clamp the argument to prevent this.
1534 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1535 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1537 static INLINE LLVMValueRef
1538 lp_build_rcp_refine(struct lp_build_context
*bld
,
1542 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1543 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
1546 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
1547 res
= LLVMBuildFSub(builder
, two
, res
, "");
1548 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
1555 lp_build_rcp(struct lp_build_context
*bld
,
1558 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1559 const struct lp_type type
= bld
->type
;
1561 assert(lp_check_value(type
, a
));
1570 assert(type
.floating
);
1572 if(LLVMIsConstant(a
))
1573 return LLVMConstFDiv(bld
->one
, a
);
1576 * We don't use RCPPS because:
1577 * - it only has 10bits of precision
1578 * - it doesn't even get the reciprocate of 1.0 exactly
1579 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1580 * - for recent processors the benefit over DIVPS is marginal, a case
1583 * We could still use it on certain processors if benchmarks show that the
1584 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1585 * particular uses that require less workarounds.
1588 if (FALSE
&& util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1589 const unsigned num_iterations
= 0;
1593 res
= lp_build_intrinsic_unary(builder
, "llvm.x86.sse.rcp.ps", bld
->vec_type
, a
);
1595 for (i
= 0; i
< num_iterations
; ++i
) {
1596 res
= lp_build_rcp_refine(bld
, a
, res
);
1602 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
1607 * Do one Newton-Raphson step to improve rsqrt precision:
1609 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1612 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1614 static INLINE LLVMValueRef
1615 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
1617 LLVMValueRef rsqrt_a
)
1619 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1620 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
1621 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
1624 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
1625 res
= LLVMBuildFMul(builder
, a
, res
, "");
1626 res
= LLVMBuildFSub(builder
, three
, res
, "");
1627 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
1628 res
= LLVMBuildFMul(builder
, half
, res
, "");
1635 * Generate 1/sqrt(a)
1638 lp_build_rsqrt(struct lp_build_context
*bld
,
1641 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1642 const struct lp_type type
= bld
->type
;
1644 assert(lp_check_value(type
, a
));
1646 assert(type
.floating
);
1648 if (util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1649 const unsigned num_iterations
= 1;
1653 res
= lp_build_intrinsic_unary(builder
, "llvm.x86.sse.rsqrt.ps", bld
->vec_type
, a
);
1655 for (i
= 0; i
< num_iterations
; ++i
) {
1656 res
= lp_build_rsqrt_refine(bld
, a
, res
);
1662 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
1667 * Generate sin(a) using SSE2
1670 lp_build_sin(struct lp_build_context
*bld
,
1673 struct gallivm_state
*gallivm
= bld
->gallivm
;
1674 LLVMBuilderRef builder
= gallivm
->builder
;
1675 struct lp_type int_type
= lp_int_type(bld
->type
);
1676 LLVMBuilderRef b
= builder
;
1679 * take the absolute value,
1680 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1683 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
1684 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
1686 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
1687 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
1690 * extract the sign bit (upper one)
1691 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1693 LLVMValueRef sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
1694 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
1698 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1701 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
1702 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
1705 * store the integer part of y in mm0
1706 * emm2 = _mm_cvttps_epi32(y);
1709 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
1712 * j=(j+1) & (~1) (see the cephes sources)
1713 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1716 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
1717 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
1719 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1721 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
1722 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
1725 * y = _mm_cvtepi32_ps(emm2);
1727 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
1729 /* get the swap sign flag
1730 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1732 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
1733 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
1736 * emm2 = _mm_slli_epi32(emm0, 29);
1738 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
1739 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
1742 * get the polynom selection mask
1743 * there is one polynom for 0 <= x <= Pi/4
1744 * and another one for Pi/4<x<=Pi/2
1745 * Both branches will be computed.
1747 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1748 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1751 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
1752 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
1753 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
1754 int_type
, PIPE_FUNC_EQUAL
,
1755 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
1757 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1759 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
1762 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1763 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1764 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1766 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
1767 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
1768 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
1771 * The magic pass: "Extended precision modular arithmetic"
1772 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1773 * xmm1 = _mm_mul_ps(y, xmm1);
1774 * xmm2 = _mm_mul_ps(y, xmm2);
1775 * xmm3 = _mm_mul_ps(y, xmm3);
1777 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
1778 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
1779 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
1782 * x = _mm_add_ps(x, xmm1);
1783 * x = _mm_add_ps(x, xmm2);
1784 * x = _mm_add_ps(x, xmm3);
1787 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
1788 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
1789 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
1792 * Evaluate the first polynom (0 <= x <= Pi/4)
1794 * z = _mm_mul_ps(x,x);
1796 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
1799 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1800 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1801 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1803 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
1804 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
1805 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
1808 * y = *(v4sf*)_ps_coscof_p0;
1809 * y = _mm_mul_ps(y, z);
1811 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
1812 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
1813 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
1814 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
1815 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
1816 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
1820 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1821 * y = _mm_sub_ps(y, tmp);
1822 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1824 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
1825 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
1826 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
1827 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
1828 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
1831 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1832 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1833 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1835 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
1836 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
1837 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
1840 * Evaluate the second polynom (Pi/4 <= x <= 0)
1842 * y2 = *(v4sf*)_ps_sincof_p0;
1843 * y2 = _mm_mul_ps(y2, z);
1844 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1845 * y2 = _mm_mul_ps(y2, z);
1846 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1847 * y2 = _mm_mul_ps(y2, z);
1848 * y2 = _mm_mul_ps(y2, x);
1849 * y2 = _mm_add_ps(y2, x);
1852 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
1853 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
1854 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
1855 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
1856 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
1857 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
1858 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
1861 * select the correct result from the two polynoms
1863 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1864 * y = _mm_andnot_ps(xmm3, y);
1865 * y = _mm_add_ps(y,y2);
1867 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
1868 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
1869 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
1870 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
1871 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
1872 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
1873 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
1877 * y = _mm_xor_ps(y, sign_bit);
1879 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
1880 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
1886 * Generate cos(a) using SSE2
1889 lp_build_cos(struct lp_build_context
*bld
,
1892 struct gallivm_state
*gallivm
= bld
->gallivm
;
1893 LLVMBuilderRef builder
= gallivm
->builder
;
1894 struct lp_type int_type
= lp_int_type(bld
->type
);
1895 LLVMBuilderRef b
= builder
;
1898 * take the absolute value,
1899 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1902 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
1903 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
1905 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
1906 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
1910 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1913 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
1914 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
1917 * store the integer part of y in mm0
1918 * emm2 = _mm_cvttps_epi32(y);
1921 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
1924 * j=(j+1) & (~1) (see the cephes sources)
1925 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1928 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
1929 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
1931 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1933 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
1934 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
1937 * y = _mm_cvtepi32_ps(emm2);
1939 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
1943 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1945 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
1946 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
1949 /* get the swap sign flag
1950 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1952 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
1953 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
1954 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
1955 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
1958 * emm2 = _mm_slli_epi32(emm0, 29);
1960 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
1961 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
1964 * get the polynom selection mask
1965 * there is one polynom for 0 <= x <= Pi/4
1966 * and another one for Pi/4<x<=Pi/2
1967 * Both branches will be computed.
1969 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1970 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1973 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
1974 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
1975 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
1976 int_type
, PIPE_FUNC_EQUAL
,
1977 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
1980 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1981 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1982 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1984 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
1985 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
1986 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
1989 * The magic pass: "Extended precision modular arithmetic"
1990 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1991 * xmm1 = _mm_mul_ps(y, xmm1);
1992 * xmm2 = _mm_mul_ps(y, xmm2);
1993 * xmm3 = _mm_mul_ps(y, xmm3);
1995 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
1996 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
1997 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2000 * x = _mm_add_ps(x, xmm1);
2001 * x = _mm_add_ps(x, xmm2);
2002 * x = _mm_add_ps(x, xmm3);
2005 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2006 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2007 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2010 * Evaluate the first polynom (0 <= x <= Pi/4)
2012 * z = _mm_mul_ps(x,x);
2014 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2017 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2018 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2019 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2021 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2022 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2023 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2026 * y = *(v4sf*)_ps_coscof_p0;
2027 * y = _mm_mul_ps(y, z);
2029 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2030 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2031 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2032 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2033 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2034 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2038 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2039 * y = _mm_sub_ps(y, tmp);
2040 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2042 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2043 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2044 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2045 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2046 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2049 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2050 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2051 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2053 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2054 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2055 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2058 * Evaluate the second polynom (Pi/4 <= x <= 0)
2060 * y2 = *(v4sf*)_ps_sincof_p0;
2061 * y2 = _mm_mul_ps(y2, z);
2062 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2063 * y2 = _mm_mul_ps(y2, z);
2064 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2065 * y2 = _mm_mul_ps(y2, z);
2066 * y2 = _mm_mul_ps(y2, x);
2067 * y2 = _mm_add_ps(y2, x);
2070 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2071 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2072 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2073 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2074 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2075 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2076 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2079 * select the correct result from the two polynoms
2081 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2082 * y = _mm_andnot_ps(xmm3, y);
2083 * y = _mm_add_ps(y,y2);
2085 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2086 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2087 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2088 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2089 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2090 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2094 * y = _mm_xor_ps(y, sign_bit);
2096 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
2097 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2103 * Generate pow(x, y)
2106 lp_build_pow(struct lp_build_context
*bld
,
2110 /* TODO: optimize the constant case */
2111 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2112 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2113 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2117 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2125 lp_build_exp(struct lp_build_context
*bld
,
2128 /* log2(e) = 1/log(2) */
2129 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2130 1.4426950408889634);
2132 assert(lp_check_value(bld
->type
, x
));
2134 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2142 lp_build_log(struct lp_build_context
*bld
,
2146 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2147 0.69314718055994529);
2149 assert(lp_check_value(bld
->type
, x
));
2151 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2156 * Generate polynomial.
2157 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2160 lp_build_polynomial(struct lp_build_context
*bld
,
2162 const double *coeffs
,
2163 unsigned num_coeffs
)
2165 const struct lp_type type
= bld
->type
;
2166 LLVMValueRef even
= NULL
, odd
= NULL
;
2170 assert(lp_check_value(bld
->type
, x
));
2172 /* TODO: optimize the constant case */
2173 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2174 LLVMIsConstant(x
)) {
2175 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2180 * Calculate odd and even terms seperately to decrease data dependency
2182 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2183 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2185 x2
= lp_build_mul(bld
, x
, x
);
2187 for (i
= num_coeffs
; i
--; ) {
2190 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2194 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2199 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2206 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2215 * Minimax polynomial fit of 2**x, in range [0, 1[
2217 const double lp_build_exp2_polynomial
[] = {
2218 #if EXP_POLY_DEGREE == 5
2219 0.999999925063526176901,
2220 0.693153073200168932794,
2221 0.240153617044375388211,
2222 0.0558263180532956664775,
2223 0.00898934009049466391101,
2224 0.00187757667519147912699
2225 #elif EXP_POLY_DEGREE == 4
2226 1.00000259337069434683,
2227 0.693003834469974940458,
2228 0.24144275689150793076,
2229 0.0520114606103070150235,
2230 0.0135341679161270268764
2231 #elif EXP_POLY_DEGREE == 3
2232 0.999925218562710312959,
2233 0.695833540494823811697,
2234 0.226067155427249155588,
2235 0.0780245226406372992967
2236 #elif EXP_POLY_DEGREE == 2
2237 1.00172476321474503578,
2238 0.657636275736077639316,
2239 0.33718943461968720704
2247 lp_build_exp2_approx(struct lp_build_context
*bld
,
2249 LLVMValueRef
*p_exp2_int_part
,
2250 LLVMValueRef
*p_frac_part
,
2251 LLVMValueRef
*p_exp2
)
2253 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2254 const struct lp_type type
= bld
->type
;
2255 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2256 LLVMValueRef ipart
= NULL
;
2257 LLVMValueRef fpart
= NULL
;
2258 LLVMValueRef expipart
= NULL
;
2259 LLVMValueRef expfpart
= NULL
;
2260 LLVMValueRef res
= NULL
;
2262 assert(lp_check_value(bld
->type
, x
));
2264 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
2265 /* TODO: optimize the constant case */
2266 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2267 LLVMIsConstant(x
)) {
2268 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2272 assert(type
.floating
&& type
.width
== 32);
2274 x
= lp_build_min(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, 129.0));
2275 x
= lp_build_max(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999));
2277 /* ipart = floor(x) */
2278 /* fpart = x - ipart */
2279 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
2282 if(p_exp2_int_part
|| p_exp2
) {
2283 /* expipart = (float) (1 << ipart) */
2284 expipart
= LLVMBuildAdd(builder
, ipart
,
2285 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
2286 expipart
= LLVMBuildShl(builder
, expipart
,
2287 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
2288 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
2292 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
2293 Elements(lp_build_exp2_polynomial
));
2295 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
2299 *p_exp2_int_part
= expipart
;
2302 *p_frac_part
= fpart
;
2310 lp_build_exp2(struct lp_build_context
*bld
,
2314 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
2320 * Extract the exponent of a IEEE-754 floating point value.
2322 * Optionally apply an integer bias.
2324 * Result is an integer value with
2326 * ifloor(log2(x)) + bias
2329 lp_build_extract_exponent(struct lp_build_context
*bld
,
2333 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2334 const struct lp_type type
= bld
->type
;
2335 unsigned mantissa
= lp_mantissa(type
);
2338 assert(type
.floating
);
2340 assert(lp_check_value(bld
->type
, x
));
2342 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
2344 res
= LLVMBuildLShr(builder
, x
,
2345 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
2346 res
= LLVMBuildAnd(builder
, res
,
2347 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
2348 res
= LLVMBuildSub(builder
, res
,
2349 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
2356 * Extract the mantissa of the a floating.
2358 * Result is a floating point value with
2360 * x / floor(log2(x))
2363 lp_build_extract_mantissa(struct lp_build_context
*bld
,
2366 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2367 const struct lp_type type
= bld
->type
;
2368 unsigned mantissa
= lp_mantissa(type
);
2369 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2370 (1ULL << mantissa
) - 1);
2371 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
2374 assert(lp_check_value(bld
->type
, x
));
2376 assert(type
.floating
);
2378 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
2380 /* res = x / 2**ipart */
2381 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
2382 res
= LLVMBuildOr(builder
, res
, one
, "");
2383 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
2391 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2392 * These coefficients can be generate with
2393 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2395 const double lp_build_log2_polynomial
[] = {
2396 #if LOG_POLY_DEGREE == 6
2397 3.11578814719469302614,
2398 -3.32419399085241980044,
2399 2.59883907202499966007,
2400 -1.23152682416275988241,
2401 0.318212422185251071475,
2402 -0.0344359067839062357313
2403 #elif LOG_POLY_DEGREE == 5
2404 2.8882704548164776201,
2405 -2.52074962577807006663,
2406 1.48116647521213171641,
2407 -0.465725644288844778798,
2408 0.0596515482674574969533
2409 #elif LOG_POLY_DEGREE == 4
2410 2.61761038894603480148,
2411 -1.75647175389045657003,
2412 0.688243882994381274313,
2413 -0.107254423828329604454
2414 #elif LOG_POLY_DEGREE == 3
2415 2.28330284476918490682,
2416 -1.04913055217340124191,
2417 0.204446009836232697516
2425 * See http://www.devmaster.net/forums/showthread.php?p=43580
2428 lp_build_log2_approx(struct lp_build_context
*bld
,
2430 LLVMValueRef
*p_exp
,
2431 LLVMValueRef
*p_floor_log2
,
2432 LLVMValueRef
*p_log2
)
2434 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2435 const struct lp_type type
= bld
->type
;
2436 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2437 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2439 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
2440 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
2441 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
2443 LLVMValueRef i
= NULL
;
2444 LLVMValueRef exp
= NULL
;
2445 LLVMValueRef mant
= NULL
;
2446 LLVMValueRef logexp
= NULL
;
2447 LLVMValueRef logmant
= NULL
;
2448 LLVMValueRef res
= NULL
;
2450 assert(lp_check_value(bld
->type
, x
));
2452 if(p_exp
|| p_floor_log2
|| p_log2
) {
2453 /* TODO: optimize the constant case */
2454 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2455 LLVMIsConstant(x
)) {
2456 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2460 assert(type
.floating
&& type
.width
== 32);
2463 * We don't explicitly handle denormalized numbers. They will yield a
2464 * result in the neighbourhood of -127, which appears to be adequate
2468 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
2470 /* exp = (float) exponent(x) */
2471 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
2474 if(p_floor_log2
|| p_log2
) {
2475 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
2476 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
2477 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
2481 /* mant = (float) mantissa(x) */
2482 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
2483 mant
= LLVMBuildOr(builder
, mant
, one
, "");
2484 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
2486 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
2487 Elements(lp_build_log2_polynomial
));
2489 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2490 logmant
= LLVMBuildFMul(builder
, logmant
, LLVMBuildFSub(builder
, mant
, bld
->one
, ""), "");
2492 res
= LLVMBuildFAdd(builder
, logmant
, logexp
, "");
2496 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
2501 *p_floor_log2
= logexp
;
2509 lp_build_log2(struct lp_build_context
*bld
,
2513 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);
2519 * Faster (and less accurate) log2.
2521 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2523 * Piece-wise linear approximation, with exact results when x is a
2526 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2529 lp_build_fast_log2(struct lp_build_context
*bld
,
2532 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2536 assert(lp_check_value(bld
->type
, x
));
2538 assert(bld
->type
.floating
);
2540 /* ipart = floor(log2(x)) - 1 */
2541 ipart
= lp_build_extract_exponent(bld
, x
, -1);
2542 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
2544 /* fpart = x / 2**ipart */
2545 fpart
= lp_build_extract_mantissa(bld
, x
);
2548 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
2553 * Fast implementation of iround(log2(x)).
2555 * Not an approximation -- it should give accurate results all the time.
2558 lp_build_ilog2(struct lp_build_context
*bld
,
2561 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2562 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
2565 assert(bld
->type
.floating
);
2567 assert(lp_check_value(bld
->type
, x
));
2569 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
2570 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
2572 /* ipart = floor(log2(x) + 0.5) */
2573 ipart
= lp_build_extract_exponent(bld
, x
, 0);
2579 lp_build_mod(struct lp_build_context
*bld
,
2583 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2585 const struct lp_type type
= bld
->type
;
2587 assert(lp_check_value(type
, x
));
2588 assert(lp_check_value(type
, y
));
2591 res
= LLVMBuildFRem(builder
, x
, y
, "");
2593 res
= LLVMBuildSRem(builder
, x
, y
, "");
2595 res
= LLVMBuildURem(builder
, x
, y
, "");