1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
62 #define EXP_POLY_DEGREE 3
64 #define LOG_POLY_DEGREE 5
69 * No checks for special case values of a or b = 1 or 0 are done.
72 lp_build_min_simple(struct lp_build_context
*bld
,
76 const struct lp_type type
= bld
->type
;
77 const char *intrinsic
= NULL
;
80 assert(lp_check_value(type
, a
));
81 assert(lp_check_value(type
, b
));
83 /* TODO: optimize the constant case */
85 if(type
.width
* type
.length
== 128) {
87 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
88 intrinsic
= "llvm.x86.sse.min.ps";
89 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
90 intrinsic
= "llvm.x86.sse2.min.pd";
93 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
94 intrinsic
= "llvm.x86.sse2.pminu.b";
95 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
96 intrinsic
= "llvm.x86.sse41.pminsb";
97 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
98 intrinsic
= "llvm.x86.sse41.pminuw";
99 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
100 intrinsic
= "llvm.x86.sse2.pmins.w";
101 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
102 intrinsic
= "llvm.x86.sse41.pminud";
103 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
104 intrinsic
= "llvm.x86.sse41.pminsd";
109 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
111 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
112 return lp_build_select(bld
, cond
, a
, b
);
118 * No checks for special case values of a or b = 1 or 0 are done.
121 lp_build_max_simple(struct lp_build_context
*bld
,
125 const struct lp_type type
= bld
->type
;
126 const char *intrinsic
= NULL
;
129 assert(lp_check_value(type
, a
));
130 assert(lp_check_value(type
, b
));
132 /* TODO: optimize the constant case */
134 if(type
.width
* type
.length
== 128) {
136 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
137 intrinsic
= "llvm.x86.sse.max.ps";
138 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
139 intrinsic
= "llvm.x86.sse2.max.pd";
142 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
143 intrinsic
= "llvm.x86.sse2.pmaxu.b";
144 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
145 intrinsic
= "llvm.x86.sse41.pmaxsb";
146 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
147 intrinsic
= "llvm.x86.sse41.pmaxuw";
148 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
149 intrinsic
= "llvm.x86.sse2.pmaxs.w";
150 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
151 intrinsic
= "llvm.x86.sse41.pmaxud";
152 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
153 intrinsic
= "llvm.x86.sse41.pmaxsd";
158 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
160 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
161 return lp_build_select(bld
, cond
, a
, b
);
166 * Generate 1 - a, or ~a depending on bld->type.
169 lp_build_comp(struct lp_build_context
*bld
,
172 const struct lp_type type
= bld
->type
;
174 assert(lp_check_value(type
, a
));
181 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
182 if(LLVMIsConstant(a
))
183 return LLVMConstNot(a
);
185 return LLVMBuildNot(bld
->builder
, a
, "");
188 if(LLVMIsConstant(a
))
190 return LLVMConstFSub(bld
->one
, a
);
192 return LLVMConstSub(bld
->one
, a
);
195 return LLVMBuildFSub(bld
->builder
, bld
->one
, a
, "");
197 return LLVMBuildSub(bld
->builder
, bld
->one
, a
, "");
205 lp_build_add(struct lp_build_context
*bld
,
209 const struct lp_type type
= bld
->type
;
212 assert(lp_check_value(type
, a
));
213 assert(lp_check_value(type
, b
));
219 if(a
== bld
->undef
|| b
== bld
->undef
)
223 const char *intrinsic
= NULL
;
225 if(a
== bld
->one
|| b
== bld
->one
)
228 if(util_cpu_caps
.has_sse2
&&
229 type
.width
* type
.length
== 128 &&
230 !type
.floating
&& !type
.fixed
) {
232 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
234 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
238 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
241 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
243 res
= LLVMConstFAdd(a
, b
);
245 res
= LLVMConstAdd(a
, b
);
248 res
= LLVMBuildFAdd(bld
->builder
, a
, b
, "");
250 res
= LLVMBuildAdd(bld
->builder
, a
, b
, "");
252 /* clamp to ceiling of 1.0 */
253 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
254 res
= lp_build_min_simple(bld
, res
, bld
->one
);
256 /* XXX clamp to floor of -1 or 0??? */
262 /** Return the scalar sum of the elements of a */
264 lp_build_sum_vector(struct lp_build_context
*bld
,
267 const struct lp_type type
= bld
->type
;
268 LLVMValueRef index
, res
;
271 assert(lp_check_value(type
, a
));
273 if (type
.length
== 1) {
277 assert(!bld
->type
.norm
);
279 index
= LLVMConstInt(LLVMInt32Type(), 0, 0);
280 res
= LLVMBuildExtractElement(bld
->builder
, a
, index
, "");
282 for (i
= 1; i
< type
.length
; i
++) {
283 index
= LLVMConstInt(LLVMInt32Type(), i
, 0);
285 res
= LLVMBuildFAdd(bld
->builder
, res
,
286 LLVMBuildExtractElement(bld
->builder
,
290 res
= LLVMBuildAdd(bld
->builder
, res
,
291 LLVMBuildExtractElement(bld
->builder
,
304 lp_build_sub(struct lp_build_context
*bld
,
308 const struct lp_type type
= bld
->type
;
311 assert(lp_check_value(type
, a
));
312 assert(lp_check_value(type
, b
));
316 if(a
== bld
->undef
|| b
== bld
->undef
)
322 const char *intrinsic
= NULL
;
327 if(util_cpu_caps
.has_sse2
&&
328 type
.width
* type
.length
== 128 &&
329 !type
.floating
&& !type
.fixed
) {
331 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
333 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
337 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
340 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
342 res
= LLVMConstFSub(a
, b
);
344 res
= LLVMConstSub(a
, b
);
347 res
= LLVMBuildFSub(bld
->builder
, a
, b
, "");
349 res
= LLVMBuildSub(bld
->builder
, a
, b
, "");
351 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
352 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
359 * Normalized 8bit multiplication.
363 * makes the following approximation to the division (Sree)
365 * a*b/255 ~= (a*(b + 1)) >> 256
367 * which is the fastest method that satisfies the following OpenGL criteria
369 * 0*0 = 0 and 255*255 = 255
373 * takes the geometric series approximation to the division
375 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
377 * in this case just the first two terms to fit in 16bit arithmetic
379 * t/255 ~= (t + (t >> 8)) >> 8
381 * note that just by itself it doesn't satisfies the OpenGL criteria, as
382 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
385 * - geometric series plus rounding
387 * when using a geometric series division instead of truncating the result
388 * use roundoff in the approximation (Jim Blinn)
390 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
392 * achieving the exact results
394 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
395 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
396 * @sa Michael Herf, The "double blend trick", May 2000,
397 * http://www.stereopsis.com/doubleblend.html
400 lp_build_mul_u8n(LLVMBuilderRef builder
,
401 struct lp_type i16_type
,
402 LLVMValueRef a
, LLVMValueRef b
)
407 assert(!i16_type
.floating
);
408 assert(lp_check_value(i16_type
, a
));
409 assert(lp_check_value(i16_type
, b
));
411 c8
= lp_build_const_int_vec(i16_type
, 8);
415 /* a*b/255 ~= (a*(b + 1)) >> 256 */
416 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(i16_type
, 1), "");
417 ab
= LLVMBuildMul(builder
, a
, b
, "");
421 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
422 ab
= LLVMBuildMul(builder
, a
, b
, "");
423 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
424 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(i16_type
, 0x80), "");
428 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
438 lp_build_mul(struct lp_build_context
*bld
,
442 const struct lp_type type
= bld
->type
;
446 assert(lp_check_value(type
, a
));
447 assert(lp_check_value(type
, b
));
457 if(a
== bld
->undef
|| b
== bld
->undef
)
460 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
461 if(type
.width
== 8) {
462 struct lp_type i16_type
= lp_wider_type(type
);
463 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
465 lp_build_unpack2(bld
->builder
, type
, i16_type
, a
, &al
, &ah
);
466 lp_build_unpack2(bld
->builder
, type
, i16_type
, b
, &bl
, &bh
);
468 /* PMULLW, PSRLW, PADDW */
469 abl
= lp_build_mul_u8n(bld
->builder
, i16_type
, al
, bl
);
470 abh
= lp_build_mul_u8n(bld
->builder
, i16_type
, ah
, bh
);
472 ab
= lp_build_pack2(bld
->builder
, i16_type
, type
, abl
, abh
);
482 shift
= lp_build_const_int_vec(type
, type
.width
/2);
486 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
488 res
= LLVMConstFMul(a
, b
);
490 res
= LLVMConstMul(a
, b
);
493 res
= LLVMConstAShr(res
, shift
);
495 res
= LLVMConstLShr(res
, shift
);
500 res
= LLVMBuildFMul(bld
->builder
, a
, b
, "");
502 res
= LLVMBuildMul(bld
->builder
, a
, b
, "");
505 res
= LLVMBuildAShr(bld
->builder
, res
, shift
, "");
507 res
= LLVMBuildLShr(bld
->builder
, res
, shift
, "");
516 * Small vector x scale multiplication optimization.
519 lp_build_mul_imm(struct lp_build_context
*bld
,
525 assert(lp_check_value(bld
->type
, a
));
534 return lp_build_negate(bld
, a
);
536 if(b
== 2 && bld
->type
.floating
)
537 return lp_build_add(bld
, a
, a
);
539 if(util_is_power_of_two(b
)) {
540 unsigned shift
= ffs(b
) - 1;
542 if(bld
->type
.floating
) {
545 * Power of two multiplication by directly manipulating the mantissa.
547 * XXX: This might not be always faster, it will introduce a small error
548 * for multiplication by zero, and it will produce wrong results
551 unsigned mantissa
= lp_mantissa(bld
->type
);
552 factor
= lp_build_const_int_vec(bld
->type
, (unsigned long long)shift
<< mantissa
);
553 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_int_vec_type(bld
->type
), "");
554 a
= LLVMBuildAdd(bld
->builder
, a
, factor
, "");
555 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_vec_type(bld
->type
), "");
560 factor
= lp_build_const_vec(bld
->type
, shift
);
561 return LLVMBuildShl(bld
->builder
, a
, factor
, "");
565 factor
= lp_build_const_vec(bld
->type
, (double)b
);
566 return lp_build_mul(bld
, a
, factor
);
574 lp_build_div(struct lp_build_context
*bld
,
578 const struct lp_type type
= bld
->type
;
580 assert(lp_check_value(type
, a
));
581 assert(lp_check_value(type
, b
));
586 return lp_build_rcp(bld
, b
);
591 if(a
== bld
->undef
|| b
== bld
->undef
)
594 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
596 return LLVMConstFDiv(a
, b
);
598 return LLVMConstSDiv(a
, b
);
600 return LLVMConstUDiv(a
, b
);
603 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
604 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
607 return LLVMBuildFDiv(bld
->builder
, a
, b
, "");
609 return LLVMBuildSDiv(bld
->builder
, a
, b
, "");
611 return LLVMBuildUDiv(bld
->builder
, a
, b
, "");
616 * Linear interpolation.
618 * This also works for integer values with a few caveats.
620 * @sa http://www.stereopsis.com/doubleblend.html
623 lp_build_lerp(struct lp_build_context
*bld
,
631 assert(lp_check_value(bld
->type
, x
));
632 assert(lp_check_value(bld
->type
, v0
));
633 assert(lp_check_value(bld
->type
, v1
));
635 delta
= lp_build_sub(bld
, v1
, v0
);
637 res
= lp_build_mul(bld
, x
, delta
);
639 res
= lp_build_add(bld
, v0
, res
);
642 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
643 * but it will be wrong for other uses. Basically we need a more
644 * powerful lp_type, capable of further distinguishing the values
645 * interpretation from the value storage. */
646 res
= LLVMBuildAnd(bld
->builder
, res
, lp_build_const_int_vec(bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
653 lp_build_lerp_2d(struct lp_build_context
*bld
,
661 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
662 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
663 return lp_build_lerp(bld
, y
, v0
, v1
);
669 * Do checks for special cases.
672 lp_build_min(struct lp_build_context
*bld
,
676 assert(lp_check_value(bld
->type
, a
));
677 assert(lp_check_value(bld
->type
, b
));
679 if(a
== bld
->undef
|| b
== bld
->undef
)
686 if(a
== bld
->zero
|| b
== bld
->zero
)
694 return lp_build_min_simple(bld
, a
, b
);
700 * Do checks for special cases.
703 lp_build_max(struct lp_build_context
*bld
,
707 assert(lp_check_value(bld
->type
, a
));
708 assert(lp_check_value(bld
->type
, b
));
710 if(a
== bld
->undef
|| b
== bld
->undef
)
717 if(a
== bld
->one
|| b
== bld
->one
)
725 return lp_build_max_simple(bld
, a
, b
);
730 * Generate clamp(a, min, max)
731 * Do checks for special cases.
734 lp_build_clamp(struct lp_build_context
*bld
,
739 assert(lp_check_value(bld
->type
, a
));
740 assert(lp_check_value(bld
->type
, min
));
741 assert(lp_check_value(bld
->type
, max
));
743 a
= lp_build_min(bld
, a
, max
);
744 a
= lp_build_max(bld
, a
, min
);
753 lp_build_abs(struct lp_build_context
*bld
,
756 const struct lp_type type
= bld
->type
;
757 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
759 assert(lp_check_value(type
, a
));
765 /* Mask out the sign bit */
766 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
767 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
768 LLVMValueRef mask
= lp_build_const_int_vec(type
, ((unsigned long long) absMask
));
769 a
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
770 a
= LLVMBuildAnd(bld
->builder
, a
, mask
, "");
771 a
= LLVMBuildBitCast(bld
->builder
, a
, vec_type
, "");
775 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
778 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
780 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
782 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
786 return lp_build_max(bld
, a
, LLVMBuildNeg(bld
->builder
, a
, ""));
791 lp_build_negate(struct lp_build_context
*bld
,
794 assert(lp_check_value(bld
->type
, a
));
796 #if HAVE_LLVM >= 0x0207
797 if (bld
->type
.floating
)
798 a
= LLVMBuildFNeg(bld
->builder
, a
, "");
801 a
= LLVMBuildNeg(bld
->builder
, a
, "");
807 /** Return -1, 0 or +1 depending on the sign of a */
809 lp_build_sgn(struct lp_build_context
*bld
,
812 const struct lp_type type
= bld
->type
;
816 assert(lp_check_value(type
, a
));
818 /* Handle non-zero case */
820 /* if not zero then sign must be positive */
823 else if(type
.floating
) {
824 LLVMTypeRef vec_type
;
825 LLVMTypeRef int_type
;
829 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
831 int_type
= lp_build_int_vec_type(type
);
832 vec_type
= lp_build_vec_type(type
);
833 mask
= lp_build_const_int_vec(type
, maskBit
);
835 /* Take the sign bit and add it to 1 constant */
836 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_type
, "");
837 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
838 one
= LLVMConstBitCast(bld
->one
, int_type
);
839 res
= LLVMBuildOr(bld
->builder
, sign
, one
, "");
840 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
844 LLVMValueRef minus_one
= lp_build_const_vec(type
, -1.0);
845 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
846 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
850 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
851 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
858 * Set the sign of float vector 'a' according to 'sign'.
859 * If sign==0, return abs(a).
860 * If sign==1, return -abs(a);
861 * Other values for sign produce undefined results.
864 lp_build_set_sign(struct lp_build_context
*bld
,
865 LLVMValueRef a
, LLVMValueRef sign
)
867 const struct lp_type type
= bld
->type
;
868 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
869 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
870 LLVMValueRef shift
= lp_build_const_int_vec(type
, type
.width
- 1);
871 LLVMValueRef mask
= lp_build_const_int_vec(type
,
872 ~((unsigned long long) 1 << (type
.width
- 1)));
873 LLVMValueRef val
, res
;
875 assert(type
.floating
);
876 assert(lp_check_value(type
, a
));
878 /* val = reinterpret_cast<int>(a) */
879 val
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
880 /* val = val & mask */
881 val
= LLVMBuildAnd(bld
->builder
, val
, mask
, "");
882 /* sign = sign << shift */
883 sign
= LLVMBuildShl(bld
->builder
, sign
, shift
, "");
884 /* res = val | sign */
885 res
= LLVMBuildOr(bld
->builder
, val
, sign
, "");
886 /* res = reinterpret_cast<float>(res) */
887 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
894 * Convert vector of (or scalar) int to vector of (or scalar) float.
897 lp_build_int_to_float(struct lp_build_context
*bld
,
900 const struct lp_type type
= bld
->type
;
901 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
903 assert(type
.floating
);
905 return LLVMBuildSIToFP(bld
->builder
, a
, vec_type
, "");
910 enum lp_build_round_sse41_mode
912 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
913 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
914 LP_BUILD_ROUND_SSE41_CEIL
= 2,
915 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
919 static INLINE LLVMValueRef
920 lp_build_round_sse41(struct lp_build_context
*bld
,
922 enum lp_build_round_sse41_mode mode
)
924 const struct lp_type type
= bld
->type
;
925 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
926 const char *intrinsic
;
928 assert(type
.floating
);
929 assert(type
.width
*type
.length
== 128);
930 assert(lp_check_value(type
, a
));
931 assert(util_cpu_caps
.has_sse4_1
);
935 intrinsic
= "llvm.x86.sse41.round.ps";
938 intrinsic
= "llvm.x86.sse41.round.pd";
945 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, vec_type
, a
,
946 LLVMConstInt(LLVMInt32Type(), mode
, 0));
951 * Return the integer part of a float (vector) value. The returned value is
953 * Ex: trunc(-1.5) = 1.0
956 lp_build_trunc(struct lp_build_context
*bld
,
959 const struct lp_type type
= bld
->type
;
961 assert(type
.floating
);
962 assert(lp_check_value(type
, a
));
964 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
965 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
967 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
968 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
970 res
= LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
971 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
978 * Return float (vector) rounded to nearest integer (vector). The returned
979 * value is a float (vector).
980 * Ex: round(0.9) = 1.0
981 * Ex: round(-1.5) = -2.0
984 lp_build_round(struct lp_build_context
*bld
,
987 const struct lp_type type
= bld
->type
;
989 assert(type
.floating
);
990 assert(lp_check_value(type
, a
));
992 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
993 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
995 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
997 res
= lp_build_iround(bld
, a
);
998 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
1005 * Return floor of float (vector), result is a float (vector)
1006 * Ex: floor(1.1) = 1.0
1007 * Ex: floor(-1.1) = -2.0
1010 lp_build_floor(struct lp_build_context
*bld
,
1013 const struct lp_type type
= bld
->type
;
1015 assert(type
.floating
);
1016 assert(lp_check_value(type
, a
));
1018 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
1019 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1021 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1023 res
= lp_build_ifloor(bld
, a
);
1024 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
1031 * Return ceiling of float (vector), returning float (vector).
1032 * Ex: ceil( 1.1) = 2.0
1033 * Ex: ceil(-1.1) = -1.0
1036 lp_build_ceil(struct lp_build_context
*bld
,
1039 const struct lp_type type
= bld
->type
;
1041 assert(type
.floating
);
1042 assert(lp_check_value(type
, a
));
1044 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
1045 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1047 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1049 res
= lp_build_iceil(bld
, a
);
1050 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
1057 * Return fractional part of 'a' computed as a - floor(a)
1058 * Typically used in texture coord arithmetic.
1061 lp_build_fract(struct lp_build_context
*bld
,
1064 assert(bld
->type
.floating
);
1065 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1070 * Return the integer part of a float (vector) value. The returned value is
1071 * an integer (vector).
1072 * Ex: itrunc(-1.5) = 1
1075 lp_build_itrunc(struct lp_build_context
*bld
,
1078 const struct lp_type type
= bld
->type
;
1079 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1081 assert(type
.floating
);
1082 assert(lp_check_value(type
, a
));
1084 return LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
1089 * Return float (vector) rounded to nearest integer (vector). The returned
1090 * value is an integer (vector).
1091 * Ex: iround(0.9) = 1
1092 * Ex: iround(-1.5) = -2
1095 lp_build_iround(struct lp_build_context
*bld
,
1098 const struct lp_type type
= bld
->type
;
1099 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1102 assert(type
.floating
);
1104 assert(lp_check_value(type
, a
));
1106 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1107 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1110 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1111 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1116 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1117 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1120 half
= lp_build_const_vec(type
, 0.5);
1121 half
= LLVMBuildBitCast(bld
->builder
, half
, int_vec_type
, "");
1122 half
= LLVMBuildOr(bld
->builder
, sign
, half
, "");
1123 half
= LLVMBuildBitCast(bld
->builder
, half
, vec_type
, "");
1125 res
= LLVMBuildFAdd(bld
->builder
, a
, half
, "");
1128 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1135 * Return floor of float (vector), result is an int (vector)
1136 * Ex: ifloor(1.1) = 1.0
1137 * Ex: ifloor(-1.1) = -2.0
1140 lp_build_ifloor(struct lp_build_context
*bld
,
1143 const struct lp_type type
= bld
->type
;
1144 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1147 assert(type
.floating
);
1148 assert(lp_check_value(type
, a
));
1150 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1151 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1154 /* Take the sign bit and add it to 1 constant */
1155 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1156 unsigned mantissa
= lp_mantissa(type
);
1157 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1159 LLVMValueRef offset
;
1161 /* sign = a < 0 ? ~0 : 0 */
1162 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1163 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1164 sign
= LLVMBuildAShr(bld
->builder
, sign
, lp_build_const_int_vec(type
, type
.width
- 1), "ifloor.sign");
1166 /* offset = -0.99999(9)f */
1167 offset
= lp_build_const_vec(type
, -(double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1168 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1170 /* offset = a < 0 ? offset : 0.0f */
1171 offset
= LLVMBuildAnd(bld
->builder
, offset
, sign
, "");
1172 offset
= LLVMBuildBitCast(bld
->builder
, offset
, vec_type
, "ifloor.offset");
1174 res
= LLVMBuildFAdd(bld
->builder
, a
, offset
, "ifloor.res");
1177 /* round to nearest (toward zero) */
1178 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "ifloor.res");
1185 * Return ceiling of float (vector), returning int (vector).
1186 * Ex: iceil( 1.1) = 2
1187 * Ex: iceil(-1.1) = -1
1190 lp_build_iceil(struct lp_build_context
*bld
,
1193 const struct lp_type type
= bld
->type
;
1194 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1197 assert(type
.floating
);
1198 assert(lp_check_value(type
, a
));
1200 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1201 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1204 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1205 unsigned mantissa
= lp_mantissa(type
);
1206 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1208 LLVMValueRef offset
;
1210 /* sign = a < 0 ? 0 : ~0 */
1211 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1212 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1213 sign
= LLVMBuildAShr(bld
->builder
, sign
, lp_build_const_int_vec(type
, type
.width
- 1), "iceil.sign");
1214 sign
= LLVMBuildNot(bld
->builder
, sign
, "iceil.not");
1216 /* offset = 0.99999(9)f */
1217 offset
= lp_build_const_vec(type
, (double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1218 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1220 /* offset = a < 0 ? 0.0 : offset */
1221 offset
= LLVMBuildAnd(bld
->builder
, offset
, sign
, "");
1222 offset
= LLVMBuildBitCast(bld
->builder
, offset
, vec_type
, "iceil.offset");
1224 res
= LLVMBuildFAdd(bld
->builder
, a
, offset
, "iceil.res");
1227 /* round to nearest (toward zero) */
1228 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "iceil.res");
1235 lp_build_sqrt(struct lp_build_context
*bld
,
1238 const struct lp_type type
= bld
->type
;
1239 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1242 assert(lp_check_value(type
, a
));
1244 /* TODO: optimize the constant case */
1245 /* TODO: optimize the constant case */
1247 assert(type
.floating
);
1248 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
1250 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1255 * Do one Newton-Raphson step to improve reciprocate precision:
1257 * x_{i+1} = x_i * (2 - a * x_i)
1259 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1260 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1261 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1262 * halo. It would be necessary to clamp the argument to prevent this.
1265 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1266 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1268 static INLINE LLVMValueRef
1269 lp_build_rcp_refine(struct lp_build_context
*bld
,
1273 LLVMValueRef two
= lp_build_const_vec(bld
->type
, 2.0);
1276 res
= LLVMBuildFMul(bld
->builder
, a
, rcp_a
, "");
1277 res
= LLVMBuildFSub(bld
->builder
, two
, res
, "");
1278 res
= LLVMBuildFMul(bld
->builder
, rcp_a
, res
, "");
1285 lp_build_rcp(struct lp_build_context
*bld
,
1288 const struct lp_type type
= bld
->type
;
1290 assert(lp_check_value(type
, a
));
1299 assert(type
.floating
);
1301 if(LLVMIsConstant(a
))
1302 return LLVMConstFDiv(bld
->one
, a
);
1305 * We don't use RCPPS because:
1306 * - it only has 10bits of precision
1307 * - it doesn't even get the reciprocate of 1.0 exactly
1308 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1309 * - for recent processors the benefit over DIVPS is marginal, a case
1312 * We could still use it on certain processors if benchmarks show that the
1313 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1314 * particular uses that require less workarounds.
1317 if (FALSE
&& util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1318 const unsigned num_iterations
= 0;
1322 res
= lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", bld
->vec_type
, a
);
1324 for (i
= 0; i
< num_iterations
; ++i
) {
1325 res
= lp_build_rcp_refine(bld
, a
, res
);
1331 return LLVMBuildFDiv(bld
->builder
, bld
->one
, a
, "");
1336 * Do one Newton-Raphson step to improve rsqrt precision:
1338 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1341 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1343 static INLINE LLVMValueRef
1344 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
1346 LLVMValueRef rsqrt_a
)
1348 LLVMValueRef half
= lp_build_const_vec(bld
->type
, 0.5);
1349 LLVMValueRef three
= lp_build_const_vec(bld
->type
, 3.0);
1352 res
= LLVMBuildFMul(bld
->builder
, rsqrt_a
, rsqrt_a
, "");
1353 res
= LLVMBuildFMul(bld
->builder
, a
, res
, "");
1354 res
= LLVMBuildFSub(bld
->builder
, three
, res
, "");
1355 res
= LLVMBuildFMul(bld
->builder
, rsqrt_a
, res
, "");
1356 res
= LLVMBuildFMul(bld
->builder
, half
, res
, "");
1363 * Generate 1/sqrt(a)
1366 lp_build_rsqrt(struct lp_build_context
*bld
,
1369 const struct lp_type type
= bld
->type
;
1371 assert(lp_check_value(type
, a
));
1373 assert(type
.floating
);
1375 if (util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1376 const unsigned num_iterations
= 0;
1380 res
= lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rsqrt.ps", bld
->vec_type
, a
);
1382 for (i
= 0; i
< num_iterations
; ++i
) {
1383 res
= lp_build_rsqrt_refine(bld
, a
, res
);
1389 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
1393 static inline LLVMValueRef
1394 lp_build_const_v4si(unsigned long value
)
1396 LLVMValueRef element
= LLVMConstInt(LLVMInt32Type(), value
, 0);
1397 LLVMValueRef elements
[4] = { element
, element
, element
, element
};
1398 return LLVMConstVector(elements
, 4);
1401 static inline LLVMValueRef
1402 lp_build_const_v4sf(float value
)
1404 LLVMValueRef element
= LLVMConstReal(LLVMFloatType(), value
);
1405 LLVMValueRef elements
[4] = { element
, element
, element
, element
};
1406 return LLVMConstVector(elements
, 4);
1411 * Generate sin(a) using SSE2
1414 lp_build_sin(struct lp_build_context
*bld
,
1417 struct lp_type int_type
= lp_int_type(bld
->type
);
1418 LLVMBuilderRef b
= bld
->builder
;
1419 LLVMTypeRef v4sf
= LLVMVectorType(LLVMFloatType(), 4);
1420 LLVMTypeRef v4si
= LLVMVectorType(LLVMInt32Type(), 4);
1423 * take the absolute value,
1424 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1427 LLVMValueRef inv_sig_mask
= lp_build_const_v4si(~0x80000000);
1428 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, v4si
, "a_v4si");
1430 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
1431 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, v4sf
, "x_abs");
1434 * extract the sign bit (upper one)
1435 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1437 LLVMValueRef sig_mask
= lp_build_const_v4si(0x80000000);
1438 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
1442 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1445 LLVMValueRef FOPi
= lp_build_const_v4sf(1.27323954473516);
1446 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
1449 * store the integer part of y in mm0
1450 * emm2 = _mm_cvttps_epi32(y);
1453 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, v4si
, "emm2_i");
1456 * j=(j+1) & (~1) (see the cephes sources)
1457 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1460 LLVMValueRef all_one
= lp_build_const_v4si(1);
1461 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
1463 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1465 LLVMValueRef inv_one
= lp_build_const_v4si(~1);
1466 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
1469 * y = _mm_cvtepi32_ps(emm2);
1471 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, v4sf
, "y_2");
1473 /* get the swap sign flag
1474 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1476 LLVMValueRef pi32_4
= lp_build_const_v4si(4);
1477 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
1480 * emm2 = _mm_slli_epi32(emm0, 29);
1482 LLVMValueRef const_29
= lp_build_const_v4si(29);
1483 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
1486 * get the polynom selection mask
1487 * there is one polynom for 0 <= x <= Pi/4
1488 * and another one for Pi/4<x<=Pi/2
1489 * Both branches will be computed.
1491 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1492 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1495 LLVMValueRef pi32_2
= lp_build_const_v4si(2);
1496 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
1497 LLVMValueRef poly_mask
= lp_build_compare(b
, int_type
, PIPE_FUNC_EQUAL
,
1498 emm2_3
, lp_build_const_v4si(0));
1500 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1502 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
1505 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1506 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1507 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1509 LLVMValueRef DP1
= lp_build_const_v4sf(-0.78515625);
1510 LLVMValueRef DP2
= lp_build_const_v4sf(-2.4187564849853515625e-4);
1511 LLVMValueRef DP3
= lp_build_const_v4sf(-3.77489497744594108e-8);
1514 * The magic pass: "Extended precision modular arithmetic"
1515 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1516 * xmm1 = _mm_mul_ps(y, xmm1);
1517 * xmm2 = _mm_mul_ps(y, xmm2);
1518 * xmm3 = _mm_mul_ps(y, xmm3);
1520 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
1521 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
1522 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
1525 * x = _mm_add_ps(x, xmm1);
1526 * x = _mm_add_ps(x, xmm2);
1527 * x = _mm_add_ps(x, xmm3);
1530 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
1531 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
1532 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
1535 * Evaluate the first polynom (0 <= x <= Pi/4)
1537 * z = _mm_mul_ps(x,x);
1539 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
1542 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1543 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1544 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1546 LLVMValueRef coscof_p0
= lp_build_const_v4sf(2.443315711809948E-005);
1547 LLVMValueRef coscof_p1
= lp_build_const_v4sf(-1.388731625493765E-003);
1548 LLVMValueRef coscof_p2
= lp_build_const_v4sf(4.166664568298827E-002);
1551 * y = *(v4sf*)_ps_coscof_p0;
1552 * y = _mm_mul_ps(y, z);
1554 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
1555 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
1556 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
1557 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
1558 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
1559 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
1563 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1564 * y = _mm_sub_ps(y, tmp);
1565 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1567 LLVMValueRef half
= lp_build_const_v4sf(0.5);
1568 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
1569 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
1570 LLVMValueRef one
= lp_build_const_v4sf(1.0);
1571 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
1574 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1575 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1576 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1578 LLVMValueRef sincof_p0
= lp_build_const_v4sf(-1.9515295891E-4);
1579 LLVMValueRef sincof_p1
= lp_build_const_v4sf(8.3321608736E-3);
1580 LLVMValueRef sincof_p2
= lp_build_const_v4sf(-1.6666654611E-1);
1583 * Evaluate the second polynom (Pi/4 <= x <= 0)
1585 * y2 = *(v4sf*)_ps_sincof_p0;
1586 * y2 = _mm_mul_ps(y2, z);
1587 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1588 * y2 = _mm_mul_ps(y2, z);
1589 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1590 * y2 = _mm_mul_ps(y2, z);
1591 * y2 = _mm_mul_ps(y2, x);
1592 * y2 = _mm_add_ps(y2, x);
1595 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
1596 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
1597 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
1598 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
1599 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
1600 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
1601 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
1604 * select the correct result from the two polynoms
1606 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1607 * y = _mm_andnot_ps(xmm3, y);
1608 * y = _mm_add_ps(y,y2);
1610 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, v4si
, "y2_i");
1611 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, v4si
, "y_i");
1612 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
1613 LLVMValueRef inv
= lp_build_const_v4si(~0);
1614 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
1615 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
1616 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
1620 * y = _mm_xor_ps(y, sign_bit);
1622 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
1623 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, v4sf
, "y_result");
1629 * Generate cos(a) using SSE2
1632 lp_build_cos(struct lp_build_context
*bld
,
1635 struct lp_type int_type
= lp_int_type(bld
->type
);
1636 LLVMBuilderRef b
= bld
->builder
;
1637 LLVMTypeRef v4sf
= LLVMVectorType(LLVMFloatType(), 4);
1638 LLVMTypeRef v4si
= LLVMVectorType(LLVMInt32Type(), 4);
1641 * take the absolute value,
1642 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1645 LLVMValueRef inv_sig_mask
= lp_build_const_v4si(~0x80000000);
1646 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, v4si
, "a_v4si");
1648 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
1649 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, v4sf
, "x_abs");
1653 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1656 LLVMValueRef FOPi
= lp_build_const_v4sf(1.27323954473516);
1657 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
1660 * store the integer part of y in mm0
1661 * emm2 = _mm_cvttps_epi32(y);
1664 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, v4si
, "emm2_i");
1667 * j=(j+1) & (~1) (see the cephes sources)
1668 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1671 LLVMValueRef all_one
= lp_build_const_v4si(1);
1672 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
1674 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1676 LLVMValueRef inv_one
= lp_build_const_v4si(~1);
1677 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
1680 * y = _mm_cvtepi32_ps(emm2);
1682 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, v4sf
, "y_2");
1686 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1688 LLVMValueRef const_2
= lp_build_const_v4si(2);
1689 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
1692 /* get the swap sign flag
1693 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1695 LLVMValueRef inv
= lp_build_const_v4si(~0);
1696 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
1697 LLVMValueRef pi32_4
= lp_build_const_v4si(4);
1698 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
1701 * emm2 = _mm_slli_epi32(emm0, 29);
1703 LLVMValueRef const_29
= lp_build_const_v4si(29);
1704 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
1707 * get the polynom selection mask
1708 * there is one polynom for 0 <= x <= Pi/4
1709 * and another one for Pi/4<x<=Pi/2
1710 * Both branches will be computed.
1712 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1713 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1716 LLVMValueRef pi32_2
= lp_build_const_v4si(2);
1717 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
1718 LLVMValueRef poly_mask
= lp_build_compare(b
, int_type
, PIPE_FUNC_EQUAL
,
1719 emm2_3
, lp_build_const_v4si(0));
1722 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1723 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1724 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1726 LLVMValueRef DP1
= lp_build_const_v4sf(-0.78515625);
1727 LLVMValueRef DP2
= lp_build_const_v4sf(-2.4187564849853515625e-4);
1728 LLVMValueRef DP3
= lp_build_const_v4sf(-3.77489497744594108e-8);
1731 * The magic pass: "Extended precision modular arithmetic"
1732 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1733 * xmm1 = _mm_mul_ps(y, xmm1);
1734 * xmm2 = _mm_mul_ps(y, xmm2);
1735 * xmm3 = _mm_mul_ps(y, xmm3);
1737 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
1738 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
1739 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
1742 * x = _mm_add_ps(x, xmm1);
1743 * x = _mm_add_ps(x, xmm2);
1744 * x = _mm_add_ps(x, xmm3);
1747 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
1748 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
1749 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
1752 * Evaluate the first polynom (0 <= x <= Pi/4)
1754 * z = _mm_mul_ps(x,x);
1756 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
1759 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1760 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1761 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1763 LLVMValueRef coscof_p0
= lp_build_const_v4sf(2.443315711809948E-005);
1764 LLVMValueRef coscof_p1
= lp_build_const_v4sf(-1.388731625493765E-003);
1765 LLVMValueRef coscof_p2
= lp_build_const_v4sf(4.166664568298827E-002);
1768 * y = *(v4sf*)_ps_coscof_p0;
1769 * y = _mm_mul_ps(y, z);
1771 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
1772 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
1773 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
1774 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
1775 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
1776 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
1780 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1781 * y = _mm_sub_ps(y, tmp);
1782 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1784 LLVMValueRef half
= lp_build_const_v4sf(0.5);
1785 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
1786 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
1787 LLVMValueRef one
= lp_build_const_v4sf(1.0);
1788 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
1791 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1792 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1793 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1795 LLVMValueRef sincof_p0
= lp_build_const_v4sf(-1.9515295891E-4);
1796 LLVMValueRef sincof_p1
= lp_build_const_v4sf(8.3321608736E-3);
1797 LLVMValueRef sincof_p2
= lp_build_const_v4sf(-1.6666654611E-1);
1800 * Evaluate the second polynom (Pi/4 <= x <= 0)
1802 * y2 = *(v4sf*)_ps_sincof_p0;
1803 * y2 = _mm_mul_ps(y2, z);
1804 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1805 * y2 = _mm_mul_ps(y2, z);
1806 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1807 * y2 = _mm_mul_ps(y2, z);
1808 * y2 = _mm_mul_ps(y2, x);
1809 * y2 = _mm_add_ps(y2, x);
1812 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
1813 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
1814 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
1815 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
1816 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
1817 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
1818 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
1821 * select the correct result from the two polynoms
1823 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1824 * y = _mm_andnot_ps(xmm3, y);
1825 * y = _mm_add_ps(y,y2);
1827 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, v4si
, "y2_i");
1828 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, v4si
, "y_i");
1829 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
1830 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
1831 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
1832 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
1836 * y = _mm_xor_ps(y, sign_bit);
1838 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
1839 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, v4sf
, "y_result");
1845 * Generate pow(x, y)
1848 lp_build_pow(struct lp_build_context
*bld
,
1852 /* TODO: optimize the constant case */
1853 if(LLVMIsConstant(x
) && LLVMIsConstant(y
))
1854 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1857 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
1865 lp_build_exp(struct lp_build_context
*bld
,
1868 /* log2(e) = 1/log(2) */
1869 LLVMValueRef log2e
= lp_build_const_vec(bld
->type
, 1.4426950408889634);
1871 assert(lp_check_value(bld
->type
, x
));
1873 return lp_build_mul(bld
, log2e
, lp_build_exp2(bld
, x
));
1881 lp_build_log(struct lp_build_context
*bld
,
1885 LLVMValueRef log2
= lp_build_const_vec(bld
->type
, 0.69314718055994529);
1887 assert(lp_check_value(bld
->type
, x
));
1889 return lp_build_mul(bld
, log2
, lp_build_exp2(bld
, x
));
1894 * Generate polynomial.
1895 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1898 lp_build_polynomial(struct lp_build_context
*bld
,
1900 const double *coeffs
,
1901 unsigned num_coeffs
)
1903 const struct lp_type type
= bld
->type
;
1904 LLVMValueRef res
= NULL
;
1907 assert(lp_check_value(bld
->type
, x
));
1909 /* TODO: optimize the constant case */
1910 if(LLVMIsConstant(x
))
1911 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1914 for (i
= num_coeffs
; i
--; ) {
1917 coeff
= lp_build_const_vec(type
, coeffs
[i
]);
1920 res
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x
, res
));
1933 * Minimax polynomial fit of 2**x, in range [0, 1[
1935 const double lp_build_exp2_polynomial
[] = {
1936 #if EXP_POLY_DEGREE == 5
1937 0.999999999690134838155,
1938 0.583974334321735217258,
1939 0.164553105719676828492,
1940 0.0292811063701710962255,
1941 0.00354944426657875141846,
1942 0.000296253726543423377365
1943 #elif EXP_POLY_DEGREE == 4
1944 1.00000001502262084505,
1945 0.563586057338685991394,
1946 0.150436017652442413623,
1947 0.0243220604213317927308,
1948 0.0025359088446580436489
1949 #elif EXP_POLY_DEGREE == 3
1950 0.999925218562710312959,
1951 0.695833540494823811697,
1952 0.226067155427249155588,
1953 0.0780245226406372992967
1954 #elif EXP_POLY_DEGREE == 2
1955 1.00172476321474503578,
1956 0.657636275736077639316,
1957 0.33718943461968720704
1965 lp_build_exp2_approx(struct lp_build_context
*bld
,
1967 LLVMValueRef
*p_exp2_int_part
,
1968 LLVMValueRef
*p_frac_part
,
1969 LLVMValueRef
*p_exp2
)
1971 const struct lp_type type
= bld
->type
;
1972 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1973 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1974 LLVMValueRef ipart
= NULL
;
1975 LLVMValueRef fpart
= NULL
;
1976 LLVMValueRef expipart
= NULL
;
1977 LLVMValueRef expfpart
= NULL
;
1978 LLVMValueRef res
= NULL
;
1980 assert(lp_check_value(bld
->type
, x
));
1982 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
1983 /* TODO: optimize the constant case */
1984 if(LLVMIsConstant(x
))
1985 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1988 assert(type
.floating
&& type
.width
== 32);
1990 x
= lp_build_min(bld
, x
, lp_build_const_vec(type
, 129.0));
1991 x
= lp_build_max(bld
, x
, lp_build_const_vec(type
, -126.99999));
1993 /* ipart = floor(x) */
1994 ipart
= lp_build_floor(bld
, x
);
1996 /* fpart = x - ipart */
1997 fpart
= LLVMBuildFSub(bld
->builder
, x
, ipart
, "");
2000 if(p_exp2_int_part
|| p_exp2
) {
2001 /* expipart = (float) (1 << ipart) */
2002 ipart
= LLVMBuildFPToSI(bld
->builder
, ipart
, int_vec_type
, "");
2003 expipart
= LLVMBuildAdd(bld
->builder
, ipart
, lp_build_const_int_vec(type
, 127), "");
2004 expipart
= LLVMBuildShl(bld
->builder
, expipart
, lp_build_const_int_vec(type
, 23), "");
2005 expipart
= LLVMBuildBitCast(bld
->builder
, expipart
, vec_type
, "");
2009 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
2010 Elements(lp_build_exp2_polynomial
));
2012 res
= LLVMBuildFMul(bld
->builder
, expipart
, expfpart
, "");
2016 *p_exp2_int_part
= expipart
;
2019 *p_frac_part
= fpart
;
2027 lp_build_exp2(struct lp_build_context
*bld
,
2031 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
2037 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2038 * These coefficients can be generate with
2039 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2041 const double lp_build_log2_polynomial
[] = {
2042 #if LOG_POLY_DEGREE == 6
2043 3.11578814719469302614,
2044 -3.32419399085241980044,
2045 2.59883907202499966007,
2046 -1.23152682416275988241,
2047 0.318212422185251071475,
2048 -0.0344359067839062357313
2049 #elif LOG_POLY_DEGREE == 5
2050 2.8882704548164776201,
2051 -2.52074962577807006663,
2052 1.48116647521213171641,
2053 -0.465725644288844778798,
2054 0.0596515482674574969533
2055 #elif LOG_POLY_DEGREE == 4
2056 2.61761038894603480148,
2057 -1.75647175389045657003,
2058 0.688243882994381274313,
2059 -0.107254423828329604454
2060 #elif LOG_POLY_DEGREE == 3
2061 2.28330284476918490682,
2062 -1.04913055217340124191,
2063 0.204446009836232697516
2071 * See http://www.devmaster.net/forums/showthread.php?p=43580
2074 lp_build_log2_approx(struct lp_build_context
*bld
,
2076 LLVMValueRef
*p_exp
,
2077 LLVMValueRef
*p_floor_log2
,
2078 LLVMValueRef
*p_log2
)
2080 const struct lp_type type
= bld
->type
;
2081 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
2082 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
2084 LLVMValueRef expmask
= lp_build_const_int_vec(type
, 0x7f800000);
2085 LLVMValueRef mantmask
= lp_build_const_int_vec(type
, 0x007fffff);
2086 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
2088 LLVMValueRef i
= NULL
;
2089 LLVMValueRef exp
= NULL
;
2090 LLVMValueRef mant
= NULL
;
2091 LLVMValueRef logexp
= NULL
;
2092 LLVMValueRef logmant
= NULL
;
2093 LLVMValueRef res
= NULL
;
2095 assert(lp_check_value(bld
->type
, x
));
2097 if(p_exp
|| p_floor_log2
|| p_log2
) {
2098 /* TODO: optimize the constant case */
2099 if(LLVMIsConstant(x
))
2100 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2103 assert(type
.floating
&& type
.width
== 32);
2105 i
= LLVMBuildBitCast(bld
->builder
, x
, int_vec_type
, "");
2107 /* exp = (float) exponent(x) */
2108 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
2111 if(p_floor_log2
|| p_log2
) {
2112 logexp
= LLVMBuildLShr(bld
->builder
, exp
, lp_build_const_int_vec(type
, 23), "");
2113 logexp
= LLVMBuildSub(bld
->builder
, logexp
, lp_build_const_int_vec(type
, 127), "");
2114 logexp
= LLVMBuildSIToFP(bld
->builder
, logexp
, vec_type
, "");
2118 /* mant = (float) mantissa(x) */
2119 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
2120 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
2121 mant
= LLVMBuildBitCast(bld
->builder
, mant
, vec_type
, "");
2123 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
2124 Elements(lp_build_log2_polynomial
));
2126 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2127 logmant
= LLVMBuildFMul(bld
->builder
, logmant
, LLVMBuildFSub(bld
->builder
, mant
, bld
->one
, ""), "");
2129 res
= LLVMBuildFAdd(bld
->builder
, logmant
, logexp
, "");
2133 exp
= LLVMBuildBitCast(bld
->builder
, exp
, vec_type
, "");
2138 *p_floor_log2
= logexp
;
2146 lp_build_log2(struct lp_build_context
*bld
,
2150 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);