1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_init.h" /* for lp_build_engine */
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
66 * No checks for special case values of a or b = 1 or 0 are done.
69 lp_build_min_simple(struct lp_build_context
*bld
,
73 const struct lp_type type
= bld
->type
;
74 const char *intrinsic
= NULL
;
77 /* TODO: optimize the constant case */
79 if(type
.width
* type
.length
== 128) {
81 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
82 intrinsic
= "llvm.x86.sse.min.ps";
83 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
84 intrinsic
= "llvm.x86.sse2.min.pd";
87 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
88 intrinsic
= "llvm.x86.sse2.pminu.b";
89 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
90 intrinsic
= "llvm.x86.sse41.pminsb";
91 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
92 intrinsic
= "llvm.x86.sse41.pminuw";
93 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
94 intrinsic
= "llvm.x86.sse2.pmins.w";
95 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
96 intrinsic
= "llvm.x86.sse41.pminud";
97 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
98 intrinsic
= "llvm.x86.sse41.pminsd";
103 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
105 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
106 return lp_build_select(bld
, cond
, a
, b
);
112 * No checks for special case values of a or b = 1 or 0 are done.
115 lp_build_max_simple(struct lp_build_context
*bld
,
119 const struct lp_type type
= bld
->type
;
120 const char *intrinsic
= NULL
;
123 /* TODO: optimize the constant case */
125 if(type
.width
* type
.length
== 128) {
127 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
128 intrinsic
= "llvm.x86.sse.max.ps";
129 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
130 intrinsic
= "llvm.x86.sse2.max.pd";
133 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
134 intrinsic
= "llvm.x86.sse2.pmaxu.b";
135 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
136 intrinsic
= "llvm.x86.sse41.pmaxsb";
137 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
138 intrinsic
= "llvm.x86.sse41.pmaxuw";
139 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
140 intrinsic
= "llvm.x86.sse2.pmaxs.w";
141 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
142 intrinsic
= "llvm.x86.sse41.pmaxud";
143 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
144 intrinsic
= "llvm.x86.sse41.pmaxsd";
149 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
151 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
152 return lp_build_select(bld
, cond
, a
, b
);
157 * Generate 1 - a, or ~a depending on bld->type.
160 lp_build_comp(struct lp_build_context
*bld
,
163 const struct lp_type type
= bld
->type
;
170 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
171 if(LLVMIsConstant(a
))
172 return LLVMConstNot(a
);
174 return LLVMBuildNot(bld
->builder
, a
, "");
177 if(LLVMIsConstant(a
))
178 return LLVMConstSub(bld
->one
, a
);
180 return LLVMBuildSub(bld
->builder
, bld
->one
, a
, "");
188 lp_build_add(struct lp_build_context
*bld
,
192 const struct lp_type type
= bld
->type
;
199 if(a
== bld
->undef
|| b
== bld
->undef
)
203 const char *intrinsic
= NULL
;
205 if(a
== bld
->one
|| b
== bld
->one
)
208 if(util_cpu_caps
.has_sse2
&&
209 type
.width
* type
.length
== 128 &&
210 !type
.floating
&& !type
.fixed
) {
212 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
214 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
218 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
221 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
222 res
= LLVMConstAdd(a
, b
);
224 res
= LLVMBuildAdd(bld
->builder
, a
, b
, "");
226 /* clamp to ceiling of 1.0 */
227 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
228 res
= lp_build_min_simple(bld
, res
, bld
->one
);
230 /* XXX clamp to floor of -1 or 0??? */
236 /** Return the sum of the elements of a */
238 lp_build_sum_vector(struct lp_build_context
*bld
,
241 const struct lp_type type
= bld
->type
;
242 LLVMValueRef index
, res
;
249 assert(type
.length
> 1);
251 assert(!bld
->type
.norm
);
253 index
= LLVMConstInt(LLVMInt32Type(), 0, 0);
254 res
= LLVMBuildExtractElement(bld
->builder
, a
, index
, "");
256 for (i
= 1; i
< type
.length
; i
++) {
257 index
= LLVMConstInt(LLVMInt32Type(), i
, 0);
258 res
= LLVMBuildAdd(bld
->builder
, res
,
259 LLVMBuildExtractElement(bld
->builder
, a
, index
, ""),
271 lp_build_sub(struct lp_build_context
*bld
,
275 const struct lp_type type
= bld
->type
;
280 if(a
== bld
->undef
|| b
== bld
->undef
)
286 const char *intrinsic
= NULL
;
291 if(util_cpu_caps
.has_sse2
&&
292 type
.width
* type
.length
== 128 &&
293 !type
.floating
&& !type
.fixed
) {
295 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
297 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
301 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
304 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
305 res
= LLVMConstSub(a
, b
);
307 res
= LLVMBuildSub(bld
->builder
, a
, b
, "");
309 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
310 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
317 * Normalized 8bit multiplication.
321 * makes the following approximation to the division (Sree)
323 * a*b/255 ~= (a*(b + 1)) >> 256
325 * which is the fastest method that satisfies the following OpenGL criteria
327 * 0*0 = 0 and 255*255 = 255
331 * takes the geometric series approximation to the division
333 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
335 * in this case just the first two terms to fit in 16bit arithmetic
337 * t/255 ~= (t + (t >> 8)) >> 8
339 * note that just by itself it doesn't satisfies the OpenGL criteria, as
340 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
343 * - geometric series plus rounding
345 * when using a geometric series division instead of truncating the result
346 * use roundoff in the approximation (Jim Blinn)
348 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
350 * achieving the exact results
352 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
353 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
354 * @sa Michael Herf, The "double blend trick", May 2000,
355 * http://www.stereopsis.com/doubleblend.html
358 lp_build_mul_u8n(LLVMBuilderRef builder
,
359 struct lp_type i16_type
,
360 LLVMValueRef a
, LLVMValueRef b
)
365 c8
= lp_build_const_int_vec(i16_type
, 8);
369 /* a*b/255 ~= (a*(b + 1)) >> 256 */
370 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(i16_type
, 1), "");
371 ab
= LLVMBuildMul(builder
, a
, b
, "");
375 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
376 ab
= LLVMBuildMul(builder
, a
, b
, "");
377 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
378 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(i16_type
, 0x80), "");
382 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
392 lp_build_mul(struct lp_build_context
*bld
,
396 const struct lp_type type
= bld
->type
;
408 if(a
== bld
->undef
|| b
== bld
->undef
)
411 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
412 if(type
.width
== 8) {
413 struct lp_type i16_type
= lp_wider_type(type
);
414 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
416 lp_build_unpack2(bld
->builder
, type
, i16_type
, a
, &al
, &ah
);
417 lp_build_unpack2(bld
->builder
, type
, i16_type
, b
, &bl
, &bh
);
419 /* PMULLW, PSRLW, PADDW */
420 abl
= lp_build_mul_u8n(bld
->builder
, i16_type
, al
, bl
);
421 abh
= lp_build_mul_u8n(bld
->builder
, i16_type
, ah
, bh
);
423 ab
= lp_build_pack2(bld
->builder
, i16_type
, type
, abl
, abh
);
433 shift
= lp_build_const_int_vec(type
, type
.width
/2);
437 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
438 res
= LLVMConstMul(a
, b
);
441 res
= LLVMConstAShr(res
, shift
);
443 res
= LLVMConstLShr(res
, shift
);
447 res
= LLVMBuildMul(bld
->builder
, a
, b
, "");
450 res
= LLVMBuildAShr(bld
->builder
, res
, shift
, "");
452 res
= LLVMBuildLShr(bld
->builder
, res
, shift
, "");
461 * Small vector x scale multiplication optimization.
464 lp_build_mul_imm(struct lp_build_context
*bld
,
477 return LLVMBuildNeg(bld
->builder
, a
, "");
479 if(b
== 2 && bld
->type
.floating
)
480 return lp_build_add(bld
, a
, a
);
483 unsigned shift
= ffs(b
) - 1;
485 if(bld
->type
.floating
) {
488 * Power of two multiplication by directly manipulating the mantissa.
490 * XXX: This might not be always faster, it will introduce a small error
491 * for multiplication by zero, and it will produce wrong results
494 unsigned mantissa
= lp_mantissa(bld
->type
);
495 factor
= lp_build_const_int_vec(bld
->type
, (unsigned long long)shift
<< mantissa
);
496 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_int_vec_type(bld
->type
), "");
497 a
= LLVMBuildAdd(bld
->builder
, a
, factor
, "");
498 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_vec_type(bld
->type
), "");
503 factor
= lp_build_const_vec(bld
->type
, shift
);
504 return LLVMBuildShl(bld
->builder
, a
, factor
, "");
508 factor
= lp_build_const_vec(bld
->type
, (double)b
);
509 return lp_build_mul(bld
, a
, factor
);
517 lp_build_div(struct lp_build_context
*bld
,
521 const struct lp_type type
= bld
->type
;
526 return lp_build_rcp(bld
, b
);
531 if(a
== bld
->undef
|| b
== bld
->undef
)
534 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
535 return LLVMConstFDiv(a
, b
);
537 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
538 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
540 return LLVMBuildFDiv(bld
->builder
, a
, b
, "");
545 * Linear interpolation.
547 * This also works for integer values with a few caveats.
549 * @sa http://www.stereopsis.com/doubleblend.html
552 lp_build_lerp(struct lp_build_context
*bld
,
560 delta
= lp_build_sub(bld
, v1
, v0
);
562 res
= lp_build_mul(bld
, x
, delta
);
564 res
= lp_build_add(bld
, v0
, res
);
567 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
568 * but it will be wrong for other uses. Basically we need a more
569 * powerful lp_type, capable of further distinguishing the values
570 * interpretation from the value storage. */
571 res
= LLVMBuildAnd(bld
->builder
, res
, lp_build_const_int_vec(bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
578 lp_build_lerp_2d(struct lp_build_context
*bld
,
586 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
587 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
588 return lp_build_lerp(bld
, y
, v0
, v1
);
594 * Do checks for special cases.
597 lp_build_min(struct lp_build_context
*bld
,
601 if(a
== bld
->undef
|| b
== bld
->undef
)
608 if(a
== bld
->zero
|| b
== bld
->zero
)
616 return lp_build_min_simple(bld
, a
, b
);
622 * Do checks for special cases.
625 lp_build_max(struct lp_build_context
*bld
,
629 if(a
== bld
->undef
|| b
== bld
->undef
)
636 if(a
== bld
->one
|| b
== bld
->one
)
644 return lp_build_max_simple(bld
, a
, b
);
649 * Generate clamp(a, min, max)
650 * Do checks for special cases.
653 lp_build_clamp(struct lp_build_context
*bld
,
658 a
= lp_build_min(bld
, a
, max
);
659 a
= lp_build_max(bld
, a
, min
);
668 lp_build_abs(struct lp_build_context
*bld
,
671 const struct lp_type type
= bld
->type
;
672 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
678 /* Mask out the sign bit */
679 if (type
.length
== 1) {
680 LLVMTypeRef int_type
= LLVMIntType(type
.width
);
681 LLVMTypeRef float_type
= LLVMFloatType();
682 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
683 LLVMValueRef mask
= LLVMConstInt(int_type
, absMask
, 0);
684 a
= LLVMBuildBitCast(bld
->builder
, a
, int_type
, "");
685 a
= LLVMBuildAnd(bld
->builder
, a
, mask
, "");
686 a
= LLVMBuildBitCast(bld
->builder
, a
, float_type
, "");
690 /* vector of floats */
691 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
692 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
693 LLVMValueRef mask
= lp_build_const_int_vec(type
, ((unsigned long long) absMask
));
694 a
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
695 a
= LLVMBuildAnd(bld
->builder
, a
, mask
, "");
696 a
= LLVMBuildBitCast(bld
->builder
, a
, vec_type
, "");
701 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
704 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
706 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
708 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
712 return lp_build_max(bld
, a
, LLVMBuildNeg(bld
->builder
, a
, ""));
717 lp_build_negate(struct lp_build_context
*bld
,
720 return LLVMBuildNeg(bld
->builder
, a
, "");
724 /** Return -1, 0 or +1 depending on the sign of a */
726 lp_build_sgn(struct lp_build_context
*bld
,
729 const struct lp_type type
= bld
->type
;
733 /* Handle non-zero case */
735 /* if not zero then sign must be positive */
738 else if(type
.floating
) {
739 LLVMTypeRef vec_type
;
740 LLVMTypeRef int_type
;
744 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
746 if (type
.length
== 1) {
747 int_type
= lp_build_int_elem_type(type
);
748 vec_type
= lp_build_elem_type(type
);
749 mask
= LLVMConstInt(int_type
, maskBit
, 0);
753 int_type
= lp_build_int_vec_type(type
);
754 vec_type
= lp_build_vec_type(type
);
755 mask
= lp_build_const_int_vec(type
, maskBit
);
758 /* Take the sign bit and add it to 1 constant */
759 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_type
, "");
760 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
761 one
= LLVMConstBitCast(bld
->one
, int_type
);
762 res
= LLVMBuildOr(bld
->builder
, sign
, one
, "");
763 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
767 LLVMValueRef minus_one
= lp_build_const_vec(type
, -1.0);
768 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
769 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
773 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
774 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
781 * Set the sign of float vector 'a' according to 'sign'.
782 * If sign==0, return abs(a).
783 * If sign==1, return -abs(a);
784 * Other values for sign produce undefined results.
787 lp_build_set_sign(struct lp_build_context
*bld
,
788 LLVMValueRef a
, LLVMValueRef sign
)
790 const struct lp_type type
= bld
->type
;
791 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
792 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
793 LLVMValueRef shift
= lp_build_const_int_vec(type
, type
.width
- 1);
794 LLVMValueRef mask
= lp_build_const_int_vec(type
,
795 ~((unsigned long long) 1 << (type
.width
- 1)));
796 LLVMValueRef val
, res
;
798 assert(type
.floating
);
800 /* val = reinterpret_cast<int>(a) */
801 val
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
802 /* val = val & mask */
803 val
= LLVMBuildAnd(bld
->builder
, val
, mask
, "");
804 /* sign = sign << shift */
805 sign
= LLVMBuildShl(bld
->builder
, sign
, shift
, "");
806 /* res = val | sign */
807 res
= LLVMBuildOr(bld
->builder
, val
, sign
, "");
808 /* res = reinterpret_cast<float>(res) */
809 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
816 * Convert vector of (or scalar) int to vector of (or scalar) float.
819 lp_build_int_to_float(struct lp_build_context
*bld
,
822 const struct lp_type type
= bld
->type
;
824 assert(type
.floating
);
825 /*assert(lp_check_value(type, a));*/
827 if (type
.length
== 1) {
828 LLVMTypeRef float_type
= LLVMFloatType();
829 return LLVMBuildSIToFP(bld
->builder
, a
, float_type
, "");
832 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
833 /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
835 res
= LLVMBuildSIToFP(bld
->builder
, a
, vec_type
, "");
842 enum lp_build_round_sse41_mode
844 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
845 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
846 LP_BUILD_ROUND_SSE41_CEIL
= 2,
847 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
851 static INLINE LLVMValueRef
852 lp_build_round_sse41(struct lp_build_context
*bld
,
854 enum lp_build_round_sse41_mode mode
)
856 const struct lp_type type
= bld
->type
;
857 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
858 const char *intrinsic
;
860 assert(type
.floating
);
861 assert(type
.width
*type
.length
== 128);
862 assert(lp_check_value(type
, a
));
863 assert(util_cpu_caps
.has_sse4_1
);
867 intrinsic
= "llvm.x86.sse41.round.ps";
870 intrinsic
= "llvm.x86.sse41.round.pd";
877 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, vec_type
, a
,
878 LLVMConstInt(LLVMInt32Type(), mode
, 0));
883 lp_build_trunc(struct lp_build_context
*bld
,
886 const struct lp_type type
= bld
->type
;
888 assert(type
.floating
);
889 assert(lp_check_value(type
, a
));
891 if(util_cpu_caps
.has_sse4_1
)
892 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
894 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
895 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
897 res
= LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
898 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
905 lp_build_round(struct lp_build_context
*bld
,
908 const struct lp_type type
= bld
->type
;
910 assert(type
.floating
);
911 assert(lp_check_value(type
, a
));
913 if(util_cpu_caps
.has_sse4_1
)
914 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
916 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
918 res
= lp_build_iround(bld
, a
);
919 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
926 lp_build_floor(struct lp_build_context
*bld
,
929 const struct lp_type type
= bld
->type
;
931 assert(type
.floating
);
933 if (type
.length
== 1) {
935 res
= lp_build_ifloor(bld
, a
);
936 res
= LLVMBuildSIToFP(bld
->builder
, res
, LLVMFloatType(), "");
940 if(util_cpu_caps
.has_sse4_1
)
941 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
943 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
945 res
= lp_build_ifloor(bld
, a
);
946 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
953 lp_build_ceil(struct lp_build_context
*bld
,
956 const struct lp_type type
= bld
->type
;
958 assert(type
.floating
);
959 assert(lp_check_value(type
, a
));
961 if(util_cpu_caps
.has_sse4_1
)
962 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
964 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
966 res
= lp_build_iceil(bld
, a
);
967 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
974 * Return fractional part of 'a' computed as a - floor(f)
975 * Typically used in texture coord arithmetic.
978 lp_build_fract(struct lp_build_context
*bld
,
981 assert(bld
->type
.floating
);
982 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
987 * Convert to integer, through whichever rounding method that's fastest,
988 * typically truncating toward zero.
991 lp_build_itrunc(struct lp_build_context
*bld
,
994 const struct lp_type type
= bld
->type
;
996 assert(type
.floating
);
998 if (type
.length
== 1) {
999 LLVMTypeRef int_type
= LLVMIntType(type
.width
);
1000 return LLVMBuildFPToSI(bld
->builder
, a
, int_type
, "");
1003 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1004 assert(lp_check_value(type
, a
));
1005 return LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
1011 * Convert float[] to int[] with round().
1014 lp_build_iround(struct lp_build_context
*bld
,
1017 const struct lp_type type
= bld
->type
;
1018 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1021 assert(type
.floating
);
1023 if (type
.length
== 1) {
1024 /* scalar float to int */
1025 LLVMTypeRef int_type
= LLVMIntType(type
.width
);
1026 /* XXX we want rounding here! */
1027 res
= LLVMBuildFPToSI(bld
->builder
, a
, int_type
, "");
1031 assert(lp_check_value(type
, a
));
1033 if(util_cpu_caps
.has_sse4_1
) {
1034 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1037 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1038 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1043 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1044 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1047 half
= lp_build_const_vec(type
, 0.5);
1048 half
= LLVMBuildBitCast(bld
->builder
, half
, int_vec_type
, "");
1049 half
= LLVMBuildOr(bld
->builder
, sign
, half
, "");
1050 half
= LLVMBuildBitCast(bld
->builder
, half
, vec_type
, "");
1052 res
= LLVMBuildAdd(bld
->builder
, a
, half
, "");
1055 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1062 * Convert float[] to int[] with floor().
1065 lp_build_ifloor(struct lp_build_context
*bld
,
1068 const struct lp_type type
= bld
->type
;
1069 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1072 assert(type
.floating
);
1074 if (type
.length
== 1) {
1075 /* scalar float to int */
1076 LLVMTypeRef int_type
= LLVMIntType(type
.width
);
1077 res
= LLVMBuildFPToSI(bld
->builder
, a
, int_type
, "");
1081 assert(lp_check_value(type
, a
));
1083 if(util_cpu_caps
.has_sse4_1
) {
1084 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1087 /* Take the sign bit and add it to 1 constant */
1088 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1089 unsigned mantissa
= lp_mantissa(type
);
1090 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1092 LLVMValueRef offset
;
1094 /* sign = a < 0 ? ~0 : 0 */
1095 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1096 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1097 sign
= LLVMBuildAShr(bld
->builder
, sign
, lp_build_const_int_vec(type
, type
.width
- 1), "");
1098 lp_build_name(sign
, "floor.sign");
1100 /* offset = -0.99999(9)f */
1101 offset
= lp_build_const_vec(type
, -(double)(((unsigned long long)1 << mantissa
) - 1)/((unsigned long long)1 << mantissa
));
1102 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1104 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1105 offset
= LLVMBuildAnd(bld
->builder
, offset
, sign
, "");
1106 offset
= LLVMBuildBitCast(bld
->builder
, offset
, vec_type
, "");
1107 lp_build_name(offset
, "floor.offset");
1109 res
= LLVMBuildAdd(bld
->builder
, a
, offset
, "");
1110 lp_build_name(res
, "floor.res");
1113 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1114 lp_build_name(res
, "floor");
1121 lp_build_iceil(struct lp_build_context
*bld
,
1124 const struct lp_type type
= bld
->type
;
1125 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1128 assert(type
.floating
);
1129 assert(lp_check_value(type
, a
));
1131 if(util_cpu_caps
.has_sse4_1
) {
1132 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1139 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1146 lp_build_sqrt(struct lp_build_context
*bld
,
1149 const struct lp_type type
= bld
->type
;
1150 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1153 /* TODO: optimize the constant case */
1154 /* TODO: optimize the constant case */
1156 assert(type
.floating
);
1157 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
1159 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1164 lp_build_rcp(struct lp_build_context
*bld
,
1167 const struct lp_type type
= bld
->type
;
1176 assert(type
.floating
);
1178 if(LLVMIsConstant(a
))
1179 return LLVMConstFDiv(bld
->one
, a
);
1181 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1183 * XXX: Added precision is not always necessary, so only enable this
1184 * when we have a better system in place to track minimum precision.
1189 * Do one Newton-Raphson step to improve precision:
1191 * x1 = (2 - a * rcp(a)) * rcp(a)
1194 LLVMValueRef two
= lp_build_const_vec(bld
->type
, 2.0);
1198 rcp_a
= lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type
), a
);
1200 res
= LLVMBuildMul(bld
->builder
, a
, rcp_a
, "");
1201 res
= LLVMBuildSub(bld
->builder
, two
, res
, "");
1202 res
= LLVMBuildMul(bld
->builder
, res
, rcp_a
, "");
1206 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type
), a
);
1210 return LLVMBuildFDiv(bld
->builder
, bld
->one
, a
, "");
1215 * Generate 1/sqrt(a)
1218 lp_build_rsqrt(struct lp_build_context
*bld
,
1221 const struct lp_type type
= bld
->type
;
1223 assert(type
.floating
);
1225 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
1226 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type
), a
);
1228 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
1232 #ifdef PIPE_OS_WINDOWS
1235 * XXX: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
1236 * which is neither efficient nor does the CRT linkage work on Windows
1237 * causing segmentation fault.
1239 * XXX: With LLVM 2.7 both schemes cause an assertion failure.
1242 lp_build_sincos(struct lp_build_context
*bld
,
1244 float (*func
)(float),
1247 LLVMModuleRef module
=
1248 LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld
->builder
)));
1249 LLVMValueRef function
;
1253 assert(bld
->type
.floating
);
1254 assert(bld
->type
.width
== 32);
1256 function
= LLVMGetNamedFunction(module
, name
);
1258 LLVMTypeRef ret_type
;
1259 LLVMTypeRef arg_types
[1];
1260 LLVMTypeRef function_type
;
1262 ret_type
= LLVMFloatType();
1263 arg_types
[0] = LLVMFloatType();
1264 function_type
= LLVMFunctionType(ret_type
, arg_types
, Elements(arg_types
), 0);
1265 function
= LLVMAddFunction(module
, name
, function_type
);
1267 LLVMSetFunctionCallConv(function
, LLVMCCallConv
);
1268 LLVMSetLinkage(function
, LLVMPrivateLinkage
);
1270 assert(LLVMIsDeclaration(function
));
1272 LLVMAddGlobalMapping(lp_build_engine
, function
, func
);
1277 for (i
= 0; i
< bld
->type
.length
; ++i
) {
1278 LLVMValueRef index
= LLVMConstInt(LLVMInt32Type(), i
, 0);
1279 LLVMValueRef args
[1];
1282 args
[0] = LLVMBuildExtractElement(bld
->builder
, a
, index
, "");
1284 tmp
= LLVMBuildCall(bld
->builder
, function
, args
, Elements(args
), "");
1286 res
= LLVMBuildInsertElement(bld
->builder
, res
, tmp
, index
, "");
1293 lp_build_cos(struct lp_build_context
*bld
,
1296 return lp_build_sincos(bld
, "cosf", &cosf
, a
);
1300 lp_build_sin(struct lp_build_context
*bld
,
1303 return lp_build_sincos(bld
, "sinf", &sinf
, a
);
1306 #else /* !PIPE_OS_WINDOWS */
1312 lp_build_cos(struct lp_build_context
*bld
,
1315 const struct lp_type type
= bld
->type
;
1316 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1319 /* TODO: optimize the constant case */
1321 assert(type
.floating
);
1322 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.cos.v%uf%u", type
.length
, type
.width
);
1324 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1332 lp_build_sin(struct lp_build_context
*bld
,
1335 const struct lp_type type
= bld
->type
;
1336 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1339 /* TODO: optimize the constant case */
1341 assert(type
.floating
);
1342 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sin.v%uf%u", type
.length
, type
.width
);
1344 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1347 #endif /* !PIPE_OS_WINDOWS */
1351 * Generate pow(x, y)
1354 lp_build_pow(struct lp_build_context
*bld
,
1358 /* TODO: optimize the constant case */
1359 if(LLVMIsConstant(x
) && LLVMIsConstant(y
))
1360 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1363 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
1371 lp_build_exp(struct lp_build_context
*bld
,
1374 /* log2(e) = 1/log(2) */
1375 LLVMValueRef log2e
= lp_build_const_vec(bld
->type
, 1.4426950408889634);
1377 return lp_build_mul(bld
, log2e
, lp_build_exp2(bld
, x
));
1385 lp_build_log(struct lp_build_context
*bld
,
1389 LLVMValueRef log2
= lp_build_const_vec(bld
->type
, 0.69314718055994529);
1391 return lp_build_mul(bld
, log2
, lp_build_exp2(bld
, x
));
1395 #define EXP_POLY_DEGREE 3
1396 #define LOG_POLY_DEGREE 5
1400 * Generate polynomial.
1401 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1404 lp_build_polynomial(struct lp_build_context
*bld
,
1406 const double *coeffs
,
1407 unsigned num_coeffs
)
1409 const struct lp_type type
= bld
->type
;
1410 LLVMTypeRef float_type
= LLVMFloatType();
1411 LLVMValueRef res
= NULL
;
1414 /* TODO: optimize the constant case */
1415 if(LLVMIsConstant(x
))
1416 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1419 for (i
= num_coeffs
; i
--; ) {
1422 if (type
.length
== 1)
1423 coeff
= LLVMConstReal(float_type
, coeffs
[i
]);
1425 coeff
= lp_build_const_vec(type
, coeffs
[i
]);
1428 res
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x
, res
));
1441 * Minimax polynomial fit of 2**x, in range [0, 1[
1443 const double lp_build_exp2_polynomial
[] = {
1444 #if EXP_POLY_DEGREE == 5
1445 0.999999999690134838155,
1446 0.583974334321735217258,
1447 0.164553105719676828492,
1448 0.0292811063701710962255,
1449 0.00354944426657875141846,
1450 0.000296253726543423377365
1451 #elif EXP_POLY_DEGREE == 4
1452 1.00000001502262084505,
1453 0.563586057338685991394,
1454 0.150436017652442413623,
1455 0.0243220604213317927308,
1456 0.0025359088446580436489
1457 #elif EXP_POLY_DEGREE == 3
1458 0.999925218562710312959,
1459 0.695833540494823811697,
1460 0.226067155427249155588,
1461 0.0780245226406372992967
1462 #elif EXP_POLY_DEGREE == 2
1463 1.00172476321474503578,
1464 0.657636275736077639316,
1465 0.33718943461968720704
1473 lp_build_exp2_approx(struct lp_build_context
*bld
,
1475 LLVMValueRef
*p_exp2_int_part
,
1476 LLVMValueRef
*p_frac_part
,
1477 LLVMValueRef
*p_exp2
)
1479 const struct lp_type type
= bld
->type
;
1480 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1481 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1482 LLVMValueRef ipart
= NULL
;
1483 LLVMValueRef fpart
= NULL
;
1484 LLVMValueRef expipart
= NULL
;
1485 LLVMValueRef expfpart
= NULL
;
1486 LLVMValueRef res
= NULL
;
1488 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
1489 /* TODO: optimize the constant case */
1490 if(LLVMIsConstant(x
))
1491 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1494 assert(type
.floating
&& type
.width
== 32);
1496 x
= lp_build_min(bld
, x
, lp_build_const_vec(type
, 129.0));
1497 x
= lp_build_max(bld
, x
, lp_build_const_vec(type
, -126.99999));
1499 /* ipart = floor(x) */
1500 ipart
= lp_build_floor(bld
, x
);
1502 /* fpart = x - ipart */
1503 fpart
= LLVMBuildSub(bld
->builder
, x
, ipart
, "");
1506 if(p_exp2_int_part
|| p_exp2
) {
1507 /* expipart = (float) (1 << ipart) */
1508 ipart
= LLVMBuildFPToSI(bld
->builder
, ipart
, int_vec_type
, "");
1509 expipart
= LLVMBuildAdd(bld
->builder
, ipart
, lp_build_const_int_vec(type
, 127), "");
1510 expipart
= LLVMBuildShl(bld
->builder
, expipart
, lp_build_const_int_vec(type
, 23), "");
1511 expipart
= LLVMBuildBitCast(bld
->builder
, expipart
, vec_type
, "");
1515 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
1516 Elements(lp_build_exp2_polynomial
));
1518 res
= LLVMBuildMul(bld
->builder
, expipart
, expfpart
, "");
1522 *p_exp2_int_part
= expipart
;
1525 *p_frac_part
= fpart
;
1533 lp_build_exp2(struct lp_build_context
*bld
,
1537 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
1543 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1544 * These coefficients can be generate with
1545 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1547 const double lp_build_log2_polynomial
[] = {
1548 #if LOG_POLY_DEGREE == 6
1549 3.11578814719469302614,
1550 -3.32419399085241980044,
1551 2.59883907202499966007,
1552 -1.23152682416275988241,
1553 0.318212422185251071475,
1554 -0.0344359067839062357313
1555 #elif LOG_POLY_DEGREE == 5
1556 2.8882704548164776201,
1557 -2.52074962577807006663,
1558 1.48116647521213171641,
1559 -0.465725644288844778798,
1560 0.0596515482674574969533
1561 #elif LOG_POLY_DEGREE == 4
1562 2.61761038894603480148,
1563 -1.75647175389045657003,
1564 0.688243882994381274313,
1565 -0.107254423828329604454
1566 #elif LOG_POLY_DEGREE == 3
1567 2.28330284476918490682,
1568 -1.04913055217340124191,
1569 0.204446009836232697516
1577 * See http://www.devmaster.net/forums/showthread.php?p=43580
1580 lp_build_log2_approx(struct lp_build_context
*bld
,
1582 LLVMValueRef
*p_exp
,
1583 LLVMValueRef
*p_floor_log2
,
1584 LLVMValueRef
*p_log2
)
1586 const struct lp_type type
= bld
->type
;
1587 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1588 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1590 LLVMValueRef expmask
= lp_build_const_int_vec(type
, 0x7f800000);
1591 LLVMValueRef mantmask
= lp_build_const_int_vec(type
, 0x007fffff);
1592 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
1594 LLVMValueRef i
= NULL
;
1595 LLVMValueRef exp
= NULL
;
1596 LLVMValueRef mant
= NULL
;
1597 LLVMValueRef logexp
= NULL
;
1598 LLVMValueRef logmant
= NULL
;
1599 LLVMValueRef res
= NULL
;
1601 if(p_exp
|| p_floor_log2
|| p_log2
) {
1602 /* TODO: optimize the constant case */
1603 if(LLVMIsConstant(x
))
1604 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1607 assert(type
.floating
&& type
.width
== 32);
1609 i
= LLVMBuildBitCast(bld
->builder
, x
, int_vec_type
, "");
1611 /* exp = (float) exponent(x) */
1612 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
1615 if(p_floor_log2
|| p_log2
) {
1616 logexp
= LLVMBuildLShr(bld
->builder
, exp
, lp_build_const_int_vec(type
, 23), "");
1617 logexp
= LLVMBuildSub(bld
->builder
, logexp
, lp_build_const_int_vec(type
, 127), "");
1618 logexp
= LLVMBuildSIToFP(bld
->builder
, logexp
, vec_type
, "");
1622 /* mant = (float) mantissa(x) */
1623 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
1624 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
1625 mant
= LLVMBuildBitCast(bld
->builder
, mant
, vec_type
, "");
1627 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
1628 Elements(lp_build_log2_polynomial
));
1630 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1631 logmant
= LLVMBuildMul(bld
->builder
, logmant
, LLVMBuildSub(bld
->builder
, mant
, bld
->one
, ""), "");
1633 res
= LLVMBuildAdd(bld
->builder
, logmant
, logexp
, "");
1637 exp
= LLVMBuildBitCast(bld
->builder
, exp
, vec_type
, "");
1642 *p_floor_log2
= logexp
;
1649 /** scalar version of above function */
1651 lp_build_float_log2_approx(struct lp_build_context
*bld
,
1653 LLVMValueRef
*p_exp
,
1654 LLVMValueRef
*p_floor_log2
,
1655 LLVMValueRef
*p_log2
)
1657 const struct lp_type type
= bld
->type
;
1658 LLVMTypeRef float_type
= LLVMFloatType();
1659 LLVMTypeRef int_type
= LLVMIntType(type
.width
);
1661 LLVMValueRef expmask
= LLVMConstInt(int_type
, 0x7f800000, 0);
1662 LLVMValueRef mantmask
= LLVMConstInt(int_type
, 0x007fffff, 0);
1663 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_type
);
1665 LLVMValueRef i
= NULL
;
1666 LLVMValueRef exp
= NULL
;
1667 LLVMValueRef mant
= NULL
;
1668 LLVMValueRef logexp
= NULL
;
1669 LLVMValueRef logmant
= NULL
;
1670 LLVMValueRef res
= NULL
;
1672 if(p_exp
|| p_floor_log2
|| p_log2
) {
1673 /* TODO: optimize the constant case */
1674 if(LLVMIsConstant(x
))
1675 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1678 assert(type
.floating
&& type
.width
== 32);
1680 i
= LLVMBuildBitCast(bld
->builder
, x
, int_type
, "");
1682 /* exp = (float) exponent(x) */
1683 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
1686 if(p_floor_log2
|| p_log2
) {
1687 LLVMValueRef c23
= LLVMConstInt(int_type
, 23, 0);
1688 LLVMValueRef c127
= LLVMConstInt(int_type
, 127, 0);
1689 logexp
= LLVMBuildLShr(bld
->builder
, exp
, c23
, "");
1690 logexp
= LLVMBuildSub(bld
->builder
, logexp
, c127
, "");
1691 logexp
= LLVMBuildSIToFP(bld
->builder
, logexp
, float_type
, "");
1695 /* mant = (float) mantissa(x) */
1696 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
1697 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
1698 mant
= LLVMBuildBitCast(bld
->builder
, mant
, float_type
, "");
1700 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
1701 Elements(lp_build_log2_polynomial
));
1703 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1704 logmant
= LLVMBuildMul(bld
->builder
, logmant
, LLVMBuildSub(bld
->builder
, mant
, bld
->one
, ""), "");
1706 res
= LLVMBuildAdd(bld
->builder
, logmant
, logexp
, "");
1710 exp
= LLVMBuildBitCast(bld
->builder
, exp
, float_type
, "");
1715 *p_floor_log2
= logexp
;
1723 lp_build_log2(struct lp_build_context
*bld
,
1727 if (bld
->type
.length
== 1) {
1728 lp_build_float_log2_approx(bld
, x
, NULL
, NULL
, &res
);
1731 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);