1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51 #include "util/u_cpu_detect.h"
53 #include "lp_bld_type.h"
54 #include "lp_bld_const.h"
55 #include "lp_bld_intr.h"
56 #include "lp_bld_logic.h"
57 #include "lp_bld_arit.h"
62 * No checks for special case values of a or b = 1 or 0 are done.
65 lp_build_min_simple(struct lp_build_context
*bld
,
69 const struct lp_type type
= bld
->type
;
70 const char *intrinsic
= NULL
;
73 /* TODO: optimize the constant case */
75 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
76 if(type
.width
* type
.length
== 128) {
79 intrinsic
= "llvm.x86.sse.min.ps";
81 intrinsic
= "llvm.x86.sse2.min.pd";
84 if(type
.width
== 8 && !type
.sign
)
85 intrinsic
= "llvm.x86.sse2.pminu.b";
86 if(type
.width
== 8 && type
.sign
)
87 intrinsic
= "llvm.x86.sse41.pminsb";
88 if(type
.width
== 16 && !type
.sign
)
89 intrinsic
= "llvm.x86.sse41.pminuw";
90 if(type
.width
== 16 && type
.sign
)
91 intrinsic
= "llvm.x86.sse2.pmins.w";
92 if(type
.width
== 32 && !type
.sign
)
93 intrinsic
= "llvm.x86.sse41.pminud";
94 if(type
.width
== 32 && type
.sign
)
95 intrinsic
= "llvm.x86.sse41.pminsd";
101 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
103 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
104 return lp_build_select(bld
, cond
, a
, b
);
110 * No checks for special case values of a or b = 1 or 0 are done.
113 lp_build_max_simple(struct lp_build_context
*bld
,
117 const struct lp_type type
= bld
->type
;
118 const char *intrinsic
= NULL
;
121 /* TODO: optimize the constant case */
123 if(type
.width
* type
.length
== 128) {
125 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
126 intrinsic
= "llvm.x86.sse.max.ps";
127 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
128 intrinsic
= "llvm.x86.sse2.max.pd";
131 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
132 intrinsic
= "llvm.x86.sse2.pmaxu.b";
133 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
134 intrinsic
= "llvm.x86.sse41.pmaxsb";
135 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
136 intrinsic
= "llvm.x86.sse41.pmaxuw";
137 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
138 intrinsic
= "llvm.x86.sse2.pmaxs.w";
139 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
140 intrinsic
= "llvm.x86.sse41.pmaxud";
141 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
142 intrinsic
= "llvm.x86.sse41.pmaxsd";
147 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
149 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
150 return lp_build_select(bld
, cond
, a
, b
);
155 * Generate 1 - a, or ~a depending on bld->type.
158 lp_build_comp(struct lp_build_context
*bld
,
161 const struct lp_type type
= bld
->type
;
168 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
169 if(LLVMIsConstant(a
))
170 return LLVMConstNot(a
);
172 return LLVMBuildNot(bld
->builder
, a
, "");
175 if(LLVMIsConstant(a
))
176 return LLVMConstSub(bld
->one
, a
);
178 return LLVMBuildSub(bld
->builder
, bld
->one
, a
, "");
186 lp_build_add(struct lp_build_context
*bld
,
190 const struct lp_type type
= bld
->type
;
197 if(a
== bld
->undef
|| b
== bld
->undef
)
201 const char *intrinsic
= NULL
;
203 if(a
== bld
->one
|| b
== bld
->one
)
206 if(util_cpu_caps
.has_sse2
&&
207 type
.width
* type
.length
== 128 &&
208 !type
.floating
&& !type
.fixed
) {
210 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
212 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
216 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
219 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
220 res
= LLVMConstAdd(a
, b
);
222 res
= LLVMBuildAdd(bld
->builder
, a
, b
, "");
224 /* clamp to ceiling of 1.0 */
225 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
226 res
= lp_build_min_simple(bld
, res
, bld
->one
);
228 /* XXX clamp to floor of -1 or 0??? */
238 lp_build_sub(struct lp_build_context
*bld
,
242 const struct lp_type type
= bld
->type
;
247 if(a
== bld
->undef
|| b
== bld
->undef
)
253 const char *intrinsic
= NULL
;
258 if(util_cpu_caps
.has_sse2
&&
259 type
.width
* type
.length
== 128 &&
260 !type
.floating
&& !type
.fixed
) {
262 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
264 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
268 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
271 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
272 res
= LLVMConstSub(a
, b
);
274 res
= LLVMBuildSub(bld
->builder
, a
, b
, "");
276 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
277 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
284 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
287 lp_build_unpack_shuffle(unsigned n
, unsigned lo_hi
)
289 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
292 assert(n
<= LP_MAX_VECTOR_LENGTH
);
295 for(i
= 0, j
= lo_hi
*n
/2; i
< n
; i
+= 2, ++j
) {
296 elems
[i
+ 0] = LLVMConstInt(LLVMInt32Type(), 0 + j
, 0);
297 elems
[i
+ 1] = LLVMConstInt(LLVMInt32Type(), n
+ j
, 0);
300 return LLVMConstVector(elems
, n
);
305 * Build constant int vector of width 'n' and value 'c'.
308 lp_build_const_vec(LLVMTypeRef type
, unsigned n
, long long c
)
310 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
313 assert(n
<= LP_MAX_VECTOR_LENGTH
);
315 for(i
= 0; i
< n
; ++i
)
316 elems
[i
] = LLVMConstInt(type
, c
, 0);
318 return LLVMConstVector(elems
, n
);
323 * Normalized 8bit multiplication.
327 * makes the following approximation to the division (Sree)
329 * a*b/255 ~= (a*(b + 1)) >> 256
331 * which is the fastest method that satisfies the following OpenGL criteria
333 * 0*0 = 0 and 255*255 = 255
337 * takes the geometric series approximation to the division
339 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
341 * in this case just the first two terms to fit in 16bit arithmetic
343 * t/255 ~= (t + (t >> 8)) >> 8
345 * note that just by itself it doesn't satisfies the OpenGL criteria, as
346 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
349 * - geometric series plus rounding
351 * when using a geometric series division instead of truncating the result
352 * use roundoff in the approximation (Jim Blinn)
354 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
356 * achieving the exact results
358 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
359 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
360 * @sa Michael Herf, The "double blend trick", May 2000,
361 * http://www.stereopsis.com/doubleblend.html
364 lp_build_mul_u8n(LLVMBuilderRef builder
,
365 LLVMValueRef a
, LLVMValueRef b
)
367 static LLVMValueRef c01
= NULL
;
368 static LLVMValueRef c08
= NULL
;
369 static LLVMValueRef c80
= NULL
;
372 if(!c01
) c01
= lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
373 if(!c08
) c08
= lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
374 if(!c80
) c80
= lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
378 /* a*b/255 ~= (a*(b + 1)) >> 256 */
379 b
= LLVMBuildAdd(builder
, b
, c01
, "");
380 ab
= LLVMBuildMul(builder
, a
, b
, "");
384 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
385 ab
= LLVMBuildMul(builder
, a
, b
, "");
386 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c08
, ""), "");
387 ab
= LLVMBuildAdd(builder
, ab
, c80
, "");
391 ab
= LLVMBuildLShr(builder
, ab
, c08
, "");
401 lp_build_mul(struct lp_build_context
*bld
,
405 const struct lp_type type
= bld
->type
;
415 if(a
== bld
->undef
|| b
== bld
->undef
)
418 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
419 if(util_cpu_caps
.has_sse2
&& type
.width
== 8 && type
.length
== 16) {
420 LLVMTypeRef i16x8
= LLVMVectorType(LLVMInt16Type(), 8);
421 LLVMTypeRef i8x16
= LLVMVectorType(LLVMInt8Type(), 16);
422 static LLVMValueRef ml
= NULL
;
423 static LLVMValueRef mh
= NULL
;
424 LLVMValueRef al
, ah
, bl
, bh
;
425 LLVMValueRef abl
, abh
;
428 if(!ml
) ml
= lp_build_unpack_shuffle(16, 0);
429 if(!mh
) mh
= lp_build_unpack_shuffle(16, 1);
431 /* PUNPCKLBW, PUNPCKHBW */
432 al
= LLVMBuildShuffleVector(bld
->builder
, a
, bld
->zero
, ml
, "");
433 bl
= LLVMBuildShuffleVector(bld
->builder
, b
, bld
->zero
, ml
, "");
434 ah
= LLVMBuildShuffleVector(bld
->builder
, a
, bld
->zero
, mh
, "");
435 bh
= LLVMBuildShuffleVector(bld
->builder
, b
, bld
->zero
, mh
, "");
438 al
= LLVMBuildBitCast(bld
->builder
, al
, i16x8
, "");
439 bl
= LLVMBuildBitCast(bld
->builder
, bl
, i16x8
, "");
440 ah
= LLVMBuildBitCast(bld
->builder
, ah
, i16x8
, "");
441 bh
= LLVMBuildBitCast(bld
->builder
, bh
, i16x8
, "");
443 /* PMULLW, PSRLW, PADDW */
444 abl
= lp_build_mul_u8n(bld
->builder
, al
, bl
);
445 abh
= lp_build_mul_u8n(bld
->builder
, ah
, bh
);
448 ab
= lp_build_intrinsic_binary(bld
->builder
, "llvm.x86.sse2.packuswb.128" , i16x8
, abl
, abh
);
451 ab
= LLVMBuildBitCast(bld
->builder
, ab
, i8x16
, "");
460 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
461 return LLVMConstMul(a
, b
);
463 return LLVMBuildMul(bld
->builder
, a
, b
, "");
471 lp_build_div(struct lp_build_context
*bld
,
475 const struct lp_type type
= bld
->type
;
480 return lp_build_rcp(bld
, b
);
485 if(a
== bld
->undef
|| b
== bld
->undef
)
488 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
489 return LLVMConstFDiv(a
, b
);
491 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
492 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
494 return LLVMBuildFDiv(bld
->builder
, a
, b
, "");
499 lp_build_lerp(struct lp_build_context
*bld
,
504 return lp_build_add(bld
, v0
, lp_build_mul(bld
, x
, lp_build_sub(bld
, v1
, v0
)));
509 lp_build_lerp_2d(struct lp_build_context
*bld
,
517 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
518 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
519 return lp_build_lerp(bld
, y
, v0
, v1
);
525 * Do checks for special cases.
528 lp_build_min(struct lp_build_context
*bld
,
532 if(a
== bld
->undef
|| b
== bld
->undef
)
539 if(a
== bld
->zero
|| b
== bld
->zero
)
547 return lp_build_min_simple(bld
, a
, b
);
553 * Do checks for special cases.
556 lp_build_max(struct lp_build_context
*bld
,
560 if(a
== bld
->undef
|| b
== bld
->undef
)
567 if(a
== bld
->one
|| b
== bld
->one
)
575 return lp_build_max_simple(bld
, a
, b
);
583 lp_build_abs(struct lp_build_context
*bld
,
586 const struct lp_type type
= bld
->type
;
587 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
593 /* Mask out the sign bit */
594 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
595 LLVMValueRef mask
= lp_build_int_const_scalar(type
, ((unsigned long long)1 << type
.width
) - 1);
596 a
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
597 a
= LLVMBuildAnd(bld
->builder
, a
, mask
, "");
598 a
= LLVMBuildBitCast(bld
->builder
, a
, vec_type
, "");
602 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
605 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
607 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
609 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
613 return lp_build_max(bld
, a
, LLVMBuildNeg(bld
->builder
, a
, ""));
618 lp_build_sgn(struct lp_build_context
*bld
,
621 const struct lp_type type
= bld
->type
;
622 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
626 /* Handle non-zero case */
628 /* if not zero then sign must be positive */
631 else if(type
.floating
) {
632 /* Take the sign bit and add it to 1 constant */
633 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
634 LLVMValueRef mask
= lp_build_int_const_scalar(type
, (unsigned long long)1 << (type
.width
- 1));
637 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
638 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
639 one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
640 res
= LLVMBuildOr(bld
->builder
, sign
, one
, "");
641 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
645 LLVMValueRef minus_one
= lp_build_const_scalar(type
, -1.0);
646 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
647 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
651 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
652 res
= lp_build_select(bld
, cond
, bld
->zero
, bld
->one
);
658 enum lp_build_round_sse41_mode
660 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
661 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
662 LP_BUILD_ROUND_SSE41_CEIL
= 2,
663 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
667 static INLINE LLVMValueRef
668 lp_build_round_sse41(struct lp_build_context
*bld
,
670 enum lp_build_round_sse41_mode mode
)
672 const struct lp_type type
= bld
->type
;
673 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
674 const char *intrinsic
;
676 assert(type
.floating
);
677 assert(type
.width
*type
.length
== 128);
681 intrinsic
= "llvm.x86.sse41.round.ps";
684 intrinsic
= "llvm.x86.sse41.round.pd";
691 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, vec_type
, a
,
692 LLVMConstInt(LLVMInt32Type(), mode
, 0));
697 lp_build_round(struct lp_build_context
*bld
,
700 const struct lp_type type
= bld
->type
;
702 assert(type
.floating
);
704 if(util_cpu_caps
.has_sse4_1
)
705 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
714 lp_build_floor(struct lp_build_context
*bld
,
717 const struct lp_type type
= bld
->type
;
719 assert(type
.floating
);
721 if(util_cpu_caps
.has_sse4_1
)
722 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
731 lp_build_ceil(struct lp_build_context
*bld
,
734 const struct lp_type type
= bld
->type
;
736 assert(type
.floating
);
738 if(util_cpu_caps
.has_sse4_1
)
739 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
748 lp_build_trunc(struct lp_build_context
*bld
,
751 const struct lp_type type
= bld
->type
;
753 assert(type
.floating
);
755 if(util_cpu_caps
.has_sse4_1
)
756 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
765 * Convert to integer, through whichever rounding method that's fastest,
766 * typically truncating to zero.
769 lp_build_int(struct lp_build_context
*bld
,
772 const struct lp_type type
= bld
->type
;
773 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
775 assert(type
.floating
);
777 return LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
782 lp_build_ifloor(struct lp_build_context
*bld
,
785 a
= lp_build_floor(bld
, a
);
786 a
= lp_build_int(bld
, a
);
792 lp_build_sqrt(struct lp_build_context
*bld
,
795 const struct lp_type type
= bld
->type
;
796 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
799 /* TODO: optimize the constant case */
800 /* TODO: optimize the constant case */
802 assert(type
.floating
);
803 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
805 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
810 lp_build_rcp(struct lp_build_context
*bld
,
813 const struct lp_type type
= bld
->type
;
822 assert(type
.floating
);
824 if(LLVMIsConstant(a
))
825 return LLVMConstFDiv(bld
->one
, a
);
827 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
828 /* FIXME: improve precision */
829 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type
), a
);
831 return LLVMBuildFDiv(bld
->builder
, bld
->one
, a
, "");
839 lp_build_rsqrt(struct lp_build_context
*bld
,
842 const struct lp_type type
= bld
->type
;
844 assert(type
.floating
);
846 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
847 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type
), a
);
849 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
857 lp_build_cos(struct lp_build_context
*bld
,
860 const struct lp_type type
= bld
->type
;
861 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
864 /* TODO: optimize the constant case */
866 assert(type
.floating
);
867 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.cos.v%uf%u", type
.length
, type
.width
);
869 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
877 lp_build_sin(struct lp_build_context
*bld
,
880 const struct lp_type type
= bld
->type
;
881 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
884 /* TODO: optimize the constant case */
886 assert(type
.floating
);
887 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sin.v%uf%u", type
.length
, type
.width
);
889 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
897 lp_build_pow(struct lp_build_context
*bld
,
901 /* TODO: optimize the constant case */
902 if(LLVMIsConstant(x
) && LLVMIsConstant(y
))
903 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
906 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
914 lp_build_exp(struct lp_build_context
*bld
,
917 /* log2(e) = 1/log(2) */
918 LLVMValueRef log2e
= lp_build_const_scalar(bld
->type
, 1.4426950408889634);
920 return lp_build_mul(bld
, log2e
, lp_build_exp2(bld
, x
));
928 lp_build_log(struct lp_build_context
*bld
,
932 LLVMValueRef log2
= lp_build_const_scalar(bld
->type
, 1.4426950408889634);
934 return lp_build_mul(bld
, log2
, lp_build_exp2(bld
, x
));
938 #define EXP_POLY_DEGREE 3
939 #define LOG_POLY_DEGREE 5
943 * Generate polynomial.
944 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
947 lp_build_polynomial(struct lp_build_context
*bld
,
949 const double *coeffs
,
952 const struct lp_type type
= bld
->type
;
953 LLVMValueRef res
= NULL
;
956 /* TODO: optimize the constant case */
957 if(LLVMIsConstant(x
))
958 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
961 for (i
= num_coeffs
; i
--; ) {
962 LLVMValueRef coeff
= lp_build_const_scalar(type
, coeffs
[i
]);
964 res
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x
, res
));
977 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
979 const double lp_build_exp2_polynomial
[] = {
980 #if EXP_POLY_DEGREE == 5
981 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
982 #elif EXP_POLY_DEGREE == 4
983 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
984 #elif EXP_POLY_DEGREE == 3
985 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
986 #elif EXP_POLY_DEGREE == 2
987 1.0017247, 6.5763628e-1, 3.3718944e-1
995 lp_build_exp2_approx(struct lp_build_context
*bld
,
997 LLVMValueRef
*p_exp2_int_part
,
998 LLVMValueRef
*p_frac_part
,
999 LLVMValueRef
*p_exp2
)
1001 const struct lp_type type
= bld
->type
;
1002 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1003 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1004 LLVMValueRef ipart
= NULL
;
1005 LLVMValueRef fpart
= NULL
;
1006 LLVMValueRef expipart
= NULL
;
1007 LLVMValueRef expfpart
= NULL
;
1008 LLVMValueRef res
= NULL
;
1010 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
1011 /* TODO: optimize the constant case */
1012 if(LLVMIsConstant(x
))
1013 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1016 assert(type
.floating
&& type
.width
== 32);
1018 x
= lp_build_min(bld
, x
, lp_build_const_scalar(type
, 129.0));
1019 x
= lp_build_max(bld
, x
, lp_build_const_scalar(type
, -126.99999));
1021 /* ipart = int(x - 0.5) */
1022 ipart
= LLVMBuildSub(bld
->builder
, x
, lp_build_const_scalar(type
, 0.5f
), "");
1023 ipart
= LLVMBuildFPToSI(bld
->builder
, ipart
, int_vec_type
, "");
1025 /* fpart = x - ipart */
1026 fpart
= LLVMBuildSIToFP(bld
->builder
, ipart
, vec_type
, "");
1027 fpart
= LLVMBuildSub(bld
->builder
, x
, fpart
, "");
1030 if(p_exp2_int_part
|| p_exp2
) {
1031 /* expipart = (float) (1 << ipart) */
1032 expipart
= LLVMBuildAdd(bld
->builder
, ipart
, lp_build_int_const_scalar(type
, 127), "");
1033 expipart
= LLVMBuildShl(bld
->builder
, expipart
, lp_build_int_const_scalar(type
, 23), "");
1034 expipart
= LLVMBuildBitCast(bld
->builder
, expipart
, vec_type
, "");
1038 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
1039 Elements(lp_build_exp2_polynomial
));
1041 res
= LLVMBuildMul(bld
->builder
, expipart
, expfpart
, "");
1045 *p_exp2_int_part
= expipart
;
1048 *p_frac_part
= fpart
;
1056 lp_build_exp2(struct lp_build_context
*bld
,
1060 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
1066 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1067 * These coefficients can be generate with
1068 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1070 const double lp_build_log2_polynomial
[] = {
1071 #if LOG_POLY_DEGREE == 6
1072 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1073 #elif LOG_POLY_DEGREE == 5
1074 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1075 #elif LOG_POLY_DEGREE == 4
1076 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1077 #elif LOG_POLY_DEGREE == 3
1078 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1086 * See http://www.devmaster.net/forums/showthread.php?p=43580
1089 lp_build_log2_approx(struct lp_build_context
*bld
,
1091 LLVMValueRef
*p_exp
,
1092 LLVMValueRef
*p_floor_log2
,
1093 LLVMValueRef
*p_log2
)
1095 const struct lp_type type
= bld
->type
;
1096 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1097 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1099 LLVMValueRef expmask
= lp_build_int_const_scalar(type
, 0x7f800000);
1100 LLVMValueRef mantmask
= lp_build_int_const_scalar(type
, 0x007fffff);
1101 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
1103 LLVMValueRef i
= NULL
;
1104 LLVMValueRef exp
= NULL
;
1105 LLVMValueRef mant
= NULL
;
1106 LLVMValueRef logexp
= NULL
;
1107 LLVMValueRef logmant
= NULL
;
1108 LLVMValueRef res
= NULL
;
1110 if(p_exp
|| p_floor_log2
|| p_log2
) {
1111 /* TODO: optimize the constant case */
1112 if(LLVMIsConstant(x
))
1113 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1116 assert(type
.floating
&& type
.width
== 32);
1118 i
= LLVMBuildBitCast(bld
->builder
, x
, int_vec_type
, "");
1120 /* exp = (float) exponent(x) */
1121 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
1124 if(p_floor_log2
|| p_log2
) {
1125 logexp
= LLVMBuildLShr(bld
->builder
, exp
, lp_build_int_const_scalar(type
, 23), "");
1126 logexp
= LLVMBuildSub(bld
->builder
, logexp
, lp_build_int_const_scalar(type
, 127), "");
1127 logexp
= LLVMBuildSIToFP(bld
->builder
, logexp
, vec_type
, "");
1131 /* mant = (float) mantissa(x) */
1132 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
1133 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
1134 mant
= LLVMBuildSIToFP(bld
->builder
, mant
, vec_type
, "");
1136 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
1137 Elements(lp_build_log2_polynomial
));
1139 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1140 logmant
= LLVMBuildMul(bld
->builder
, logmant
, LLVMBuildMul(bld
->builder
, mant
, bld
->one
, ""), "");
1142 res
= LLVMBuildAdd(bld
->builder
, logmant
, logexp
, "");
1149 *p_floor_log2
= logexp
;
1157 lp_build_log2(struct lp_build_context
*bld
,
1161 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);