1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_init.h" /* for lp_build_engine */
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
66 * No checks for special case values of a or b = 1 or 0 are done.
69 lp_build_min_simple(struct lp_build_context
*bld
,
73 const struct lp_type type
= bld
->type
;
74 const char *intrinsic
= NULL
;
77 /* TODO: optimize the constant case */
79 if(type
.width
* type
.length
== 128) {
81 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
82 intrinsic
= "llvm.x86.sse.min.ps";
83 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
84 intrinsic
= "llvm.x86.sse2.min.pd";
87 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
88 intrinsic
= "llvm.x86.sse2.pminu.b";
89 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
90 intrinsic
= "llvm.x86.sse41.pminsb";
91 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
92 intrinsic
= "llvm.x86.sse41.pminuw";
93 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
94 intrinsic
= "llvm.x86.sse2.pmins.w";
95 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
96 intrinsic
= "llvm.x86.sse41.pminud";
97 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
98 intrinsic
= "llvm.x86.sse41.pminsd";
103 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
105 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
106 return lp_build_select(bld
, cond
, a
, b
);
112 * No checks for special case values of a or b = 1 or 0 are done.
115 lp_build_max_simple(struct lp_build_context
*bld
,
119 const struct lp_type type
= bld
->type
;
120 const char *intrinsic
= NULL
;
123 /* TODO: optimize the constant case */
125 if(type
.width
* type
.length
== 128) {
127 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
128 intrinsic
= "llvm.x86.sse.max.ps";
129 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
130 intrinsic
= "llvm.x86.sse2.max.pd";
133 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
134 intrinsic
= "llvm.x86.sse2.pmaxu.b";
135 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
136 intrinsic
= "llvm.x86.sse41.pmaxsb";
137 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
138 intrinsic
= "llvm.x86.sse41.pmaxuw";
139 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
140 intrinsic
= "llvm.x86.sse2.pmaxs.w";
141 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
142 intrinsic
= "llvm.x86.sse41.pmaxud";
143 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
144 intrinsic
= "llvm.x86.sse41.pmaxsd";
149 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
151 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
152 return lp_build_select(bld
, cond
, a
, b
);
157 * Generate 1 - a, or ~a depending on bld->type.
160 lp_build_comp(struct lp_build_context
*bld
,
163 const struct lp_type type
= bld
->type
;
170 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
171 if(LLVMIsConstant(a
))
172 return LLVMConstNot(a
);
174 return LLVMBuildNot(bld
->builder
, a
, "");
177 if(LLVMIsConstant(a
))
178 return LLVMConstSub(bld
->one
, a
);
180 return LLVMBuildSub(bld
->builder
, bld
->one
, a
, "");
188 lp_build_add(struct lp_build_context
*bld
,
192 const struct lp_type type
= bld
->type
;
199 if(a
== bld
->undef
|| b
== bld
->undef
)
203 const char *intrinsic
= NULL
;
205 if(a
== bld
->one
|| b
== bld
->one
)
208 if(util_cpu_caps
.has_sse2
&&
209 type
.width
* type
.length
== 128 &&
210 !type
.floating
&& !type
.fixed
) {
212 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
214 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
218 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
221 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
222 res
= LLVMConstAdd(a
, b
);
224 res
= LLVMBuildAdd(bld
->builder
, a
, b
, "");
226 /* clamp to ceiling of 1.0 */
227 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
228 res
= lp_build_min_simple(bld
, res
, bld
->one
);
230 /* XXX clamp to floor of -1 or 0??? */
236 /** Return the sum of the elements of a */
238 lp_build_sum_vector(struct lp_build_context
*bld
,
241 const struct lp_type type
= bld
->type
;
242 LLVMValueRef index
, res
;
249 assert(type
.length
> 1);
251 assert(!bld
->type
.norm
);
253 index
= LLVMConstInt(LLVMInt32Type(), 0, 0);
254 res
= LLVMBuildExtractElement(bld
->builder
, a
, index
, "");
256 for (i
= 1; i
< type
.length
; i
++) {
257 index
= LLVMConstInt(LLVMInt32Type(), i
, 0);
258 res
= LLVMBuildAdd(bld
->builder
, res
,
259 LLVMBuildExtractElement(bld
->builder
, a
, index
, ""),
271 lp_build_sub(struct lp_build_context
*bld
,
275 const struct lp_type type
= bld
->type
;
280 if(a
== bld
->undef
|| b
== bld
->undef
)
286 const char *intrinsic
= NULL
;
291 if(util_cpu_caps
.has_sse2
&&
292 type
.width
* type
.length
== 128 &&
293 !type
.floating
&& !type
.fixed
) {
295 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
297 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
301 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
304 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
305 res
= LLVMConstSub(a
, b
);
307 res
= LLVMBuildSub(bld
->builder
, a
, b
, "");
309 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
310 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
317 * Normalized 8bit multiplication.
321 * makes the following approximation to the division (Sree)
323 * a*b/255 ~= (a*(b + 1)) >> 256
325 * which is the fastest method that satisfies the following OpenGL criteria
327 * 0*0 = 0 and 255*255 = 255
331 * takes the geometric series approximation to the division
333 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
335 * in this case just the first two terms to fit in 16bit arithmetic
337 * t/255 ~= (t + (t >> 8)) >> 8
339 * note that just by itself it doesn't satisfies the OpenGL criteria, as
340 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
343 * - geometric series plus rounding
345 * when using a geometric series division instead of truncating the result
346 * use roundoff in the approximation (Jim Blinn)
348 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
350 * achieving the exact results
352 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
353 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
354 * @sa Michael Herf, The "double blend trick", May 2000,
355 * http://www.stereopsis.com/doubleblend.html
358 lp_build_mul_u8n(LLVMBuilderRef builder
,
359 struct lp_type i16_type
,
360 LLVMValueRef a
, LLVMValueRef b
)
365 c8
= lp_build_const_int_vec(i16_type
, 8);
369 /* a*b/255 ~= (a*(b + 1)) >> 256 */
370 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(i16_type
, 1), "");
371 ab
= LLVMBuildMul(builder
, a
, b
, "");
375 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
376 ab
= LLVMBuildMul(builder
, a
, b
, "");
377 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
378 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(i16_type
, 0x80), "");
382 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
392 lp_build_mul(struct lp_build_context
*bld
,
396 const struct lp_type type
= bld
->type
;
408 if(a
== bld
->undef
|| b
== bld
->undef
)
411 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
412 if(type
.width
== 8) {
413 struct lp_type i16_type
= lp_wider_type(type
);
414 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
416 lp_build_unpack2(bld
->builder
, type
, i16_type
, a
, &al
, &ah
);
417 lp_build_unpack2(bld
->builder
, type
, i16_type
, b
, &bl
, &bh
);
419 /* PMULLW, PSRLW, PADDW */
420 abl
= lp_build_mul_u8n(bld
->builder
, i16_type
, al
, bl
);
421 abh
= lp_build_mul_u8n(bld
->builder
, i16_type
, ah
, bh
);
423 ab
= lp_build_pack2(bld
->builder
, i16_type
, type
, abl
, abh
);
433 shift
= lp_build_const_int_vec(type
, type
.width
/2);
437 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
438 res
= LLVMConstMul(a
, b
);
441 res
= LLVMConstAShr(res
, shift
);
443 res
= LLVMConstLShr(res
, shift
);
447 res
= LLVMBuildMul(bld
->builder
, a
, b
, "");
450 res
= LLVMBuildAShr(bld
->builder
, res
, shift
, "");
452 res
= LLVMBuildLShr(bld
->builder
, res
, shift
, "");
461 * Small vector x scale multiplication optimization.
464 lp_build_mul_imm(struct lp_build_context
*bld
,
477 return LLVMBuildNeg(bld
->builder
, a
, "");
479 if(b
== 2 && bld
->type
.floating
)
480 return lp_build_add(bld
, a
, a
);
483 unsigned shift
= ffs(b
) - 1;
485 if(bld
->type
.floating
) {
488 * Power of two multiplication by directly manipulating the mantissa.
490 * XXX: This might not be always faster, it will introduce a small error
491 * for multiplication by zero, and it will produce wrong results
494 unsigned mantissa
= lp_mantissa(bld
->type
);
495 factor
= lp_build_const_int_vec(bld
->type
, (unsigned long long)shift
<< mantissa
);
496 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_int_vec_type(bld
->type
), "");
497 a
= LLVMBuildAdd(bld
->builder
, a
, factor
, "");
498 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_vec_type(bld
->type
), "");
503 factor
= lp_build_const_vec(bld
->type
, shift
);
504 return LLVMBuildShl(bld
->builder
, a
, factor
, "");
508 factor
= lp_build_const_vec(bld
->type
, (double)b
);
509 return lp_build_mul(bld
, a
, factor
);
517 lp_build_div(struct lp_build_context
*bld
,
521 const struct lp_type type
= bld
->type
;
526 return lp_build_rcp(bld
, b
);
531 if(a
== bld
->undef
|| b
== bld
->undef
)
534 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
535 return LLVMConstFDiv(a
, b
);
537 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
538 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
540 return LLVMBuildFDiv(bld
->builder
, a
, b
, "");
545 * Linear interpolation.
547 * This also works for integer values with a few caveats.
549 * @sa http://www.stereopsis.com/doubleblend.html
552 lp_build_lerp(struct lp_build_context
*bld
,
560 delta
= lp_build_sub(bld
, v1
, v0
);
562 res
= lp_build_mul(bld
, x
, delta
);
564 res
= lp_build_add(bld
, v0
, res
);
567 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
568 * but it will be wrong for other uses. Basically we need a more
569 * powerful lp_type, capable of further distinguishing the values
570 * interpretation from the value storage. */
571 res
= LLVMBuildAnd(bld
->builder
, res
, lp_build_const_int_vec(bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
578 lp_build_lerp_2d(struct lp_build_context
*bld
,
586 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
587 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
588 return lp_build_lerp(bld
, y
, v0
, v1
);
594 * Do checks for special cases.
597 lp_build_min(struct lp_build_context
*bld
,
601 if(a
== bld
->undef
|| b
== bld
->undef
)
608 if(a
== bld
->zero
|| b
== bld
->zero
)
616 return lp_build_min_simple(bld
, a
, b
);
622 * Do checks for special cases.
625 lp_build_max(struct lp_build_context
*bld
,
629 if(a
== bld
->undef
|| b
== bld
->undef
)
636 if(a
== bld
->one
|| b
== bld
->one
)
644 return lp_build_max_simple(bld
, a
, b
);
649 * Generate clamp(a, min, max)
650 * Do checks for special cases.
653 lp_build_clamp(struct lp_build_context
*bld
,
658 a
= lp_build_min(bld
, a
, max
);
659 a
= lp_build_max(bld
, a
, min
);
668 lp_build_abs(struct lp_build_context
*bld
,
671 const struct lp_type type
= bld
->type
;
672 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
678 /* Mask out the sign bit */
679 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
680 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
681 LLVMValueRef mask
= lp_build_const_int_vec(type
, ((unsigned long long) absMask
));
682 a
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
683 a
= LLVMBuildAnd(bld
->builder
, a
, mask
, "");
684 a
= LLVMBuildBitCast(bld
->builder
, a
, vec_type
, "");
688 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
691 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
693 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
695 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
699 return lp_build_max(bld
, a
, LLVMBuildNeg(bld
->builder
, a
, ""));
704 lp_build_negate(struct lp_build_context
*bld
,
707 return LLVMBuildNeg(bld
->builder
, a
, "");
711 /** Return -1, 0 or +1 depending on the sign of a */
713 lp_build_sgn(struct lp_build_context
*bld
,
716 const struct lp_type type
= bld
->type
;
720 /* Handle non-zero case */
722 /* if not zero then sign must be positive */
725 else if(type
.floating
) {
726 LLVMTypeRef vec_type
;
727 LLVMTypeRef int_type
;
731 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
733 int_type
= lp_build_int_vec_type(type
);
734 vec_type
= lp_build_vec_type(type
);
735 mask
= lp_build_const_int_vec(type
, maskBit
);
737 /* Take the sign bit and add it to 1 constant */
738 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_type
, "");
739 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
740 one
= LLVMConstBitCast(bld
->one
, int_type
);
741 res
= LLVMBuildOr(bld
->builder
, sign
, one
, "");
742 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
746 LLVMValueRef minus_one
= lp_build_const_vec(type
, -1.0);
747 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
748 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
752 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
753 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
760 * Set the sign of float vector 'a' according to 'sign'.
761 * If sign==0, return abs(a).
762 * If sign==1, return -abs(a);
763 * Other values for sign produce undefined results.
766 lp_build_set_sign(struct lp_build_context
*bld
,
767 LLVMValueRef a
, LLVMValueRef sign
)
769 const struct lp_type type
= bld
->type
;
770 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
771 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
772 LLVMValueRef shift
= lp_build_const_int_vec(type
, type
.width
- 1);
773 LLVMValueRef mask
= lp_build_const_int_vec(type
,
774 ~((unsigned long long) 1 << (type
.width
- 1)));
775 LLVMValueRef val
, res
;
777 assert(type
.floating
);
779 /* val = reinterpret_cast<int>(a) */
780 val
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
781 /* val = val & mask */
782 val
= LLVMBuildAnd(bld
->builder
, val
, mask
, "");
783 /* sign = sign << shift */
784 sign
= LLVMBuildShl(bld
->builder
, sign
, shift
, "");
785 /* res = val | sign */
786 res
= LLVMBuildOr(bld
->builder
, val
, sign
, "");
787 /* res = reinterpret_cast<float>(res) */
788 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
795 * Convert vector of (or scalar) int to vector of (or scalar) float.
798 lp_build_int_to_float(struct lp_build_context
*bld
,
801 const struct lp_type type
= bld
->type
;
802 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
804 assert(type
.floating
);
806 return LLVMBuildSIToFP(bld
->builder
, a
, vec_type
, "");
811 enum lp_build_round_sse41_mode
813 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
814 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
815 LP_BUILD_ROUND_SSE41_CEIL
= 2,
816 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
820 static INLINE LLVMValueRef
821 lp_build_round_sse41(struct lp_build_context
*bld
,
823 enum lp_build_round_sse41_mode mode
)
825 const struct lp_type type
= bld
->type
;
826 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
827 const char *intrinsic
;
829 assert(type
.floating
);
830 assert(type
.width
*type
.length
== 128);
831 assert(lp_check_value(type
, a
));
832 assert(util_cpu_caps
.has_sse4_1
);
836 intrinsic
= "llvm.x86.sse41.round.ps";
839 intrinsic
= "llvm.x86.sse41.round.pd";
846 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, vec_type
, a
,
847 LLVMConstInt(LLVMInt32Type(), mode
, 0));
852 lp_build_trunc(struct lp_build_context
*bld
,
855 const struct lp_type type
= bld
->type
;
857 assert(type
.floating
);
858 assert(lp_check_value(type
, a
));
860 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
861 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
863 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
864 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
866 res
= LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
867 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
874 lp_build_round(struct lp_build_context
*bld
,
877 const struct lp_type type
= bld
->type
;
879 assert(type
.floating
);
880 assert(lp_check_value(type
, a
));
882 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
883 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
885 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
887 res
= lp_build_iround(bld
, a
);
888 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
895 lp_build_floor(struct lp_build_context
*bld
,
898 const struct lp_type type
= bld
->type
;
900 assert(type
.floating
);
901 assert(lp_check_value(type
, a
));
903 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
904 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
906 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
908 res
= lp_build_ifloor(bld
, a
);
909 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
916 lp_build_ceil(struct lp_build_context
*bld
,
919 const struct lp_type type
= bld
->type
;
921 assert(type
.floating
);
922 assert(lp_check_value(type
, a
));
924 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
925 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
927 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
929 res
= lp_build_iceil(bld
, a
);
930 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
937 * Return fractional part of 'a' computed as a - floor(f)
938 * Typically used in texture coord arithmetic.
941 lp_build_fract(struct lp_build_context
*bld
,
944 assert(bld
->type
.floating
);
945 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
950 * Convert to integer, through whichever rounding method that's fastest,
951 * typically truncating toward zero.
954 lp_build_itrunc(struct lp_build_context
*bld
,
957 const struct lp_type type
= bld
->type
;
958 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
960 assert(type
.floating
);
961 assert(lp_check_value(type
, a
));
963 return LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
968 * Convert float[] to int[] with round().
971 lp_build_iround(struct lp_build_context
*bld
,
974 const struct lp_type type
= bld
->type
;
975 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
978 assert(type
.floating
);
980 assert(lp_check_value(type
, a
));
982 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
983 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
986 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
987 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
992 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
993 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
996 half
= lp_build_const_vec(type
, 0.5);
997 half
= LLVMBuildBitCast(bld
->builder
, half
, int_vec_type
, "");
998 half
= LLVMBuildOr(bld
->builder
, sign
, half
, "");
999 half
= LLVMBuildBitCast(bld
->builder
, half
, vec_type
, "");
1001 res
= LLVMBuildAdd(bld
->builder
, a
, half
, "");
1004 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1011 * Convert float[] to int[] with floor().
1014 lp_build_ifloor(struct lp_build_context
*bld
,
1017 const struct lp_type type
= bld
->type
;
1018 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1021 assert(type
.floating
);
1022 assert(lp_check_value(type
, a
));
1024 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1025 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1028 /* Take the sign bit and add it to 1 constant */
1029 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1030 unsigned mantissa
= lp_mantissa(type
);
1031 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1033 LLVMValueRef offset
;
1035 /* sign = a < 0 ? ~0 : 0 */
1036 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1037 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1038 sign
= LLVMBuildAShr(bld
->builder
, sign
, lp_build_const_int_vec(type
, type
.width
- 1), "");
1039 lp_build_name(sign
, "floor.sign");
1041 /* offset = -0.99999(9)f */
1042 offset
= lp_build_const_vec(type
, -(double)(((unsigned long long)1 << mantissa
) - 1)/((unsigned long long)1 << mantissa
));
1043 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1045 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1046 offset
= LLVMBuildAnd(bld
->builder
, offset
, sign
, "");
1047 offset
= LLVMBuildBitCast(bld
->builder
, offset
, vec_type
, "");
1048 lp_build_name(offset
, "floor.offset");
1050 res
= LLVMBuildAdd(bld
->builder
, a
, offset
, "");
1051 lp_build_name(res
, "floor.res");
1054 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1055 lp_build_name(res
, "floor");
1062 lp_build_iceil(struct lp_build_context
*bld
,
1065 const struct lp_type type
= bld
->type
;
1066 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1069 assert(type
.floating
);
1070 assert(lp_check_value(type
, a
));
1072 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1073 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1076 /* TODO: mimic lp_build_ifloor() here */
1081 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1088 lp_build_sqrt(struct lp_build_context
*bld
,
1091 const struct lp_type type
= bld
->type
;
1092 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1095 /* TODO: optimize the constant case */
1096 /* TODO: optimize the constant case */
1098 assert(type
.floating
);
1099 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
1101 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1106 lp_build_rcp(struct lp_build_context
*bld
,
1109 const struct lp_type type
= bld
->type
;
1118 assert(type
.floating
);
1120 if(LLVMIsConstant(a
))
1121 return LLVMConstFDiv(bld
->one
, a
);
1123 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1125 * XXX: Added precision is not always necessary, so only enable this
1126 * when we have a better system in place to track minimum precision.
1131 * Do one Newton-Raphson step to improve precision:
1133 * x1 = (2 - a * rcp(a)) * rcp(a)
1136 LLVMValueRef two
= lp_build_const_vec(bld
->type
, 2.0);
1140 rcp_a
= lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type
), a
);
1142 res
= LLVMBuildMul(bld
->builder
, a
, rcp_a
, "");
1143 res
= LLVMBuildSub(bld
->builder
, two
, res
, "");
1144 res
= LLVMBuildMul(bld
->builder
, res
, rcp_a
, "");
1148 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type
), a
);
1152 return LLVMBuildFDiv(bld
->builder
, bld
->one
, a
, "");
1157 * Generate 1/sqrt(a)
1160 lp_build_rsqrt(struct lp_build_context
*bld
,
1163 const struct lp_type type
= bld
->type
;
1165 assert(type
.floating
);
1167 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
1168 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type
), a
);
1170 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
1174 #ifdef PIPE_OS_WINDOWS
1177 * XXX: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
1178 * which is neither efficient nor does the CRT linkage work on Windows
1179 * causing segmentation fault.
1181 * XXX: With LLVM 2.7 both schemes cause an assertion failure.
1184 lp_build_sincos(struct lp_build_context
*bld
,
1186 float (*func
)(float),
1189 LLVMModuleRef module
=
1190 LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld
->builder
)));
1191 LLVMValueRef function
;
1195 assert(bld
->type
.floating
);
1196 assert(bld
->type
.width
== 32);
1198 function
= LLVMGetNamedFunction(module
, name
);
1200 LLVMTypeRef ret_type
;
1201 LLVMTypeRef arg_types
[1];
1202 LLVMTypeRef function_type
;
1204 ret_type
= LLVMFloatType();
1205 arg_types
[0] = LLVMFloatType();
1206 function_type
= LLVMFunctionType(ret_type
, arg_types
, Elements(arg_types
), 0);
1207 function
= LLVMAddFunction(module
, name
, function_type
);
1209 LLVMSetFunctionCallConv(function
, LLVMCCallConv
);
1210 LLVMSetLinkage(function
, LLVMPrivateLinkage
);
1212 assert(LLVMIsDeclaration(function
));
1214 LLVMAddGlobalMapping(lp_build_engine
, function
, func
);
1219 for (i
= 0; i
< bld
->type
.length
; ++i
) {
1220 LLVMValueRef index
= LLVMConstInt(LLVMInt32Type(), i
, 0);
1221 LLVMValueRef args
[1];
1224 args
[0] = LLVMBuildExtractElement(bld
->builder
, a
, index
, "");
1226 tmp
= LLVMBuildCall(bld
->builder
, function
, args
, Elements(args
), "");
1228 res
= LLVMBuildInsertElement(bld
->builder
, res
, tmp
, index
, "");
1234 static float c_cosf( float f
)
1236 return (float) cos( (double) f
);
1239 static float c_sinf( float f
)
1241 return (float) sin( (double) f
);
1245 lp_build_cos(struct lp_build_context
*bld
,
1248 return lp_build_sincos(bld
, "cosf", &c_cosf
, a
);
1252 lp_build_sin(struct lp_build_context
*bld
,
1255 return lp_build_sincos(bld
, "sinf", &c_sinf
, a
);
1258 #else /* !PIPE_OS_WINDOWS */
1264 lp_build_cos(struct lp_build_context
*bld
,
1267 const struct lp_type type
= bld
->type
;
1268 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1271 /* TODO: optimize the constant case */
1273 assert(type
.floating
);
1274 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.cos.v%uf%u", type
.length
, type
.width
);
1276 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1284 lp_build_sin(struct lp_build_context
*bld
,
1287 const struct lp_type type
= bld
->type
;
1288 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1291 /* TODO: optimize the constant case */
1293 assert(type
.floating
);
1294 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sin.v%uf%u", type
.length
, type
.width
);
1296 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1299 #endif /* !PIPE_OS_WINDOWS */
1303 * Generate pow(x, y)
1306 lp_build_pow(struct lp_build_context
*bld
,
1310 /* TODO: optimize the constant case */
1311 if(LLVMIsConstant(x
) && LLVMIsConstant(y
))
1312 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1315 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
1323 lp_build_exp(struct lp_build_context
*bld
,
1326 /* log2(e) = 1/log(2) */
1327 LLVMValueRef log2e
= lp_build_const_vec(bld
->type
, 1.4426950408889634);
1329 return lp_build_mul(bld
, log2e
, lp_build_exp2(bld
, x
));
1337 lp_build_log(struct lp_build_context
*bld
,
1341 LLVMValueRef log2
= lp_build_const_vec(bld
->type
, 0.69314718055994529);
1343 return lp_build_mul(bld
, log2
, lp_build_exp2(bld
, x
));
1347 #define EXP_POLY_DEGREE 3
1348 #define LOG_POLY_DEGREE 5
1352 * Generate polynomial.
1353 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1356 lp_build_polynomial(struct lp_build_context
*bld
,
1358 const double *coeffs
,
1359 unsigned num_coeffs
)
1361 const struct lp_type type
= bld
->type
;
1362 LLVMValueRef res
= NULL
;
1365 /* TODO: optimize the constant case */
1366 if(LLVMIsConstant(x
))
1367 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1370 for (i
= num_coeffs
; i
--; ) {
1373 coeff
= lp_build_const_vec(type
, coeffs
[i
]);
1376 res
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x
, res
));
1389 * Minimax polynomial fit of 2**x, in range [0, 1[
1391 const double lp_build_exp2_polynomial
[] = {
1392 #if EXP_POLY_DEGREE == 5
1393 0.999999999690134838155,
1394 0.583974334321735217258,
1395 0.164553105719676828492,
1396 0.0292811063701710962255,
1397 0.00354944426657875141846,
1398 0.000296253726543423377365
1399 #elif EXP_POLY_DEGREE == 4
1400 1.00000001502262084505,
1401 0.563586057338685991394,
1402 0.150436017652442413623,
1403 0.0243220604213317927308,
1404 0.0025359088446580436489
1405 #elif EXP_POLY_DEGREE == 3
1406 0.999925218562710312959,
1407 0.695833540494823811697,
1408 0.226067155427249155588,
1409 0.0780245226406372992967
1410 #elif EXP_POLY_DEGREE == 2
1411 1.00172476321474503578,
1412 0.657636275736077639316,
1413 0.33718943461968720704
1421 lp_build_exp2_approx(struct lp_build_context
*bld
,
1423 LLVMValueRef
*p_exp2_int_part
,
1424 LLVMValueRef
*p_frac_part
,
1425 LLVMValueRef
*p_exp2
)
1427 const struct lp_type type
= bld
->type
;
1428 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1429 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1430 LLVMValueRef ipart
= NULL
;
1431 LLVMValueRef fpart
= NULL
;
1432 LLVMValueRef expipart
= NULL
;
1433 LLVMValueRef expfpart
= NULL
;
1434 LLVMValueRef res
= NULL
;
1436 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
1437 /* TODO: optimize the constant case */
1438 if(LLVMIsConstant(x
))
1439 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1442 assert(type
.floating
&& type
.width
== 32);
1444 x
= lp_build_min(bld
, x
, lp_build_const_vec(type
, 129.0));
1445 x
= lp_build_max(bld
, x
, lp_build_const_vec(type
, -126.99999));
1447 /* ipart = floor(x) */
1448 ipart
= lp_build_floor(bld
, x
);
1450 /* fpart = x - ipart */
1451 fpart
= LLVMBuildSub(bld
->builder
, x
, ipart
, "");
1454 if(p_exp2_int_part
|| p_exp2
) {
1455 /* expipart = (float) (1 << ipart) */
1456 ipart
= LLVMBuildFPToSI(bld
->builder
, ipart
, int_vec_type
, "");
1457 expipart
= LLVMBuildAdd(bld
->builder
, ipart
, lp_build_const_int_vec(type
, 127), "");
1458 expipart
= LLVMBuildShl(bld
->builder
, expipart
, lp_build_const_int_vec(type
, 23), "");
1459 expipart
= LLVMBuildBitCast(bld
->builder
, expipart
, vec_type
, "");
1463 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
1464 Elements(lp_build_exp2_polynomial
));
1466 res
= LLVMBuildMul(bld
->builder
, expipart
, expfpart
, "");
1470 *p_exp2_int_part
= expipart
;
1473 *p_frac_part
= fpart
;
1481 lp_build_exp2(struct lp_build_context
*bld
,
1485 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
1491 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1492 * These coefficients can be generate with
1493 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1495 const double lp_build_log2_polynomial
[] = {
1496 #if LOG_POLY_DEGREE == 6
1497 3.11578814719469302614,
1498 -3.32419399085241980044,
1499 2.59883907202499966007,
1500 -1.23152682416275988241,
1501 0.318212422185251071475,
1502 -0.0344359067839062357313
1503 #elif LOG_POLY_DEGREE == 5
1504 2.8882704548164776201,
1505 -2.52074962577807006663,
1506 1.48116647521213171641,
1507 -0.465725644288844778798,
1508 0.0596515482674574969533
1509 #elif LOG_POLY_DEGREE == 4
1510 2.61761038894603480148,
1511 -1.75647175389045657003,
1512 0.688243882994381274313,
1513 -0.107254423828329604454
1514 #elif LOG_POLY_DEGREE == 3
1515 2.28330284476918490682,
1516 -1.04913055217340124191,
1517 0.204446009836232697516
1525 * See http://www.devmaster.net/forums/showthread.php?p=43580
1528 lp_build_log2_approx(struct lp_build_context
*bld
,
1530 LLVMValueRef
*p_exp
,
1531 LLVMValueRef
*p_floor_log2
,
1532 LLVMValueRef
*p_log2
)
1534 const struct lp_type type
= bld
->type
;
1535 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1536 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1538 LLVMValueRef expmask
= lp_build_const_int_vec(type
, 0x7f800000);
1539 LLVMValueRef mantmask
= lp_build_const_int_vec(type
, 0x007fffff);
1540 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
1542 LLVMValueRef i
= NULL
;
1543 LLVMValueRef exp
= NULL
;
1544 LLVMValueRef mant
= NULL
;
1545 LLVMValueRef logexp
= NULL
;
1546 LLVMValueRef logmant
= NULL
;
1547 LLVMValueRef res
= NULL
;
1549 if(p_exp
|| p_floor_log2
|| p_log2
) {
1550 /* TODO: optimize the constant case */
1551 if(LLVMIsConstant(x
))
1552 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1555 assert(type
.floating
&& type
.width
== 32);
1557 i
= LLVMBuildBitCast(bld
->builder
, x
, int_vec_type
, "");
1559 /* exp = (float) exponent(x) */
1560 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
1563 if(p_floor_log2
|| p_log2
) {
1564 logexp
= LLVMBuildLShr(bld
->builder
, exp
, lp_build_const_int_vec(type
, 23), "");
1565 logexp
= LLVMBuildSub(bld
->builder
, logexp
, lp_build_const_int_vec(type
, 127), "");
1566 logexp
= LLVMBuildSIToFP(bld
->builder
, logexp
, vec_type
, "");
1570 /* mant = (float) mantissa(x) */
1571 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
1572 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
1573 mant
= LLVMBuildBitCast(bld
->builder
, mant
, vec_type
, "");
1575 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
1576 Elements(lp_build_log2_polynomial
));
1578 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1579 logmant
= LLVMBuildMul(bld
->builder
, logmant
, LLVMBuildSub(bld
->builder
, mant
, bld
->one
, ""), "");
1581 res
= LLVMBuildAdd(bld
->builder
, logmant
, logexp
, "");
1585 exp
= LLVMBuildBitCast(bld
->builder
, exp
, vec_type
, "");
1590 *p_floor_log2
= logexp
;
1598 lp_build_log2(struct lp_build_context
*bld
,
1602 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);