1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_debug.h"
49 #include "util/u_string.h"
51 #include "lp_bld_type.h"
52 #include "lp_bld_const.h"
53 #include "lp_bld_intr.h"
54 #include "lp_bld_arit.h"
58 lp_build_min_simple(struct lp_build_context
*bld
,
62 const union lp_type type
= bld
->type
;
63 const char *intrinsic
= NULL
;
66 /* TODO: optimize the constant case */
68 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
69 if(type
.width
* type
.length
== 128) {
72 intrinsic
= "llvm.x86.sse.min.ps";
74 intrinsic
= "llvm.x86.sse2.min.pd";
77 if(type
.width
== 8 && !type
.sign
)
78 intrinsic
= "llvm.x86.sse2.pminu.b";
79 if(type
.width
== 8 && type
.sign
)
80 intrinsic
= "llvm.x86.sse41.pminsb";
81 if(type
.width
== 16 && !type
.sign
)
82 intrinsic
= "llvm.x86.sse41.pminuw";
83 if(type
.width
== 16 && type
.sign
)
84 intrinsic
= "llvm.x86.sse2.pmins.w";
85 if(type
.width
== 32 && !type
.sign
)
86 intrinsic
= "llvm.x86.sse41.pminud";
87 if(type
.width
== 32 && type
.sign
)
88 intrinsic
= "llvm.x86.sse41.pminsd";
94 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
97 cond
= LLVMBuildFCmp(bld
->builder
, LLVMRealULT
, a
, b
, "");
99 cond
= LLVMBuildICmp(bld
->builder
, type
.sign
? LLVMIntSLT
: LLVMIntULT
, a
, b
, "");
100 return LLVMBuildSelect(bld
->builder
, cond
, a
, b
, "");
105 lp_build_max_simple(struct lp_build_context
*bld
,
109 const union lp_type type
= bld
->type
;
110 const char *intrinsic
= NULL
;
113 /* TODO: optimize the constant case */
115 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
116 if(type
.width
* type
.length
== 128) {
119 intrinsic
= "llvm.x86.sse.max.ps";
121 intrinsic
= "llvm.x86.sse2.max.pd";
124 if(type
.width
== 8 && !type
.sign
)
125 intrinsic
= "llvm.x86.sse2.pmaxu.b";
126 if(type
.width
== 8 && type
.sign
)
127 intrinsic
= "llvm.x86.sse41.pmaxsb";
128 if(type
.width
== 16 && !type
.sign
)
129 intrinsic
= "llvm.x86.sse41.pmaxuw";
130 if(type
.width
== 16 && type
.sign
)
131 intrinsic
= "llvm.x86.sse2.pmaxs.w";
132 if(type
.width
== 32 && !type
.sign
)
133 intrinsic
= "llvm.x86.sse41.pmaxud";
134 if(type
.width
== 32 && type
.sign
)
135 intrinsic
= "llvm.x86.sse41.pmaxsd";
141 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
144 cond
= LLVMBuildFCmp(bld
->builder
, LLVMRealULT
, a
, b
, "");
146 cond
= LLVMBuildICmp(bld
->builder
, type
.sign
? LLVMIntSLT
: LLVMIntULT
, a
, b
, "");
147 return LLVMBuildSelect(bld
->builder
, cond
, b
, a
, "");
152 lp_build_comp(struct lp_build_context
*bld
,
155 const union lp_type type
= bld
->type
;
162 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
163 if(LLVMIsConstant(a
))
164 return LLVMConstNot(a
);
166 return LLVMBuildNot(bld
->builder
, a
, "");
169 if(LLVMIsConstant(a
))
170 return LLVMConstSub(bld
->one
, a
);
172 return LLVMBuildSub(bld
->builder
, bld
->one
, a
, "");
177 lp_build_add(struct lp_build_context
*bld
,
181 const union lp_type type
= bld
->type
;
188 if(a
== bld
->undef
|| b
== bld
->undef
)
192 const char *intrinsic
= NULL
;
194 if(a
== bld
->one
|| b
== bld
->one
)
197 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
198 if(type
.width
* type
.length
== 128 &&
199 !type
.floating
&& !type
.fixed
) {
201 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
203 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
208 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
211 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
212 res
= LLVMConstAdd(a
, b
);
214 res
= LLVMBuildAdd(bld
->builder
, a
, b
, "");
216 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
217 res
= lp_build_min_simple(bld
, res
, bld
->one
);
224 lp_build_sub(struct lp_build_context
*bld
,
228 const union lp_type type
= bld
->type
;
233 if(a
== bld
->undef
|| b
== bld
->undef
)
239 const char *intrinsic
= NULL
;
244 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
245 if(type
.width
* type
.length
== 128 &&
246 !type
.floating
&& !type
.fixed
) {
248 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
250 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
255 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
258 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
259 res
= LLVMConstSub(a
, b
);
261 res
= LLVMBuildSub(bld
->builder
, a
, b
, "");
263 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
264 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
271 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
274 lp_build_unpack_shuffle(unsigned n
, unsigned lo_hi
)
276 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
279 assert(n
<= LP_MAX_VECTOR_LENGTH
);
282 for(i
= 0, j
= lo_hi
*n
/2; i
< n
; i
+= 2, ++j
) {
283 elems
[i
+ 0] = LLVMConstInt(LLVMInt32Type(), 0 + j
, 0);
284 elems
[i
+ 1] = LLVMConstInt(LLVMInt32Type(), n
+ j
, 0);
287 return LLVMConstVector(elems
, n
);
292 lp_build_const_vec(LLVMTypeRef type
, unsigned n
, long long c
)
294 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
297 assert(n
<= LP_MAX_VECTOR_LENGTH
);
299 for(i
= 0; i
< n
; ++i
)
300 elems
[i
] = LLVMConstInt(type
, c
, 0);
302 return LLVMConstVector(elems
, n
);
307 * Normalized 8bit multiplication.
311 * makes the following approximation to the division (Sree)
313 * a*b/255 ~= (a*(b + 1)) >> 256
315 * which is the fastest method that satisfies the following OpenGL criteria
317 * 0*0 = 0 and 255*255 = 255
321 * takes the geometric series approximation to the division
323 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
325 * in this case just the first two terms to fit in 16bit arithmetic
327 * t/255 ~= (t + (t >> 8)) >> 8
329 * note that just by itself it doesn't satisfies the OpenGL criteria, as
330 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
333 * - geometric series plus rounding
335 * when using a geometric series division instead of truncating the result
336 * use roundoff in the approximation (Jim Blinn)
338 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
340 * achieving the exact results
342 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
343 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
344 * @sa Michael Herf, The "double blend trick", May 2000,
345 * http://www.stereopsis.com/doubleblend.html
348 lp_build_mul_u8n(LLVMBuilderRef builder
,
349 LLVMValueRef a
, LLVMValueRef b
)
351 static LLVMValueRef c01
= NULL
;
352 static LLVMValueRef c08
= NULL
;
353 static LLVMValueRef c80
= NULL
;
356 if(!c01
) c01
= lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
357 if(!c08
) c08
= lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
358 if(!c80
) c80
= lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
362 /* a*b/255 ~= (a*(b + 1)) >> 256 */
363 b
= LLVMBuildAdd(builder
, b
, c01
, "");
364 ab
= LLVMBuildMul(builder
, a
, b
, "");
368 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
369 ab
= LLVMBuildMul(builder
, a
, b
, "");
370 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c08
, ""), "");
371 ab
= LLVMBuildAdd(builder
, ab
, c80
, "");
375 ab
= LLVMBuildLShr(builder
, ab
, c08
, "");
382 lp_build_mul(struct lp_build_context
*bld
,
386 const union lp_type type
= bld
->type
;
396 if(a
== bld
->undef
|| b
== bld
->undef
)
399 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
400 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
401 if(type
.width
== 8 && type
.length
== 16) {
402 LLVMTypeRef i16x8
= LLVMVectorType(LLVMInt16Type(), 8);
403 LLVMTypeRef i8x16
= LLVMVectorType(LLVMInt8Type(), 16);
404 static LLVMValueRef ml
= NULL
;
405 static LLVMValueRef mh
= NULL
;
406 LLVMValueRef al
, ah
, bl
, bh
;
407 LLVMValueRef abl
, abh
;
410 if(!ml
) ml
= lp_build_unpack_shuffle(16, 0);
411 if(!mh
) mh
= lp_build_unpack_shuffle(16, 1);
413 /* PUNPCKLBW, PUNPCKHBW */
414 al
= LLVMBuildShuffleVector(bld
->builder
, a
, bld
->zero
, ml
, "");
415 bl
= LLVMBuildShuffleVector(bld
->builder
, b
, bld
->zero
, ml
, "");
416 ah
= LLVMBuildShuffleVector(bld
->builder
, a
, bld
->zero
, mh
, "");
417 bh
= LLVMBuildShuffleVector(bld
->builder
, b
, bld
->zero
, mh
, "");
420 al
= LLVMBuildBitCast(bld
->builder
, al
, i16x8
, "");
421 bl
= LLVMBuildBitCast(bld
->builder
, bl
, i16x8
, "");
422 ah
= LLVMBuildBitCast(bld
->builder
, ah
, i16x8
, "");
423 bh
= LLVMBuildBitCast(bld
->builder
, bh
, i16x8
, "");
425 /* PMULLW, PSRLW, PADDW */
426 abl
= lp_build_mul_u8n(bld
->builder
, al
, bl
);
427 abh
= lp_build_mul_u8n(bld
->builder
, ah
, bh
);
430 ab
= lp_build_intrinsic_binary(bld
->builder
, "llvm.x86.sse2.packuswb.128" , i16x8
, abl
, abh
);
433 ab
= LLVMBuildBitCast(bld
->builder
, ab
, i8x16
, "");
443 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
444 return LLVMConstMul(a
, b
);
446 return LLVMBuildMul(bld
->builder
, a
, b
, "");
451 lp_build_div(struct lp_build_context
*bld
,
455 const union lp_type type
= bld
->type
;
460 return lp_build_rcp(bld
, b
);
465 if(a
== bld
->undef
|| b
== bld
->undef
)
468 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
469 return LLVMConstFDiv(a
, b
);
471 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
472 if(type
.width
== 32 && type
.length
== 4)
473 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
476 return LLVMBuildFDiv(bld
->builder
, a
, b
, "");
481 lp_build_min(struct lp_build_context
*bld
,
485 if(a
== bld
->undef
|| b
== bld
->undef
)
492 if(a
== bld
->zero
|| b
== bld
->zero
)
500 return lp_build_min_simple(bld
, a
, b
);
505 lp_build_max(struct lp_build_context
*bld
,
509 if(a
== bld
->undef
|| b
== bld
->undef
)
516 if(a
== bld
->one
|| b
== bld
->one
)
524 return lp_build_max_simple(bld
, a
, b
);
529 lp_build_abs(struct lp_build_context
*bld
,
532 const union lp_type type
= bld
->type
;
537 /* XXX: is this really necessary? */
538 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
539 if(!type
.floating
&& type
.width
*type
.length
== 128) {
540 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
542 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
544 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
546 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
550 return lp_build_max(bld
, a
, LLVMBuildNeg(bld
->builder
, a
, ""));
555 lp_build_sqrt(struct lp_build_context
*bld
,
558 const union lp_type type
= bld
->type
;
559 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
562 /* TODO: optimize the constant case */
563 /* TODO: optimize the constant case */
565 assert(type
.floating
);
566 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
568 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
573 lp_build_rcp(struct lp_build_context
*bld
,
576 const union lp_type type
= bld
->type
;
585 assert(type
.floating
);
587 if(LLVMIsConstant(a
))
588 return LLVMConstFDiv(bld
->one
, a
);
590 /* XXX: is this really necessary? */
591 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
592 if(type
.width
== 32 && type
.length
== 4)
593 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type
), a
);
596 return LLVMBuildFDiv(bld
->builder
, bld
->one
, a
, "");
601 lp_build_rsqrt(struct lp_build_context
*bld
,
604 const union lp_type type
= bld
->type
;
606 assert(type
.floating
);
608 /* XXX: is this really necessary? */
609 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
610 if(type
.width
== 32 && type
.length
== 4)
611 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type
), a
);
614 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
619 lp_build_cos(struct lp_build_context
*bld
,
622 const union lp_type type
= bld
->type
;
623 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
626 /* TODO: optimize the constant case */
628 assert(type
.floating
);
629 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.cos.v%uf%u", type
.length
, type
.width
);
631 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
636 lp_build_sin(struct lp_build_context
*bld
,
639 const union lp_type type
= bld
->type
;
640 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
643 /* TODO: optimize the constant case */
645 assert(type
.floating
);
646 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sin.v%uf%u", type
.length
, type
.width
);
648 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
653 lp_build_pow(struct lp_build_context
*bld
,
657 const union lp_type type
= bld
->type
;
658 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
661 /* TODO: optimize the constant case */
663 assert(type
.floating
);
664 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.pow.v%uf%u", type
.length
, type
.width
);
666 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, vec_type
, a
, b
);
671 lp_build_exp(struct lp_build_context
*bld
,
674 /* FIXME: optimize */
675 return lp_build_pow(bld
, lp_build_const_uni(bld
->type
, 2.7182818284590452354), a
);
680 lp_build_log(struct lp_build_context
*bld
,
683 /* FIXME: implement */
688 #define EXP_POLY_DEGREE 3
689 #define LOG_POLY_DEGREE 5
693 lp_build_polynomial(struct lp_build_context
*bld
,
695 const double *coeffs
,
698 const union lp_type type
= bld
->type
;
699 LLVMValueRef res
= NULL
;
702 for (i
= num_coeffs
; i
--; ) {
703 LLVMValueRef coeff
= lp_build_const_uni(type
, coeffs
[i
]);
705 res
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x
, res
));
718 lp_build_exp2(struct lp_build_context
*bld
,
721 /* FIXME: optimize */
722 return lp_build_pow(bld
, lp_build_const_uni(bld
->type
, 2.0), a
);
727 * See http://www.devmaster.net/forums/showthread.php?p=43580
730 lp_build_log2(struct lp_build_context
*bld
,
733 const union lp_type type
= bld
->type
;
734 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
735 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
737 LLVMValueRef expmask
= lp_build_int_const_uni(type
, 0x7f800000);
738 LLVMValueRef mantmask
= lp_build_int_const_uni(type
, 0x007fffff);
739 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
741 LLVMValueRef i
= LLVMBuildBitCast(bld
->builder
, x
, int_vec_type
, "");
745 LLVMValueRef logmant
;
747 /* exp = (float) exponent(x) */
748 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
749 exp
= LLVMBuildLShr(bld
->builder
, exp
, lp_build_int_const_uni(type
, 23), "");
750 exp
= LLVMBuildSub(bld
->builder
, exp
, lp_build_int_const_uni(type
, 127), "");
751 exp
= LLVMBuildSIToFP(bld
->builder
, exp
, vec_type
, "");
753 /* mant = (float) mantissa(x) */
754 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
755 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
756 mant
= LLVMBuildSIToFP(bld
->builder
, mant
, vec_type
, "");
758 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
759 * These coefficients can be generate with
760 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
762 const double polynomial
[] = {
763 #if LOG_POLY_DEGREE == 6
764 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
765 #elif LOG_POLY_DEGREE == 5
766 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
767 #elif LOG_POLY_DEGREE == 4
768 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
769 #elif LOG_POLY_DEGREE == 3
770 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
776 logmant
= lp_build_polynomial(bld
, mant
, polynomial
, sizeof(polynomial
)/sizeof(polynomial
[0]));
778 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
779 logmant
= LLVMBuildMul(bld
->builder
, logmant
, LLVMBuildMul(bld
->builder
, mant
, bld
->one
, ""), "");
781 return LLVMBuildAdd(bld
->builder
, logmant
, exp
, "");