1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
63 * XXX: Increasing eliminates some artifacts, but adds others, most
64 * noticeably corruption in the Earth halo in Google Earth.
66 #define RCP_NEWTON_STEPS 0
68 #define RSQRT_NEWTON_STEPS 0
70 #define EXP_POLY_DEGREE 3
72 #define LOG_POLY_DEGREE 5
77 * No checks for special case values of a or b = 1 or 0 are done.
80 lp_build_min_simple(struct lp_build_context
*bld
,
84 const struct lp_type type
= bld
->type
;
85 const char *intrinsic
= NULL
;
88 assert(lp_check_value(type
, a
));
89 assert(lp_check_value(type
, b
));
91 /* TODO: optimize the constant case */
93 if(type
.width
* type
.length
== 128) {
95 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
96 intrinsic
= "llvm.x86.sse.min.ps";
97 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
98 intrinsic
= "llvm.x86.sse2.min.pd";
101 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
102 intrinsic
= "llvm.x86.sse2.pminu.b";
103 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
104 intrinsic
= "llvm.x86.sse41.pminsb";
105 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
106 intrinsic
= "llvm.x86.sse41.pminuw";
107 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
108 intrinsic
= "llvm.x86.sse2.pmins.w";
109 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
110 intrinsic
= "llvm.x86.sse41.pminud";
111 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
112 intrinsic
= "llvm.x86.sse41.pminsd";
117 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
119 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
120 return lp_build_select(bld
, cond
, a
, b
);
126 * No checks for special case values of a or b = 1 or 0 are done.
129 lp_build_max_simple(struct lp_build_context
*bld
,
133 const struct lp_type type
= bld
->type
;
134 const char *intrinsic
= NULL
;
137 assert(lp_check_value(type
, a
));
138 assert(lp_check_value(type
, b
));
140 /* TODO: optimize the constant case */
142 if(type
.width
* type
.length
== 128) {
144 if(type
.width
== 32 && util_cpu_caps
.has_sse
)
145 intrinsic
= "llvm.x86.sse.max.ps";
146 if(type
.width
== 64 && util_cpu_caps
.has_sse2
)
147 intrinsic
= "llvm.x86.sse2.max.pd";
150 if(type
.width
== 8 && !type
.sign
&& util_cpu_caps
.has_sse2
)
151 intrinsic
= "llvm.x86.sse2.pmaxu.b";
152 if(type
.width
== 8 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
153 intrinsic
= "llvm.x86.sse41.pmaxsb";
154 if(type
.width
== 16 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
155 intrinsic
= "llvm.x86.sse41.pmaxuw";
156 if(type
.width
== 16 && type
.sign
&& util_cpu_caps
.has_sse2
)
157 intrinsic
= "llvm.x86.sse2.pmaxs.w";
158 if(type
.width
== 32 && !type
.sign
&& util_cpu_caps
.has_sse4_1
)
159 intrinsic
= "llvm.x86.sse41.pmaxud";
160 if(type
.width
== 32 && type
.sign
&& util_cpu_caps
.has_sse4_1
)
161 intrinsic
= "llvm.x86.sse41.pmaxsd";
166 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
168 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
169 return lp_build_select(bld
, cond
, a
, b
);
174 * Generate 1 - a, or ~a depending on bld->type.
177 lp_build_comp(struct lp_build_context
*bld
,
180 const struct lp_type type
= bld
->type
;
182 assert(lp_check_value(type
, a
));
189 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
190 if(LLVMIsConstant(a
))
191 return LLVMConstNot(a
);
193 return LLVMBuildNot(bld
->builder
, a
, "");
196 if(LLVMIsConstant(a
))
198 return LLVMConstFSub(bld
->one
, a
);
200 return LLVMConstSub(bld
->one
, a
);
203 return LLVMBuildFSub(bld
->builder
, bld
->one
, a
, "");
205 return LLVMBuildSub(bld
->builder
, bld
->one
, a
, "");
213 lp_build_add(struct lp_build_context
*bld
,
217 const struct lp_type type
= bld
->type
;
220 assert(lp_check_value(type
, a
));
221 assert(lp_check_value(type
, b
));
227 if(a
== bld
->undef
|| b
== bld
->undef
)
231 const char *intrinsic
= NULL
;
233 if(a
== bld
->one
|| b
== bld
->one
)
236 if(util_cpu_caps
.has_sse2
&&
237 type
.width
* type
.length
== 128 &&
238 !type
.floating
&& !type
.fixed
) {
240 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
242 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
246 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
249 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
251 res
= LLVMConstFAdd(a
, b
);
253 res
= LLVMConstAdd(a
, b
);
256 res
= LLVMBuildFAdd(bld
->builder
, a
, b
, "");
258 res
= LLVMBuildAdd(bld
->builder
, a
, b
, "");
260 /* clamp to ceiling of 1.0 */
261 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
262 res
= lp_build_min_simple(bld
, res
, bld
->one
);
264 /* XXX clamp to floor of -1 or 0??? */
270 /** Return the sum of the elements of a */
272 lp_build_sum_vector(struct lp_build_context
*bld
,
275 const struct lp_type type
= bld
->type
;
276 LLVMValueRef index
, res
;
279 assert(lp_check_value(type
, a
));
285 assert(type
.length
> 1);
287 assert(!bld
->type
.norm
);
289 index
= LLVMConstInt(LLVMInt32Type(), 0, 0);
290 res
= LLVMBuildExtractElement(bld
->builder
, a
, index
, "");
292 for (i
= 1; i
< type
.length
; i
++) {
293 index
= LLVMConstInt(LLVMInt32Type(), i
, 0);
295 res
= LLVMBuildFAdd(bld
->builder
, res
,
296 LLVMBuildExtractElement(bld
->builder
,
300 res
= LLVMBuildAdd(bld
->builder
, res
,
301 LLVMBuildExtractElement(bld
->builder
,
314 lp_build_sub(struct lp_build_context
*bld
,
318 const struct lp_type type
= bld
->type
;
321 assert(lp_check_value(type
, a
));
322 assert(lp_check_value(type
, b
));
326 if(a
== bld
->undef
|| b
== bld
->undef
)
332 const char *intrinsic
= NULL
;
337 if(util_cpu_caps
.has_sse2
&&
338 type
.width
* type
.length
== 128 &&
339 !type
.floating
&& !type
.fixed
) {
341 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
343 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
347 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, lp_build_vec_type(bld
->type
), a
, b
);
350 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
352 res
= LLVMConstFSub(a
, b
);
354 res
= LLVMConstSub(a
, b
);
357 res
= LLVMBuildFSub(bld
->builder
, a
, b
, "");
359 res
= LLVMBuildSub(bld
->builder
, a
, b
, "");
361 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
362 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
369 * Normalized 8bit multiplication.
373 * makes the following approximation to the division (Sree)
375 * a*b/255 ~= (a*(b + 1)) >> 256
377 * which is the fastest method that satisfies the following OpenGL criteria
379 * 0*0 = 0 and 255*255 = 255
383 * takes the geometric series approximation to the division
385 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
387 * in this case just the first two terms to fit in 16bit arithmetic
389 * t/255 ~= (t + (t >> 8)) >> 8
391 * note that just by itself it doesn't satisfies the OpenGL criteria, as
392 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
395 * - geometric series plus rounding
397 * when using a geometric series division instead of truncating the result
398 * use roundoff in the approximation (Jim Blinn)
400 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
402 * achieving the exact results
404 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
405 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
406 * @sa Michael Herf, The "double blend trick", May 2000,
407 * http://www.stereopsis.com/doubleblend.html
410 lp_build_mul_u8n(LLVMBuilderRef builder
,
411 struct lp_type i16_type
,
412 LLVMValueRef a
, LLVMValueRef b
)
417 assert(!i16_type
.floating
);
418 assert(lp_check_value(i16_type
, a
));
419 assert(lp_check_value(i16_type
, b
));
421 c8
= lp_build_const_int_vec(i16_type
, 8);
425 /* a*b/255 ~= (a*(b + 1)) >> 256 */
426 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(i16_type
, 1), "");
427 ab
= LLVMBuildMul(builder
, a
, b
, "");
431 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
432 ab
= LLVMBuildMul(builder
, a
, b
, "");
433 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
434 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(i16_type
, 0x80), "");
438 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
448 lp_build_mul(struct lp_build_context
*bld
,
452 const struct lp_type type
= bld
->type
;
456 assert(lp_check_value(type
, a
));
457 assert(lp_check_value(type
, b
));
467 if(a
== bld
->undef
|| b
== bld
->undef
)
470 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
471 if(type
.width
== 8) {
472 struct lp_type i16_type
= lp_wider_type(type
);
473 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
475 lp_build_unpack2(bld
->builder
, type
, i16_type
, a
, &al
, &ah
);
476 lp_build_unpack2(bld
->builder
, type
, i16_type
, b
, &bl
, &bh
);
478 /* PMULLW, PSRLW, PADDW */
479 abl
= lp_build_mul_u8n(bld
->builder
, i16_type
, al
, bl
);
480 abh
= lp_build_mul_u8n(bld
->builder
, i16_type
, ah
, bh
);
482 ab
= lp_build_pack2(bld
->builder
, i16_type
, type
, abl
, abh
);
492 shift
= lp_build_const_int_vec(type
, type
.width
/2);
496 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
498 res
= LLVMConstFMul(a
, b
);
500 res
= LLVMConstMul(a
, b
);
503 res
= LLVMConstAShr(res
, shift
);
505 res
= LLVMConstLShr(res
, shift
);
510 res
= LLVMBuildFMul(bld
->builder
, a
, b
, "");
512 res
= LLVMBuildMul(bld
->builder
, a
, b
, "");
515 res
= LLVMBuildAShr(bld
->builder
, res
, shift
, "");
517 res
= LLVMBuildLShr(bld
->builder
, res
, shift
, "");
526 * Small vector x scale multiplication optimization.
529 lp_build_mul_imm(struct lp_build_context
*bld
,
535 assert(lp_check_value(bld
->type
, a
));
544 return lp_build_negate(bld
, a
);
546 if(b
== 2 && bld
->type
.floating
)
547 return lp_build_add(bld
, a
, a
);
550 unsigned shift
= ffs(b
) - 1;
552 if(bld
->type
.floating
) {
555 * Power of two multiplication by directly manipulating the mantissa.
557 * XXX: This might not be always faster, it will introduce a small error
558 * for multiplication by zero, and it will produce wrong results
561 unsigned mantissa
= lp_mantissa(bld
->type
);
562 factor
= lp_build_const_int_vec(bld
->type
, (unsigned long long)shift
<< mantissa
);
563 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_int_vec_type(bld
->type
), "");
564 a
= LLVMBuildAdd(bld
->builder
, a
, factor
, "");
565 a
= LLVMBuildBitCast(bld
->builder
, a
, lp_build_vec_type(bld
->type
), "");
570 factor
= lp_build_const_vec(bld
->type
, shift
);
571 return LLVMBuildShl(bld
->builder
, a
, factor
, "");
575 factor
= lp_build_const_vec(bld
->type
, (double)b
);
576 return lp_build_mul(bld
, a
, factor
);
584 lp_build_div(struct lp_build_context
*bld
,
588 const struct lp_type type
= bld
->type
;
590 assert(lp_check_value(type
, a
));
591 assert(lp_check_value(type
, b
));
596 return lp_build_rcp(bld
, b
);
601 if(a
== bld
->undef
|| b
== bld
->undef
)
604 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
606 return LLVMConstFDiv(a
, b
);
608 return LLVMConstSDiv(a
, b
);
610 return LLVMConstUDiv(a
, b
);
613 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4)
614 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
617 return LLVMBuildFDiv(bld
->builder
, a
, b
, "");
619 return LLVMBuildSDiv(bld
->builder
, a
, b
, "");
621 return LLVMBuildUDiv(bld
->builder
, a
, b
, "");
626 * Linear interpolation.
628 * This also works for integer values with a few caveats.
630 * @sa http://www.stereopsis.com/doubleblend.html
633 lp_build_lerp(struct lp_build_context
*bld
,
641 assert(lp_check_value(bld
->type
, x
));
642 assert(lp_check_value(bld
->type
, v0
));
643 assert(lp_check_value(bld
->type
, v1
));
645 delta
= lp_build_sub(bld
, v1
, v0
);
647 res
= lp_build_mul(bld
, x
, delta
);
649 res
= lp_build_add(bld
, v0
, res
);
652 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
653 * but it will be wrong for other uses. Basically we need a more
654 * powerful lp_type, capable of further distinguishing the values
655 * interpretation from the value storage. */
656 res
= LLVMBuildAnd(bld
->builder
, res
, lp_build_const_int_vec(bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
663 lp_build_lerp_2d(struct lp_build_context
*bld
,
671 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
672 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
673 return lp_build_lerp(bld
, y
, v0
, v1
);
679 * Do checks for special cases.
682 lp_build_min(struct lp_build_context
*bld
,
686 assert(lp_check_value(bld
->type
, a
));
687 assert(lp_check_value(bld
->type
, b
));
689 if(a
== bld
->undef
|| b
== bld
->undef
)
696 if(a
== bld
->zero
|| b
== bld
->zero
)
704 return lp_build_min_simple(bld
, a
, b
);
710 * Do checks for special cases.
713 lp_build_max(struct lp_build_context
*bld
,
717 assert(lp_check_value(bld
->type
, a
));
718 assert(lp_check_value(bld
->type
, b
));
720 if(a
== bld
->undef
|| b
== bld
->undef
)
727 if(a
== bld
->one
|| b
== bld
->one
)
735 return lp_build_max_simple(bld
, a
, b
);
740 * Generate clamp(a, min, max)
741 * Do checks for special cases.
744 lp_build_clamp(struct lp_build_context
*bld
,
749 assert(lp_check_value(bld
->type
, a
));
750 assert(lp_check_value(bld
->type
, min
));
751 assert(lp_check_value(bld
->type
, max
));
753 a
= lp_build_min(bld
, a
, max
);
754 a
= lp_build_max(bld
, a
, min
);
763 lp_build_abs(struct lp_build_context
*bld
,
766 const struct lp_type type
= bld
->type
;
767 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
769 assert(lp_check_value(type
, a
));
775 /* Mask out the sign bit */
776 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
777 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
778 LLVMValueRef mask
= lp_build_const_int_vec(type
, ((unsigned long long) absMask
));
779 a
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
780 a
= LLVMBuildAnd(bld
->builder
, a
, mask
, "");
781 a
= LLVMBuildBitCast(bld
->builder
, a
, vec_type
, "");
785 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
788 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
790 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
792 return lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
796 return lp_build_max(bld
, a
, LLVMBuildNeg(bld
->builder
, a
, ""));
801 lp_build_negate(struct lp_build_context
*bld
,
804 assert(lp_check_value(bld
->type
, a
));
806 #if HAVE_LLVM >= 0x0207
807 if (bld
->type
.floating
)
808 a
= LLVMBuildFNeg(bld
->builder
, a
, "");
811 a
= LLVMBuildNeg(bld
->builder
, a
, "");
817 /** Return -1, 0 or +1 depending on the sign of a */
819 lp_build_sgn(struct lp_build_context
*bld
,
822 const struct lp_type type
= bld
->type
;
826 assert(lp_check_value(type
, a
));
828 /* Handle non-zero case */
830 /* if not zero then sign must be positive */
833 else if(type
.floating
) {
834 LLVMTypeRef vec_type
;
835 LLVMTypeRef int_type
;
839 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
841 int_type
= lp_build_int_vec_type(type
);
842 vec_type
= lp_build_vec_type(type
);
843 mask
= lp_build_const_int_vec(type
, maskBit
);
845 /* Take the sign bit and add it to 1 constant */
846 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_type
, "");
847 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
848 one
= LLVMConstBitCast(bld
->one
, int_type
);
849 res
= LLVMBuildOr(bld
->builder
, sign
, one
, "");
850 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
854 LLVMValueRef minus_one
= lp_build_const_vec(type
, -1.0);
855 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
856 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
860 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
861 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
868 * Set the sign of float vector 'a' according to 'sign'.
869 * If sign==0, return abs(a).
870 * If sign==1, return -abs(a);
871 * Other values for sign produce undefined results.
874 lp_build_set_sign(struct lp_build_context
*bld
,
875 LLVMValueRef a
, LLVMValueRef sign
)
877 const struct lp_type type
= bld
->type
;
878 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
879 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
880 LLVMValueRef shift
= lp_build_const_int_vec(type
, type
.width
- 1);
881 LLVMValueRef mask
= lp_build_const_int_vec(type
,
882 ~((unsigned long long) 1 << (type
.width
- 1)));
883 LLVMValueRef val
, res
;
885 assert(type
.floating
);
886 assert(lp_check_value(type
, a
));
888 /* val = reinterpret_cast<int>(a) */
889 val
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
890 /* val = val & mask */
891 val
= LLVMBuildAnd(bld
->builder
, val
, mask
, "");
892 /* sign = sign << shift */
893 sign
= LLVMBuildShl(bld
->builder
, sign
, shift
, "");
894 /* res = val | sign */
895 res
= LLVMBuildOr(bld
->builder
, val
, sign
, "");
896 /* res = reinterpret_cast<float>(res) */
897 res
= LLVMBuildBitCast(bld
->builder
, res
, vec_type
, "");
904 * Convert vector of (or scalar) int to vector of (or scalar) float.
907 lp_build_int_to_float(struct lp_build_context
*bld
,
910 const struct lp_type type
= bld
->type
;
911 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
913 assert(type
.floating
);
915 return LLVMBuildSIToFP(bld
->builder
, a
, vec_type
, "");
920 enum lp_build_round_sse41_mode
922 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
923 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
924 LP_BUILD_ROUND_SSE41_CEIL
= 2,
925 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
929 static INLINE LLVMValueRef
930 lp_build_round_sse41(struct lp_build_context
*bld
,
932 enum lp_build_round_sse41_mode mode
)
934 const struct lp_type type
= bld
->type
;
935 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
936 const char *intrinsic
;
938 assert(type
.floating
);
939 assert(type
.width
*type
.length
== 128);
940 assert(lp_check_value(type
, a
));
941 assert(util_cpu_caps
.has_sse4_1
);
945 intrinsic
= "llvm.x86.sse41.round.ps";
948 intrinsic
= "llvm.x86.sse41.round.pd";
955 return lp_build_intrinsic_binary(bld
->builder
, intrinsic
, vec_type
, a
,
956 LLVMConstInt(LLVMInt32Type(), mode
, 0));
961 * Return the integer part of a float (vector) value. The returned value is
963 * Ex: trunc(-1.5) = 1.0
966 lp_build_trunc(struct lp_build_context
*bld
,
969 const struct lp_type type
= bld
->type
;
971 assert(type
.floating
);
972 assert(lp_check_value(type
, a
));
974 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
975 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
977 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
978 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
980 res
= LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
981 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
988 * Return float (vector) rounded to nearest integer (vector). The returned
989 * value is a float (vector).
990 * Ex: round(0.9) = 1.0
991 * Ex: round(-1.5) = -2.0
994 lp_build_round(struct lp_build_context
*bld
,
997 const struct lp_type type
= bld
->type
;
999 assert(type
.floating
);
1000 assert(lp_check_value(type
, a
));
1002 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
1003 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1005 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1007 res
= lp_build_iround(bld
, a
);
1008 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
1015 * Return floor of float (vector), result is a float (vector)
1016 * Ex: floor(1.1) = 1.0
1017 * Ex: floor(-1.1) = -2.0
1020 lp_build_floor(struct lp_build_context
*bld
,
1023 const struct lp_type type
= bld
->type
;
1025 assert(type
.floating
);
1026 assert(lp_check_value(type
, a
));
1028 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
1029 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1031 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1033 res
= lp_build_ifloor(bld
, a
);
1034 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
1041 * Return ceiling of float (vector), returning float (vector).
1042 * Ex: ceil( 1.1) = 2.0
1043 * Ex: ceil(-1.1) = -1.0
1046 lp_build_ceil(struct lp_build_context
*bld
,
1049 const struct lp_type type
= bld
->type
;
1051 assert(type
.floating
);
1052 assert(lp_check_value(type
, a
));
1054 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128)
1055 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1057 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1059 res
= lp_build_iceil(bld
, a
);
1060 res
= LLVMBuildSIToFP(bld
->builder
, res
, vec_type
, "");
1067 * Return fractional part of 'a' computed as a - floor(a)
1068 * Typically used in texture coord arithmetic.
1071 lp_build_fract(struct lp_build_context
*bld
,
1074 assert(bld
->type
.floating
);
1075 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1080 * Return the integer part of a float (vector) value. The returned value is
1081 * an integer (vector).
1082 * Ex: itrunc(-1.5) = 1
1085 lp_build_itrunc(struct lp_build_context
*bld
,
1088 const struct lp_type type
= bld
->type
;
1089 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1091 assert(type
.floating
);
1092 assert(lp_check_value(type
, a
));
1094 return LLVMBuildFPToSI(bld
->builder
, a
, int_vec_type
, "");
1099 * Return float (vector) rounded to nearest integer (vector). The returned
1100 * value is an integer (vector).
1101 * Ex: iround(0.9) = 1
1102 * Ex: iround(-1.5) = -2
1105 lp_build_iround(struct lp_build_context
*bld
,
1108 const struct lp_type type
= bld
->type
;
1109 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1112 assert(type
.floating
);
1114 assert(lp_check_value(type
, a
));
1116 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1117 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1120 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1121 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1126 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1127 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1130 half
= lp_build_const_vec(type
, 0.5);
1131 half
= LLVMBuildBitCast(bld
->builder
, half
, int_vec_type
, "");
1132 half
= LLVMBuildOr(bld
->builder
, sign
, half
, "");
1133 half
= LLVMBuildBitCast(bld
->builder
, half
, vec_type
, "");
1135 res
= LLVMBuildFAdd(bld
->builder
, a
, half
, "");
1138 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "");
1145 * Return floor of float (vector), result is an int (vector)
1146 * Ex: ifloor(1.1) = 1.0
1147 * Ex: ifloor(-1.1) = -2.0
1150 lp_build_ifloor(struct lp_build_context
*bld
,
1153 const struct lp_type type
= bld
->type
;
1154 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1157 assert(type
.floating
);
1158 assert(lp_check_value(type
, a
));
1160 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1161 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1164 /* Take the sign bit and add it to 1 constant */
1165 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1166 unsigned mantissa
= lp_mantissa(type
);
1167 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1169 LLVMValueRef offset
;
1171 /* sign = a < 0 ? ~0 : 0 */
1172 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1173 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1174 sign
= LLVMBuildAShr(bld
->builder
, sign
, lp_build_const_int_vec(type
, type
.width
- 1), "ifloor.sign");
1176 /* offset = -0.99999(9)f */
1177 offset
= lp_build_const_vec(type
, -(double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1178 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1180 /* offset = a < 0 ? offset : 0.0f */
1181 offset
= LLVMBuildAnd(bld
->builder
, offset
, sign
, "");
1182 offset
= LLVMBuildBitCast(bld
->builder
, offset
, vec_type
, "ifloor.offset");
1184 res
= LLVMBuildFAdd(bld
->builder
, a
, offset
, "ifloor.res");
1187 /* round to nearest (toward zero) */
1188 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "ifloor.res");
1195 * Return ceiling of float (vector), returning int (vector).
1196 * Ex: iceil( 1.1) = 2
1197 * Ex: iceil(-1.1) = -1
1200 lp_build_iceil(struct lp_build_context
*bld
,
1203 const struct lp_type type
= bld
->type
;
1204 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1207 assert(type
.floating
);
1208 assert(lp_check_value(type
, a
));
1210 if (util_cpu_caps
.has_sse4_1
&& type
.width
*type
.length
== 128) {
1211 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1214 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1215 unsigned mantissa
= lp_mantissa(type
);
1216 LLVMValueRef mask
= lp_build_const_int_vec(type
, (unsigned long long)1 << (type
.width
- 1));
1218 LLVMValueRef offset
;
1220 /* sign = a < 0 ? 0 : ~0 */
1221 sign
= LLVMBuildBitCast(bld
->builder
, a
, int_vec_type
, "");
1222 sign
= LLVMBuildAnd(bld
->builder
, sign
, mask
, "");
1223 sign
= LLVMBuildAShr(bld
->builder
, sign
, lp_build_const_int_vec(type
, type
.width
- 1), "iceil.sign");
1224 sign
= LLVMBuildNot(bld
->builder
, sign
, "iceil.not");
1226 /* offset = 0.99999(9)f */
1227 offset
= lp_build_const_vec(type
, (double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1228 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1230 /* offset = a < 0 ? 0.0 : offset */
1231 offset
= LLVMBuildAnd(bld
->builder
, offset
, sign
, "");
1232 offset
= LLVMBuildBitCast(bld
->builder
, offset
, vec_type
, "iceil.offset");
1234 res
= LLVMBuildFAdd(bld
->builder
, a
, offset
, "iceil.res");
1237 /* round to nearest (toward zero) */
1238 res
= LLVMBuildFPToSI(bld
->builder
, res
, int_vec_type
, "iceil.res");
1245 lp_build_sqrt(struct lp_build_context
*bld
,
1248 const struct lp_type type
= bld
->type
;
1249 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1252 assert(lp_check_value(type
, a
));
1254 /* TODO: optimize the constant case */
1255 /* TODO: optimize the constant case */
1257 assert(type
.floating
);
1258 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
1260 return lp_build_intrinsic_unary(bld
->builder
, intrinsic
, vec_type
, a
);
1265 * Do one Newton-Raphson step to improve reciprocate precision:
1267 * x_{i+1} = x_i * (2 - a * x_i)
1270 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1271 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1273 static INLINE LLVMValueRef
1274 lp_build_rcp_refine(struct lp_build_context
*bld
,
1278 LLVMValueRef two
= lp_build_const_vec(bld
->type
, 2.0);
1281 res
= LLVMBuildFMul(bld
->builder
, a
, rcp_a
, "");
1282 res
= LLVMBuildFSub(bld
->builder
, two
, res
, "");
1283 res
= LLVMBuildFMul(bld
->builder
, rcp_a
, res
, "");
1290 lp_build_rcp(struct lp_build_context
*bld
,
1293 const struct lp_type type
= bld
->type
;
1295 assert(lp_check_value(type
, a
));
1304 assert(type
.floating
);
1306 if(LLVMIsConstant(a
))
1307 return LLVMConstFDiv(bld
->one
, a
);
1309 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1313 res
= lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rcp.ps", bld
->vec_type
, a
);
1315 for (i
= 0; i
< RCP_NEWTON_STEPS
; ++i
) {
1316 res
= lp_build_rcp_refine(bld
, a
, res
);
1322 return LLVMBuildFDiv(bld
->builder
, bld
->one
, a
, "");
1327 * Do one Newton-Raphson step to improve rsqrt precision:
1329 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1332 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1334 static INLINE LLVMValueRef
1335 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
1337 LLVMValueRef rsqrt_a
)
1339 LLVMValueRef half
= lp_build_const_vec(bld
->type
, 0.5);
1340 LLVMValueRef three
= lp_build_const_vec(bld
->type
, 3.0);
1343 res
= LLVMBuildFMul(bld
->builder
, rsqrt_a
, rsqrt_a
, "");
1344 res
= LLVMBuildFMul(bld
->builder
, a
, res
, "");
1345 res
= LLVMBuildFSub(bld
->builder
, three
, res
, "");
1346 res
= LLVMBuildFMul(bld
->builder
, rsqrt_a
, res
, "");
1347 res
= LLVMBuildFMul(bld
->builder
, half
, res
, "");
1354 * Generate 1/sqrt(a)
1357 lp_build_rsqrt(struct lp_build_context
*bld
,
1360 const struct lp_type type
= bld
->type
;
1362 assert(lp_check_value(type
, a
));
1364 assert(type
.floating
);
1366 if(util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) {
1370 res
= lp_build_intrinsic_unary(bld
->builder
, "llvm.x86.sse.rsqrt.ps", bld
->vec_type
, a
);
1372 for (i
= 0; i
< RSQRT_NEWTON_STEPS
; ++i
) {
1373 res
= lp_build_rsqrt_refine(bld
, a
, res
);
1379 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
1383 static inline LLVMValueRef
1384 lp_build_const_v4si(unsigned long value
)
1386 LLVMValueRef element
= LLVMConstInt(LLVMInt32Type(), value
, 0);
1387 LLVMValueRef elements
[4] = { element
, element
, element
, element
};
1388 return LLVMConstVector(elements
, 4);
1391 static inline LLVMValueRef
1392 lp_build_const_v4sf(float value
)
1394 LLVMValueRef element
= LLVMConstReal(LLVMFloatType(), value
);
1395 LLVMValueRef elements
[4] = { element
, element
, element
, element
};
1396 return LLVMConstVector(elements
, 4);
1401 * Generate sin(a) using SSE2
1404 lp_build_sin(struct lp_build_context
*bld
,
1407 struct lp_type int_type
= lp_int_type(bld
->type
);
1408 LLVMBuilderRef b
= bld
->builder
;
1409 LLVMTypeRef v4sf
= LLVMVectorType(LLVMFloatType(), 4);
1410 LLVMTypeRef v4si
= LLVMVectorType(LLVMInt32Type(), 4);
1413 * take the absolute value,
1414 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1417 LLVMValueRef inv_sig_mask
= lp_build_const_v4si(~0x80000000);
1418 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, v4si
, "a_v4si");
1420 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
1421 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, v4sf
, "x_abs");
1424 * extract the sign bit (upper one)
1425 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1427 LLVMValueRef sig_mask
= lp_build_const_v4si(0x80000000);
1428 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
1432 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1435 LLVMValueRef FOPi
= lp_build_const_v4sf(1.27323954473516);
1436 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
1439 * store the integer part of y in mm0
1440 * emm2 = _mm_cvttps_epi32(y);
1443 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, v4si
, "emm2_i");
1446 * j=(j+1) & (~1) (see the cephes sources)
1447 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1450 LLVMValueRef all_one
= lp_build_const_v4si(1);
1451 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
1453 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1455 LLVMValueRef inv_one
= lp_build_const_v4si(~1);
1456 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
1459 * y = _mm_cvtepi32_ps(emm2);
1461 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, v4sf
, "y_2");
1463 /* get the swap sign flag
1464 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1466 LLVMValueRef pi32_4
= lp_build_const_v4si(4);
1467 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
1470 * emm2 = _mm_slli_epi32(emm0, 29);
1472 LLVMValueRef const_29
= lp_build_const_v4si(29);
1473 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
1476 * get the polynom selection mask
1477 * there is one polynom for 0 <= x <= Pi/4
1478 * and another one for Pi/4<x<=Pi/2
1479 * Both branches will be computed.
1481 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1482 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1485 LLVMValueRef pi32_2
= lp_build_const_v4si(2);
1486 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
1487 LLVMValueRef poly_mask
= lp_build_compare(b
, int_type
, PIPE_FUNC_EQUAL
,
1488 emm2_3
, lp_build_const_v4si(0));
1490 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1492 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
1495 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1496 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1497 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1499 LLVMValueRef DP1
= lp_build_const_v4sf(-0.78515625);
1500 LLVMValueRef DP2
= lp_build_const_v4sf(-2.4187564849853515625e-4);
1501 LLVMValueRef DP3
= lp_build_const_v4sf(-3.77489497744594108e-8);
1504 * The magic pass: "Extended precision modular arithmetic"
1505 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1506 * xmm1 = _mm_mul_ps(y, xmm1);
1507 * xmm2 = _mm_mul_ps(y, xmm2);
1508 * xmm3 = _mm_mul_ps(y, xmm3);
1510 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
1511 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
1512 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
1515 * x = _mm_add_ps(x, xmm1);
1516 * x = _mm_add_ps(x, xmm2);
1517 * x = _mm_add_ps(x, xmm3);
1520 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
1521 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
1522 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
1525 * Evaluate the first polynom (0 <= x <= Pi/4)
1527 * z = _mm_mul_ps(x,x);
1529 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
1532 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1533 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1534 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1536 LLVMValueRef coscof_p0
= lp_build_const_v4sf(2.443315711809948E-005);
1537 LLVMValueRef coscof_p1
= lp_build_const_v4sf(-1.388731625493765E-003);
1538 LLVMValueRef coscof_p2
= lp_build_const_v4sf(4.166664568298827E-002);
1541 * y = *(v4sf*)_ps_coscof_p0;
1542 * y = _mm_mul_ps(y, z);
1544 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
1545 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
1546 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
1547 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
1548 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
1549 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
1553 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1554 * y = _mm_sub_ps(y, tmp);
1555 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1557 LLVMValueRef half
= lp_build_const_v4sf(0.5);
1558 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
1559 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
1560 LLVMValueRef one
= lp_build_const_v4sf(1.0);
1561 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
1564 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1565 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1566 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1568 LLVMValueRef sincof_p0
= lp_build_const_v4sf(-1.9515295891E-4);
1569 LLVMValueRef sincof_p1
= lp_build_const_v4sf(8.3321608736E-3);
1570 LLVMValueRef sincof_p2
= lp_build_const_v4sf(-1.6666654611E-1);
1573 * Evaluate the second polynom (Pi/4 <= x <= 0)
1575 * y2 = *(v4sf*)_ps_sincof_p0;
1576 * y2 = _mm_mul_ps(y2, z);
1577 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1578 * y2 = _mm_mul_ps(y2, z);
1579 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1580 * y2 = _mm_mul_ps(y2, z);
1581 * y2 = _mm_mul_ps(y2, x);
1582 * y2 = _mm_add_ps(y2, x);
1585 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
1586 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
1587 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
1588 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
1589 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
1590 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
1591 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
1594 * select the correct result from the two polynoms
1596 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1597 * y = _mm_andnot_ps(xmm3, y);
1598 * y = _mm_add_ps(y,y2);
1600 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, v4si
, "y2_i");
1601 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, v4si
, "y_i");
1602 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
1603 LLVMValueRef inv
= lp_build_const_v4si(~0);
1604 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
1605 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
1606 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
1610 * y = _mm_xor_ps(y, sign_bit);
1612 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
1613 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, v4sf
, "y_result");
1619 * Generate cos(a) using SSE2
1622 lp_build_cos(struct lp_build_context
*bld
,
1625 struct lp_type int_type
= lp_int_type(bld
->type
);
1626 LLVMBuilderRef b
= bld
->builder
;
1627 LLVMTypeRef v4sf
= LLVMVectorType(LLVMFloatType(), 4);
1628 LLVMTypeRef v4si
= LLVMVectorType(LLVMInt32Type(), 4);
1631 * take the absolute value,
1632 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1635 LLVMValueRef inv_sig_mask
= lp_build_const_v4si(~0x80000000);
1636 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, v4si
, "a_v4si");
1638 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
1639 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, v4sf
, "x_abs");
1643 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1646 LLVMValueRef FOPi
= lp_build_const_v4sf(1.27323954473516);
1647 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
1650 * store the integer part of y in mm0
1651 * emm2 = _mm_cvttps_epi32(y);
1654 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, v4si
, "emm2_i");
1657 * j=(j+1) & (~1) (see the cephes sources)
1658 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1661 LLVMValueRef all_one
= lp_build_const_v4si(1);
1662 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
1664 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1666 LLVMValueRef inv_one
= lp_build_const_v4si(~1);
1667 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
1670 * y = _mm_cvtepi32_ps(emm2);
1672 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, v4sf
, "y_2");
1676 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1678 LLVMValueRef const_2
= lp_build_const_v4si(2);
1679 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
1682 /* get the swap sign flag
1683 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1685 LLVMValueRef inv
= lp_build_const_v4si(~0);
1686 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
1687 LLVMValueRef pi32_4
= lp_build_const_v4si(4);
1688 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
1691 * emm2 = _mm_slli_epi32(emm0, 29);
1693 LLVMValueRef const_29
= lp_build_const_v4si(29);
1694 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
1697 * get the polynom selection mask
1698 * there is one polynom for 0 <= x <= Pi/4
1699 * and another one for Pi/4<x<=Pi/2
1700 * Both branches will be computed.
1702 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1703 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1706 LLVMValueRef pi32_2
= lp_build_const_v4si(2);
1707 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
1708 LLVMValueRef poly_mask
= lp_build_compare(b
, int_type
, PIPE_FUNC_EQUAL
,
1709 emm2_3
, lp_build_const_v4si(0));
1712 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1713 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1714 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1716 LLVMValueRef DP1
= lp_build_const_v4sf(-0.78515625);
1717 LLVMValueRef DP2
= lp_build_const_v4sf(-2.4187564849853515625e-4);
1718 LLVMValueRef DP3
= lp_build_const_v4sf(-3.77489497744594108e-8);
1721 * The magic pass: "Extended precision modular arithmetic"
1722 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1723 * xmm1 = _mm_mul_ps(y, xmm1);
1724 * xmm2 = _mm_mul_ps(y, xmm2);
1725 * xmm3 = _mm_mul_ps(y, xmm3);
1727 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
1728 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
1729 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
1732 * x = _mm_add_ps(x, xmm1);
1733 * x = _mm_add_ps(x, xmm2);
1734 * x = _mm_add_ps(x, xmm3);
1737 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
1738 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
1739 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
1742 * Evaluate the first polynom (0 <= x <= Pi/4)
1744 * z = _mm_mul_ps(x,x);
1746 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
1749 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1750 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1751 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1753 LLVMValueRef coscof_p0
= lp_build_const_v4sf(2.443315711809948E-005);
1754 LLVMValueRef coscof_p1
= lp_build_const_v4sf(-1.388731625493765E-003);
1755 LLVMValueRef coscof_p2
= lp_build_const_v4sf(4.166664568298827E-002);
1758 * y = *(v4sf*)_ps_coscof_p0;
1759 * y = _mm_mul_ps(y, z);
1761 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
1762 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
1763 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
1764 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
1765 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
1766 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
1770 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1771 * y = _mm_sub_ps(y, tmp);
1772 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1774 LLVMValueRef half
= lp_build_const_v4sf(0.5);
1775 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
1776 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
1777 LLVMValueRef one
= lp_build_const_v4sf(1.0);
1778 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
1781 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1782 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1783 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1785 LLVMValueRef sincof_p0
= lp_build_const_v4sf(-1.9515295891E-4);
1786 LLVMValueRef sincof_p1
= lp_build_const_v4sf(8.3321608736E-3);
1787 LLVMValueRef sincof_p2
= lp_build_const_v4sf(-1.6666654611E-1);
1790 * Evaluate the second polynom (Pi/4 <= x <= 0)
1792 * y2 = *(v4sf*)_ps_sincof_p0;
1793 * y2 = _mm_mul_ps(y2, z);
1794 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1795 * y2 = _mm_mul_ps(y2, z);
1796 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1797 * y2 = _mm_mul_ps(y2, z);
1798 * y2 = _mm_mul_ps(y2, x);
1799 * y2 = _mm_add_ps(y2, x);
1802 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
1803 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
1804 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
1805 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
1806 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
1807 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
1808 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
1811 * select the correct result from the two polynoms
1813 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1814 * y = _mm_andnot_ps(xmm3, y);
1815 * y = _mm_add_ps(y,y2);
1817 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, v4si
, "y2_i");
1818 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, v4si
, "y_i");
1819 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
1820 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
1821 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
1822 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
1826 * y = _mm_xor_ps(y, sign_bit);
1828 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
1829 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, v4sf
, "y_result");
1835 * Generate pow(x, y)
1838 lp_build_pow(struct lp_build_context
*bld
,
1842 /* TODO: optimize the constant case */
1843 if(LLVMIsConstant(x
) && LLVMIsConstant(y
))
1844 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1847 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
1855 lp_build_exp(struct lp_build_context
*bld
,
1858 /* log2(e) = 1/log(2) */
1859 LLVMValueRef log2e
= lp_build_const_vec(bld
->type
, 1.4426950408889634);
1861 assert(lp_check_value(bld
->type
, x
));
1863 return lp_build_mul(bld
, log2e
, lp_build_exp2(bld
, x
));
1871 lp_build_log(struct lp_build_context
*bld
,
1875 LLVMValueRef log2
= lp_build_const_vec(bld
->type
, 0.69314718055994529);
1877 assert(lp_check_value(bld
->type
, x
));
1879 return lp_build_mul(bld
, log2
, lp_build_exp2(bld
, x
));
1884 * Generate polynomial.
1885 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1888 lp_build_polynomial(struct lp_build_context
*bld
,
1890 const double *coeffs
,
1891 unsigned num_coeffs
)
1893 const struct lp_type type
= bld
->type
;
1894 LLVMValueRef res
= NULL
;
1897 assert(lp_check_value(bld
->type
, x
));
1899 /* TODO: optimize the constant case */
1900 if(LLVMIsConstant(x
))
1901 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1904 for (i
= num_coeffs
; i
--; ) {
1907 coeff
= lp_build_const_vec(type
, coeffs
[i
]);
1910 res
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x
, res
));
1923 * Minimax polynomial fit of 2**x, in range [0, 1[
1925 const double lp_build_exp2_polynomial
[] = {
1926 #if EXP_POLY_DEGREE == 5
1927 0.999999999690134838155,
1928 0.583974334321735217258,
1929 0.164553105719676828492,
1930 0.0292811063701710962255,
1931 0.00354944426657875141846,
1932 0.000296253726543423377365
1933 #elif EXP_POLY_DEGREE == 4
1934 1.00000001502262084505,
1935 0.563586057338685991394,
1936 0.150436017652442413623,
1937 0.0243220604213317927308,
1938 0.0025359088446580436489
1939 #elif EXP_POLY_DEGREE == 3
1940 0.999925218562710312959,
1941 0.695833540494823811697,
1942 0.226067155427249155588,
1943 0.0780245226406372992967
1944 #elif EXP_POLY_DEGREE == 2
1945 1.00172476321474503578,
1946 0.657636275736077639316,
1947 0.33718943461968720704
1955 lp_build_exp2_approx(struct lp_build_context
*bld
,
1957 LLVMValueRef
*p_exp2_int_part
,
1958 LLVMValueRef
*p_frac_part
,
1959 LLVMValueRef
*p_exp2
)
1961 const struct lp_type type
= bld
->type
;
1962 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
1963 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
1964 LLVMValueRef ipart
= NULL
;
1965 LLVMValueRef fpart
= NULL
;
1966 LLVMValueRef expipart
= NULL
;
1967 LLVMValueRef expfpart
= NULL
;
1968 LLVMValueRef res
= NULL
;
1970 assert(lp_check_value(bld
->type
, x
));
1972 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
1973 /* TODO: optimize the constant case */
1974 if(LLVMIsConstant(x
))
1975 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1978 assert(type
.floating
&& type
.width
== 32);
1980 x
= lp_build_min(bld
, x
, lp_build_const_vec(type
, 129.0));
1981 x
= lp_build_max(bld
, x
, lp_build_const_vec(type
, -126.99999));
1983 /* ipart = floor(x) */
1984 ipart
= lp_build_floor(bld
, x
);
1986 /* fpart = x - ipart */
1987 fpart
= LLVMBuildFSub(bld
->builder
, x
, ipart
, "");
1990 if(p_exp2_int_part
|| p_exp2
) {
1991 /* expipart = (float) (1 << ipart) */
1992 ipart
= LLVMBuildFPToSI(bld
->builder
, ipart
, int_vec_type
, "");
1993 expipart
= LLVMBuildAdd(bld
->builder
, ipart
, lp_build_const_int_vec(type
, 127), "");
1994 expipart
= LLVMBuildShl(bld
->builder
, expipart
, lp_build_const_int_vec(type
, 23), "");
1995 expipart
= LLVMBuildBitCast(bld
->builder
, expipart
, vec_type
, "");
1999 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
2000 Elements(lp_build_exp2_polynomial
));
2002 res
= LLVMBuildFMul(bld
->builder
, expipart
, expfpart
, "");
2006 *p_exp2_int_part
= expipart
;
2009 *p_frac_part
= fpart
;
2017 lp_build_exp2(struct lp_build_context
*bld
,
2021 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
2027 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2028 * These coefficients can be generate with
2029 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2031 const double lp_build_log2_polynomial
[] = {
2032 #if LOG_POLY_DEGREE == 6
2033 3.11578814719469302614,
2034 -3.32419399085241980044,
2035 2.59883907202499966007,
2036 -1.23152682416275988241,
2037 0.318212422185251071475,
2038 -0.0344359067839062357313
2039 #elif LOG_POLY_DEGREE == 5
2040 2.8882704548164776201,
2041 -2.52074962577807006663,
2042 1.48116647521213171641,
2043 -0.465725644288844778798,
2044 0.0596515482674574969533
2045 #elif LOG_POLY_DEGREE == 4
2046 2.61761038894603480148,
2047 -1.75647175389045657003,
2048 0.688243882994381274313,
2049 -0.107254423828329604454
2050 #elif LOG_POLY_DEGREE == 3
2051 2.28330284476918490682,
2052 -1.04913055217340124191,
2053 0.204446009836232697516
2061 * See http://www.devmaster.net/forums/showthread.php?p=43580
2064 lp_build_log2_approx(struct lp_build_context
*bld
,
2066 LLVMValueRef
*p_exp
,
2067 LLVMValueRef
*p_floor_log2
,
2068 LLVMValueRef
*p_log2
)
2070 const struct lp_type type
= bld
->type
;
2071 LLVMTypeRef vec_type
= lp_build_vec_type(type
);
2072 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(type
);
2074 LLVMValueRef expmask
= lp_build_const_int_vec(type
, 0x7f800000);
2075 LLVMValueRef mantmask
= lp_build_const_int_vec(type
, 0x007fffff);
2076 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
2078 LLVMValueRef i
= NULL
;
2079 LLVMValueRef exp
= NULL
;
2080 LLVMValueRef mant
= NULL
;
2081 LLVMValueRef logexp
= NULL
;
2082 LLVMValueRef logmant
= NULL
;
2083 LLVMValueRef res
= NULL
;
2085 assert(lp_check_value(bld
->type
, x
));
2087 if(p_exp
|| p_floor_log2
|| p_log2
) {
2088 /* TODO: optimize the constant case */
2089 if(LLVMIsConstant(x
))
2090 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2093 assert(type
.floating
&& type
.width
== 32);
2095 i
= LLVMBuildBitCast(bld
->builder
, x
, int_vec_type
, "");
2097 /* exp = (float) exponent(x) */
2098 exp
= LLVMBuildAnd(bld
->builder
, i
, expmask
, "");
2101 if(p_floor_log2
|| p_log2
) {
2102 logexp
= LLVMBuildLShr(bld
->builder
, exp
, lp_build_const_int_vec(type
, 23), "");
2103 logexp
= LLVMBuildSub(bld
->builder
, logexp
, lp_build_const_int_vec(type
, 127), "");
2104 logexp
= LLVMBuildSIToFP(bld
->builder
, logexp
, vec_type
, "");
2108 /* mant = (float) mantissa(x) */
2109 mant
= LLVMBuildAnd(bld
->builder
, i
, mantmask
, "");
2110 mant
= LLVMBuildOr(bld
->builder
, mant
, one
, "");
2111 mant
= LLVMBuildBitCast(bld
->builder
, mant
, vec_type
, "");
2113 logmant
= lp_build_polynomial(bld
, mant
, lp_build_log2_polynomial
,
2114 Elements(lp_build_log2_polynomial
));
2116 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2117 logmant
= LLVMBuildFMul(bld
->builder
, logmant
, LLVMBuildFSub(bld
->builder
, mant
, bld
->one
, ""), "");
2119 res
= LLVMBuildFAdd(bld
->builder
, logmant
, logexp
, "");
2123 exp
= LLVMBuildBitCast(bld
->builder
, exp
, vec_type
, "");
2128 *p_floor_log2
= logexp
;
2136 lp_build_log2(struct lp_build_context
*bld
,
2140 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);