1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
65 #define EXP_POLY_DEGREE 5
67 #define LOG_POLY_DEGREE 4
72 * No checks for special case values of a or b = 1 or 0 are done.
75 lp_build_min_simple(struct lp_build_context
*bld
,
79 const struct lp_type type
= bld
->type
;
80 const char *intrinsic
= NULL
;
81 unsigned intr_size
= 0;
84 assert(lp_check_value(type
, a
));
85 assert(lp_check_value(type
, b
));
87 /* TODO: optimize the constant case */
89 if (type
.floating
&& util_cpu_caps
.has_sse
) {
90 if (type
.width
== 32) {
91 if (type
.length
== 1) {
92 intrinsic
= "llvm.x86.sse.min.ss";
95 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
96 intrinsic
= "llvm.x86.sse.min.ps";
100 intrinsic
= "llvm.x86.avx.min.ps.256";
104 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
105 if (type
.length
== 1) {
106 intrinsic
= "llvm.x86.sse2.min.sd";
109 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
110 intrinsic
= "llvm.x86.sse2.min.pd";
114 intrinsic
= "llvm.x86.avx.min.pd.256";
119 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
120 if (type
.width
== 32 && type
.length
== 4) {
121 intrinsic
= "llvm.ppc.altivec.vminfp";
124 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
126 if ((type
.width
== 8 || type
.width
== 16) &&
127 (type
.width
* type
.length
<= 64) &&
128 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
129 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
132 if (type
.width
== 8 && !type
.sign
) {
133 intrinsic
= "llvm.x86.sse2.pminu.b";
135 else if (type
.width
== 16 && type
.sign
) {
136 intrinsic
= "llvm.x86.sse2.pmins.w";
138 if (util_cpu_caps
.has_sse4_1
) {
139 if (type
.width
== 8 && type
.sign
) {
140 intrinsic
= "llvm.x86.sse41.pminsb";
142 if (type
.width
== 16 && !type
.sign
) {
143 intrinsic
= "llvm.x86.sse41.pminuw";
145 if (type
.width
== 32 && !type
.sign
) {
146 intrinsic
= "llvm.x86.sse41.pminud";
148 if (type
.width
== 32 && type
.sign
) {
149 intrinsic
= "llvm.x86.sse41.pminsd";
152 } else if (util_cpu_caps
.has_altivec
) {
154 if (type
.width
== 8) {
156 intrinsic
= "llvm.ppc.altivec.vminub";
158 intrinsic
= "llvm.ppc.altivec.vminsb";
160 } else if (type
.width
== 16) {
162 intrinsic
= "llvm.ppc.altivec.vminuh";
164 intrinsic
= "llvm.ppc.altivec.vminsh";
166 } else if (type
.width
== 32) {
168 intrinsic
= "llvm.ppc.altivec.vminuw";
170 intrinsic
= "llvm.ppc.altivec.vminsw";
176 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
181 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
182 return lp_build_select(bld
, cond
, a
, b
);
188 * No checks for special case values of a or b = 1 or 0 are done.
191 lp_build_max_simple(struct lp_build_context
*bld
,
195 const struct lp_type type
= bld
->type
;
196 const char *intrinsic
= NULL
;
197 unsigned intr_size
= 0;
200 assert(lp_check_value(type
, a
));
201 assert(lp_check_value(type
, b
));
203 /* TODO: optimize the constant case */
205 if (type
.floating
&& util_cpu_caps
.has_sse
) {
206 if (type
.width
== 32) {
207 if (type
.length
== 1) {
208 intrinsic
= "llvm.x86.sse.max.ss";
211 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
212 intrinsic
= "llvm.x86.sse.max.ps";
216 intrinsic
= "llvm.x86.avx.max.ps.256";
220 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
221 if (type
.length
== 1) {
222 intrinsic
= "llvm.x86.sse2.max.sd";
225 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
226 intrinsic
= "llvm.x86.sse2.max.pd";
230 intrinsic
= "llvm.x86.avx.max.pd.256";
235 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
236 if (type
.width
== 32 || type
.length
== 4) {
237 intrinsic
= "llvm.ppc.altivec.vmaxfp";
240 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
242 if ((type
.width
== 8 || type
.width
== 16) &&
243 (type
.width
* type
.length
<= 64) &&
244 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
245 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
248 if (type
.width
== 8 && !type
.sign
) {
249 intrinsic
= "llvm.x86.sse2.pmaxu.b";
252 else if (type
.width
== 16 && type
.sign
) {
253 intrinsic
= "llvm.x86.sse2.pmaxs.w";
255 if (util_cpu_caps
.has_sse4_1
) {
256 if (type
.width
== 8 && type
.sign
) {
257 intrinsic
= "llvm.x86.sse41.pmaxsb";
259 if (type
.width
== 16 && !type
.sign
) {
260 intrinsic
= "llvm.x86.sse41.pmaxuw";
262 if (type
.width
== 32 && !type
.sign
) {
263 intrinsic
= "llvm.x86.sse41.pmaxud";
265 if (type
.width
== 32 && type
.sign
) {
266 intrinsic
= "llvm.x86.sse41.pmaxsd";
269 } else if (util_cpu_caps
.has_altivec
) {
271 if (type
.width
== 8) {
273 intrinsic
= "llvm.ppc.altivec.vmaxub";
275 intrinsic
= "llvm.ppc.altivec.vmaxsb";
277 } else if (type
.width
== 16) {
279 intrinsic
= "llvm.ppc.altivec.vmaxuh";
281 intrinsic
= "llvm.ppc.altivec.vmaxsh";
283 } else if (type
.width
== 32) {
285 intrinsic
= "llvm.ppc.altivec.vmaxuw";
287 intrinsic
= "llvm.ppc.altivec.vmaxsw";
293 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
298 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
299 return lp_build_select(bld
, cond
, a
, b
);
304 * Generate 1 - a, or ~a depending on bld->type.
307 lp_build_comp(struct lp_build_context
*bld
,
310 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
311 const struct lp_type type
= bld
->type
;
313 assert(lp_check_value(type
, a
));
320 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
321 if(LLVMIsConstant(a
))
322 return LLVMConstNot(a
);
324 return LLVMBuildNot(builder
, a
, "");
327 if(LLVMIsConstant(a
))
329 return LLVMConstFSub(bld
->one
, a
);
331 return LLVMConstSub(bld
->one
, a
);
334 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
336 return LLVMBuildSub(builder
, bld
->one
, a
, "");
344 lp_build_add(struct lp_build_context
*bld
,
348 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
349 const struct lp_type type
= bld
->type
;
352 assert(lp_check_value(type
, a
));
353 assert(lp_check_value(type
, b
));
359 if(a
== bld
->undef
|| b
== bld
->undef
)
363 const char *intrinsic
= NULL
;
365 if(a
== bld
->one
|| b
== bld
->one
)
368 if (type
.width
* type
.length
== 128 &&
369 !type
.floating
&& !type
.fixed
) {
370 if(util_cpu_caps
.has_sse2
) {
372 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
374 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
375 } else if (util_cpu_caps
.has_altivec
) {
377 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
379 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsws" : "llvm.ppc.altivec.vadduws";
384 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
387 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
389 res
= LLVMConstFAdd(a
, b
);
391 res
= LLVMConstAdd(a
, b
);
394 res
= LLVMBuildFAdd(builder
, a
, b
, "");
396 res
= LLVMBuildAdd(builder
, a
, b
, "");
398 /* clamp to ceiling of 1.0 */
399 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
400 res
= lp_build_min_simple(bld
, res
, bld
->one
);
402 /* XXX clamp to floor of -1 or 0??? */
408 /** Return the scalar sum of the elements of a.
409 * Should avoid this operation whenever possible.
412 lp_build_horizontal_add(struct lp_build_context
*bld
,
415 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
416 const struct lp_type type
= bld
->type
;
417 LLVMValueRef index
, res
;
419 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
420 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
421 LLVMValueRef vecres
, elem2
;
423 assert(lp_check_value(type
, a
));
425 if (type
.length
== 1) {
429 assert(!bld
->type
.norm
);
432 * for byte vectors can do much better with psadbw.
433 * Using repeated shuffle/adds here. Note with multiple vectors
434 * this can be done more efficiently as outlined in the intel
435 * optimization manual.
436 * Note: could cause data rearrangement if used with smaller element
441 length
= type
.length
/ 2;
443 LLVMValueRef vec1
, vec2
;
444 for (i
= 0; i
< length
; i
++) {
445 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
446 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
448 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
449 LLVMConstVector(shuffles1
, length
), "");
450 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
451 LLVMConstVector(shuffles2
, length
), "");
453 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
456 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
458 length
= length
>> 1;
461 /* always have vector of size 2 here */
464 index
= lp_build_const_int32(bld
->gallivm
, 0);
465 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
466 index
= lp_build_const_int32(bld
->gallivm
, 1);
467 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
470 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
472 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
478 * Return the horizontal sums of 4 float vectors as a float4 vector.
479 * This uses the technique as outlined in Intel Optimization Manual.
482 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
485 struct gallivm_state
*gallivm
= bld
->gallivm
;
486 LLVMBuilderRef builder
= gallivm
->builder
;
487 LLVMValueRef shuffles
[4];
489 LLVMValueRef sumtmp
[2], shuftmp
[2];
491 /* lower half of regs */
492 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
493 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
494 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
495 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
496 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
497 LLVMConstVector(shuffles
, 4), "");
498 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
499 LLVMConstVector(shuffles
, 4), "");
501 /* upper half of regs */
502 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
503 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
504 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
505 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
506 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
507 LLVMConstVector(shuffles
, 4), "");
508 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
509 LLVMConstVector(shuffles
, 4), "");
511 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
512 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
514 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
515 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
516 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
517 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
518 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
519 LLVMConstVector(shuffles
, 4), "");
521 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
522 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
523 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
524 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
525 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
526 LLVMConstVector(shuffles
, 4), "");
528 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
533 * partially horizontally add 2-4 float vectors with length nx4,
534 * i.e. only four adjacent values in each vector will be added,
535 * assuming values are really grouped in 4 which also determines
538 * Return a vector of the same length as the initial vectors,
539 * with the excess elements (if any) being undefined.
540 * The element order is independent of number of input vectors.
541 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
542 * the output order thus will be
543 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
546 lp_build_hadd_partial4(struct lp_build_context
*bld
,
547 LLVMValueRef vectors
[],
550 struct gallivm_state
*gallivm
= bld
->gallivm
;
551 LLVMBuilderRef builder
= gallivm
->builder
;
552 LLVMValueRef ret_vec
;
554 const char *intrinsic
= NULL
;
556 assert(num_vecs
>= 2 && num_vecs
<= 4);
557 assert(bld
->type
.floating
);
559 /* only use this with at least 2 vectors, as it is sort of expensive
560 * (depending on cpu) and we always need two horizontal adds anyway,
561 * so a shuffle/add approach might be better.
567 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
568 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
570 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
571 bld
->type
.length
== 4) {
572 intrinsic
= "llvm.x86.sse3.hadd.ps";
574 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
575 bld
->type
.length
== 8) {
576 intrinsic
= "llvm.x86.avx.hadd.ps.256";
579 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
580 lp_build_vec_type(gallivm
, bld
->type
),
583 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
584 lp_build_vec_type(gallivm
, bld
->type
),
590 return lp_build_intrinsic_binary(builder
, intrinsic
,
591 lp_build_vec_type(gallivm
, bld
->type
),
595 if (bld
->type
.length
== 4) {
596 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
599 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
601 unsigned num_iter
= bld
->type
.length
/ 4;
602 struct lp_type parttype
= bld
->type
;
604 for (j
= 0; j
< num_iter
; j
++) {
605 LLVMValueRef partsrc
[4];
607 for (i
= 0; i
< 4; i
++) {
608 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
610 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
612 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
621 lp_build_sub(struct lp_build_context
*bld
,
625 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
626 const struct lp_type type
= bld
->type
;
629 assert(lp_check_value(type
, a
));
630 assert(lp_check_value(type
, b
));
634 if(a
== bld
->undef
|| b
== bld
->undef
)
640 const char *intrinsic
= NULL
;
645 if (type
.width
* type
.length
== 128 &&
646 !type
.floating
&& !type
.fixed
) {
647 if (util_cpu_caps
.has_sse2
) {
649 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
651 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
652 } else if (util_cpu_caps
.has_altivec
) {
654 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
656 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsws" : "llvm.ppc.altivec.vsubuws";
661 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
664 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
666 res
= LLVMConstFSub(a
, b
);
668 res
= LLVMConstSub(a
, b
);
671 res
= LLVMBuildFSub(builder
, a
, b
, "");
673 res
= LLVMBuildSub(builder
, a
, b
, "");
675 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
676 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
683 * Normalized 8bit multiplication.
687 * makes the following approximation to the division (Sree)
689 * a*b/255 ~= (a*(b + 1)) >> 256
691 * which is the fastest method that satisfies the following OpenGL criteria
693 * 0*0 = 0 and 255*255 = 255
697 * takes the geometric series approximation to the division
699 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
701 * in this case just the first two terms to fit in 16bit arithmetic
703 * t/255 ~= (t + (t >> 8)) >> 8
705 * note that just by itself it doesn't satisfies the OpenGL criteria, as
706 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
709 * - geometric series plus rounding
711 * when using a geometric series division instead of truncating the result
712 * use roundoff in the approximation (Jim Blinn)
714 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
716 * achieving the exact results
718 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
719 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
720 * @sa Michael Herf, The "double blend trick", May 2000,
721 * http://www.stereopsis.com/doubleblend.html
724 lp_build_mul_u8n(struct gallivm_state
*gallivm
,
725 struct lp_type i16_type
,
726 LLVMValueRef a
, LLVMValueRef b
)
728 LLVMBuilderRef builder
= gallivm
->builder
;
732 assert(!i16_type
.floating
);
733 assert(lp_check_value(i16_type
, a
));
734 assert(lp_check_value(i16_type
, b
));
736 c8
= lp_build_const_int_vec(gallivm
, i16_type
, 8);
740 /* a*b/255 ~= (a*(b + 1)) >> 256 */
741 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(gallium
, i16_type
, 1), "");
742 ab
= LLVMBuildMul(builder
, a
, b
, "");
746 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
747 ab
= LLVMBuildMul(builder
, a
, b
, "");
748 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
749 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(gallivm
, i16_type
, 0x80), "");
753 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
759 * Normalized 16bit multiplication.
761 * Utilises same principle as above code.
764 lp_build_mul_u16n(struct gallivm_state
*gallivm
,
765 struct lp_type i32_type
,
766 LLVMValueRef a
, LLVMValueRef b
)
768 LLVMBuilderRef builder
= gallivm
->builder
;
772 assert(!i32_type
.floating
);
773 assert(lp_check_value(i32_type
, a
));
774 assert(lp_check_value(i32_type
, b
));
776 c16
= lp_build_const_int_vec(gallivm
, i32_type
, 16);
778 /* ab/65535 ~= (ab + (ab >> 16) + 0x8000) >> 16 */
779 ab
= LLVMBuildMul(builder
, a
, b
, "");
780 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c16
, ""), "");
781 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(gallivm
, i32_type
, 0x8000), "");
783 ab
= LLVMBuildLShr(builder
, ab
, c16
, "");
792 lp_build_mul(struct lp_build_context
*bld
,
796 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
797 const struct lp_type type
= bld
->type
;
801 assert(lp_check_value(type
, a
));
802 assert(lp_check_value(type
, b
));
812 if(a
== bld
->undef
|| b
== bld
->undef
)
815 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
816 if(type
.width
== 8) {
817 struct lp_type i16_type
= lp_wider_type(type
);
818 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
820 lp_build_unpack2(bld
->gallivm
, type
, i16_type
, a
, &al
, &ah
);
821 lp_build_unpack2(bld
->gallivm
, type
, i16_type
, b
, &bl
, &bh
);
823 /* PMULLW, PSRLW, PADDW */
824 abl
= lp_build_mul_u8n(bld
->gallivm
, i16_type
, al
, bl
);
825 abh
= lp_build_mul_u8n(bld
->gallivm
, i16_type
, ah
, bh
);
827 ab
= lp_build_pack2(bld
->gallivm
, i16_type
, type
, abl
, abh
);
832 if(type
.width
== 16) {
833 struct lp_type i32_type
= lp_wider_type(type
);
834 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
836 lp_build_unpack2(bld
->gallivm
, type
, i32_type
, a
, &al
, &ah
);
837 lp_build_unpack2(bld
->gallivm
, type
, i32_type
, b
, &bl
, &bh
);
839 /* PMULLW, PSRLW, PADDW */
840 abl
= lp_build_mul_u16n(bld
->gallivm
, i32_type
, al
, bl
);
841 abh
= lp_build_mul_u16n(bld
->gallivm
, i32_type
, ah
, bh
);
843 ab
= lp_build_pack2(bld
->gallivm
, i32_type
, type
, abl
, abh
);
853 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
857 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
859 res
= LLVMConstFMul(a
, b
);
861 res
= LLVMConstMul(a
, b
);
864 res
= LLVMConstAShr(res
, shift
);
866 res
= LLVMConstLShr(res
, shift
);
871 res
= LLVMBuildFMul(builder
, a
, b
, "");
873 res
= LLVMBuildMul(builder
, a
, b
, "");
876 res
= LLVMBuildAShr(builder
, res
, shift
, "");
878 res
= LLVMBuildLShr(builder
, res
, shift
, "");
887 * Small vector x scale multiplication optimization.
890 lp_build_mul_imm(struct lp_build_context
*bld
,
894 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
897 assert(lp_check_value(bld
->type
, a
));
906 return lp_build_negate(bld
, a
);
908 if(b
== 2 && bld
->type
.floating
)
909 return lp_build_add(bld
, a
, a
);
911 if(util_is_power_of_two(b
)) {
912 unsigned shift
= ffs(b
) - 1;
914 if(bld
->type
.floating
) {
917 * Power of two multiplication by directly manipulating the exponent.
919 * XXX: This might not be always faster, it will introduce a small error
920 * for multiplication by zero, and it will produce wrong results
923 unsigned mantissa
= lp_mantissa(bld
->type
);
924 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
925 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
926 a
= LLVMBuildAdd(builder
, a
, factor
, "");
927 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
932 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
933 return LLVMBuildShl(builder
, a
, factor
, "");
937 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
938 return lp_build_mul(bld
, a
, factor
);
946 lp_build_div(struct lp_build_context
*bld
,
950 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
951 const struct lp_type type
= bld
->type
;
953 assert(lp_check_value(type
, a
));
954 assert(lp_check_value(type
, b
));
959 return lp_build_rcp(bld
, b
);
964 if(a
== bld
->undef
|| b
== bld
->undef
)
967 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
969 return LLVMConstFDiv(a
, b
);
971 return LLVMConstSDiv(a
, b
);
973 return LLVMConstUDiv(a
, b
);
976 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
977 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
979 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
982 return LLVMBuildFDiv(builder
, a
, b
, "");
984 return LLVMBuildSDiv(builder
, a
, b
, "");
986 return LLVMBuildUDiv(builder
, a
, b
, "");
991 * Linear interpolation -- without any checks.
993 * @sa http://www.stereopsis.com/doubleblend.html
995 static INLINE LLVMValueRef
996 lp_build_lerp_simple(struct lp_build_context
*bld
,
1001 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1005 assert(lp_check_value(bld
->type
, x
));
1006 assert(lp_check_value(bld
->type
, v0
));
1007 assert(lp_check_value(bld
->type
, v1
));
1009 delta
= lp_build_sub(bld
, v1
, v0
);
1011 res
= lp_build_mul(bld
, x
, delta
);
1013 res
= lp_build_add(bld
, v0
, res
);
1015 if (bld
->type
.fixed
) {
1016 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1017 * but it will be wrong for other uses. Basically we need a more
1018 * powerful lp_type, capable of further distinguishing the values
1019 * interpretation from the value storage. */
1020 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
1028 * Linear interpolation.
1031 lp_build_lerp(struct lp_build_context
*bld
,
1036 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1037 const struct lp_type type
= bld
->type
;
1040 assert(lp_check_value(type
, x
));
1041 assert(lp_check_value(type
, v0
));
1042 assert(lp_check_value(type
, v1
));
1045 struct lp_type wide_type
;
1046 struct lp_build_context wide_bld
;
1047 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1050 assert(type
.length
>= 2);
1054 * Create a wider type, enough to hold the intermediate result of the
1057 memset(&wide_type
, 0, sizeof wide_type
);
1058 wide_type
.fixed
= TRUE
;
1059 wide_type
.width
= type
.width
*2;
1060 wide_type
.length
= type
.length
/2;
1062 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1064 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1065 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1066 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1069 * Scale x from [0, 255] to [0, 256]
1072 shift
= lp_build_const_int_vec(bld
->gallivm
, wide_type
, type
.width
- 1);
1074 xl
= lp_build_add(&wide_bld
, xl
,
1075 LLVMBuildAShr(builder
, xl
, shift
, ""));
1076 xh
= lp_build_add(&wide_bld
, xh
,
1077 LLVMBuildAShr(builder
, xh
, shift
, ""));
1083 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
);
1084 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
);
1086 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1088 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
);
1096 lp_build_lerp_2d(struct lp_build_context
*bld
,
1104 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
1105 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
1106 return lp_build_lerp(bld
, y
, v0
, v1
);
1111 * Generate min(a, b)
1112 * Do checks for special cases.
1115 lp_build_min(struct lp_build_context
*bld
,
1119 assert(lp_check_value(bld
->type
, a
));
1120 assert(lp_check_value(bld
->type
, b
));
1122 if(a
== bld
->undef
|| b
== bld
->undef
)
1128 if (bld
->type
.norm
) {
1129 if (!bld
->type
.sign
) {
1130 if (a
== bld
->zero
|| b
== bld
->zero
) {
1140 return lp_build_min_simple(bld
, a
, b
);
1145 * Generate max(a, b)
1146 * Do checks for special cases.
1149 lp_build_max(struct lp_build_context
*bld
,
1153 assert(lp_check_value(bld
->type
, a
));
1154 assert(lp_check_value(bld
->type
, b
));
1156 if(a
== bld
->undef
|| b
== bld
->undef
)
1162 if(bld
->type
.norm
) {
1163 if(a
== bld
->one
|| b
== bld
->one
)
1165 if (!bld
->type
.sign
) {
1166 if (a
== bld
->zero
) {
1169 if (b
== bld
->zero
) {
1175 return lp_build_max_simple(bld
, a
, b
);
1180 * Generate clamp(a, min, max)
1181 * Do checks for special cases.
1184 lp_build_clamp(struct lp_build_context
*bld
,
1189 assert(lp_check_value(bld
->type
, a
));
1190 assert(lp_check_value(bld
->type
, min
));
1191 assert(lp_check_value(bld
->type
, max
));
1193 a
= lp_build_min(bld
, a
, max
);
1194 a
= lp_build_max(bld
, a
, min
);
1203 lp_build_abs(struct lp_build_context
*bld
,
1206 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1207 const struct lp_type type
= bld
->type
;
1208 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1210 assert(lp_check_value(type
, a
));
1216 /* Mask out the sign bit */
1217 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1218 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1219 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1220 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1221 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1222 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1226 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1227 switch(type
.width
) {
1229 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1231 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1233 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1236 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1237 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1238 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1239 debug_printf("%s: inefficient code, should split vectors manually\n",
1243 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1248 lp_build_negate(struct lp_build_context
*bld
,
1251 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1253 assert(lp_check_value(bld
->type
, a
));
1255 #if HAVE_LLVM >= 0x0207
1256 if (bld
->type
.floating
)
1257 a
= LLVMBuildFNeg(builder
, a
, "");
1260 a
= LLVMBuildNeg(builder
, a
, "");
1266 /** Return -1, 0 or +1 depending on the sign of a */
1268 lp_build_sgn(struct lp_build_context
*bld
,
1271 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1272 const struct lp_type type
= bld
->type
;
1276 assert(lp_check_value(type
, a
));
1278 /* Handle non-zero case */
1280 /* if not zero then sign must be positive */
1283 else if(type
.floating
) {
1284 LLVMTypeRef vec_type
;
1285 LLVMTypeRef int_type
;
1289 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1291 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1292 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1293 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1295 /* Take the sign bit and add it to 1 constant */
1296 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1297 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1298 one
= LLVMConstBitCast(bld
->one
, int_type
);
1299 res
= LLVMBuildOr(builder
, sign
, one
, "");
1300 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1304 /* signed int/norm/fixed point */
1305 /* could use psign with sse3 and appropriate vectors here */
1306 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1307 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1308 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1312 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1313 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1320 * Set the sign of float vector 'a' according to 'sign'.
1321 * If sign==0, return abs(a).
1322 * If sign==1, return -abs(a);
1323 * Other values for sign produce undefined results.
1326 lp_build_set_sign(struct lp_build_context
*bld
,
1327 LLVMValueRef a
, LLVMValueRef sign
)
1329 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1330 const struct lp_type type
= bld
->type
;
1331 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1332 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1333 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1334 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1335 ~((unsigned long long) 1 << (type
.width
- 1)));
1336 LLVMValueRef val
, res
;
1338 assert(type
.floating
);
1339 assert(lp_check_value(type
, a
));
1341 /* val = reinterpret_cast<int>(a) */
1342 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1343 /* val = val & mask */
1344 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1345 /* sign = sign << shift */
1346 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1347 /* res = val | sign */
1348 res
= LLVMBuildOr(builder
, val
, sign
, "");
1349 /* res = reinterpret_cast<float>(res) */
1350 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1357 * Convert vector of (or scalar) int to vector of (or scalar) float.
1360 lp_build_int_to_float(struct lp_build_context
*bld
,
1363 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1364 const struct lp_type type
= bld
->type
;
1365 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1367 assert(type
.floating
);
1369 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1373 arch_rounding_available(const struct lp_type type
)
1375 if ((util_cpu_caps
.has_sse4_1
&&
1376 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1377 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1379 else if ((util_cpu_caps
.has_altivec
&&
1380 (type
.width
== 32 && type
.length
== 4)))
1386 enum lp_build_round_mode
1388 LP_BUILD_ROUND_NEAREST
= 0,
1389 LP_BUILD_ROUND_FLOOR
= 1,
1390 LP_BUILD_ROUND_CEIL
= 2,
1391 LP_BUILD_ROUND_TRUNCATE
= 3
1395 * Helper for SSE4.1's ROUNDxx instructions.
1397 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1398 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1400 static INLINE LLVMValueRef
1401 lp_build_round_sse41(struct lp_build_context
*bld
,
1403 enum lp_build_round_mode mode
)
1405 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1406 const struct lp_type type
= bld
->type
;
1407 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1408 const char *intrinsic
;
1411 assert(type
.floating
);
1413 assert(lp_check_value(type
, a
));
1414 assert(util_cpu_caps
.has_sse4_1
);
1416 if (type
.length
== 1) {
1417 LLVMTypeRef vec_type
;
1419 LLVMValueRef args
[3];
1420 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1422 switch(type
.width
) {
1424 intrinsic
= "llvm.x86.sse41.round.ss";
1427 intrinsic
= "llvm.x86.sse41.round.sd";
1434 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1436 undef
= LLVMGetUndef(vec_type
);
1439 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1440 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1442 res
= lp_build_intrinsic(builder
, intrinsic
,
1443 vec_type
, args
, Elements(args
));
1445 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1448 if (type
.width
* type
.length
== 128) {
1449 switch(type
.width
) {
1451 intrinsic
= "llvm.x86.sse41.round.ps";
1454 intrinsic
= "llvm.x86.sse41.round.pd";
1462 assert(type
.width
* type
.length
== 256);
1463 assert(util_cpu_caps
.has_avx
);
1465 switch(type
.width
) {
1467 intrinsic
= "llvm.x86.avx.round.ps.256";
1470 intrinsic
= "llvm.x86.avx.round.pd.256";
1478 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1480 LLVMConstInt(i32t
, mode
, 0));
1487 static INLINE LLVMValueRef
1488 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1491 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1492 const struct lp_type type
= bld
->type
;
1493 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1494 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1495 const char *intrinsic
;
1498 assert(type
.floating
);
1499 /* using the double precision conversions is a bit more complicated */
1500 assert(type
.width
== 32);
1502 assert(lp_check_value(type
, a
));
1503 assert(util_cpu_caps
.has_sse2
);
1505 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1506 if (type
.length
== 1) {
1507 LLVMTypeRef vec_type
;
1510 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1512 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1514 intrinsic
= "llvm.x86.sse.cvtss2si";
1516 undef
= LLVMGetUndef(vec_type
);
1518 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1520 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1524 if (type
.width
* type
.length
== 128) {
1525 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1528 assert(type
.width
*type
.length
== 256);
1529 assert(util_cpu_caps
.has_avx
);
1531 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1533 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1543 static INLINE LLVMValueRef
1544 lp_build_round_altivec(struct lp_build_context
*bld
,
1546 enum lp_build_round_mode mode
)
1548 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1549 const struct lp_type type
= bld
->type
;
1550 const char *intrinsic
= NULL
;
1552 assert(type
.floating
);
1554 assert(lp_check_value(type
, a
));
1555 assert(util_cpu_caps
.has_altivec
);
1558 case LP_BUILD_ROUND_NEAREST
:
1559 intrinsic
= "llvm.ppc.altivec.vrfin";
1561 case LP_BUILD_ROUND_FLOOR
:
1562 intrinsic
= "llvm.ppc.altivec.vrfim";
1564 case LP_BUILD_ROUND_CEIL
:
1565 intrinsic
= "llvm.ppc.altivec.vrfip";
1567 case LP_BUILD_ROUND_TRUNCATE
:
1568 intrinsic
= "llvm.ppc.altivec.vrfiz";
1572 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1575 static INLINE LLVMValueRef
1576 lp_build_round_arch(struct lp_build_context
*bld
,
1578 enum lp_build_round_mode mode
)
1580 if (util_cpu_caps
.has_sse4_1
)
1581 return lp_build_round_sse41(bld
, a
, mode
);
1582 else /* (util_cpu_caps.has_altivec) */
1583 return lp_build_round_altivec(bld
, a
, mode
);
1587 * Return the integer part of a float (vector) value (== round toward zero).
1588 * The returned value is a float (vector).
1589 * Ex: trunc(-1.5) = -1.0
1592 lp_build_trunc(struct lp_build_context
*bld
,
1595 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1596 const struct lp_type type
= bld
->type
;
1598 assert(type
.floating
);
1599 assert(lp_check_value(type
, a
));
1601 if (arch_rounding_available(type
)) {
1602 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1605 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1606 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1608 res
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1609 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1616 * Return float (vector) rounded to nearest integer (vector). The returned
1617 * value is a float (vector).
1618 * Ex: round(0.9) = 1.0
1619 * Ex: round(-1.5) = -2.0
1622 lp_build_round(struct lp_build_context
*bld
,
1625 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1626 const struct lp_type type
= bld
->type
;
1628 assert(type
.floating
);
1629 assert(lp_check_value(type
, a
));
1631 if (arch_rounding_available(type
)) {
1632 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1635 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1637 res
= lp_build_iround(bld
, a
);
1638 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1645 * Return floor of float (vector), result is a float (vector)
1646 * Ex: floor(1.1) = 1.0
1647 * Ex: floor(-1.1) = -2.0
1650 lp_build_floor(struct lp_build_context
*bld
,
1653 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1654 const struct lp_type type
= bld
->type
;
1656 assert(type
.floating
);
1657 assert(lp_check_value(type
, a
));
1659 if (arch_rounding_available(type
)) {
1660 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1663 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1665 res
= lp_build_ifloor(bld
, a
);
1666 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1673 * Return ceiling of float (vector), returning float (vector).
1674 * Ex: ceil( 1.1) = 2.0
1675 * Ex: ceil(-1.1) = -1.0
1678 lp_build_ceil(struct lp_build_context
*bld
,
1681 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1682 const struct lp_type type
= bld
->type
;
1684 assert(type
.floating
);
1685 assert(lp_check_value(type
, a
));
1687 if (arch_rounding_available(type
)) {
1688 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
1691 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1693 res
= lp_build_iceil(bld
, a
);
1694 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1701 * Return fractional part of 'a' computed as a - floor(a)
1702 * Typically used in texture coord arithmetic.
1705 lp_build_fract(struct lp_build_context
*bld
,
1708 assert(bld
->type
.floating
);
1709 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1714 * Prevent returning a fractional part of 1.0 for very small negative values of
1715 * 'a' by clamping against 0.99999(9).
1717 static inline LLVMValueRef
1718 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
1722 /* this is the largest number smaller than 1.0 representable as float */
1723 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
1724 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
1725 return lp_build_min(bld
, fract
, max
);
1730 * Same as lp_build_fract, but guarantees that the result is always smaller
1734 lp_build_fract_safe(struct lp_build_context
*bld
,
1737 return clamp_fract(bld
, lp_build_fract(bld
, a
));
1742 * Return the integer part of a float (vector) value (== round toward zero).
1743 * The returned value is an integer (vector).
1744 * Ex: itrunc(-1.5) = -1
1747 lp_build_itrunc(struct lp_build_context
*bld
,
1750 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1751 const struct lp_type type
= bld
->type
;
1752 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1754 assert(type
.floating
);
1755 assert(lp_check_value(type
, a
));
1757 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1762 * Return float (vector) rounded to nearest integer (vector). The returned
1763 * value is an integer (vector).
1764 * Ex: iround(0.9) = 1
1765 * Ex: iround(-1.5) = -2
1768 lp_build_iround(struct lp_build_context
*bld
,
1771 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1772 const struct lp_type type
= bld
->type
;
1773 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1776 assert(type
.floating
);
1778 assert(lp_check_value(type
, a
));
1780 if ((util_cpu_caps
.has_sse2
&&
1781 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
1782 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
1783 return lp_build_iround_nearest_sse2(bld
, a
);
1785 if (arch_rounding_available(type
)) {
1786 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1791 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
1794 LLVMTypeRef vec_type
= bld
->vec_type
;
1795 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1796 (unsigned long long)1 << (type
.width
- 1));
1800 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1801 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1804 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
1805 half
= LLVMBuildOr(builder
, sign
, half
, "");
1806 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
1809 res
= LLVMBuildFAdd(builder
, a
, half
, "");
1812 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
1819 * Return floor of float (vector), result is an int (vector)
1820 * Ex: ifloor(1.1) = 1.0
1821 * Ex: ifloor(-1.1) = -2.0
1824 lp_build_ifloor(struct lp_build_context
*bld
,
1827 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1828 const struct lp_type type
= bld
->type
;
1829 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1832 assert(type
.floating
);
1833 assert(lp_check_value(type
, a
));
1837 if (arch_rounding_available(type
)) {
1838 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1841 /* Take the sign bit and add it to 1 constant */
1842 LLVMTypeRef vec_type
= bld
->vec_type
;
1843 unsigned mantissa
= lp_mantissa(type
);
1844 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1845 (unsigned long long)1 << (type
.width
- 1));
1847 LLVMValueRef offset
;
1849 /* sign = a < 0 ? ~0 : 0 */
1850 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1851 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1852 sign
= LLVMBuildAShr(builder
, sign
,
1853 lp_build_const_int_vec(bld
->gallivm
, type
,
1857 /* offset = -0.99999(9)f */
1858 offset
= lp_build_const_vec(bld
->gallivm
, type
,
1859 -(double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1860 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1862 /* offset = a < 0 ? offset : 0.0f */
1863 offset
= LLVMBuildAnd(builder
, offset
, sign
, "");
1864 offset
= LLVMBuildBitCast(builder
, offset
, vec_type
, "ifloor.offset");
1866 res
= LLVMBuildFAdd(builder
, res
, offset
, "ifloor.res");
1870 /* round to nearest (toward zero) */
1871 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
1878 * Return ceiling of float (vector), returning int (vector).
1879 * Ex: iceil( 1.1) = 2
1880 * Ex: iceil(-1.1) = -1
1883 lp_build_iceil(struct lp_build_context
*bld
,
1886 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1887 const struct lp_type type
= bld
->type
;
1888 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1891 assert(type
.floating
);
1892 assert(lp_check_value(type
, a
));
1894 if (arch_rounding_available(type
)) {
1895 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
1898 LLVMTypeRef vec_type
= bld
->vec_type
;
1899 unsigned mantissa
= lp_mantissa(type
);
1900 LLVMValueRef offset
;
1902 /* offset = 0.99999(9)f */
1903 offset
= lp_build_const_vec(bld
->gallivm
, type
,
1904 (double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1907 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1908 (unsigned long long)1 << (type
.width
- 1));
1911 /* sign = a < 0 ? 0 : ~0 */
1912 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1913 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1914 sign
= LLVMBuildAShr(builder
, sign
,
1915 lp_build_const_int_vec(bld
->gallivm
, type
,
1918 sign
= LLVMBuildNot(builder
, sign
, "iceil.not");
1920 /* offset = a < 0 ? 0.0 : offset */
1921 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1922 offset
= LLVMBuildAnd(builder
, offset
, sign
, "");
1923 offset
= LLVMBuildBitCast(builder
, offset
, vec_type
, "iceil.offset");
1926 res
= LLVMBuildFAdd(builder
, a
, offset
, "iceil.res");
1929 /* round to nearest (toward zero) */
1930 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
1937 * Combined ifloor() & fract().
1939 * Preferred to calling the functions separately, as it will ensure that the
1940 * strategy (floor() vs ifloor()) that results in less redundant work is used.
1943 lp_build_ifloor_fract(struct lp_build_context
*bld
,
1945 LLVMValueRef
*out_ipart
,
1946 LLVMValueRef
*out_fpart
)
1948 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1949 const struct lp_type type
= bld
->type
;
1952 assert(type
.floating
);
1953 assert(lp_check_value(type
, a
));
1955 if (arch_rounding_available(type
)) {
1957 * floor() is easier.
1960 ipart
= lp_build_floor(bld
, a
);
1961 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
1962 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
1966 * ifloor() is easier.
1969 *out_ipart
= lp_build_ifloor(bld
, a
);
1970 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
1971 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
1977 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1978 * always smaller than one.
1981 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
1983 LLVMValueRef
*out_ipart
,
1984 LLVMValueRef
*out_fpart
)
1986 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
1987 *out_fpart
= clamp_fract(bld
, *out_fpart
);
1992 lp_build_sqrt(struct lp_build_context
*bld
,
1995 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1996 const struct lp_type type
= bld
->type
;
1997 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2000 assert(lp_check_value(type
, a
));
2002 /* TODO: optimize the constant case */
2004 assert(type
.floating
);
2005 if (type
.length
== 1) {
2006 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.f%u", type
.width
);
2009 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
2012 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2017 * Do one Newton-Raphson step to improve reciprocate precision:
2019 * x_{i+1} = x_i * (2 - a * x_i)
2021 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2022 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2023 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2024 * halo. It would be necessary to clamp the argument to prevent this.
2027 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2028 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2030 static INLINE LLVMValueRef
2031 lp_build_rcp_refine(struct lp_build_context
*bld
,
2035 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2036 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2039 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2040 res
= LLVMBuildFSub(builder
, two
, res
, "");
2041 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2048 lp_build_rcp(struct lp_build_context
*bld
,
2051 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2052 const struct lp_type type
= bld
->type
;
2054 assert(lp_check_value(type
, a
));
2063 assert(type
.floating
);
2065 if(LLVMIsConstant(a
))
2066 return LLVMConstFDiv(bld
->one
, a
);
2069 * We don't use RCPPS because:
2070 * - it only has 10bits of precision
2071 * - it doesn't even get the reciprocate of 1.0 exactly
2072 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2073 * - for recent processors the benefit over DIVPS is marginal, a case
2076 * We could still use it on certain processors if benchmarks show that the
2077 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2078 * particular uses that require less workarounds.
2081 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2082 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2083 const unsigned num_iterations
= 0;
2086 const char *intrinsic
= NULL
;
2088 if (type
.length
== 4) {
2089 intrinsic
= "llvm.x86.sse.rcp.ps";
2092 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2095 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2097 for (i
= 0; i
< num_iterations
; ++i
) {
2098 res
= lp_build_rcp_refine(bld
, a
, res
);
2104 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2109 * Do one Newton-Raphson step to improve rsqrt precision:
2111 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2113 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2115 static INLINE LLVMValueRef
2116 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2118 LLVMValueRef rsqrt_a
)
2120 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2121 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2122 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2125 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2126 res
= LLVMBuildFMul(builder
, a
, res
, "");
2127 res
= LLVMBuildFSub(builder
, three
, res
, "");
2128 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2129 res
= LLVMBuildFMul(builder
, half
, res
, "");
2136 * Generate 1/sqrt(a).
2137 * Result is undefined for values < 0, infinity for +0.
2140 lp_build_rsqrt(struct lp_build_context
*bld
,
2143 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2144 const struct lp_type type
= bld
->type
;
2146 assert(lp_check_value(type
, a
));
2148 assert(type
.floating
);
2151 * This should be faster but all denormals will end up as infinity.
2153 if (0 && ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2154 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))) {
2155 const unsigned num_iterations
= 1;
2158 const char *intrinsic
= NULL
;
2160 if (type
.length
== 4) {
2161 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2164 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2166 if (num_iterations
) {
2168 * Newton-Raphson will result in NaN instead of infinity for zero,
2169 * and NaN instead of zero for infinity.
2170 * Also, need to ensure rsqrt(1.0) == 1.0.
2171 * All numbers smaller than FLT_MIN will result in +infinity
2172 * (rsqrtps treats all denormals as zero).
2175 * Certain non-c99 compilers don't know INFINITY and might not support
2176 * hacks to evaluate it at compile time neither.
2178 const unsigned posinf_int
= 0x7F800000;
2180 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2181 LLVMValueRef inf
= lp_build_const_int_vec(bld
->gallivm
, type
, posinf_int
);
2183 inf
= LLVMBuildBitCast(builder
, inf
, lp_build_vec_type(bld
->gallivm
, type
), "");
2185 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2187 for (i
= 0; i
< num_iterations
; ++i
) {
2188 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2190 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2191 res
= lp_build_select(bld
, cmp
, inf
, res
);
2192 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2193 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2194 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2195 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2198 /* rsqrt(1.0) != 1.0 here */
2199 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2206 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2211 * Generate sin(a) using SSE2
2214 lp_build_sin(struct lp_build_context
*bld
,
2217 struct gallivm_state
*gallivm
= bld
->gallivm
;
2218 LLVMBuilderRef builder
= gallivm
->builder
;
2219 struct lp_type int_type
= lp_int_type(bld
->type
);
2220 LLVMBuilderRef b
= builder
;
2223 * take the absolute value,
2224 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2227 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2228 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2230 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2231 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2234 * extract the sign bit (upper one)
2235 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2237 LLVMValueRef sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2238 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
2242 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2245 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2246 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2249 * store the integer part of y in mm0
2250 * emm2 = _mm_cvttps_epi32(y);
2253 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2256 * j=(j+1) & (~1) (see the cephes sources)
2257 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2260 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2261 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2263 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2265 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2266 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2269 * y = _mm_cvtepi32_ps(emm2);
2271 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2273 /* get the swap sign flag
2274 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2276 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2277 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
2280 * emm2 = _mm_slli_epi32(emm0, 29);
2282 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2283 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
2286 * get the polynom selection mask
2287 * there is one polynom for 0 <= x <= Pi/4
2288 * and another one for Pi/4<x<=Pi/2
2289 * Both branches will be computed.
2291 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2292 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2295 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2296 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
2297 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2298 int_type
, PIPE_FUNC_EQUAL
,
2299 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2301 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2303 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
2306 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2307 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2308 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2310 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2311 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2312 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2315 * The magic pass: "Extended precision modular arithmetic"
2316 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2317 * xmm1 = _mm_mul_ps(y, xmm1);
2318 * xmm2 = _mm_mul_ps(y, xmm2);
2319 * xmm3 = _mm_mul_ps(y, xmm3);
2321 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2322 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2323 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2326 * x = _mm_add_ps(x, xmm1);
2327 * x = _mm_add_ps(x, xmm2);
2328 * x = _mm_add_ps(x, xmm3);
2331 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2332 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2333 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2336 * Evaluate the first polynom (0 <= x <= Pi/4)
2338 * z = _mm_mul_ps(x,x);
2340 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2343 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2344 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2345 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2347 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2348 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2349 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2352 * y = *(v4sf*)_ps_coscof_p0;
2353 * y = _mm_mul_ps(y, z);
2355 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2356 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2357 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2358 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2359 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2360 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2364 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2365 * y = _mm_sub_ps(y, tmp);
2366 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2368 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2369 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2370 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2371 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2372 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2375 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2376 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2377 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2379 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2380 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2381 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2384 * Evaluate the second polynom (Pi/4 <= x <= 0)
2386 * y2 = *(v4sf*)_ps_sincof_p0;
2387 * y2 = _mm_mul_ps(y2, z);
2388 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2389 * y2 = _mm_mul_ps(y2, z);
2390 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2391 * y2 = _mm_mul_ps(y2, z);
2392 * y2 = _mm_mul_ps(y2, x);
2393 * y2 = _mm_add_ps(y2, x);
2396 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2397 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2398 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2399 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2400 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2401 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2402 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2405 * select the correct result from the two polynoms
2407 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2408 * y = _mm_andnot_ps(xmm3, y);
2409 * y = _mm_add_ps(y,y2);
2411 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2412 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2413 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2414 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2415 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2416 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2417 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2421 * y = _mm_xor_ps(y, sign_bit);
2423 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
2424 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2430 * Generate cos(a) using SSE2
2433 lp_build_cos(struct lp_build_context
*bld
,
2436 struct gallivm_state
*gallivm
= bld
->gallivm
;
2437 LLVMBuilderRef builder
= gallivm
->builder
;
2438 struct lp_type int_type
= lp_int_type(bld
->type
);
2439 LLVMBuilderRef b
= builder
;
2442 * take the absolute value,
2443 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2446 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2447 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2449 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2450 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2454 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2457 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2458 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2461 * store the integer part of y in mm0
2462 * emm2 = _mm_cvttps_epi32(y);
2465 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2468 * j=(j+1) & (~1) (see the cephes sources)
2469 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2472 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2473 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2475 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2477 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2478 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2481 * y = _mm_cvtepi32_ps(emm2);
2483 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2487 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2489 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2490 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
2493 /* get the swap sign flag
2494 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2496 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2497 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
2498 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2499 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
2502 * emm2 = _mm_slli_epi32(emm0, 29);
2504 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2505 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
2508 * get the polynom selection mask
2509 * there is one polynom for 0 <= x <= Pi/4
2510 * and another one for Pi/4<x<=Pi/2
2511 * Both branches will be computed.
2513 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2514 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2517 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2518 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
2519 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2520 int_type
, PIPE_FUNC_EQUAL
,
2521 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2524 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2525 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2526 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2528 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2529 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2530 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2533 * The magic pass: "Extended precision modular arithmetic"
2534 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2535 * xmm1 = _mm_mul_ps(y, xmm1);
2536 * xmm2 = _mm_mul_ps(y, xmm2);
2537 * xmm3 = _mm_mul_ps(y, xmm3);
2539 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2540 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2541 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2544 * x = _mm_add_ps(x, xmm1);
2545 * x = _mm_add_ps(x, xmm2);
2546 * x = _mm_add_ps(x, xmm3);
2549 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2550 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2551 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2554 * Evaluate the first polynom (0 <= x <= Pi/4)
2556 * z = _mm_mul_ps(x,x);
2558 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2561 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2562 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2563 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2565 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2566 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2567 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2570 * y = *(v4sf*)_ps_coscof_p0;
2571 * y = _mm_mul_ps(y, z);
2573 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2574 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2575 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2576 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2577 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2578 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2582 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2583 * y = _mm_sub_ps(y, tmp);
2584 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2586 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2587 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2588 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2589 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2590 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2593 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2594 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2595 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2597 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2598 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2599 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2602 * Evaluate the second polynom (Pi/4 <= x <= 0)
2604 * y2 = *(v4sf*)_ps_sincof_p0;
2605 * y2 = _mm_mul_ps(y2, z);
2606 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2607 * y2 = _mm_mul_ps(y2, z);
2608 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2609 * y2 = _mm_mul_ps(y2, z);
2610 * y2 = _mm_mul_ps(y2, x);
2611 * y2 = _mm_add_ps(y2, x);
2614 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2615 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2616 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2617 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2618 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2619 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2620 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2623 * select the correct result from the two polynoms
2625 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2626 * y = _mm_andnot_ps(xmm3, y);
2627 * y = _mm_add_ps(y,y2);
2629 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2630 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2631 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2632 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2633 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2634 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2638 * y = _mm_xor_ps(y, sign_bit);
2640 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
2641 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2647 * Generate pow(x, y)
2650 lp_build_pow(struct lp_build_context
*bld
,
2654 /* TODO: optimize the constant case */
2655 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2656 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2657 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2661 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2669 lp_build_exp(struct lp_build_context
*bld
,
2672 /* log2(e) = 1/log(2) */
2673 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2674 1.4426950408889634);
2676 assert(lp_check_value(bld
->type
, x
));
2678 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2686 lp_build_log(struct lp_build_context
*bld
,
2690 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2691 0.69314718055994529);
2693 assert(lp_check_value(bld
->type
, x
));
2695 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2700 * Generate polynomial.
2701 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2704 lp_build_polynomial(struct lp_build_context
*bld
,
2706 const double *coeffs
,
2707 unsigned num_coeffs
)
2709 const struct lp_type type
= bld
->type
;
2710 LLVMValueRef even
= NULL
, odd
= NULL
;
2714 assert(lp_check_value(bld
->type
, x
));
2716 /* TODO: optimize the constant case */
2717 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2718 LLVMIsConstant(x
)) {
2719 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2724 * Calculate odd and even terms seperately to decrease data dependency
2726 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2727 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2729 x2
= lp_build_mul(bld
, x
, x
);
2731 for (i
= num_coeffs
; i
--; ) {
2734 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2738 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2743 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2750 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2759 * Minimax polynomial fit of 2**x, in range [0, 1[
2761 const double lp_build_exp2_polynomial
[] = {
2762 #if EXP_POLY_DEGREE == 5
2763 0.999999925063526176901,
2764 0.693153073200168932794,
2765 0.240153617044375388211,
2766 0.0558263180532956664775,
2767 0.00898934009049466391101,
2768 0.00187757667519147912699
2769 #elif EXP_POLY_DEGREE == 4
2770 1.00000259337069434683,
2771 0.693003834469974940458,
2772 0.24144275689150793076,
2773 0.0520114606103070150235,
2774 0.0135341679161270268764
2775 #elif EXP_POLY_DEGREE == 3
2776 0.999925218562710312959,
2777 0.695833540494823811697,
2778 0.226067155427249155588,
2779 0.0780245226406372992967
2780 #elif EXP_POLY_DEGREE == 2
2781 1.00172476321474503578,
2782 0.657636275736077639316,
2783 0.33718943461968720704
2791 lp_build_exp2_approx(struct lp_build_context
*bld
,
2793 LLVMValueRef
*p_exp2_int_part
,
2794 LLVMValueRef
*p_frac_part
,
2795 LLVMValueRef
*p_exp2
)
2797 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2798 const struct lp_type type
= bld
->type
;
2799 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2800 LLVMValueRef ipart
= NULL
;
2801 LLVMValueRef fpart
= NULL
;
2802 LLVMValueRef expipart
= NULL
;
2803 LLVMValueRef expfpart
= NULL
;
2804 LLVMValueRef res
= NULL
;
2806 assert(lp_check_value(bld
->type
, x
));
2808 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
2809 /* TODO: optimize the constant case */
2810 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2811 LLVMIsConstant(x
)) {
2812 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2816 assert(type
.floating
&& type
.width
== 32);
2818 x
= lp_build_min(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, 129.0));
2819 x
= lp_build_max(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999));
2821 /* ipart = floor(x) */
2822 /* fpart = x - ipart */
2823 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
2826 if(p_exp2_int_part
|| p_exp2
) {
2827 /* expipart = (float) (1 << ipart) */
2828 expipart
= LLVMBuildAdd(builder
, ipart
,
2829 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
2830 expipart
= LLVMBuildShl(builder
, expipart
,
2831 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
2832 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
2836 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
2837 Elements(lp_build_exp2_polynomial
));
2839 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
2843 *p_exp2_int_part
= expipart
;
2846 *p_frac_part
= fpart
;
2854 lp_build_exp2(struct lp_build_context
*bld
,
2858 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
2864 * Extract the exponent of a IEEE-754 floating point value.
2866 * Optionally apply an integer bias.
2868 * Result is an integer value with
2870 * ifloor(log2(x)) + bias
2873 lp_build_extract_exponent(struct lp_build_context
*bld
,
2877 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2878 const struct lp_type type
= bld
->type
;
2879 unsigned mantissa
= lp_mantissa(type
);
2882 assert(type
.floating
);
2884 assert(lp_check_value(bld
->type
, x
));
2886 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
2888 res
= LLVMBuildLShr(builder
, x
,
2889 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
2890 res
= LLVMBuildAnd(builder
, res
,
2891 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
2892 res
= LLVMBuildSub(builder
, res
,
2893 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
2900 * Extract the mantissa of the a floating.
2902 * Result is a floating point value with
2904 * x / floor(log2(x))
2907 lp_build_extract_mantissa(struct lp_build_context
*bld
,
2910 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2911 const struct lp_type type
= bld
->type
;
2912 unsigned mantissa
= lp_mantissa(type
);
2913 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2914 (1ULL << mantissa
) - 1);
2915 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
2918 assert(lp_check_value(bld
->type
, x
));
2920 assert(type
.floating
);
2922 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
2924 /* res = x / 2**ipart */
2925 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
2926 res
= LLVMBuildOr(builder
, res
, one
, "");
2927 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
2935 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2936 * These coefficients can be generate with
2937 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2939 const double lp_build_log2_polynomial
[] = {
2940 #if LOG_POLY_DEGREE == 5
2941 2.88539008148777786488L,
2942 0.961796878841293367824L,
2943 0.577058946784739859012L,
2944 0.412914355135828735411L,
2945 0.308591899232910175289L,
2946 0.352376952300281371868L,
2947 #elif LOG_POLY_DEGREE == 4
2948 2.88539009343309178325L,
2949 0.961791550404184197881L,
2950 0.577440339438736392009L,
2951 0.403343858251329912514L,
2952 0.406718052498846252698L,
2953 #elif LOG_POLY_DEGREE == 3
2954 2.88538959748872753838L,
2955 0.961932915889597772928L,
2956 0.571118517972136195241L,
2957 0.493997535084709500285L,
2964 * See http://www.devmaster.net/forums/showthread.php?p=43580
2965 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2966 * http://www.nezumi.demon.co.uk/consult/logx.htm
2969 lp_build_log2_approx(struct lp_build_context
*bld
,
2971 LLVMValueRef
*p_exp
,
2972 LLVMValueRef
*p_floor_log2
,
2973 LLVMValueRef
*p_log2
)
2975 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2976 const struct lp_type type
= bld
->type
;
2977 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2978 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2980 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
2981 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
2982 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
2984 LLVMValueRef i
= NULL
;
2985 LLVMValueRef y
= NULL
;
2986 LLVMValueRef z
= NULL
;
2987 LLVMValueRef exp
= NULL
;
2988 LLVMValueRef mant
= NULL
;
2989 LLVMValueRef logexp
= NULL
;
2990 LLVMValueRef logmant
= NULL
;
2991 LLVMValueRef res
= NULL
;
2993 assert(lp_check_value(bld
->type
, x
));
2995 if(p_exp
|| p_floor_log2
|| p_log2
) {
2996 /* TODO: optimize the constant case */
2997 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2998 LLVMIsConstant(x
)) {
2999 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3003 assert(type
.floating
&& type
.width
== 32);
3006 * We don't explicitly handle denormalized numbers. They will yield a
3007 * result in the neighbourhood of -127, which appears to be adequate
3011 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3013 /* exp = (float) exponent(x) */
3014 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3017 if(p_floor_log2
|| p_log2
) {
3018 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3019 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3020 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3024 /* mant = 1 + (float) mantissa(x) */
3025 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3026 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3027 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3029 /* y = (mant - 1) / (mant + 1) */
3030 y
= lp_build_div(bld
,
3031 lp_build_sub(bld
, mant
, bld
->one
),
3032 lp_build_add(bld
, mant
, bld
->one
)
3036 z
= lp_build_mul(bld
, y
, y
);
3039 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3040 Elements(lp_build_log2_polynomial
));
3042 /* logmant = y * P(z) */
3043 logmant
= lp_build_mul(bld
, y
, logmant
);
3045 res
= lp_build_add(bld
, logmant
, logexp
);
3049 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3054 *p_floor_log2
= logexp
;
3062 lp_build_log2(struct lp_build_context
*bld
,
3066 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);
3072 * Faster (and less accurate) log2.
3074 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3076 * Piece-wise linear approximation, with exact results when x is a
3079 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3082 lp_build_fast_log2(struct lp_build_context
*bld
,
3085 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3089 assert(lp_check_value(bld
->type
, x
));
3091 assert(bld
->type
.floating
);
3093 /* ipart = floor(log2(x)) - 1 */
3094 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3095 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3097 /* fpart = x / 2**ipart */
3098 fpart
= lp_build_extract_mantissa(bld
, x
);
3101 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3106 * Fast implementation of iround(log2(x)).
3108 * Not an approximation -- it should give accurate results all the time.
3111 lp_build_ilog2(struct lp_build_context
*bld
,
3114 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3115 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3118 assert(bld
->type
.floating
);
3120 assert(lp_check_value(bld
->type
, x
));
3122 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3123 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3125 /* ipart = floor(log2(x) + 0.5) */
3126 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3132 lp_build_mod(struct lp_build_context
*bld
,
3136 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3138 const struct lp_type type
= bld
->type
;
3140 assert(lp_check_value(type
, x
));
3141 assert(lp_check_value(type
, y
));
3144 res
= LLVMBuildFRem(builder
, x
, y
, "");
3146 res
= LLVMBuildSRem(builder
, x
, y
, "");
3148 res
= LLVMBuildURem(builder
, x
, y
, "");