1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
67 #define EXP_POLY_DEGREE 5
69 #define LOG_POLY_DEGREE 4
74 * No checks for special case values of a or b = 1 or 0 are done.
77 lp_build_min_simple(struct lp_build_context
*bld
,
81 const struct lp_type type
= bld
->type
;
82 const char *intrinsic
= NULL
;
83 unsigned intr_size
= 0;
86 assert(lp_check_value(type
, a
));
87 assert(lp_check_value(type
, b
));
89 /* TODO: optimize the constant case */
91 if (type
.floating
&& util_cpu_caps
.has_sse
) {
92 if (type
.width
== 32) {
93 if (type
.length
== 1) {
94 intrinsic
= "llvm.x86.sse.min.ss";
97 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
98 intrinsic
= "llvm.x86.sse.min.ps";
102 intrinsic
= "llvm.x86.avx.min.ps.256";
106 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
107 if (type
.length
== 1) {
108 intrinsic
= "llvm.x86.sse2.min.sd";
111 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
112 intrinsic
= "llvm.x86.sse2.min.pd";
116 intrinsic
= "llvm.x86.avx.min.pd.256";
121 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
122 if (type
.width
== 32 && type
.length
== 4) {
123 intrinsic
= "llvm.ppc.altivec.vminfp";
126 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
128 if ((type
.width
== 8 || type
.width
== 16) &&
129 (type
.width
* type
.length
<= 64) &&
130 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
131 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
134 if (type
.width
== 8 && !type
.sign
) {
135 intrinsic
= "llvm.x86.sse2.pminu.b";
137 else if (type
.width
== 16 && type
.sign
) {
138 intrinsic
= "llvm.x86.sse2.pmins.w";
140 if (util_cpu_caps
.has_sse4_1
) {
141 if (type
.width
== 8 && type
.sign
) {
142 intrinsic
= "llvm.x86.sse41.pminsb";
144 if (type
.width
== 16 && !type
.sign
) {
145 intrinsic
= "llvm.x86.sse41.pminuw";
147 if (type
.width
== 32 && !type
.sign
) {
148 intrinsic
= "llvm.x86.sse41.pminud";
150 if (type
.width
== 32 && type
.sign
) {
151 intrinsic
= "llvm.x86.sse41.pminsd";
154 } else if (util_cpu_caps
.has_altivec
) {
156 if (type
.width
== 8) {
158 intrinsic
= "llvm.ppc.altivec.vminub";
160 intrinsic
= "llvm.ppc.altivec.vminsb";
162 } else if (type
.width
== 16) {
164 intrinsic
= "llvm.ppc.altivec.vminuh";
166 intrinsic
= "llvm.ppc.altivec.vminsh";
168 } else if (type
.width
== 32) {
170 intrinsic
= "llvm.ppc.altivec.vminuw";
172 intrinsic
= "llvm.ppc.altivec.vminsw";
178 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
183 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
184 return lp_build_select(bld
, cond
, a
, b
);
190 * No checks for special case values of a or b = 1 or 0 are done.
193 lp_build_max_simple(struct lp_build_context
*bld
,
197 const struct lp_type type
= bld
->type
;
198 const char *intrinsic
= NULL
;
199 unsigned intr_size
= 0;
202 assert(lp_check_value(type
, a
));
203 assert(lp_check_value(type
, b
));
205 /* TODO: optimize the constant case */
207 if (type
.floating
&& util_cpu_caps
.has_sse
) {
208 if (type
.width
== 32) {
209 if (type
.length
== 1) {
210 intrinsic
= "llvm.x86.sse.max.ss";
213 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
214 intrinsic
= "llvm.x86.sse.max.ps";
218 intrinsic
= "llvm.x86.avx.max.ps.256";
222 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
223 if (type
.length
== 1) {
224 intrinsic
= "llvm.x86.sse2.max.sd";
227 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
228 intrinsic
= "llvm.x86.sse2.max.pd";
232 intrinsic
= "llvm.x86.avx.max.pd.256";
237 else if (type
.floating
&& util_cpu_caps
.has_altivec
) {
238 if (type
.width
== 32 || type
.length
== 4) {
239 intrinsic
= "llvm.ppc.altivec.vmaxfp";
242 } else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
244 if ((type
.width
== 8 || type
.width
== 16) &&
245 (type
.width
* type
.length
<= 64) &&
246 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
247 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
250 if (type
.width
== 8 && !type
.sign
) {
251 intrinsic
= "llvm.x86.sse2.pmaxu.b";
254 else if (type
.width
== 16 && type
.sign
) {
255 intrinsic
= "llvm.x86.sse2.pmaxs.w";
257 if (util_cpu_caps
.has_sse4_1
) {
258 if (type
.width
== 8 && type
.sign
) {
259 intrinsic
= "llvm.x86.sse41.pmaxsb";
261 if (type
.width
== 16 && !type
.sign
) {
262 intrinsic
= "llvm.x86.sse41.pmaxuw";
264 if (type
.width
== 32 && !type
.sign
) {
265 intrinsic
= "llvm.x86.sse41.pmaxud";
267 if (type
.width
== 32 && type
.sign
) {
268 intrinsic
= "llvm.x86.sse41.pmaxsd";
271 } else if (util_cpu_caps
.has_altivec
) {
273 if (type
.width
== 8) {
275 intrinsic
= "llvm.ppc.altivec.vmaxub";
277 intrinsic
= "llvm.ppc.altivec.vmaxsb";
279 } else if (type
.width
== 16) {
281 intrinsic
= "llvm.ppc.altivec.vmaxuh";
283 intrinsic
= "llvm.ppc.altivec.vmaxsh";
285 } else if (type
.width
== 32) {
287 intrinsic
= "llvm.ppc.altivec.vmaxuw";
289 intrinsic
= "llvm.ppc.altivec.vmaxsw";
295 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
300 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
301 return lp_build_select(bld
, cond
, a
, b
);
306 * Generate 1 - a, or ~a depending on bld->type.
309 lp_build_comp(struct lp_build_context
*bld
,
312 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
313 const struct lp_type type
= bld
->type
;
315 assert(lp_check_value(type
, a
));
322 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
323 if(LLVMIsConstant(a
))
324 return LLVMConstNot(a
);
326 return LLVMBuildNot(builder
, a
, "");
329 if(LLVMIsConstant(a
))
331 return LLVMConstFSub(bld
->one
, a
);
333 return LLVMConstSub(bld
->one
, a
);
336 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
338 return LLVMBuildSub(builder
, bld
->one
, a
, "");
346 lp_build_add(struct lp_build_context
*bld
,
350 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
351 const struct lp_type type
= bld
->type
;
354 assert(lp_check_value(type
, a
));
355 assert(lp_check_value(type
, b
));
361 if(a
== bld
->undef
|| b
== bld
->undef
)
365 const char *intrinsic
= NULL
;
367 if(a
== bld
->one
|| b
== bld
->one
)
370 if (type
.width
* type
.length
== 128 &&
371 !type
.floating
&& !type
.fixed
) {
372 if(util_cpu_caps
.has_sse2
) {
374 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
376 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
377 } else if (util_cpu_caps
.has_altivec
) {
379 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
381 intrinsic
= type
.sign
? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
386 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
389 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
391 res
= LLVMConstFAdd(a
, b
);
393 res
= LLVMConstAdd(a
, b
);
396 res
= LLVMBuildFAdd(builder
, a
, b
, "");
398 res
= LLVMBuildAdd(builder
, a
, b
, "");
400 /* clamp to ceiling of 1.0 */
401 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
402 res
= lp_build_min_simple(bld
, res
, bld
->one
);
404 /* XXX clamp to floor of -1 or 0??? */
410 /** Return the scalar sum of the elements of a.
411 * Should avoid this operation whenever possible.
414 lp_build_horizontal_add(struct lp_build_context
*bld
,
417 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
418 const struct lp_type type
= bld
->type
;
419 LLVMValueRef index
, res
;
421 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
422 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
423 LLVMValueRef vecres
, elem2
;
425 assert(lp_check_value(type
, a
));
427 if (type
.length
== 1) {
431 assert(!bld
->type
.norm
);
434 * for byte vectors can do much better with psadbw.
435 * Using repeated shuffle/adds here. Note with multiple vectors
436 * this can be done more efficiently as outlined in the intel
437 * optimization manual.
438 * Note: could cause data rearrangement if used with smaller element
443 length
= type
.length
/ 2;
445 LLVMValueRef vec1
, vec2
;
446 for (i
= 0; i
< length
; i
++) {
447 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
448 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
450 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
451 LLVMConstVector(shuffles1
, length
), "");
452 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
453 LLVMConstVector(shuffles2
, length
), "");
455 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
458 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
460 length
= length
>> 1;
463 /* always have vector of size 2 here */
466 index
= lp_build_const_int32(bld
->gallivm
, 0);
467 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
468 index
= lp_build_const_int32(bld
->gallivm
, 1);
469 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
472 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
474 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
480 * Return the horizontal sums of 4 float vectors as a float4 vector.
481 * This uses the technique as outlined in Intel Optimization Manual.
484 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
487 struct gallivm_state
*gallivm
= bld
->gallivm
;
488 LLVMBuilderRef builder
= gallivm
->builder
;
489 LLVMValueRef shuffles
[4];
491 LLVMValueRef sumtmp
[2], shuftmp
[2];
493 /* lower half of regs */
494 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
495 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
496 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
497 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
498 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
499 LLVMConstVector(shuffles
, 4), "");
500 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
501 LLVMConstVector(shuffles
, 4), "");
503 /* upper half of regs */
504 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
505 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
506 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
507 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
508 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
509 LLVMConstVector(shuffles
, 4), "");
510 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
511 LLVMConstVector(shuffles
, 4), "");
513 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
514 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
516 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
517 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
518 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
519 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
520 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
521 LLVMConstVector(shuffles
, 4), "");
523 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
524 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
525 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
526 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
527 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
528 LLVMConstVector(shuffles
, 4), "");
530 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
535 * partially horizontally add 2-4 float vectors with length nx4,
536 * i.e. only four adjacent values in each vector will be added,
537 * assuming values are really grouped in 4 which also determines
540 * Return a vector of the same length as the initial vectors,
541 * with the excess elements (if any) being undefined.
542 * The element order is independent of number of input vectors.
543 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
544 * the output order thus will be
545 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
548 lp_build_hadd_partial4(struct lp_build_context
*bld
,
549 LLVMValueRef vectors
[],
552 struct gallivm_state
*gallivm
= bld
->gallivm
;
553 LLVMBuilderRef builder
= gallivm
->builder
;
554 LLVMValueRef ret_vec
;
556 const char *intrinsic
= NULL
;
558 assert(num_vecs
>= 2 && num_vecs
<= 4);
559 assert(bld
->type
.floating
);
561 /* only use this with at least 2 vectors, as it is sort of expensive
562 * (depending on cpu) and we always need two horizontal adds anyway,
563 * so a shuffle/add approach might be better.
569 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
570 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
572 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
573 bld
->type
.length
== 4) {
574 intrinsic
= "llvm.x86.sse3.hadd.ps";
576 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
577 bld
->type
.length
== 8) {
578 intrinsic
= "llvm.x86.avx.hadd.ps.256";
581 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
582 lp_build_vec_type(gallivm
, bld
->type
),
585 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
586 lp_build_vec_type(gallivm
, bld
->type
),
592 return lp_build_intrinsic_binary(builder
, intrinsic
,
593 lp_build_vec_type(gallivm
, bld
->type
),
597 if (bld
->type
.length
== 4) {
598 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
601 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
603 unsigned num_iter
= bld
->type
.length
/ 4;
604 struct lp_type parttype
= bld
->type
;
606 for (j
= 0; j
< num_iter
; j
++) {
607 LLVMValueRef partsrc
[4];
609 for (i
= 0; i
< 4; i
++) {
610 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
612 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
614 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
623 lp_build_sub(struct lp_build_context
*bld
,
627 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
628 const struct lp_type type
= bld
->type
;
631 assert(lp_check_value(type
, a
));
632 assert(lp_check_value(type
, b
));
636 if(a
== bld
->undef
|| b
== bld
->undef
)
642 const char *intrinsic
= NULL
;
647 if (type
.width
* type
.length
== 128 &&
648 !type
.floating
&& !type
.fixed
) {
649 if (util_cpu_caps
.has_sse2
) {
651 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
653 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
654 } else if (util_cpu_caps
.has_altivec
) {
656 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
658 intrinsic
= type
.sign
? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
663 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
666 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
668 res
= LLVMConstFSub(a
, b
);
670 res
= LLVMConstSub(a
, b
);
673 res
= LLVMBuildFSub(builder
, a
, b
, "");
675 res
= LLVMBuildSub(builder
, a
, b
, "");
677 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
678 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
686 * Normalized multiplication.
688 * There are several approaches for (using 8-bit normalized multiplication as
693 * makes the following approximation to the division (Sree)
695 * a*b/255 ~= (a*(b + 1)) >> 256
697 * which is the fastest method that satisfies the following OpenGL criteria of
699 * 0*0 = 0 and 255*255 = 255
703 * takes the geometric series approximation to the division
705 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
707 * in this case just the first two terms to fit in 16bit arithmetic
709 * t/255 ~= (t + (t >> 8)) >> 8
711 * note that just by itself it doesn't satisfies the OpenGL criteria, as
712 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
715 * - geometric series plus rounding
717 * when using a geometric series division instead of truncating the result
718 * use roundoff in the approximation (Jim Blinn)
720 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
722 * achieving the exact results.
726 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
727 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
728 * @sa Michael Herf, The "double blend trick", May 2000,
729 * http://www.stereopsis.com/doubleblend.html
732 lp_build_mul_norm(struct gallivm_state
*gallivm
,
733 struct lp_type wide_type
,
734 LLVMValueRef a
, LLVMValueRef b
)
736 LLVMBuilderRef builder
= gallivm
->builder
;
737 struct lp_build_context bld
;
742 assert(!wide_type
.floating
);
743 assert(lp_check_value(wide_type
, a
));
744 assert(lp_check_value(wide_type
, b
));
746 lp_build_context_init(&bld
, gallivm
, wide_type
);
748 n
= wide_type
.width
/ 2;
749 if (wide_type
.sign
) {
754 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
755 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
759 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
762 ab
= LLVMBuildMul(builder
, a
, b
, "");
763 ab
= LLVMBuildAdd(builder
, ab
, lp_build_shr_imm(&bld
, ab
, n
), "");
766 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
769 half
= lp_build_const_int_vec(gallivm
, wide_type
, 1 << (n
- 1));
770 if (wide_type
.sign
) {
771 LLVMValueRef minus_half
= LLVMBuildNeg(builder
, half
, "");
772 LLVMValueRef sign
= lp_build_shr_imm(&bld
, ab
, wide_type
.width
- 1);
773 half
= lp_build_select(&bld
, sign
, minus_half
, half
);
775 ab
= LLVMBuildAdd(builder
, ab
, half
, "");
778 ab
= lp_build_shr_imm(&bld
, ab
, n
);
787 lp_build_mul(struct lp_build_context
*bld
,
791 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
792 const struct lp_type type
= bld
->type
;
796 assert(lp_check_value(type
, a
));
797 assert(lp_check_value(type
, b
));
807 if(a
== bld
->undef
|| b
== bld
->undef
)
810 if (!type
.floating
&& !type
.fixed
&& type
.norm
) {
811 struct lp_type wide_type
= lp_wider_type(type
);
812 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
814 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, a
, &al
, &ah
);
815 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, b
, &bl
, &bh
);
817 /* PMULLW, PSRLW, PADDW */
818 abl
= lp_build_mul_norm(bld
->gallivm
, wide_type
, al
, bl
);
819 abh
= lp_build_mul_norm(bld
->gallivm
, wide_type
, ah
, bh
);
821 ab
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, abl
, abh
);
827 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
831 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
833 res
= LLVMConstFMul(a
, b
);
835 res
= LLVMConstMul(a
, b
);
838 res
= LLVMConstAShr(res
, shift
);
840 res
= LLVMConstLShr(res
, shift
);
845 res
= LLVMBuildFMul(builder
, a
, b
, "");
847 res
= LLVMBuildMul(builder
, a
, b
, "");
850 res
= LLVMBuildAShr(builder
, res
, shift
, "");
852 res
= LLVMBuildLShr(builder
, res
, shift
, "");
861 * Small vector x scale multiplication optimization.
864 lp_build_mul_imm(struct lp_build_context
*bld
,
868 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
871 assert(lp_check_value(bld
->type
, a
));
880 return lp_build_negate(bld
, a
);
882 if(b
== 2 && bld
->type
.floating
)
883 return lp_build_add(bld
, a
, a
);
885 if(util_is_power_of_two(b
)) {
886 unsigned shift
= ffs(b
) - 1;
888 if(bld
->type
.floating
) {
891 * Power of two multiplication by directly manipulating the exponent.
893 * XXX: This might not be always faster, it will introduce a small error
894 * for multiplication by zero, and it will produce wrong results
897 unsigned mantissa
= lp_mantissa(bld
->type
);
898 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
899 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
900 a
= LLVMBuildAdd(builder
, a
, factor
, "");
901 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
906 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
907 return LLVMBuildShl(builder
, a
, factor
, "");
911 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
912 return lp_build_mul(bld
, a
, factor
);
920 lp_build_div(struct lp_build_context
*bld
,
924 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
925 const struct lp_type type
= bld
->type
;
927 assert(lp_check_value(type
, a
));
928 assert(lp_check_value(type
, b
));
933 return lp_build_rcp(bld
, b
);
938 if(a
== bld
->undef
|| b
== bld
->undef
)
941 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
943 return LLVMConstFDiv(a
, b
);
945 return LLVMConstSDiv(a
, b
);
947 return LLVMConstUDiv(a
, b
);
950 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
951 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
953 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
956 return LLVMBuildFDiv(builder
, a
, b
, "");
958 return LLVMBuildSDiv(builder
, a
, b
, "");
960 return LLVMBuildUDiv(builder
, a
, b
, "");
965 * Linear interpolation helper.
967 * @param normalized whether we are interpolating normalized values,
968 * encoded in normalized integers, twice as wide.
970 * @sa http://www.stereopsis.com/doubleblend.html
972 static INLINE LLVMValueRef
973 lp_build_lerp_simple(struct lp_build_context
*bld
,
979 unsigned half_width
= bld
->type
.width
/2;
980 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
984 assert(lp_check_value(bld
->type
, x
));
985 assert(lp_check_value(bld
->type
, v0
));
986 assert(lp_check_value(bld
->type
, v1
));
988 delta
= lp_build_sub(bld
, v1
, v0
);
990 if (flags
& LP_BLD_LERP_WIDE_NORMALIZED
) {
991 if (!bld
->type
.sign
) {
992 if (!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
)) {
994 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
995 * most-significant-bit to the lowest-significant-bit, so that
996 * later we can just divide by 2**n instead of 2**n - 1.
999 x
= lp_build_add(bld
, x
, lp_build_shr_imm(bld
, x
, half_width
- 1));
1002 /* (x * delta) >> n */
1003 res
= lp_build_mul(bld
, x
, delta
);
1004 res
= lp_build_shr_imm(bld
, res
, half_width
);
1007 * The rescaling trick above doesn't work for signed numbers, so
1008 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1011 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1012 res
= lp_build_mul_norm(bld
->gallivm
, bld
->type
, x
, delta
);
1015 assert(!(flags
& LP_BLD_LERP_PRESCALED_WEIGHTS
));
1016 res
= lp_build_mul(bld
, x
, delta
);
1019 res
= lp_build_add(bld
, v0
, res
);
1021 if (((flags
& LP_BLD_LERP_WIDE_NORMALIZED
) && !bld
->type
.sign
) ||
1023 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1024 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1025 * but it will be wrong for true fixed point use cases. Basically we need
1026 * a more powerful lp_type, capable of further distinguishing the values
1027 * interpretation from the value storage. */
1028 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << half_width
) - 1), "");
1036 * Linear interpolation.
1039 lp_build_lerp(struct lp_build_context
*bld
,
1045 const struct lp_type type
= bld
->type
;
1048 assert(lp_check_value(type
, x
));
1049 assert(lp_check_value(type
, v0
));
1050 assert(lp_check_value(type
, v1
));
1052 assert(!(flags
& LP_BLD_LERP_WIDE_NORMALIZED
));
1055 struct lp_type wide_type
;
1056 struct lp_build_context wide_bld
;
1057 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
1059 assert(type
.length
>= 2);
1062 * Create a wider integer type, enough to hold the
1063 * intermediate result of the multiplication.
1065 memset(&wide_type
, 0, sizeof wide_type
);
1066 wide_type
.sign
= type
.sign
;
1067 wide_type
.width
= type
.width
*2;
1068 wide_type
.length
= type
.length
/2;
1070 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
1072 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
1073 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
1074 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
1080 flags
|= LP_BLD_LERP_WIDE_NORMALIZED
;
1082 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
, flags
);
1083 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
, flags
);
1085 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
1087 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
, flags
);
1095 * Bilinear interpolation.
1097 * Values indices are in v_{yx}.
1100 lp_build_lerp_2d(struct lp_build_context
*bld
,
1109 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
, flags
);
1110 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
, flags
);
1111 return lp_build_lerp(bld
, y
, v0
, v1
, flags
);
1116 lp_build_lerp_3d(struct lp_build_context
*bld
,
1130 LLVMValueRef v0
= lp_build_lerp_2d(bld
, x
, y
, v000
, v001
, v010
, v011
, flags
);
1131 LLVMValueRef v1
= lp_build_lerp_2d(bld
, x
, y
, v100
, v101
, v110
, v111
, flags
);
1132 return lp_build_lerp(bld
, z
, v0
, v1
, flags
);
1137 * Generate min(a, b)
1138 * Do checks for special cases.
1141 lp_build_min(struct lp_build_context
*bld
,
1145 assert(lp_check_value(bld
->type
, a
));
1146 assert(lp_check_value(bld
->type
, b
));
1148 if(a
== bld
->undef
|| b
== bld
->undef
)
1154 if (bld
->type
.norm
) {
1155 if (!bld
->type
.sign
) {
1156 if (a
== bld
->zero
|| b
== bld
->zero
) {
1166 return lp_build_min_simple(bld
, a
, b
);
1171 * Generate max(a, b)
1172 * Do checks for special cases.
1175 lp_build_max(struct lp_build_context
*bld
,
1179 assert(lp_check_value(bld
->type
, a
));
1180 assert(lp_check_value(bld
->type
, b
));
1182 if(a
== bld
->undef
|| b
== bld
->undef
)
1188 if(bld
->type
.norm
) {
1189 if(a
== bld
->one
|| b
== bld
->one
)
1191 if (!bld
->type
.sign
) {
1192 if (a
== bld
->zero
) {
1195 if (b
== bld
->zero
) {
1201 return lp_build_max_simple(bld
, a
, b
);
1206 * Generate clamp(a, min, max)
1207 * Do checks for special cases.
1210 lp_build_clamp(struct lp_build_context
*bld
,
1215 assert(lp_check_value(bld
->type
, a
));
1216 assert(lp_check_value(bld
->type
, min
));
1217 assert(lp_check_value(bld
->type
, max
));
1219 a
= lp_build_min(bld
, a
, max
);
1220 a
= lp_build_max(bld
, a
, min
);
1229 lp_build_abs(struct lp_build_context
*bld
,
1232 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1233 const struct lp_type type
= bld
->type
;
1234 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1236 assert(lp_check_value(type
, a
));
1242 /* Mask out the sign bit */
1243 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1244 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1245 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1246 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1247 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1248 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1252 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1253 switch(type
.width
) {
1255 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1257 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1259 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1262 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1263 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1264 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1265 debug_printf("%s: inefficient code, should split vectors manually\n",
1269 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1274 lp_build_negate(struct lp_build_context
*bld
,
1277 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1279 assert(lp_check_value(bld
->type
, a
));
1281 #if HAVE_LLVM >= 0x0207
1282 if (bld
->type
.floating
)
1283 a
= LLVMBuildFNeg(builder
, a
, "");
1286 a
= LLVMBuildNeg(builder
, a
, "");
1292 /** Return -1, 0 or +1 depending on the sign of a */
1294 lp_build_sgn(struct lp_build_context
*bld
,
1297 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1298 const struct lp_type type
= bld
->type
;
1302 assert(lp_check_value(type
, a
));
1304 /* Handle non-zero case */
1306 /* if not zero then sign must be positive */
1309 else if(type
.floating
) {
1310 LLVMTypeRef vec_type
;
1311 LLVMTypeRef int_type
;
1315 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1317 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1318 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1319 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1321 /* Take the sign bit and add it to 1 constant */
1322 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1323 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1324 one
= LLVMConstBitCast(bld
->one
, int_type
);
1325 res
= LLVMBuildOr(builder
, sign
, one
, "");
1326 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1330 /* signed int/norm/fixed point */
1331 /* could use psign with sse3 and appropriate vectors here */
1332 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1333 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1334 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1338 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1339 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1346 * Set the sign of float vector 'a' according to 'sign'.
1347 * If sign==0, return abs(a).
1348 * If sign==1, return -abs(a);
1349 * Other values for sign produce undefined results.
1352 lp_build_set_sign(struct lp_build_context
*bld
,
1353 LLVMValueRef a
, LLVMValueRef sign
)
1355 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1356 const struct lp_type type
= bld
->type
;
1357 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1358 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1359 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1360 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1361 ~((unsigned long long) 1 << (type
.width
- 1)));
1362 LLVMValueRef val
, res
;
1364 assert(type
.floating
);
1365 assert(lp_check_value(type
, a
));
1367 /* val = reinterpret_cast<int>(a) */
1368 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1369 /* val = val & mask */
1370 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1371 /* sign = sign << shift */
1372 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1373 /* res = val | sign */
1374 res
= LLVMBuildOr(builder
, val
, sign
, "");
1375 /* res = reinterpret_cast<float>(res) */
1376 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1383 * Convert vector of (or scalar) int to vector of (or scalar) float.
1386 lp_build_int_to_float(struct lp_build_context
*bld
,
1389 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1390 const struct lp_type type
= bld
->type
;
1391 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1393 assert(type
.floating
);
1395 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1399 arch_rounding_available(const struct lp_type type
)
1401 if ((util_cpu_caps
.has_sse4_1
&&
1402 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1403 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1405 else if ((util_cpu_caps
.has_altivec
&&
1406 (type
.width
== 32 && type
.length
== 4)))
1412 enum lp_build_round_mode
1414 LP_BUILD_ROUND_NEAREST
= 0,
1415 LP_BUILD_ROUND_FLOOR
= 1,
1416 LP_BUILD_ROUND_CEIL
= 2,
1417 LP_BUILD_ROUND_TRUNCATE
= 3
1421 * Helper for SSE4.1's ROUNDxx instructions.
1423 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1424 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1426 static INLINE LLVMValueRef
1427 lp_build_round_sse41(struct lp_build_context
*bld
,
1429 enum lp_build_round_mode mode
)
1431 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1432 const struct lp_type type
= bld
->type
;
1433 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1434 const char *intrinsic
;
1437 assert(type
.floating
);
1439 assert(lp_check_value(type
, a
));
1440 assert(util_cpu_caps
.has_sse4_1
);
1442 if (type
.length
== 1) {
1443 LLVMTypeRef vec_type
;
1445 LLVMValueRef args
[3];
1446 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1448 switch(type
.width
) {
1450 intrinsic
= "llvm.x86.sse41.round.ss";
1453 intrinsic
= "llvm.x86.sse41.round.sd";
1460 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1462 undef
= LLVMGetUndef(vec_type
);
1465 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1466 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1468 res
= lp_build_intrinsic(builder
, intrinsic
,
1469 vec_type
, args
, Elements(args
));
1471 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1474 if (type
.width
* type
.length
== 128) {
1475 switch(type
.width
) {
1477 intrinsic
= "llvm.x86.sse41.round.ps";
1480 intrinsic
= "llvm.x86.sse41.round.pd";
1488 assert(type
.width
* type
.length
== 256);
1489 assert(util_cpu_caps
.has_avx
);
1491 switch(type
.width
) {
1493 intrinsic
= "llvm.x86.avx.round.ps.256";
1496 intrinsic
= "llvm.x86.avx.round.pd.256";
1504 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1506 LLVMConstInt(i32t
, mode
, 0));
1513 static INLINE LLVMValueRef
1514 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1517 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1518 const struct lp_type type
= bld
->type
;
1519 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1520 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1521 const char *intrinsic
;
1524 assert(type
.floating
);
1525 /* using the double precision conversions is a bit more complicated */
1526 assert(type
.width
== 32);
1528 assert(lp_check_value(type
, a
));
1529 assert(util_cpu_caps
.has_sse2
);
1531 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1532 if (type
.length
== 1) {
1533 LLVMTypeRef vec_type
;
1536 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1538 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1540 intrinsic
= "llvm.x86.sse.cvtss2si";
1542 undef
= LLVMGetUndef(vec_type
);
1544 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1546 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1550 if (type
.width
* type
.length
== 128) {
1551 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1554 assert(type
.width
*type
.length
== 256);
1555 assert(util_cpu_caps
.has_avx
);
1557 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1559 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1569 static INLINE LLVMValueRef
1570 lp_build_round_altivec(struct lp_build_context
*bld
,
1572 enum lp_build_round_mode mode
)
1574 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1575 const struct lp_type type
= bld
->type
;
1576 const char *intrinsic
= NULL
;
1578 assert(type
.floating
);
1580 assert(lp_check_value(type
, a
));
1581 assert(util_cpu_caps
.has_altivec
);
1584 case LP_BUILD_ROUND_NEAREST
:
1585 intrinsic
= "llvm.ppc.altivec.vrfin";
1587 case LP_BUILD_ROUND_FLOOR
:
1588 intrinsic
= "llvm.ppc.altivec.vrfim";
1590 case LP_BUILD_ROUND_CEIL
:
1591 intrinsic
= "llvm.ppc.altivec.vrfip";
1593 case LP_BUILD_ROUND_TRUNCATE
:
1594 intrinsic
= "llvm.ppc.altivec.vrfiz";
1598 return lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1601 static INLINE LLVMValueRef
1602 lp_build_round_arch(struct lp_build_context
*bld
,
1604 enum lp_build_round_mode mode
)
1606 if (util_cpu_caps
.has_sse4_1
)
1607 return lp_build_round_sse41(bld
, a
, mode
);
1608 else /* (util_cpu_caps.has_altivec) */
1609 return lp_build_round_altivec(bld
, a
, mode
);
1613 * Return the integer part of a float (vector) value (== round toward zero).
1614 * The returned value is a float (vector).
1615 * Ex: trunc(-1.5) = -1.0
1618 lp_build_trunc(struct lp_build_context
*bld
,
1621 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1622 const struct lp_type type
= bld
->type
;
1624 assert(type
.floating
);
1625 assert(lp_check_value(type
, a
));
1627 if (arch_rounding_available(type
)) {
1628 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_TRUNCATE
);
1631 const struct lp_type type
= bld
->type
;
1632 struct lp_type inttype
;
1633 struct lp_build_context intbld
;
1634 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1635 LLVMValueRef trunc
, res
, anosign
, mask
;
1636 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1637 LLVMTypeRef vec_type
= bld
->vec_type
;
1639 assert(type
.width
== 32); /* might want to handle doubles at some point */
1642 inttype
.floating
= 0;
1643 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1645 /* round by truncation */
1646 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1647 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1649 /* mask out sign bit */
1650 anosign
= lp_build_abs(bld
, a
);
1652 * mask out all values if anosign > 2^24
1653 * This should work both for large ints (all rounding is no-op for them
1654 * because such floats are always exact) as well as special cases like
1655 * NaNs, Infs (taking advantage of the fact they use max exponent).
1656 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1658 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1659 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1660 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1661 return lp_build_select(bld
, mask
, a
, res
);
1667 * Return float (vector) rounded to nearest integer (vector). The returned
1668 * value is a float (vector).
1669 * Ex: round(0.9) = 1.0
1670 * Ex: round(-1.5) = -2.0
1673 lp_build_round(struct lp_build_context
*bld
,
1676 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1677 const struct lp_type type
= bld
->type
;
1679 assert(type
.floating
);
1680 assert(lp_check_value(type
, a
));
1682 if (arch_rounding_available(type
)) {
1683 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1686 const struct lp_type type
= bld
->type
;
1687 struct lp_type inttype
;
1688 struct lp_build_context intbld
;
1689 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1690 LLVMValueRef res
, anosign
, mask
;
1691 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1692 LLVMTypeRef vec_type
= bld
->vec_type
;
1694 assert(type
.width
== 32); /* might want to handle doubles at some point */
1697 inttype
.floating
= 0;
1698 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1700 res
= lp_build_iround(bld
, a
);
1701 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1703 /* mask out sign bit */
1704 anosign
= lp_build_abs(bld
, a
);
1706 * mask out all values if anosign > 2^24
1707 * This should work both for large ints (all rounding is no-op for them
1708 * because such floats are always exact) as well as special cases like
1709 * NaNs, Infs (taking advantage of the fact they use max exponent).
1710 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1712 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1713 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1714 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1715 return lp_build_select(bld
, mask
, a
, res
);
1721 * Return floor of float (vector), result is a float (vector)
1722 * Ex: floor(1.1) = 1.0
1723 * Ex: floor(-1.1) = -2.0
1726 lp_build_floor(struct lp_build_context
*bld
,
1729 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1730 const struct lp_type type
= bld
->type
;
1732 assert(type
.floating
);
1733 assert(lp_check_value(type
, a
));
1735 if (arch_rounding_available(type
)) {
1736 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1739 const struct lp_type type
= bld
->type
;
1740 struct lp_type inttype
;
1741 struct lp_build_context intbld
;
1742 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1743 LLVMValueRef trunc
, res
, anosign
, mask
;
1744 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1745 LLVMTypeRef vec_type
= bld
->vec_type
;
1747 assert(type
.width
== 32); /* might want to handle doubles at some point */
1750 inttype
.floating
= 0;
1751 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1753 /* round by truncation */
1754 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1755 res
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "floor.trunc");
1761 * fix values if rounding is wrong (for non-special cases)
1762 * - this is the case if trunc > a
1764 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, res
, a
);
1765 /* tmp = trunc > a ? 1.0 : 0.0 */
1766 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1767 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1768 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1769 res
= lp_build_sub(bld
, res
, tmp
);
1772 /* mask out sign bit */
1773 anosign
= lp_build_abs(bld
, a
);
1775 * mask out all values if anosign > 2^24
1776 * This should work both for large ints (all rounding is no-op for them
1777 * because such floats are always exact) as well as special cases like
1778 * NaNs, Infs (taking advantage of the fact they use max exponent).
1779 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1781 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1782 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1783 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1784 return lp_build_select(bld
, mask
, a
, res
);
1790 * Return ceiling of float (vector), returning float (vector).
1791 * Ex: ceil( 1.1) = 2.0
1792 * Ex: ceil(-1.1) = -1.0
1795 lp_build_ceil(struct lp_build_context
*bld
,
1798 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1799 const struct lp_type type
= bld
->type
;
1801 assert(type
.floating
);
1802 assert(lp_check_value(type
, a
));
1804 if (arch_rounding_available(type
)) {
1805 return lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
1808 const struct lp_type type
= bld
->type
;
1809 struct lp_type inttype
;
1810 struct lp_build_context intbld
;
1811 LLVMValueRef cmpval
= lp_build_const_vec(bld
->gallivm
, type
, 2^24);
1812 LLVMValueRef trunc
, res
, anosign
, mask
, tmp
;
1813 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1814 LLVMTypeRef vec_type
= bld
->vec_type
;
1816 assert(type
.width
== 32); /* might want to handle doubles at some point */
1819 inttype
.floating
= 0;
1820 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
1822 /* round by truncation */
1823 trunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1824 trunc
= LLVMBuildSIToFP(builder
, trunc
, vec_type
, "ceil.trunc");
1827 * fix values if rounding is wrong (for non-special cases)
1828 * - this is the case if trunc < a
1830 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
1831 /* tmp = trunc < a ? 1.0 : 0.0 */
1832 tmp
= LLVMBuildBitCast(builder
, bld
->one
, int_vec_type
, "");
1833 tmp
= lp_build_and(&intbld
, mask
, tmp
);
1834 tmp
= LLVMBuildBitCast(builder
, tmp
, vec_type
, "");
1835 res
= lp_build_add(bld
, trunc
, tmp
);
1837 /* mask out sign bit */
1838 anosign
= lp_build_abs(bld
, a
);
1840 * mask out all values if anosign > 2^24
1841 * This should work both for large ints (all rounding is no-op for them
1842 * because such floats are always exact) as well as special cases like
1843 * NaNs, Infs (taking advantage of the fact they use max exponent).
1844 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1846 anosign
= LLVMBuildBitCast(builder
, anosign
, int_vec_type
, "");
1847 cmpval
= LLVMBuildBitCast(builder
, cmpval
, int_vec_type
, "");
1848 mask
= lp_build_cmp(&intbld
, PIPE_FUNC_GREATER
, anosign
, cmpval
);
1849 return lp_build_select(bld
, mask
, a
, res
);
1855 * Return fractional part of 'a' computed as a - floor(a)
1856 * Typically used in texture coord arithmetic.
1859 lp_build_fract(struct lp_build_context
*bld
,
1862 assert(bld
->type
.floating
);
1863 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1868 * Prevent returning a fractional part of 1.0 for very small negative values of
1869 * 'a' by clamping against 0.99999(9).
1871 static inline LLVMValueRef
1872 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
1876 /* this is the largest number smaller than 1.0 representable as float */
1877 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
1878 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
1879 return lp_build_min(bld
, fract
, max
);
1884 * Same as lp_build_fract, but guarantees that the result is always smaller
1888 lp_build_fract_safe(struct lp_build_context
*bld
,
1891 return clamp_fract(bld
, lp_build_fract(bld
, a
));
1896 * Return the integer part of a float (vector) value (== round toward zero).
1897 * The returned value is an integer (vector).
1898 * Ex: itrunc(-1.5) = -1
1901 lp_build_itrunc(struct lp_build_context
*bld
,
1904 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1905 const struct lp_type type
= bld
->type
;
1906 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1908 assert(type
.floating
);
1909 assert(lp_check_value(type
, a
));
1911 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1916 * Return float (vector) rounded to nearest integer (vector). The returned
1917 * value is an integer (vector).
1918 * Ex: iround(0.9) = 1
1919 * Ex: iround(-1.5) = -2
1922 lp_build_iround(struct lp_build_context
*bld
,
1925 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1926 const struct lp_type type
= bld
->type
;
1927 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1930 assert(type
.floating
);
1932 assert(lp_check_value(type
, a
));
1934 if ((util_cpu_caps
.has_sse2
&&
1935 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
1936 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
1937 return lp_build_iround_nearest_sse2(bld
, a
);
1939 if (arch_rounding_available(type
)) {
1940 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_NEAREST
);
1945 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
1948 LLVMTypeRef vec_type
= bld
->vec_type
;
1949 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1950 (unsigned long long)1 << (type
.width
- 1));
1954 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1955 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1958 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
1959 half
= LLVMBuildOr(builder
, sign
, half
, "");
1960 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
1963 res
= LLVMBuildFAdd(builder
, a
, half
, "");
1966 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
1973 * Return floor of float (vector), result is an int (vector)
1974 * Ex: ifloor(1.1) = 1.0
1975 * Ex: ifloor(-1.1) = -2.0
1978 lp_build_ifloor(struct lp_build_context
*bld
,
1981 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1982 const struct lp_type type
= bld
->type
;
1983 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1986 assert(type
.floating
);
1987 assert(lp_check_value(type
, a
));
1991 if (arch_rounding_available(type
)) {
1992 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_FLOOR
);
1995 struct lp_type inttype
;
1996 struct lp_build_context intbld
;
1997 LLVMValueRef trunc
, itrunc
, mask
;
1999 assert(type
.floating
);
2000 assert(lp_check_value(type
, a
));
2003 inttype
.floating
= 0;
2004 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2006 /* round by truncation */
2007 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2008 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "ifloor.trunc");
2011 * fix values if rounding is wrong (for non-special cases)
2012 * - this is the case if trunc > a
2013 * The results of doing this with NaNs, very large values etc.
2014 * are undefined but this seems to be the case anyway.
2016 mask
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, trunc
, a
);
2017 /* cheapie minus one with mask since the mask is minus one / zero */
2018 return lp_build_add(&intbld
, itrunc
, mask
);
2022 /* round to nearest (toward zero) */
2023 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
2030 * Return ceiling of float (vector), returning int (vector).
2031 * Ex: iceil( 1.1) = 2
2032 * Ex: iceil(-1.1) = -1
2035 lp_build_iceil(struct lp_build_context
*bld
,
2038 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2039 const struct lp_type type
= bld
->type
;
2040 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
2043 assert(type
.floating
);
2044 assert(lp_check_value(type
, a
));
2046 if (arch_rounding_available(type
)) {
2047 res
= lp_build_round_arch(bld
, a
, LP_BUILD_ROUND_CEIL
);
2050 struct lp_type inttype
;
2051 struct lp_build_context intbld
;
2052 LLVMValueRef trunc
, itrunc
, mask
;
2054 assert(type
.floating
);
2055 assert(lp_check_value(type
, a
));
2058 inttype
.floating
= 0;
2059 lp_build_context_init(&intbld
, bld
->gallivm
, inttype
);
2061 /* round by truncation */
2062 itrunc
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
2063 trunc
= LLVMBuildSIToFP(builder
, itrunc
, bld
->vec_type
, "iceil.trunc");
2066 * fix values if rounding is wrong (for non-special cases)
2067 * - this is the case if trunc < a
2068 * The results of doing this with NaNs, very large values etc.
2069 * are undefined but this seems to be the case anyway.
2071 mask
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, trunc
, a
);
2072 /* cheapie plus one with mask since the mask is minus one / zero */
2073 return lp_build_sub(&intbld
, itrunc
, mask
);
2076 /* round to nearest (toward zero) */
2077 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
2084 * Combined ifloor() & fract().
2086 * Preferred to calling the functions separately, as it will ensure that the
2087 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2090 lp_build_ifloor_fract(struct lp_build_context
*bld
,
2092 LLVMValueRef
*out_ipart
,
2093 LLVMValueRef
*out_fpart
)
2095 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2096 const struct lp_type type
= bld
->type
;
2099 assert(type
.floating
);
2100 assert(lp_check_value(type
, a
));
2102 if (arch_rounding_available(type
)) {
2104 * floor() is easier.
2107 ipart
= lp_build_floor(bld
, a
);
2108 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2109 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
2113 * ifloor() is easier.
2116 *out_ipart
= lp_build_ifloor(bld
, a
);
2117 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
2118 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
2124 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2125 * always smaller than one.
2128 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
2130 LLVMValueRef
*out_ipart
,
2131 LLVMValueRef
*out_fpart
)
2133 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
2134 *out_fpart
= clamp_fract(bld
, *out_fpart
);
2139 lp_build_sqrt(struct lp_build_context
*bld
,
2142 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2143 const struct lp_type type
= bld
->type
;
2144 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2147 assert(lp_check_value(type
, a
));
2149 /* TODO: optimize the constant case */
2151 assert(type
.floating
);
2152 if (type
.length
== 1) {
2153 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.f%u", type
.width
);
2156 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
2159 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
2164 * Do one Newton-Raphson step to improve reciprocate precision:
2166 * x_{i+1} = x_i * (2 - a * x_i)
2168 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2169 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2170 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2171 * halo. It would be necessary to clamp the argument to prevent this.
2174 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2175 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2177 static INLINE LLVMValueRef
2178 lp_build_rcp_refine(struct lp_build_context
*bld
,
2182 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2183 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
2186 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
2187 res
= LLVMBuildFSub(builder
, two
, res
, "");
2188 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
2195 lp_build_rcp(struct lp_build_context
*bld
,
2198 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2199 const struct lp_type type
= bld
->type
;
2201 assert(lp_check_value(type
, a
));
2210 assert(type
.floating
);
2212 if(LLVMIsConstant(a
))
2213 return LLVMConstFDiv(bld
->one
, a
);
2216 * We don't use RCPPS because:
2217 * - it only has 10bits of precision
2218 * - it doesn't even get the reciprocate of 1.0 exactly
2219 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2220 * - for recent processors the benefit over DIVPS is marginal, a case
2223 * We could still use it on certain processors if benchmarks show that the
2224 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2225 * particular uses that require less workarounds.
2228 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2229 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
2230 const unsigned num_iterations
= 0;
2233 const char *intrinsic
= NULL
;
2235 if (type
.length
== 4) {
2236 intrinsic
= "llvm.x86.sse.rcp.ps";
2239 intrinsic
= "llvm.x86.avx.rcp.ps.256";
2242 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2244 for (i
= 0; i
< num_iterations
; ++i
) {
2245 res
= lp_build_rcp_refine(bld
, a
, res
);
2251 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
2256 * Do one Newton-Raphson step to improve rsqrt precision:
2258 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2260 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2262 static INLINE LLVMValueRef
2263 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
2265 LLVMValueRef rsqrt_a
)
2267 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2268 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
2269 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
2272 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
2273 res
= LLVMBuildFMul(builder
, a
, res
, "");
2274 res
= LLVMBuildFSub(builder
, three
, res
, "");
2275 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
2276 res
= LLVMBuildFMul(builder
, half
, res
, "");
2283 * Generate 1/sqrt(a).
2284 * Result is undefined for values < 0, infinity for +0.
2287 lp_build_rsqrt(struct lp_build_context
*bld
,
2290 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2291 const struct lp_type type
= bld
->type
;
2293 assert(lp_check_value(type
, a
));
2295 assert(type
.floating
);
2298 * This should be faster but all denormals will end up as infinity.
2300 if (0 && ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
2301 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))) {
2302 const unsigned num_iterations
= 1;
2305 const char *intrinsic
= NULL
;
2307 if (type
.length
== 4) {
2308 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2311 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2313 if (num_iterations
) {
2315 * Newton-Raphson will result in NaN instead of infinity for zero,
2316 * and NaN instead of zero for infinity.
2317 * Also, need to ensure rsqrt(1.0) == 1.0.
2318 * All numbers smaller than FLT_MIN will result in +infinity
2319 * (rsqrtps treats all denormals as zero).
2322 * Certain non-c99 compilers don't know INFINITY and might not support
2323 * hacks to evaluate it at compile time neither.
2325 const unsigned posinf_int
= 0x7F800000;
2327 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2328 LLVMValueRef inf
= lp_build_const_int_vec(bld
->gallivm
, type
, posinf_int
);
2330 inf
= LLVMBuildBitCast(builder
, inf
, lp_build_vec_type(bld
->gallivm
, type
), "");
2332 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2334 for (i
= 0; i
< num_iterations
; ++i
) {
2335 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2337 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2338 res
= lp_build_select(bld
, cmp
, inf
, res
);
2339 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2340 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2341 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2342 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2345 /* rsqrt(1.0) != 1.0 here */
2346 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2353 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2358 * Generate sin(a) using SSE2
2361 lp_build_sin(struct lp_build_context
*bld
,
2364 struct gallivm_state
*gallivm
= bld
->gallivm
;
2365 LLVMBuilderRef builder
= gallivm
->builder
;
2366 struct lp_type int_type
= lp_int_type(bld
->type
);
2367 LLVMBuilderRef b
= builder
;
2370 * take the absolute value,
2371 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2374 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2375 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2377 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2378 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2381 * extract the sign bit (upper one)
2382 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2384 LLVMValueRef sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2385 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
2389 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2392 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2393 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2396 * store the integer part of y in mm0
2397 * emm2 = _mm_cvttps_epi32(y);
2400 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2403 * j=(j+1) & (~1) (see the cephes sources)
2404 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2407 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2408 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2410 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2412 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2413 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2416 * y = _mm_cvtepi32_ps(emm2);
2418 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2420 /* get the swap sign flag
2421 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2423 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2424 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
2427 * emm2 = _mm_slli_epi32(emm0, 29);
2429 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2430 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
2433 * get the polynom selection mask
2434 * there is one polynom for 0 <= x <= Pi/4
2435 * and another one for Pi/4<x<=Pi/2
2436 * Both branches will be computed.
2438 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2439 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2442 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2443 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
2444 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2445 int_type
, PIPE_FUNC_EQUAL
,
2446 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2448 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2450 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
2453 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2454 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2455 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2457 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2458 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2459 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2462 * The magic pass: "Extended precision modular arithmetic"
2463 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2464 * xmm1 = _mm_mul_ps(y, xmm1);
2465 * xmm2 = _mm_mul_ps(y, xmm2);
2466 * xmm3 = _mm_mul_ps(y, xmm3);
2468 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2469 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2470 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2473 * x = _mm_add_ps(x, xmm1);
2474 * x = _mm_add_ps(x, xmm2);
2475 * x = _mm_add_ps(x, xmm3);
2478 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2479 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2480 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2483 * Evaluate the first polynom (0 <= x <= Pi/4)
2485 * z = _mm_mul_ps(x,x);
2487 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2490 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2491 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2492 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2494 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2495 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2496 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2499 * y = *(v4sf*)_ps_coscof_p0;
2500 * y = _mm_mul_ps(y, z);
2502 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2503 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2504 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2505 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2506 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2507 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2511 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2512 * y = _mm_sub_ps(y, tmp);
2513 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2515 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2516 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2517 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2518 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2519 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2522 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2523 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2524 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2526 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2527 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2528 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2531 * Evaluate the second polynom (Pi/4 <= x <= 0)
2533 * y2 = *(v4sf*)_ps_sincof_p0;
2534 * y2 = _mm_mul_ps(y2, z);
2535 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2536 * y2 = _mm_mul_ps(y2, z);
2537 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2538 * y2 = _mm_mul_ps(y2, z);
2539 * y2 = _mm_mul_ps(y2, x);
2540 * y2 = _mm_add_ps(y2, x);
2543 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2544 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2545 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2546 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2547 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2548 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2549 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2552 * select the correct result from the two polynoms
2554 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2555 * y = _mm_andnot_ps(xmm3, y);
2556 * y = _mm_add_ps(y,y2);
2558 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2559 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2560 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2561 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2562 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2563 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2564 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2568 * y = _mm_xor_ps(y, sign_bit);
2570 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
2571 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2577 * Generate cos(a) using SSE2
2580 lp_build_cos(struct lp_build_context
*bld
,
2583 struct gallivm_state
*gallivm
= bld
->gallivm
;
2584 LLVMBuilderRef builder
= gallivm
->builder
;
2585 struct lp_type int_type
= lp_int_type(bld
->type
);
2586 LLVMBuilderRef b
= builder
;
2589 * take the absolute value,
2590 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2593 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2594 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2596 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2597 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2601 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2604 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2605 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2608 * store the integer part of y in mm0
2609 * emm2 = _mm_cvttps_epi32(y);
2612 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2615 * j=(j+1) & (~1) (see the cephes sources)
2616 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2619 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2620 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2622 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2624 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2625 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2628 * y = _mm_cvtepi32_ps(emm2);
2630 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2634 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2636 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2637 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
2640 /* get the swap sign flag
2641 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2643 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2644 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
2645 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2646 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
2649 * emm2 = _mm_slli_epi32(emm0, 29);
2651 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2652 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
2655 * get the polynom selection mask
2656 * there is one polynom for 0 <= x <= Pi/4
2657 * and another one for Pi/4<x<=Pi/2
2658 * Both branches will be computed.
2660 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2661 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2664 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2665 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
2666 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2667 int_type
, PIPE_FUNC_EQUAL
,
2668 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2671 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2672 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2673 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2675 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2676 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2677 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2680 * The magic pass: "Extended precision modular arithmetic"
2681 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2682 * xmm1 = _mm_mul_ps(y, xmm1);
2683 * xmm2 = _mm_mul_ps(y, xmm2);
2684 * xmm3 = _mm_mul_ps(y, xmm3);
2686 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2687 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2688 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2691 * x = _mm_add_ps(x, xmm1);
2692 * x = _mm_add_ps(x, xmm2);
2693 * x = _mm_add_ps(x, xmm3);
2696 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2697 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2698 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2701 * Evaluate the first polynom (0 <= x <= Pi/4)
2703 * z = _mm_mul_ps(x,x);
2705 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2708 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2709 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2710 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2712 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2713 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2714 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2717 * y = *(v4sf*)_ps_coscof_p0;
2718 * y = _mm_mul_ps(y, z);
2720 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2721 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2722 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2723 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2724 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2725 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2729 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2730 * y = _mm_sub_ps(y, tmp);
2731 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2733 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2734 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2735 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2736 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2737 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2740 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2741 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2742 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2744 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2745 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2746 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2749 * Evaluate the second polynom (Pi/4 <= x <= 0)
2751 * y2 = *(v4sf*)_ps_sincof_p0;
2752 * y2 = _mm_mul_ps(y2, z);
2753 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2754 * y2 = _mm_mul_ps(y2, z);
2755 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2756 * y2 = _mm_mul_ps(y2, z);
2757 * y2 = _mm_mul_ps(y2, x);
2758 * y2 = _mm_add_ps(y2, x);
2761 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2762 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2763 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2764 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2765 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2766 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2767 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2770 * select the correct result from the two polynoms
2772 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2773 * y = _mm_andnot_ps(xmm3, y);
2774 * y = _mm_add_ps(y,y2);
2776 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2777 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2778 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2779 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2780 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2781 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2785 * y = _mm_xor_ps(y, sign_bit);
2787 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
2788 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2794 * Generate pow(x, y)
2797 lp_build_pow(struct lp_build_context
*bld
,
2801 /* TODO: optimize the constant case */
2802 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2803 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2804 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2808 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2816 lp_build_exp(struct lp_build_context
*bld
,
2819 /* log2(e) = 1/log(2) */
2820 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2821 1.4426950408889634);
2823 assert(lp_check_value(bld
->type
, x
));
2825 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2833 lp_build_log(struct lp_build_context
*bld
,
2837 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2838 0.69314718055994529);
2840 assert(lp_check_value(bld
->type
, x
));
2842 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2847 * Generate polynomial.
2848 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2851 lp_build_polynomial(struct lp_build_context
*bld
,
2853 const double *coeffs
,
2854 unsigned num_coeffs
)
2856 const struct lp_type type
= bld
->type
;
2857 LLVMValueRef even
= NULL
, odd
= NULL
;
2861 assert(lp_check_value(bld
->type
, x
));
2863 /* TODO: optimize the constant case */
2864 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2865 LLVMIsConstant(x
)) {
2866 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2871 * Calculate odd and even terms seperately to decrease data dependency
2873 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2874 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2876 x2
= lp_build_mul(bld
, x
, x
);
2878 for (i
= num_coeffs
; i
--; ) {
2881 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2885 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2890 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2897 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2906 * Minimax polynomial fit of 2**x, in range [0, 1[
2908 const double lp_build_exp2_polynomial
[] = {
2909 #if EXP_POLY_DEGREE == 5
2910 0.999999925063526176901,
2911 0.693153073200168932794,
2912 0.240153617044375388211,
2913 0.0558263180532956664775,
2914 0.00898934009049466391101,
2915 0.00187757667519147912699
2916 #elif EXP_POLY_DEGREE == 4
2917 1.00000259337069434683,
2918 0.693003834469974940458,
2919 0.24144275689150793076,
2920 0.0520114606103070150235,
2921 0.0135341679161270268764
2922 #elif EXP_POLY_DEGREE == 3
2923 0.999925218562710312959,
2924 0.695833540494823811697,
2925 0.226067155427249155588,
2926 0.0780245226406372992967
2927 #elif EXP_POLY_DEGREE == 2
2928 1.00172476321474503578,
2929 0.657636275736077639316,
2930 0.33718943461968720704
2938 lp_build_exp2_approx(struct lp_build_context
*bld
,
2940 LLVMValueRef
*p_exp2_int_part
,
2941 LLVMValueRef
*p_frac_part
,
2942 LLVMValueRef
*p_exp2
)
2944 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2945 const struct lp_type type
= bld
->type
;
2946 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2947 LLVMValueRef ipart
= NULL
;
2948 LLVMValueRef fpart
= NULL
;
2949 LLVMValueRef expipart
= NULL
;
2950 LLVMValueRef expfpart
= NULL
;
2951 LLVMValueRef res
= NULL
;
2953 assert(lp_check_value(bld
->type
, x
));
2955 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
2956 /* TODO: optimize the constant case */
2957 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2958 LLVMIsConstant(x
)) {
2959 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2963 assert(type
.floating
&& type
.width
== 32);
2965 x
= lp_build_min(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, 129.0));
2966 x
= lp_build_max(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999));
2968 /* ipart = floor(x) */
2969 /* fpart = x - ipart */
2970 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
2973 if(p_exp2_int_part
|| p_exp2
) {
2974 /* expipart = (float) (1 << ipart) */
2975 expipart
= LLVMBuildAdd(builder
, ipart
,
2976 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
2977 expipart
= LLVMBuildShl(builder
, expipart
,
2978 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
2979 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
2983 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
2984 Elements(lp_build_exp2_polynomial
));
2986 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
2990 *p_exp2_int_part
= expipart
;
2993 *p_frac_part
= fpart
;
3001 lp_build_exp2(struct lp_build_context
*bld
,
3005 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
3011 * Extract the exponent of a IEEE-754 floating point value.
3013 * Optionally apply an integer bias.
3015 * Result is an integer value with
3017 * ifloor(log2(x)) + bias
3020 lp_build_extract_exponent(struct lp_build_context
*bld
,
3024 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3025 const struct lp_type type
= bld
->type
;
3026 unsigned mantissa
= lp_mantissa(type
);
3029 assert(type
.floating
);
3031 assert(lp_check_value(bld
->type
, x
));
3033 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3035 res
= LLVMBuildLShr(builder
, x
,
3036 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
3037 res
= LLVMBuildAnd(builder
, res
,
3038 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
3039 res
= LLVMBuildSub(builder
, res
,
3040 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
3047 * Extract the mantissa of the a floating.
3049 * Result is a floating point value with
3051 * x / floor(log2(x))
3054 lp_build_extract_mantissa(struct lp_build_context
*bld
,
3057 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3058 const struct lp_type type
= bld
->type
;
3059 unsigned mantissa
= lp_mantissa(type
);
3060 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
3061 (1ULL << mantissa
) - 1);
3062 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
3065 assert(lp_check_value(bld
->type
, x
));
3067 assert(type
.floating
);
3069 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
3071 /* res = x / 2**ipart */
3072 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
3073 res
= LLVMBuildOr(builder
, res
, one
, "");
3074 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
3082 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3083 * These coefficients can be generate with
3084 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3086 const double lp_build_log2_polynomial
[] = {
3087 #if LOG_POLY_DEGREE == 5
3088 2.88539008148777786488L,
3089 0.961796878841293367824L,
3090 0.577058946784739859012L,
3091 0.412914355135828735411L,
3092 0.308591899232910175289L,
3093 0.352376952300281371868L,
3094 #elif LOG_POLY_DEGREE == 4
3095 2.88539009343309178325L,
3096 0.961791550404184197881L,
3097 0.577440339438736392009L,
3098 0.403343858251329912514L,
3099 0.406718052498846252698L,
3100 #elif LOG_POLY_DEGREE == 3
3101 2.88538959748872753838L,
3102 0.961932915889597772928L,
3103 0.571118517972136195241L,
3104 0.493997535084709500285L,
3111 * See http://www.devmaster.net/forums/showthread.php?p=43580
3112 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3113 * http://www.nezumi.demon.co.uk/consult/logx.htm
3116 lp_build_log2_approx(struct lp_build_context
*bld
,
3118 LLVMValueRef
*p_exp
,
3119 LLVMValueRef
*p_floor_log2
,
3120 LLVMValueRef
*p_log2
)
3122 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3123 const struct lp_type type
= bld
->type
;
3124 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
3125 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
3127 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
3128 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
3129 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
3131 LLVMValueRef i
= NULL
;
3132 LLVMValueRef y
= NULL
;
3133 LLVMValueRef z
= NULL
;
3134 LLVMValueRef exp
= NULL
;
3135 LLVMValueRef mant
= NULL
;
3136 LLVMValueRef logexp
= NULL
;
3137 LLVMValueRef logmant
= NULL
;
3138 LLVMValueRef res
= NULL
;
3140 assert(lp_check_value(bld
->type
, x
));
3142 if(p_exp
|| p_floor_log2
|| p_log2
) {
3143 /* TODO: optimize the constant case */
3144 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
3145 LLVMIsConstant(x
)) {
3146 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3150 assert(type
.floating
&& type
.width
== 32);
3153 * We don't explicitly handle denormalized numbers. They will yield a
3154 * result in the neighbourhood of -127, which appears to be adequate
3158 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
3160 /* exp = (float) exponent(x) */
3161 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
3164 if(p_floor_log2
|| p_log2
) {
3165 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
3166 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
3167 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
3171 /* mant = 1 + (float) mantissa(x) */
3172 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
3173 mant
= LLVMBuildOr(builder
, mant
, one
, "");
3174 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
3176 /* y = (mant - 1) / (mant + 1) */
3177 y
= lp_build_div(bld
,
3178 lp_build_sub(bld
, mant
, bld
->one
),
3179 lp_build_add(bld
, mant
, bld
->one
)
3183 z
= lp_build_mul(bld
, y
, y
);
3186 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
3187 Elements(lp_build_log2_polynomial
));
3189 /* logmant = y * P(z) */
3190 logmant
= lp_build_mul(bld
, y
, logmant
);
3192 res
= lp_build_add(bld
, logmant
, logexp
);
3196 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
3201 *p_floor_log2
= logexp
;
3209 lp_build_log2(struct lp_build_context
*bld
,
3213 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);
3219 * Faster (and less accurate) log2.
3221 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3223 * Piece-wise linear approximation, with exact results when x is a
3226 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3229 lp_build_fast_log2(struct lp_build_context
*bld
,
3232 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3236 assert(lp_check_value(bld
->type
, x
));
3238 assert(bld
->type
.floating
);
3240 /* ipart = floor(log2(x)) - 1 */
3241 ipart
= lp_build_extract_exponent(bld
, x
, -1);
3242 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
3244 /* fpart = x / 2**ipart */
3245 fpart
= lp_build_extract_mantissa(bld
, x
);
3248 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
3253 * Fast implementation of iround(log2(x)).
3255 * Not an approximation -- it should give accurate results all the time.
3258 lp_build_ilog2(struct lp_build_context
*bld
,
3261 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3262 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
3265 assert(bld
->type
.floating
);
3267 assert(lp_check_value(bld
->type
, x
));
3269 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3270 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
3272 /* ipart = floor(log2(x) + 0.5) */
3273 ipart
= lp_build_extract_exponent(bld
, x
, 0);
3279 lp_build_mod(struct lp_build_context
*bld
,
3283 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
3285 const struct lp_type type
= bld
->type
;
3287 assert(lp_check_value(type
, x
));
3288 assert(lp_check_value(type
, y
));
3291 res
= LLVMBuildFRem(builder
, x
, y
, "");
3293 res
= LLVMBuildSRem(builder
, x
, y
, "");
3295 res
= LLVMBuildURem(builder
, x
, y
, "");