1 /**************************************************************************
3 * Copyright 2009-2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
65 #define EXP_POLY_DEGREE 5
67 #define LOG_POLY_DEGREE 4
72 * No checks for special case values of a or b = 1 or 0 are done.
75 lp_build_min_simple(struct lp_build_context
*bld
,
79 const struct lp_type type
= bld
->type
;
80 const char *intrinsic
= NULL
;
81 unsigned intr_size
= 0;
84 assert(lp_check_value(type
, a
));
85 assert(lp_check_value(type
, b
));
87 /* TODO: optimize the constant case */
89 if (type
.floating
&& util_cpu_caps
.has_sse
) {
90 if (type
.width
== 32) {
91 if (type
.length
== 1) {
92 intrinsic
= "llvm.x86.sse.min.ss";
95 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
96 intrinsic
= "llvm.x86.sse.min.ps";
100 intrinsic
= "llvm.x86.avx.min.ps.256";
104 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
105 if (type
.length
== 1) {
106 intrinsic
= "llvm.x86.sse2.min.sd";
109 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
110 intrinsic
= "llvm.x86.sse2.min.pd";
114 intrinsic
= "llvm.x86.avx.min.pd.256";
119 else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
121 if ((type
.width
== 8 || type
.width
== 16) &&
122 (type
.width
* type
.length
<= 64) &&
123 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
124 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
127 if (type
.width
== 8 && !type
.sign
) {
128 intrinsic
= "llvm.x86.sse2.pminu.b";
130 else if (type
.width
== 16 && type
.sign
) {
131 intrinsic
= "llvm.x86.sse2.pmins.w";
133 if (util_cpu_caps
.has_sse4_1
) {
134 if (type
.width
== 8 && type
.sign
) {
135 intrinsic
= "llvm.x86.sse41.pminsb";
137 if (type
.width
== 16 && !type
.sign
) {
138 intrinsic
= "llvm.x86.sse41.pminuw";
140 if (type
.width
== 32 && !type
.sign
) {
141 intrinsic
= "llvm.x86.sse41.pminud";
143 if (type
.width
== 32 && type
.sign
) {
144 intrinsic
= "llvm.x86.sse41.pminsd";
150 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
155 cond
= lp_build_cmp(bld
, PIPE_FUNC_LESS
, a
, b
);
156 return lp_build_select(bld
, cond
, a
, b
);
162 * No checks for special case values of a or b = 1 or 0 are done.
165 lp_build_max_simple(struct lp_build_context
*bld
,
169 const struct lp_type type
= bld
->type
;
170 const char *intrinsic
= NULL
;
171 unsigned intr_size
= 0;
174 assert(lp_check_value(type
, a
));
175 assert(lp_check_value(type
, b
));
177 /* TODO: optimize the constant case */
179 if (type
.floating
&& util_cpu_caps
.has_sse
) {
180 if (type
.width
== 32) {
181 if (type
.length
== 1) {
182 intrinsic
= "llvm.x86.sse.max.ss";
185 else if (type
.length
<= 4 || !util_cpu_caps
.has_avx
) {
186 intrinsic
= "llvm.x86.sse.max.ps";
190 intrinsic
= "llvm.x86.avx.max.ps.256";
194 if (type
.width
== 64 && util_cpu_caps
.has_sse2
) {
195 if (type
.length
== 1) {
196 intrinsic
= "llvm.x86.sse2.max.sd";
199 else if (type
.length
== 2 || !util_cpu_caps
.has_avx
) {
200 intrinsic
= "llvm.x86.sse2.max.pd";
204 intrinsic
= "llvm.x86.avx.max.pd.256";
209 else if (util_cpu_caps
.has_sse2
&& type
.length
>= 2) {
211 if ((type
.width
== 8 || type
.width
== 16) &&
212 (type
.width
* type
.length
<= 64) &&
213 (gallivm_debug
& GALLIVM_DEBUG_PERF
)) {
214 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
217 if (type
.width
== 8 && !type
.sign
) {
218 intrinsic
= "llvm.x86.sse2.pmaxu.b";
221 else if (type
.width
== 16 && type
.sign
) {
222 intrinsic
= "llvm.x86.sse2.pmaxs.w";
224 if (util_cpu_caps
.has_sse4_1
) {
225 if (type
.width
== 8 && type
.sign
) {
226 intrinsic
= "llvm.x86.sse41.pmaxsb";
228 if (type
.width
== 16 && !type
.sign
) {
229 intrinsic
= "llvm.x86.sse41.pmaxuw";
231 if (type
.width
== 32 && !type
.sign
) {
232 intrinsic
= "llvm.x86.sse41.pmaxud";
234 if (type
.width
== 32 && type
.sign
) {
235 intrinsic
= "llvm.x86.sse41.pmaxsd";
241 return lp_build_intrinsic_binary_anylength(bld
->gallivm
, intrinsic
,
246 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, b
);
247 return lp_build_select(bld
, cond
, a
, b
);
252 * Generate 1 - a, or ~a depending on bld->type.
255 lp_build_comp(struct lp_build_context
*bld
,
258 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
259 const struct lp_type type
= bld
->type
;
261 assert(lp_check_value(type
, a
));
268 if(type
.norm
&& !type
.floating
&& !type
.fixed
&& !type
.sign
) {
269 if(LLVMIsConstant(a
))
270 return LLVMConstNot(a
);
272 return LLVMBuildNot(builder
, a
, "");
275 if(LLVMIsConstant(a
))
277 return LLVMConstFSub(bld
->one
, a
);
279 return LLVMConstSub(bld
->one
, a
);
282 return LLVMBuildFSub(builder
, bld
->one
, a
, "");
284 return LLVMBuildSub(builder
, bld
->one
, a
, "");
292 lp_build_add(struct lp_build_context
*bld
,
296 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
297 const struct lp_type type
= bld
->type
;
300 assert(lp_check_value(type
, a
));
301 assert(lp_check_value(type
, b
));
307 if(a
== bld
->undef
|| b
== bld
->undef
)
311 const char *intrinsic
= NULL
;
313 if(a
== bld
->one
|| b
== bld
->one
)
316 if(util_cpu_caps
.has_sse2
&&
317 type
.width
* type
.length
== 128 &&
318 !type
.floating
&& !type
.fixed
) {
320 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
322 intrinsic
= type
.sign
? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
326 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
329 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
331 res
= LLVMConstFAdd(a
, b
);
333 res
= LLVMConstAdd(a
, b
);
336 res
= LLVMBuildFAdd(builder
, a
, b
, "");
338 res
= LLVMBuildAdd(builder
, a
, b
, "");
340 /* clamp to ceiling of 1.0 */
341 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
342 res
= lp_build_min_simple(bld
, res
, bld
->one
);
344 /* XXX clamp to floor of -1 or 0??? */
350 /** Return the scalar sum of the elements of a.
351 * Should avoid this operation whenever possible.
354 lp_build_horizontal_add(struct lp_build_context
*bld
,
357 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
358 const struct lp_type type
= bld
->type
;
359 LLVMValueRef index
, res
;
361 LLVMValueRef shuffles1
[LP_MAX_VECTOR_LENGTH
/ 2];
362 LLVMValueRef shuffles2
[LP_MAX_VECTOR_LENGTH
/ 2];
363 LLVMValueRef vecres
, elem2
;
365 assert(lp_check_value(type
, a
));
367 if (type
.length
== 1) {
371 assert(!bld
->type
.norm
);
374 * for byte vectors can do much better with psadbw.
375 * Using repeated shuffle/adds here. Note with multiple vectors
376 * this can be done more efficiently as outlined in the intel
377 * optimization manual.
378 * Note: could cause data rearrangement if used with smaller element
383 length
= type
.length
/ 2;
385 LLVMValueRef vec1
, vec2
;
386 for (i
= 0; i
< length
; i
++) {
387 shuffles1
[i
] = lp_build_const_int32(bld
->gallivm
, i
);
388 shuffles2
[i
] = lp_build_const_int32(bld
->gallivm
, i
+ length
);
390 vec1
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
391 LLVMConstVector(shuffles1
, length
), "");
392 vec2
= LLVMBuildShuffleVector(builder
, vecres
, vecres
,
393 LLVMConstVector(shuffles2
, length
), "");
395 vecres
= LLVMBuildFAdd(builder
, vec1
, vec2
, "");
398 vecres
= LLVMBuildAdd(builder
, vec1
, vec2
, "");
400 length
= length
>> 1;
403 /* always have vector of size 2 here */
406 index
= lp_build_const_int32(bld
->gallivm
, 0);
407 res
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
408 index
= lp_build_const_int32(bld
->gallivm
, 1);
409 elem2
= LLVMBuildExtractElement(builder
, vecres
, index
, "");
412 res
= LLVMBuildFAdd(builder
, res
, elem2
, "");
414 res
= LLVMBuildAdd(builder
, res
, elem2
, "");
420 * Return the horizontal sums of 4 float vectors as a float4 vector.
421 * This uses the technique as outlined in Intel Optimization Manual.
424 lp_build_horizontal_add4x4f(struct lp_build_context
*bld
,
427 struct gallivm_state
*gallivm
= bld
->gallivm
;
428 LLVMBuilderRef builder
= gallivm
->builder
;
429 LLVMValueRef shuffles
[4];
431 LLVMValueRef sumtmp
[2], shuftmp
[2];
433 /* lower half of regs */
434 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
435 shuffles
[1] = lp_build_const_int32(gallivm
, 1);
436 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
437 shuffles
[3] = lp_build_const_int32(gallivm
, 5);
438 tmp
[0] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
439 LLVMConstVector(shuffles
, 4), "");
440 tmp
[2] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
441 LLVMConstVector(shuffles
, 4), "");
443 /* upper half of regs */
444 shuffles
[0] = lp_build_const_int32(gallivm
, 2);
445 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
446 shuffles
[2] = lp_build_const_int32(gallivm
, 6);
447 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
448 tmp
[1] = LLVMBuildShuffleVector(builder
, src
[0], src
[1],
449 LLVMConstVector(shuffles
, 4), "");
450 tmp
[3] = LLVMBuildShuffleVector(builder
, src
[2], src
[3],
451 LLVMConstVector(shuffles
, 4), "");
453 sumtmp
[0] = LLVMBuildFAdd(builder
, tmp
[0], tmp
[1], "");
454 sumtmp
[1] = LLVMBuildFAdd(builder
, tmp
[2], tmp
[3], "");
456 shuffles
[0] = lp_build_const_int32(gallivm
, 0);
457 shuffles
[1] = lp_build_const_int32(gallivm
, 2);
458 shuffles
[2] = lp_build_const_int32(gallivm
, 4);
459 shuffles
[3] = lp_build_const_int32(gallivm
, 6);
460 shuftmp
[0] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
461 LLVMConstVector(shuffles
, 4), "");
463 shuffles
[0] = lp_build_const_int32(gallivm
, 1);
464 shuffles
[1] = lp_build_const_int32(gallivm
, 3);
465 shuffles
[2] = lp_build_const_int32(gallivm
, 5);
466 shuffles
[3] = lp_build_const_int32(gallivm
, 7);
467 shuftmp
[1] = LLVMBuildShuffleVector(builder
, sumtmp
[0], sumtmp
[1],
468 LLVMConstVector(shuffles
, 4), "");
470 return LLVMBuildFAdd(builder
, shuftmp
[0], shuftmp
[1], "");
475 * partially horizontally add 2-4 float vectors with length nx4,
476 * i.e. only four adjacent values in each vector will be added,
477 * assuming values are really grouped in 4 which also determines
480 * Return a vector of the same length as the initial vectors,
481 * with the excess elements (if any) being undefined.
482 * The element order is independent of number of input vectors.
483 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
484 * the output order thus will be
485 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
488 lp_build_hadd_partial4(struct lp_build_context
*bld
,
489 LLVMValueRef vectors
[],
492 struct gallivm_state
*gallivm
= bld
->gallivm
;
493 LLVMBuilderRef builder
= gallivm
->builder
;
494 LLVMValueRef ret_vec
;
496 const char *intrinsic
= NULL
;
498 assert(num_vecs
>= 2 && num_vecs
<= 4);
499 assert(bld
->type
.floating
);
501 /* only use this with at least 2 vectors, as it is sort of expensive
502 * (depending on cpu) and we always need two horizontal adds anyway,
503 * so a shuffle/add approach might be better.
509 tmp
[2] = num_vecs
> 2 ? vectors
[2] : vectors
[0];
510 tmp
[3] = num_vecs
> 3 ? vectors
[3] : vectors
[0];
512 if (util_cpu_caps
.has_sse3
&& bld
->type
.width
== 32 &&
513 bld
->type
.length
== 4) {
514 intrinsic
= "llvm.x86.sse3.hadd.ps";
516 else if (util_cpu_caps
.has_avx
&& bld
->type
.width
== 32 &&
517 bld
->type
.length
== 8) {
518 intrinsic
= "llvm.x86.avx.hadd.ps.256";
521 tmp
[0] = lp_build_intrinsic_binary(builder
, intrinsic
,
522 lp_build_vec_type(gallivm
, bld
->type
),
525 tmp
[1] = lp_build_intrinsic_binary(builder
, intrinsic
,
526 lp_build_vec_type(gallivm
, bld
->type
),
532 return lp_build_intrinsic_binary(builder
, intrinsic
,
533 lp_build_vec_type(gallivm
, bld
->type
),
537 if (bld
->type
.length
== 4) {
538 ret_vec
= lp_build_horizontal_add4x4f(bld
, tmp
);
541 LLVMValueRef partres
[LP_MAX_VECTOR_LENGTH
/4];
543 unsigned num_iter
= bld
->type
.length
/ 4;
544 struct lp_type parttype
= bld
->type
;
546 for (j
= 0; j
< num_iter
; j
++) {
547 LLVMValueRef partsrc
[4];
549 for (i
= 0; i
< 4; i
++) {
550 partsrc
[i
] = lp_build_extract_range(gallivm
, tmp
[i
], j
*4, 4);
552 partres
[j
] = lp_build_horizontal_add4x4f(bld
, partsrc
);
554 ret_vec
= lp_build_concat(gallivm
, partres
, parttype
, num_iter
);
563 lp_build_sub(struct lp_build_context
*bld
,
567 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
568 const struct lp_type type
= bld
->type
;
571 assert(lp_check_value(type
, a
));
572 assert(lp_check_value(type
, b
));
576 if(a
== bld
->undef
|| b
== bld
->undef
)
582 const char *intrinsic
= NULL
;
587 if(util_cpu_caps
.has_sse2
&&
588 type
.width
* type
.length
== 128 &&
589 !type
.floating
&& !type
.fixed
) {
591 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
593 intrinsic
= type
.sign
? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
597 return lp_build_intrinsic_binary(builder
, intrinsic
, lp_build_vec_type(bld
->gallivm
, bld
->type
), a
, b
);
600 if(LLVMIsConstant(a
) && LLVMIsConstant(b
))
602 res
= LLVMConstFSub(a
, b
);
604 res
= LLVMConstSub(a
, b
);
607 res
= LLVMBuildFSub(builder
, a
, b
, "");
609 res
= LLVMBuildSub(builder
, a
, b
, "");
611 if(bld
->type
.norm
&& (bld
->type
.floating
|| bld
->type
.fixed
))
612 res
= lp_build_max_simple(bld
, res
, bld
->zero
);
619 * Normalized 8bit multiplication.
623 * makes the following approximation to the division (Sree)
625 * a*b/255 ~= (a*(b + 1)) >> 256
627 * which is the fastest method that satisfies the following OpenGL criteria
629 * 0*0 = 0 and 255*255 = 255
633 * takes the geometric series approximation to the division
635 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
637 * in this case just the first two terms to fit in 16bit arithmetic
639 * t/255 ~= (t + (t >> 8)) >> 8
641 * note that just by itself it doesn't satisfies the OpenGL criteria, as
642 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
645 * - geometric series plus rounding
647 * when using a geometric series division instead of truncating the result
648 * use roundoff in the approximation (Jim Blinn)
650 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
652 * achieving the exact results
654 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
655 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
656 * @sa Michael Herf, The "double blend trick", May 2000,
657 * http://www.stereopsis.com/doubleblend.html
660 lp_build_mul_u8n(struct gallivm_state
*gallivm
,
661 struct lp_type i16_type
,
662 LLVMValueRef a
, LLVMValueRef b
)
664 LLVMBuilderRef builder
= gallivm
->builder
;
668 assert(!i16_type
.floating
);
669 assert(lp_check_value(i16_type
, a
));
670 assert(lp_check_value(i16_type
, b
));
672 c8
= lp_build_const_int_vec(gallivm
, i16_type
, 8);
676 /* a*b/255 ~= (a*(b + 1)) >> 256 */
677 b
= LLVMBuildAdd(builder
, b
, lp_build_const_int_vec(gallium
, i16_type
, 1), "");
678 ab
= LLVMBuildMul(builder
, a
, b
, "");
682 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
683 ab
= LLVMBuildMul(builder
, a
, b
, "");
684 ab
= LLVMBuildAdd(builder
, ab
, LLVMBuildLShr(builder
, ab
, c8
, ""), "");
685 ab
= LLVMBuildAdd(builder
, ab
, lp_build_const_int_vec(gallivm
, i16_type
, 0x80), "");
689 ab
= LLVMBuildLShr(builder
, ab
, c8
, "");
699 lp_build_mul(struct lp_build_context
*bld
,
703 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
704 const struct lp_type type
= bld
->type
;
708 assert(lp_check_value(type
, a
));
709 assert(lp_check_value(type
, b
));
719 if(a
== bld
->undef
|| b
== bld
->undef
)
722 if(!type
.floating
&& !type
.fixed
&& type
.norm
) {
723 if(type
.width
== 8) {
724 struct lp_type i16_type
= lp_wider_type(type
);
725 LLVMValueRef al
, ah
, bl
, bh
, abl
, abh
, ab
;
727 lp_build_unpack2(bld
->gallivm
, type
, i16_type
, a
, &al
, &ah
);
728 lp_build_unpack2(bld
->gallivm
, type
, i16_type
, b
, &bl
, &bh
);
730 /* PMULLW, PSRLW, PADDW */
731 abl
= lp_build_mul_u8n(bld
->gallivm
, i16_type
, al
, bl
);
732 abh
= lp_build_mul_u8n(bld
->gallivm
, i16_type
, ah
, bh
);
734 ab
= lp_build_pack2(bld
->gallivm
, i16_type
, type
, abl
, abh
);
744 shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
/2);
748 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
750 res
= LLVMConstFMul(a
, b
);
752 res
= LLVMConstMul(a
, b
);
755 res
= LLVMConstAShr(res
, shift
);
757 res
= LLVMConstLShr(res
, shift
);
762 res
= LLVMBuildFMul(builder
, a
, b
, "");
764 res
= LLVMBuildMul(builder
, a
, b
, "");
767 res
= LLVMBuildAShr(builder
, res
, shift
, "");
769 res
= LLVMBuildLShr(builder
, res
, shift
, "");
778 * Small vector x scale multiplication optimization.
781 lp_build_mul_imm(struct lp_build_context
*bld
,
785 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
788 assert(lp_check_value(bld
->type
, a
));
797 return lp_build_negate(bld
, a
);
799 if(b
== 2 && bld
->type
.floating
)
800 return lp_build_add(bld
, a
, a
);
802 if(util_is_power_of_two(b
)) {
803 unsigned shift
= ffs(b
) - 1;
805 if(bld
->type
.floating
) {
808 * Power of two multiplication by directly manipulating the exponent.
810 * XXX: This might not be always faster, it will introduce a small error
811 * for multiplication by zero, and it will produce wrong results
814 unsigned mantissa
= lp_mantissa(bld
->type
);
815 factor
= lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (unsigned long long)shift
<< mantissa
);
816 a
= LLVMBuildBitCast(builder
, a
, lp_build_int_vec_type(bld
->type
), "");
817 a
= LLVMBuildAdd(builder
, a
, factor
, "");
818 a
= LLVMBuildBitCast(builder
, a
, lp_build_vec_type(bld
->gallivm
, bld
->type
), "");
823 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, shift
);
824 return LLVMBuildShl(builder
, a
, factor
, "");
828 factor
= lp_build_const_vec(bld
->gallivm
, bld
->type
, (double)b
);
829 return lp_build_mul(bld
, a
, factor
);
837 lp_build_div(struct lp_build_context
*bld
,
841 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
842 const struct lp_type type
= bld
->type
;
844 assert(lp_check_value(type
, a
));
845 assert(lp_check_value(type
, b
));
850 return lp_build_rcp(bld
, b
);
855 if(a
== bld
->undef
|| b
== bld
->undef
)
858 if(LLVMIsConstant(a
) && LLVMIsConstant(b
)) {
860 return LLVMConstFDiv(a
, b
);
862 return LLVMConstSDiv(a
, b
);
864 return LLVMConstUDiv(a
, b
);
867 if(((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
868 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) &&
870 return lp_build_mul(bld
, a
, lp_build_rcp(bld
, b
));
873 return LLVMBuildFDiv(builder
, a
, b
, "");
875 return LLVMBuildSDiv(builder
, a
, b
, "");
877 return LLVMBuildUDiv(builder
, a
, b
, "");
882 * Linear interpolation -- without any checks.
884 * @sa http://www.stereopsis.com/doubleblend.html
886 static INLINE LLVMValueRef
887 lp_build_lerp_simple(struct lp_build_context
*bld
,
892 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
896 assert(lp_check_value(bld
->type
, x
));
897 assert(lp_check_value(bld
->type
, v0
));
898 assert(lp_check_value(bld
->type
, v1
));
900 delta
= lp_build_sub(bld
, v1
, v0
);
902 res
= lp_build_mul(bld
, x
, delta
);
904 res
= lp_build_add(bld
, v0
, res
);
906 if (bld
->type
.fixed
) {
907 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
908 * but it will be wrong for other uses. Basically we need a more
909 * powerful lp_type, capable of further distinguishing the values
910 * interpretation from the value storage. */
911 res
= LLVMBuildAnd(builder
, res
, lp_build_const_int_vec(bld
->gallivm
, bld
->type
, (1 << bld
->type
.width
/2) - 1), "");
919 * Linear interpolation.
922 lp_build_lerp(struct lp_build_context
*bld
,
927 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
928 const struct lp_type type
= bld
->type
;
931 assert(lp_check_value(type
, x
));
932 assert(lp_check_value(type
, v0
));
933 assert(lp_check_value(type
, v1
));
936 struct lp_type wide_type
;
937 struct lp_build_context wide_bld
;
938 LLVMValueRef xl
, xh
, v0l
, v0h
, v1l
, v1h
, resl
, resh
;
941 assert(type
.length
>= 2);
945 * Create a wider type, enough to hold the intermediate result of the
948 memset(&wide_type
, 0, sizeof wide_type
);
949 wide_type
.fixed
= TRUE
;
950 wide_type
.width
= type
.width
*2;
951 wide_type
.length
= type
.length
/2;
953 lp_build_context_init(&wide_bld
, bld
->gallivm
, wide_type
);
955 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, x
, &xl
, &xh
);
956 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v0
, &v0l
, &v0h
);
957 lp_build_unpack2(bld
->gallivm
, type
, wide_type
, v1
, &v1l
, &v1h
);
960 * Scale x from [0, 255] to [0, 256]
963 shift
= lp_build_const_int_vec(bld
->gallivm
, wide_type
, type
.width
- 1);
965 xl
= lp_build_add(&wide_bld
, xl
,
966 LLVMBuildAShr(builder
, xl
, shift
, ""));
967 xh
= lp_build_add(&wide_bld
, xh
,
968 LLVMBuildAShr(builder
, xh
, shift
, ""));
974 resl
= lp_build_lerp_simple(&wide_bld
, xl
, v0l
, v1l
);
975 resh
= lp_build_lerp_simple(&wide_bld
, xh
, v0h
, v1h
);
977 res
= lp_build_pack2(bld
->gallivm
, wide_type
, type
, resl
, resh
);
979 res
= lp_build_lerp_simple(bld
, x
, v0
, v1
);
987 lp_build_lerp_2d(struct lp_build_context
*bld
,
995 LLVMValueRef v0
= lp_build_lerp(bld
, x
, v00
, v01
);
996 LLVMValueRef v1
= lp_build_lerp(bld
, x
, v10
, v11
);
997 return lp_build_lerp(bld
, y
, v0
, v1
);
1002 * Generate min(a, b)
1003 * Do checks for special cases.
1006 lp_build_min(struct lp_build_context
*bld
,
1010 assert(lp_check_value(bld
->type
, a
));
1011 assert(lp_check_value(bld
->type
, b
));
1013 if(a
== bld
->undef
|| b
== bld
->undef
)
1019 if (bld
->type
.norm
) {
1020 if (!bld
->type
.sign
) {
1021 if (a
== bld
->zero
|| b
== bld
->zero
) {
1031 return lp_build_min_simple(bld
, a
, b
);
1036 * Generate max(a, b)
1037 * Do checks for special cases.
1040 lp_build_max(struct lp_build_context
*bld
,
1044 assert(lp_check_value(bld
->type
, a
));
1045 assert(lp_check_value(bld
->type
, b
));
1047 if(a
== bld
->undef
|| b
== bld
->undef
)
1053 if(bld
->type
.norm
) {
1054 if(a
== bld
->one
|| b
== bld
->one
)
1056 if (!bld
->type
.sign
) {
1057 if (a
== bld
->zero
) {
1060 if (b
== bld
->zero
) {
1066 return lp_build_max_simple(bld
, a
, b
);
1071 * Generate clamp(a, min, max)
1072 * Do checks for special cases.
1075 lp_build_clamp(struct lp_build_context
*bld
,
1080 assert(lp_check_value(bld
->type
, a
));
1081 assert(lp_check_value(bld
->type
, min
));
1082 assert(lp_check_value(bld
->type
, max
));
1084 a
= lp_build_min(bld
, a
, max
);
1085 a
= lp_build_max(bld
, a
, min
);
1094 lp_build_abs(struct lp_build_context
*bld
,
1097 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1098 const struct lp_type type
= bld
->type
;
1099 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1101 assert(lp_check_value(type
, a
));
1107 /* Mask out the sign bit */
1108 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1109 unsigned long long absMask
= ~(1ULL << (type
.width
- 1));
1110 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
, ((unsigned long long) absMask
));
1111 a
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1112 a
= LLVMBuildAnd(builder
, a
, mask
, "");
1113 a
= LLVMBuildBitCast(builder
, a
, vec_type
, "");
1117 if(type
.width
*type
.length
== 128 && util_cpu_caps
.has_ssse3
) {
1118 switch(type
.width
) {
1120 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.b.128", vec_type
, a
);
1122 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.w.128", vec_type
, a
);
1124 return lp_build_intrinsic_unary(builder
, "llvm.x86.ssse3.pabs.d.128", vec_type
, a
);
1127 else if (type
.width
*type
.length
== 256 && util_cpu_caps
.has_ssse3
&&
1128 (gallivm_debug
& GALLIVM_DEBUG_PERF
) &&
1129 (type
.width
== 8 || type
.width
== 16 || type
.width
== 32)) {
1130 debug_printf("%s: inefficient code, should split vectors manually\n",
1134 return lp_build_max(bld
, a
, LLVMBuildNeg(builder
, a
, ""));
1139 lp_build_negate(struct lp_build_context
*bld
,
1142 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1144 assert(lp_check_value(bld
->type
, a
));
1146 #if HAVE_LLVM >= 0x0207
1147 if (bld
->type
.floating
)
1148 a
= LLVMBuildFNeg(builder
, a
, "");
1151 a
= LLVMBuildNeg(builder
, a
, "");
1157 /** Return -1, 0 or +1 depending on the sign of a */
1159 lp_build_sgn(struct lp_build_context
*bld
,
1162 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1163 const struct lp_type type
= bld
->type
;
1167 assert(lp_check_value(type
, a
));
1169 /* Handle non-zero case */
1171 /* if not zero then sign must be positive */
1174 else if(type
.floating
) {
1175 LLVMTypeRef vec_type
;
1176 LLVMTypeRef int_type
;
1180 unsigned long long maskBit
= (unsigned long long)1 << (type
.width
- 1);
1182 int_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1183 vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1184 mask
= lp_build_const_int_vec(bld
->gallivm
, type
, maskBit
);
1186 /* Take the sign bit and add it to 1 constant */
1187 sign
= LLVMBuildBitCast(builder
, a
, int_type
, "");
1188 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1189 one
= LLVMConstBitCast(bld
->one
, int_type
);
1190 res
= LLVMBuildOr(builder
, sign
, one
, "");
1191 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1195 /* signed int/norm/fixed point */
1196 /* could use psign with sse3 and appropriate vectors here */
1197 LLVMValueRef minus_one
= lp_build_const_vec(bld
->gallivm
, type
, -1.0);
1198 cond
= lp_build_cmp(bld
, PIPE_FUNC_GREATER
, a
, bld
->zero
);
1199 res
= lp_build_select(bld
, cond
, bld
->one
, minus_one
);
1203 cond
= lp_build_cmp(bld
, PIPE_FUNC_EQUAL
, a
, bld
->zero
);
1204 res
= lp_build_select(bld
, cond
, bld
->zero
, res
);
1211 * Set the sign of float vector 'a' according to 'sign'.
1212 * If sign==0, return abs(a).
1213 * If sign==1, return -abs(a);
1214 * Other values for sign produce undefined results.
1217 lp_build_set_sign(struct lp_build_context
*bld
,
1218 LLVMValueRef a
, LLVMValueRef sign
)
1220 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1221 const struct lp_type type
= bld
->type
;
1222 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1223 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1224 LLVMValueRef shift
= lp_build_const_int_vec(bld
->gallivm
, type
, type
.width
- 1);
1225 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1226 ~((unsigned long long) 1 << (type
.width
- 1)));
1227 LLVMValueRef val
, res
;
1229 assert(type
.floating
);
1230 assert(lp_check_value(type
, a
));
1232 /* val = reinterpret_cast<int>(a) */
1233 val
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1234 /* val = val & mask */
1235 val
= LLVMBuildAnd(builder
, val
, mask
, "");
1236 /* sign = sign << shift */
1237 sign
= LLVMBuildShl(builder
, sign
, shift
, "");
1238 /* res = val | sign */
1239 res
= LLVMBuildOr(builder
, val
, sign
, "");
1240 /* res = reinterpret_cast<float>(res) */
1241 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
1248 * Convert vector of (or scalar) int to vector of (or scalar) float.
1251 lp_build_int_to_float(struct lp_build_context
*bld
,
1254 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1255 const struct lp_type type
= bld
->type
;
1256 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1258 assert(type
.floating
);
1260 return LLVMBuildSIToFP(builder
, a
, vec_type
, "");
1264 sse41_rounding_available(const struct lp_type type
)
1266 if ((util_cpu_caps
.has_sse4_1
&&
1267 (type
.length
== 1 || type
.width
*type
.length
== 128)) ||
1268 (util_cpu_caps
.has_avx
&& type
.width
*type
.length
== 256))
1274 enum lp_build_round_sse41_mode
1276 LP_BUILD_ROUND_SSE41_NEAREST
= 0,
1277 LP_BUILD_ROUND_SSE41_FLOOR
= 1,
1278 LP_BUILD_ROUND_SSE41_CEIL
= 2,
1279 LP_BUILD_ROUND_SSE41_TRUNCATE
= 3
1284 * Helper for SSE4.1's ROUNDxx instructions.
1286 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1287 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1289 static INLINE LLVMValueRef
1290 lp_build_round_sse41(struct lp_build_context
*bld
,
1292 enum lp_build_round_sse41_mode mode
)
1294 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1295 const struct lp_type type
= bld
->type
;
1296 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1297 const char *intrinsic
;
1300 assert(type
.floating
);
1302 assert(lp_check_value(type
, a
));
1303 assert(util_cpu_caps
.has_sse4_1
);
1305 if (type
.length
== 1) {
1306 LLVMTypeRef vec_type
;
1308 LLVMValueRef args
[3];
1309 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1311 switch(type
.width
) {
1313 intrinsic
= "llvm.x86.sse41.round.ss";
1316 intrinsic
= "llvm.x86.sse41.round.sd";
1323 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1325 undef
= LLVMGetUndef(vec_type
);
1328 args
[1] = LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1329 args
[2] = LLVMConstInt(i32t
, mode
, 0);
1331 res
= lp_build_intrinsic(builder
, intrinsic
,
1332 vec_type
, args
, Elements(args
));
1334 res
= LLVMBuildExtractElement(builder
, res
, index0
, "");
1337 if (type
.width
* type
.length
== 128) {
1338 switch(type
.width
) {
1340 intrinsic
= "llvm.x86.sse41.round.ps";
1343 intrinsic
= "llvm.x86.sse41.round.pd";
1351 assert(type
.width
* type
.length
== 256);
1352 assert(util_cpu_caps
.has_avx
);
1354 switch(type
.width
) {
1356 intrinsic
= "llvm.x86.avx.round.ps.256";
1359 intrinsic
= "llvm.x86.avx.round.pd.256";
1367 res
= lp_build_intrinsic_binary(builder
, intrinsic
,
1369 LLVMConstInt(i32t
, mode
, 0));
1376 static INLINE LLVMValueRef
1377 lp_build_iround_nearest_sse2(struct lp_build_context
*bld
,
1380 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1381 const struct lp_type type
= bld
->type
;
1382 LLVMTypeRef i32t
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
1383 LLVMTypeRef ret_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1384 const char *intrinsic
;
1387 assert(type
.floating
);
1388 /* using the double precision conversions is a bit more complicated */
1389 assert(type
.width
== 32);
1391 assert(lp_check_value(type
, a
));
1392 assert(util_cpu_caps
.has_sse2
);
1394 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1395 if (type
.length
== 1) {
1396 LLVMTypeRef vec_type
;
1399 LLVMValueRef index0
= LLVMConstInt(i32t
, 0, 0);
1401 vec_type
= LLVMVectorType(bld
->elem_type
, 4);
1403 intrinsic
= "llvm.x86.sse.cvtss2si";
1405 undef
= LLVMGetUndef(vec_type
);
1407 arg
= LLVMBuildInsertElement(builder
, undef
, a
, index0
, "");
1409 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1413 if (type
.width
* type
.length
== 128) {
1414 intrinsic
= "llvm.x86.sse2.cvtps2dq";
1417 assert(type
.width
*type
.length
== 256);
1418 assert(util_cpu_caps
.has_avx
);
1420 intrinsic
= "llvm.x86.avx.cvt.ps2dq.256";
1422 res
= lp_build_intrinsic_unary(builder
, intrinsic
,
1431 * Return the integer part of a float (vector) value (== round toward zero).
1432 * The returned value is a float (vector).
1433 * Ex: trunc(-1.5) = -1.0
1436 lp_build_trunc(struct lp_build_context
*bld
,
1439 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1440 const struct lp_type type
= bld
->type
;
1442 assert(type
.floating
);
1443 assert(lp_check_value(type
, a
));
1445 if (sse41_rounding_available(type
)) {
1446 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_TRUNCATE
);
1449 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1450 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1452 res
= LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1453 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1460 * Return float (vector) rounded to nearest integer (vector). The returned
1461 * value is a float (vector).
1462 * Ex: round(0.9) = 1.0
1463 * Ex: round(-1.5) = -2.0
1466 lp_build_round(struct lp_build_context
*bld
,
1469 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1470 const struct lp_type type
= bld
->type
;
1472 assert(type
.floating
);
1473 assert(lp_check_value(type
, a
));
1475 if (sse41_rounding_available(type
)) {
1476 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1479 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1481 res
= lp_build_iround(bld
, a
);
1482 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1489 * Return floor of float (vector), result is a float (vector)
1490 * Ex: floor(1.1) = 1.0
1491 * Ex: floor(-1.1) = -2.0
1494 lp_build_floor(struct lp_build_context
*bld
,
1497 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1498 const struct lp_type type
= bld
->type
;
1500 assert(type
.floating
);
1501 assert(lp_check_value(type
, a
));
1503 if (sse41_rounding_available(type
)) {
1504 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1507 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1509 res
= lp_build_ifloor(bld
, a
);
1510 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1517 * Return ceiling of float (vector), returning float (vector).
1518 * Ex: ceil( 1.1) = 2.0
1519 * Ex: ceil(-1.1) = -1.0
1522 lp_build_ceil(struct lp_build_context
*bld
,
1525 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1526 const struct lp_type type
= bld
->type
;
1528 assert(type
.floating
);
1529 assert(lp_check_value(type
, a
));
1531 if (sse41_rounding_available(type
)) {
1532 return lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1535 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1537 res
= lp_build_iceil(bld
, a
);
1538 res
= LLVMBuildSIToFP(builder
, res
, vec_type
, "");
1545 * Return fractional part of 'a' computed as a - floor(a)
1546 * Typically used in texture coord arithmetic.
1549 lp_build_fract(struct lp_build_context
*bld
,
1552 assert(bld
->type
.floating
);
1553 return lp_build_sub(bld
, a
, lp_build_floor(bld
, a
));
1558 * Prevent returning a fractional part of 1.0 for very small negative values of
1559 * 'a' by clamping against 0.99999(9).
1561 static inline LLVMValueRef
1562 clamp_fract(struct lp_build_context
*bld
, LLVMValueRef fract
)
1566 /* this is the largest number smaller than 1.0 representable as float */
1567 max
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
1568 1.0 - 1.0/(1LL << (lp_mantissa(bld
->type
) + 1)));
1569 return lp_build_min(bld
, fract
, max
);
1574 * Same as lp_build_fract, but guarantees that the result is always smaller
1578 lp_build_fract_safe(struct lp_build_context
*bld
,
1581 return clamp_fract(bld
, lp_build_fract(bld
, a
));
1586 * Return the integer part of a float (vector) value (== round toward zero).
1587 * The returned value is an integer (vector).
1588 * Ex: itrunc(-1.5) = -1
1591 lp_build_itrunc(struct lp_build_context
*bld
,
1594 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1595 const struct lp_type type
= bld
->type
;
1596 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
1598 assert(type
.floating
);
1599 assert(lp_check_value(type
, a
));
1601 return LLVMBuildFPToSI(builder
, a
, int_vec_type
, "");
1606 * Return float (vector) rounded to nearest integer (vector). The returned
1607 * value is an integer (vector).
1608 * Ex: iround(0.9) = 1
1609 * Ex: iround(-1.5) = -2
1612 lp_build_iround(struct lp_build_context
*bld
,
1615 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1616 const struct lp_type type
= bld
->type
;
1617 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1620 assert(type
.floating
);
1622 assert(lp_check_value(type
, a
));
1624 if ((util_cpu_caps
.has_sse2
&&
1625 ((type
.width
== 32) && (type
.length
== 1 || type
.length
== 4))) ||
1626 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8)) {
1627 return lp_build_iround_nearest_sse2(bld
, a
);
1629 if (sse41_rounding_available(type
)) {
1630 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_NEAREST
);
1635 half
= lp_build_const_vec(bld
->gallivm
, type
, 0.5);
1638 LLVMTypeRef vec_type
= bld
->vec_type
;
1639 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1640 (unsigned long long)1 << (type
.width
- 1));
1644 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1645 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1648 half
= LLVMBuildBitCast(builder
, half
, int_vec_type
, "");
1649 half
= LLVMBuildOr(builder
, sign
, half
, "");
1650 half
= LLVMBuildBitCast(builder
, half
, vec_type
, "");
1653 res
= LLVMBuildFAdd(builder
, a
, half
, "");
1656 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
1663 * Return floor of float (vector), result is an int (vector)
1664 * Ex: ifloor(1.1) = 1.0
1665 * Ex: ifloor(-1.1) = -2.0
1668 lp_build_ifloor(struct lp_build_context
*bld
,
1671 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1672 const struct lp_type type
= bld
->type
;
1673 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1676 assert(type
.floating
);
1677 assert(lp_check_value(type
, a
));
1681 if (sse41_rounding_available(type
)) {
1682 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_FLOOR
);
1685 /* Take the sign bit and add it to 1 constant */
1686 LLVMTypeRef vec_type
= bld
->vec_type
;
1687 unsigned mantissa
= lp_mantissa(type
);
1688 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1689 (unsigned long long)1 << (type
.width
- 1));
1691 LLVMValueRef offset
;
1693 /* sign = a < 0 ? ~0 : 0 */
1694 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1695 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1696 sign
= LLVMBuildAShr(builder
, sign
,
1697 lp_build_const_int_vec(bld
->gallivm
, type
,
1701 /* offset = -0.99999(9)f */
1702 offset
= lp_build_const_vec(bld
->gallivm
, type
,
1703 -(double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1704 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1706 /* offset = a < 0 ? offset : 0.0f */
1707 offset
= LLVMBuildAnd(builder
, offset
, sign
, "");
1708 offset
= LLVMBuildBitCast(builder
, offset
, vec_type
, "ifloor.offset");
1710 res
= LLVMBuildFAdd(builder
, res
, offset
, "ifloor.res");
1714 /* round to nearest (toward zero) */
1715 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "ifloor.res");
1722 * Return ceiling of float (vector), returning int (vector).
1723 * Ex: iceil( 1.1) = 2
1724 * Ex: iceil(-1.1) = -1
1727 lp_build_iceil(struct lp_build_context
*bld
,
1730 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1731 const struct lp_type type
= bld
->type
;
1732 LLVMTypeRef int_vec_type
= bld
->int_vec_type
;
1735 assert(type
.floating
);
1736 assert(lp_check_value(type
, a
));
1738 if (sse41_rounding_available(type
)) {
1739 res
= lp_build_round_sse41(bld
, a
, LP_BUILD_ROUND_SSE41_CEIL
);
1742 LLVMTypeRef vec_type
= bld
->vec_type
;
1743 unsigned mantissa
= lp_mantissa(type
);
1744 LLVMValueRef offset
;
1746 /* offset = 0.99999(9)f */
1747 offset
= lp_build_const_vec(bld
->gallivm
, type
,
1748 (double)(((unsigned long long)1 << mantissa
) - 10)/((unsigned long long)1 << mantissa
));
1751 LLVMValueRef mask
= lp_build_const_int_vec(bld
->gallivm
, type
,
1752 (unsigned long long)1 << (type
.width
- 1));
1755 /* sign = a < 0 ? 0 : ~0 */
1756 sign
= LLVMBuildBitCast(builder
, a
, int_vec_type
, "");
1757 sign
= LLVMBuildAnd(builder
, sign
, mask
, "");
1758 sign
= LLVMBuildAShr(builder
, sign
,
1759 lp_build_const_int_vec(bld
->gallivm
, type
,
1762 sign
= LLVMBuildNot(builder
, sign
, "iceil.not");
1764 /* offset = a < 0 ? 0.0 : offset */
1765 offset
= LLVMConstBitCast(offset
, int_vec_type
);
1766 offset
= LLVMBuildAnd(builder
, offset
, sign
, "");
1767 offset
= LLVMBuildBitCast(builder
, offset
, vec_type
, "iceil.offset");
1770 res
= LLVMBuildFAdd(builder
, a
, offset
, "iceil.res");
1773 /* round to nearest (toward zero) */
1774 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "iceil.res");
1781 * Combined ifloor() & fract().
1783 * Preferred to calling the functions separately, as it will ensure that the
1784 * strategy (floor() vs ifloor()) that results in less redundant work is used.
1787 lp_build_ifloor_fract(struct lp_build_context
*bld
,
1789 LLVMValueRef
*out_ipart
,
1790 LLVMValueRef
*out_fpart
)
1792 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1793 const struct lp_type type
= bld
->type
;
1796 assert(type
.floating
);
1797 assert(lp_check_value(type
, a
));
1799 if (sse41_rounding_available(type
)) {
1801 * floor() is easier.
1804 ipart
= lp_build_floor(bld
, a
);
1805 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
1806 *out_ipart
= LLVMBuildFPToSI(builder
, ipart
, bld
->int_vec_type
, "ipart");
1810 * ifloor() is easier.
1813 *out_ipart
= lp_build_ifloor(bld
, a
);
1814 ipart
= LLVMBuildSIToFP(builder
, *out_ipart
, bld
->vec_type
, "ipart");
1815 *out_fpart
= LLVMBuildFSub(builder
, a
, ipart
, "fpart");
1821 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1822 * always smaller than one.
1825 lp_build_ifloor_fract_safe(struct lp_build_context
*bld
,
1827 LLVMValueRef
*out_ipart
,
1828 LLVMValueRef
*out_fpart
)
1830 lp_build_ifloor_fract(bld
, a
, out_ipart
, out_fpart
);
1831 *out_fpart
= clamp_fract(bld
, *out_fpart
);
1836 lp_build_sqrt(struct lp_build_context
*bld
,
1839 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1840 const struct lp_type type
= bld
->type
;
1841 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
1844 assert(lp_check_value(type
, a
));
1846 /* TODO: optimize the constant case */
1848 assert(type
.floating
);
1849 if (type
.length
== 1) {
1850 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.f%u", type
.width
);
1853 util_snprintf(intrinsic
, sizeof intrinsic
, "llvm.sqrt.v%uf%u", type
.length
, type
.width
);
1856 return lp_build_intrinsic_unary(builder
, intrinsic
, vec_type
, a
);
1861 * Do one Newton-Raphson step to improve reciprocate precision:
1863 * x_{i+1} = x_i * (2 - a * x_i)
1865 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1866 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1867 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1868 * halo. It would be necessary to clamp the argument to prevent this.
1871 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1872 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1874 static INLINE LLVMValueRef
1875 lp_build_rcp_refine(struct lp_build_context
*bld
,
1879 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1880 LLVMValueRef two
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 2.0);
1883 res
= LLVMBuildFMul(builder
, a
, rcp_a
, "");
1884 res
= LLVMBuildFSub(builder
, two
, res
, "");
1885 res
= LLVMBuildFMul(builder
, rcp_a
, res
, "");
1892 lp_build_rcp(struct lp_build_context
*bld
,
1895 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1896 const struct lp_type type
= bld
->type
;
1898 assert(lp_check_value(type
, a
));
1907 assert(type
.floating
);
1909 if(LLVMIsConstant(a
))
1910 return LLVMConstFDiv(bld
->one
, a
);
1913 * We don't use RCPPS because:
1914 * - it only has 10bits of precision
1915 * - it doesn't even get the reciprocate of 1.0 exactly
1916 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1917 * - for recent processors the benefit over DIVPS is marginal, a case
1920 * We could still use it on certain processors if benchmarks show that the
1921 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1922 * particular uses that require less workarounds.
1925 if (FALSE
&& ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1926 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))){
1927 const unsigned num_iterations
= 0;
1930 const char *intrinsic
= NULL
;
1932 if (type
.length
== 4) {
1933 intrinsic
= "llvm.x86.sse.rcp.ps";
1936 intrinsic
= "llvm.x86.avx.rcp.ps.256";
1939 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
1941 for (i
= 0; i
< num_iterations
; ++i
) {
1942 res
= lp_build_rcp_refine(bld
, a
, res
);
1948 return LLVMBuildFDiv(builder
, bld
->one
, a
, "");
1953 * Do one Newton-Raphson step to improve rsqrt precision:
1955 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1957 * See also Intel 64 and IA-32 Architectures Optimization Manual.
1959 static INLINE LLVMValueRef
1960 lp_build_rsqrt_refine(struct lp_build_context
*bld
,
1962 LLVMValueRef rsqrt_a
)
1964 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1965 LLVMValueRef half
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 0.5);
1966 LLVMValueRef three
= lp_build_const_vec(bld
->gallivm
, bld
->type
, 3.0);
1969 res
= LLVMBuildFMul(builder
, rsqrt_a
, rsqrt_a
, "");
1970 res
= LLVMBuildFMul(builder
, a
, res
, "");
1971 res
= LLVMBuildFSub(builder
, three
, res
, "");
1972 res
= LLVMBuildFMul(builder
, rsqrt_a
, res
, "");
1973 res
= LLVMBuildFMul(builder
, half
, res
, "");
1980 * Generate 1/sqrt(a).
1981 * Result is undefined for values < 0, infinity for +0.
1984 lp_build_rsqrt(struct lp_build_context
*bld
,
1987 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1988 const struct lp_type type
= bld
->type
;
1990 assert(lp_check_value(type
, a
));
1992 assert(type
.floating
);
1995 * This should be faster but all denormals will end up as infinity.
1997 if (0 && ((util_cpu_caps
.has_sse
&& type
.width
== 32 && type
.length
== 4) ||
1998 (util_cpu_caps
.has_avx
&& type
.width
== 32 && type
.length
== 8))) {
1999 const unsigned num_iterations
= 1;
2002 const char *intrinsic
= NULL
;
2004 if (type
.length
== 4) {
2005 intrinsic
= "llvm.x86.sse.rsqrt.ps";
2008 intrinsic
= "llvm.x86.avx.rsqrt.ps.256";
2010 if (num_iterations
) {
2012 * Newton-Raphson will result in NaN instead of infinity for zero,
2013 * and NaN instead of zero for infinity.
2014 * Also, need to ensure rsqrt(1.0) == 1.0.
2015 * All numbers smaller than FLT_MIN will result in +infinity
2016 * (rsqrtps treats all denormals as zero).
2019 * Certain non-c99 compilers don't know INFINITY and might not support
2020 * hacks to evaluate it at compile time neither.
2022 const unsigned posinf_int
= 0x7F800000;
2024 LLVMValueRef flt_min
= lp_build_const_vec(bld
->gallivm
, type
, FLT_MIN
);
2025 LLVMValueRef inf
= lp_build_const_int_vec(bld
->gallivm
, type
, posinf_int
);
2027 inf
= LLVMBuildBitCast(builder
, inf
, lp_build_vec_type(bld
->gallivm
, type
), "");
2029 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2031 for (i
= 0; i
< num_iterations
; ++i
) {
2032 res
= lp_build_rsqrt_refine(bld
, a
, res
);
2034 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_LESS
, a
, flt_min
);
2035 res
= lp_build_select(bld
, cmp
, inf
, res
);
2036 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, inf
);
2037 res
= lp_build_select(bld
, cmp
, bld
->zero
, res
);
2038 cmp
= lp_build_compare(bld
->gallivm
, type
, PIPE_FUNC_EQUAL
, a
, bld
->one
);
2039 res
= lp_build_select(bld
, cmp
, bld
->one
, res
);
2042 /* rsqrt(1.0) != 1.0 here */
2043 res
= lp_build_intrinsic_unary(builder
, intrinsic
, bld
->vec_type
, a
);
2050 return lp_build_rcp(bld
, lp_build_sqrt(bld
, a
));
2055 * Generate sin(a) using SSE2
2058 lp_build_sin(struct lp_build_context
*bld
,
2061 struct gallivm_state
*gallivm
= bld
->gallivm
;
2062 LLVMBuilderRef builder
= gallivm
->builder
;
2063 struct lp_type int_type
= lp_int_type(bld
->type
);
2064 LLVMBuilderRef b
= builder
;
2067 * take the absolute value,
2068 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2071 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2072 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2074 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2075 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2078 * extract the sign bit (upper one)
2079 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2081 LLVMValueRef sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, 0x80000000);
2082 LLVMValueRef sign_bit_i
= LLVMBuildAnd(b
, a_v4si
, sig_mask
, "sign_bit_i");
2086 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2089 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2090 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2093 * store the integer part of y in mm0
2094 * emm2 = _mm_cvttps_epi32(y);
2097 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2100 * j=(j+1) & (~1) (see the cephes sources)
2101 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2104 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2105 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2107 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2109 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2110 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2113 * y = _mm_cvtepi32_ps(emm2);
2115 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2117 /* get the swap sign flag
2118 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2120 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2121 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm2_add
, pi32_4
, "emm0_and");
2124 * emm2 = _mm_slli_epi32(emm0, 29);
2126 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2127 LLVMValueRef swap_sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "swap_sign_bit");
2130 * get the polynom selection mask
2131 * there is one polynom for 0 <= x <= Pi/4
2132 * and another one for Pi/4<x<=Pi/2
2133 * Both branches will be computed.
2135 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2136 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2139 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2140 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_and
, pi32_2
, "emm2_3");
2141 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2142 int_type
, PIPE_FUNC_EQUAL
,
2143 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2145 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2147 LLVMValueRef sign_bit_1
= LLVMBuildXor(b
, sign_bit_i
, swap_sign_bit
, "sign_bit");
2150 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2151 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2152 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2154 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2155 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2156 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2159 * The magic pass: "Extended precision modular arithmetic"
2160 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2161 * xmm1 = _mm_mul_ps(y, xmm1);
2162 * xmm2 = _mm_mul_ps(y, xmm2);
2163 * xmm3 = _mm_mul_ps(y, xmm3);
2165 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2166 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2167 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2170 * x = _mm_add_ps(x, xmm1);
2171 * x = _mm_add_ps(x, xmm2);
2172 * x = _mm_add_ps(x, xmm3);
2175 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2176 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2177 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2180 * Evaluate the first polynom (0 <= x <= Pi/4)
2182 * z = _mm_mul_ps(x,x);
2184 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2187 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2188 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2189 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2191 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2192 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2193 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2196 * y = *(v4sf*)_ps_coscof_p0;
2197 * y = _mm_mul_ps(y, z);
2199 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2200 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2201 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2202 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2203 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2204 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2208 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2209 * y = _mm_sub_ps(y, tmp);
2210 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2212 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2213 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2214 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2215 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2216 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2219 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2220 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2221 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2223 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2224 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2225 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2228 * Evaluate the second polynom (Pi/4 <= x <= 0)
2230 * y2 = *(v4sf*)_ps_sincof_p0;
2231 * y2 = _mm_mul_ps(y2, z);
2232 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2233 * y2 = _mm_mul_ps(y2, z);
2234 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2235 * y2 = _mm_mul_ps(y2, z);
2236 * y2 = _mm_mul_ps(y2, x);
2237 * y2 = _mm_add_ps(y2, x);
2240 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2241 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2242 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2243 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2244 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2245 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2246 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2249 * select the correct result from the two polynoms
2251 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2252 * y = _mm_andnot_ps(xmm3, y);
2253 * y = _mm_add_ps(y,y2);
2255 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2256 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2257 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2258 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2259 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2260 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2261 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2265 * y = _mm_xor_ps(y, sign_bit);
2267 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit_1
, "y_sin");
2268 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2274 * Generate cos(a) using SSE2
2277 lp_build_cos(struct lp_build_context
*bld
,
2280 struct gallivm_state
*gallivm
= bld
->gallivm
;
2281 LLVMBuilderRef builder
= gallivm
->builder
;
2282 struct lp_type int_type
= lp_int_type(bld
->type
);
2283 LLVMBuilderRef b
= builder
;
2286 * take the absolute value,
2287 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2290 LLVMValueRef inv_sig_mask
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0x80000000);
2291 LLVMValueRef a_v4si
= LLVMBuildBitCast(b
, a
, bld
->int_vec_type
, "a_v4si");
2293 LLVMValueRef absi
= LLVMBuildAnd(b
, a_v4si
, inv_sig_mask
, "absi");
2294 LLVMValueRef x_abs
= LLVMBuildBitCast(b
, absi
, bld
->vec_type
, "x_abs");
2298 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2301 LLVMValueRef FOPi
= lp_build_const_vec(gallivm
, bld
->type
, 1.27323954473516);
2302 LLVMValueRef scale_y
= LLVMBuildFMul(b
, x_abs
, FOPi
, "scale_y");
2305 * store the integer part of y in mm0
2306 * emm2 = _mm_cvttps_epi32(y);
2309 LLVMValueRef emm2_i
= LLVMBuildFPToSI(b
, scale_y
, bld
->int_vec_type
, "emm2_i");
2312 * j=(j+1) & (~1) (see the cephes sources)
2313 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2316 LLVMValueRef all_one
= lp_build_const_int_vec(gallivm
, bld
->type
, 1);
2317 LLVMValueRef emm2_add
= LLVMBuildAdd(b
, emm2_i
, all_one
, "emm2_add");
2319 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2321 LLVMValueRef inv_one
= lp_build_const_int_vec(gallivm
, bld
->type
, ~1);
2322 LLVMValueRef emm2_and
= LLVMBuildAnd(b
, emm2_add
, inv_one
, "emm2_and");
2325 * y = _mm_cvtepi32_ps(emm2);
2327 LLVMValueRef y_2
= LLVMBuildSIToFP(b
, emm2_and
, bld
->vec_type
, "y_2");
2331 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2333 LLVMValueRef const_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2334 LLVMValueRef emm2_2
= LLVMBuildSub(b
, emm2_and
, const_2
, "emm2_2");
2337 /* get the swap sign flag
2338 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2340 LLVMValueRef inv
= lp_build_const_int_vec(gallivm
, bld
->type
, ~0);
2341 LLVMValueRef emm0_not
= LLVMBuildXor(b
, emm2_2
, inv
, "emm0_not");
2342 LLVMValueRef pi32_4
= lp_build_const_int_vec(gallivm
, bld
->type
, 4);
2343 LLVMValueRef emm0_and
= LLVMBuildAnd(b
, emm0_not
, pi32_4
, "emm0_and");
2346 * emm2 = _mm_slli_epi32(emm0, 29);
2348 LLVMValueRef const_29
= lp_build_const_int_vec(gallivm
, bld
->type
, 29);
2349 LLVMValueRef sign_bit
= LLVMBuildShl(b
, emm0_and
, const_29
, "sign_bit");
2352 * get the polynom selection mask
2353 * there is one polynom for 0 <= x <= Pi/4
2354 * and another one for Pi/4<x<=Pi/2
2355 * Both branches will be computed.
2357 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2358 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2361 LLVMValueRef pi32_2
= lp_build_const_int_vec(gallivm
, bld
->type
, 2);
2362 LLVMValueRef emm2_3
= LLVMBuildAnd(b
, emm2_2
, pi32_2
, "emm2_3");
2363 LLVMValueRef poly_mask
= lp_build_compare(gallivm
,
2364 int_type
, PIPE_FUNC_EQUAL
,
2365 emm2_3
, lp_build_const_int_vec(gallivm
, bld
->type
, 0));
2368 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2369 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2370 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2372 LLVMValueRef DP1
= lp_build_const_vec(gallivm
, bld
->type
, -0.78515625);
2373 LLVMValueRef DP2
= lp_build_const_vec(gallivm
, bld
->type
, -2.4187564849853515625e-4);
2374 LLVMValueRef DP3
= lp_build_const_vec(gallivm
, bld
->type
, -3.77489497744594108e-8);
2377 * The magic pass: "Extended precision modular arithmetic"
2378 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2379 * xmm1 = _mm_mul_ps(y, xmm1);
2380 * xmm2 = _mm_mul_ps(y, xmm2);
2381 * xmm3 = _mm_mul_ps(y, xmm3);
2383 LLVMValueRef xmm1
= LLVMBuildFMul(b
, y_2
, DP1
, "xmm1");
2384 LLVMValueRef xmm2
= LLVMBuildFMul(b
, y_2
, DP2
, "xmm2");
2385 LLVMValueRef xmm3
= LLVMBuildFMul(b
, y_2
, DP3
, "xmm3");
2388 * x = _mm_add_ps(x, xmm1);
2389 * x = _mm_add_ps(x, xmm2);
2390 * x = _mm_add_ps(x, xmm3);
2393 LLVMValueRef x_1
= LLVMBuildFAdd(b
, x_abs
, xmm1
, "x_1");
2394 LLVMValueRef x_2
= LLVMBuildFAdd(b
, x_1
, xmm2
, "x_2");
2395 LLVMValueRef x_3
= LLVMBuildFAdd(b
, x_2
, xmm3
, "x_3");
2398 * Evaluate the first polynom (0 <= x <= Pi/4)
2400 * z = _mm_mul_ps(x,x);
2402 LLVMValueRef z
= LLVMBuildFMul(b
, x_3
, x_3
, "z");
2405 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2406 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2407 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2409 LLVMValueRef coscof_p0
= lp_build_const_vec(gallivm
, bld
->type
, 2.443315711809948E-005);
2410 LLVMValueRef coscof_p1
= lp_build_const_vec(gallivm
, bld
->type
, -1.388731625493765E-003);
2411 LLVMValueRef coscof_p2
= lp_build_const_vec(gallivm
, bld
->type
, 4.166664568298827E-002);
2414 * y = *(v4sf*)_ps_coscof_p0;
2415 * y = _mm_mul_ps(y, z);
2417 LLVMValueRef y_3
= LLVMBuildFMul(b
, z
, coscof_p0
, "y_3");
2418 LLVMValueRef y_4
= LLVMBuildFAdd(b
, y_3
, coscof_p1
, "y_4");
2419 LLVMValueRef y_5
= LLVMBuildFMul(b
, y_4
, z
, "y_5");
2420 LLVMValueRef y_6
= LLVMBuildFAdd(b
, y_5
, coscof_p2
, "y_6");
2421 LLVMValueRef y_7
= LLVMBuildFMul(b
, y_6
, z
, "y_7");
2422 LLVMValueRef y_8
= LLVMBuildFMul(b
, y_7
, z
, "y_8");
2426 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2427 * y = _mm_sub_ps(y, tmp);
2428 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2430 LLVMValueRef half
= lp_build_const_vec(gallivm
, bld
->type
, 0.5);
2431 LLVMValueRef tmp
= LLVMBuildFMul(b
, z
, half
, "tmp");
2432 LLVMValueRef y_9
= LLVMBuildFSub(b
, y_8
, tmp
, "y_8");
2433 LLVMValueRef one
= lp_build_const_vec(gallivm
, bld
->type
, 1.0);
2434 LLVMValueRef y_10
= LLVMBuildFAdd(b
, y_9
, one
, "y_9");
2437 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2438 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2439 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2441 LLVMValueRef sincof_p0
= lp_build_const_vec(gallivm
, bld
->type
, -1.9515295891E-4);
2442 LLVMValueRef sincof_p1
= lp_build_const_vec(gallivm
, bld
->type
, 8.3321608736E-3);
2443 LLVMValueRef sincof_p2
= lp_build_const_vec(gallivm
, bld
->type
, -1.6666654611E-1);
2446 * Evaluate the second polynom (Pi/4 <= x <= 0)
2448 * y2 = *(v4sf*)_ps_sincof_p0;
2449 * y2 = _mm_mul_ps(y2, z);
2450 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2451 * y2 = _mm_mul_ps(y2, z);
2452 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2453 * y2 = _mm_mul_ps(y2, z);
2454 * y2 = _mm_mul_ps(y2, x);
2455 * y2 = _mm_add_ps(y2, x);
2458 LLVMValueRef y2_3
= LLVMBuildFMul(b
, z
, sincof_p0
, "y2_3");
2459 LLVMValueRef y2_4
= LLVMBuildFAdd(b
, y2_3
, sincof_p1
, "y2_4");
2460 LLVMValueRef y2_5
= LLVMBuildFMul(b
, y2_4
, z
, "y2_5");
2461 LLVMValueRef y2_6
= LLVMBuildFAdd(b
, y2_5
, sincof_p2
, "y2_6");
2462 LLVMValueRef y2_7
= LLVMBuildFMul(b
, y2_6
, z
, "y2_7");
2463 LLVMValueRef y2_8
= LLVMBuildFMul(b
, y2_7
, x_3
, "y2_8");
2464 LLVMValueRef y2_9
= LLVMBuildFAdd(b
, y2_8
, x_3
, "y2_9");
2467 * select the correct result from the two polynoms
2469 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2470 * y = _mm_andnot_ps(xmm3, y);
2471 * y = _mm_add_ps(y,y2);
2473 LLVMValueRef y2_i
= LLVMBuildBitCast(b
, y2_9
, bld
->int_vec_type
, "y2_i");
2474 LLVMValueRef y_i
= LLVMBuildBitCast(b
, y_10
, bld
->int_vec_type
, "y_i");
2475 LLVMValueRef y2_and
= LLVMBuildAnd(b
, y2_i
, poly_mask
, "y2_and");
2476 LLVMValueRef poly_mask_inv
= LLVMBuildXor(b
, poly_mask
, inv
, "poly_mask_inv");
2477 LLVMValueRef y_and
= LLVMBuildAnd(b
, y_i
, poly_mask_inv
, "y_and");
2478 LLVMValueRef y_combine
= LLVMBuildAdd(b
, y_and
, y2_and
, "y_combine");
2482 * y = _mm_xor_ps(y, sign_bit);
2484 LLVMValueRef y_sign
= LLVMBuildXor(b
, y_combine
, sign_bit
, "y_sin");
2485 LLVMValueRef y_result
= LLVMBuildBitCast(b
, y_sign
, bld
->vec_type
, "y_result");
2491 * Generate pow(x, y)
2494 lp_build_pow(struct lp_build_context
*bld
,
2498 /* TODO: optimize the constant case */
2499 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2500 LLVMIsConstant(x
) && LLVMIsConstant(y
)) {
2501 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2505 return lp_build_exp2(bld
, lp_build_mul(bld
, lp_build_log2(bld
, x
), y
));
2513 lp_build_exp(struct lp_build_context
*bld
,
2516 /* log2(e) = 1/log(2) */
2517 LLVMValueRef log2e
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2518 1.4426950408889634);
2520 assert(lp_check_value(bld
->type
, x
));
2522 return lp_build_exp2(bld
, lp_build_mul(bld
, log2e
, x
));
2530 lp_build_log(struct lp_build_context
*bld
,
2534 LLVMValueRef log2
= lp_build_const_vec(bld
->gallivm
, bld
->type
,
2535 0.69314718055994529);
2537 assert(lp_check_value(bld
->type
, x
));
2539 return lp_build_mul(bld
, log2
, lp_build_log2(bld
, x
));
2544 * Generate polynomial.
2545 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2548 lp_build_polynomial(struct lp_build_context
*bld
,
2550 const double *coeffs
,
2551 unsigned num_coeffs
)
2553 const struct lp_type type
= bld
->type
;
2554 LLVMValueRef even
= NULL
, odd
= NULL
;
2558 assert(lp_check_value(bld
->type
, x
));
2560 /* TODO: optimize the constant case */
2561 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2562 LLVMIsConstant(x
)) {
2563 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2568 * Calculate odd and even terms seperately to decrease data dependency
2570 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2571 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2573 x2
= lp_build_mul(bld
, x
, x
);
2575 for (i
= num_coeffs
; i
--; ) {
2578 coeff
= lp_build_const_vec(bld
->gallivm
, type
, coeffs
[i
]);
2582 even
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, even
));
2587 odd
= lp_build_add(bld
, coeff
, lp_build_mul(bld
, x2
, odd
));
2594 return lp_build_add(bld
, lp_build_mul(bld
, odd
, x
), even
);
2603 * Minimax polynomial fit of 2**x, in range [0, 1[
2605 const double lp_build_exp2_polynomial
[] = {
2606 #if EXP_POLY_DEGREE == 5
2607 0.999999925063526176901,
2608 0.693153073200168932794,
2609 0.240153617044375388211,
2610 0.0558263180532956664775,
2611 0.00898934009049466391101,
2612 0.00187757667519147912699
2613 #elif EXP_POLY_DEGREE == 4
2614 1.00000259337069434683,
2615 0.693003834469974940458,
2616 0.24144275689150793076,
2617 0.0520114606103070150235,
2618 0.0135341679161270268764
2619 #elif EXP_POLY_DEGREE == 3
2620 0.999925218562710312959,
2621 0.695833540494823811697,
2622 0.226067155427249155588,
2623 0.0780245226406372992967
2624 #elif EXP_POLY_DEGREE == 2
2625 1.00172476321474503578,
2626 0.657636275736077639316,
2627 0.33718943461968720704
2635 lp_build_exp2_approx(struct lp_build_context
*bld
,
2637 LLVMValueRef
*p_exp2_int_part
,
2638 LLVMValueRef
*p_frac_part
,
2639 LLVMValueRef
*p_exp2
)
2641 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2642 const struct lp_type type
= bld
->type
;
2643 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2644 LLVMValueRef ipart
= NULL
;
2645 LLVMValueRef fpart
= NULL
;
2646 LLVMValueRef expipart
= NULL
;
2647 LLVMValueRef expfpart
= NULL
;
2648 LLVMValueRef res
= NULL
;
2650 assert(lp_check_value(bld
->type
, x
));
2652 if(p_exp2_int_part
|| p_frac_part
|| p_exp2
) {
2653 /* TODO: optimize the constant case */
2654 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2655 LLVMIsConstant(x
)) {
2656 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2660 assert(type
.floating
&& type
.width
== 32);
2662 x
= lp_build_min(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, 129.0));
2663 x
= lp_build_max(bld
, x
, lp_build_const_vec(bld
->gallivm
, type
, -126.99999));
2665 /* ipart = floor(x) */
2666 /* fpart = x - ipart */
2667 lp_build_ifloor_fract(bld
, x
, &ipart
, &fpart
);
2670 if(p_exp2_int_part
|| p_exp2
) {
2671 /* expipart = (float) (1 << ipart) */
2672 expipart
= LLVMBuildAdd(builder
, ipart
,
2673 lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
2674 expipart
= LLVMBuildShl(builder
, expipart
,
2675 lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
2676 expipart
= LLVMBuildBitCast(builder
, expipart
, vec_type
, "");
2680 expfpart
= lp_build_polynomial(bld
, fpart
, lp_build_exp2_polynomial
,
2681 Elements(lp_build_exp2_polynomial
));
2683 res
= LLVMBuildFMul(builder
, expipart
, expfpart
, "");
2687 *p_exp2_int_part
= expipart
;
2690 *p_frac_part
= fpart
;
2698 lp_build_exp2(struct lp_build_context
*bld
,
2702 lp_build_exp2_approx(bld
, x
, NULL
, NULL
, &res
);
2708 * Extract the exponent of a IEEE-754 floating point value.
2710 * Optionally apply an integer bias.
2712 * Result is an integer value with
2714 * ifloor(log2(x)) + bias
2717 lp_build_extract_exponent(struct lp_build_context
*bld
,
2721 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2722 const struct lp_type type
= bld
->type
;
2723 unsigned mantissa
= lp_mantissa(type
);
2726 assert(type
.floating
);
2728 assert(lp_check_value(bld
->type
, x
));
2730 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
2732 res
= LLVMBuildLShr(builder
, x
,
2733 lp_build_const_int_vec(bld
->gallivm
, type
, mantissa
), "");
2734 res
= LLVMBuildAnd(builder
, res
,
2735 lp_build_const_int_vec(bld
->gallivm
, type
, 255), "");
2736 res
= LLVMBuildSub(builder
, res
,
2737 lp_build_const_int_vec(bld
->gallivm
, type
, 127 - bias
), "");
2744 * Extract the mantissa of the a floating.
2746 * Result is a floating point value with
2748 * x / floor(log2(x))
2751 lp_build_extract_mantissa(struct lp_build_context
*bld
,
2754 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2755 const struct lp_type type
= bld
->type
;
2756 unsigned mantissa
= lp_mantissa(type
);
2757 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
,
2758 (1ULL << mantissa
) - 1);
2759 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, bld
->int_vec_type
);
2762 assert(lp_check_value(bld
->type
, x
));
2764 assert(type
.floating
);
2766 x
= LLVMBuildBitCast(builder
, x
, bld
->int_vec_type
, "");
2768 /* res = x / 2**ipart */
2769 res
= LLVMBuildAnd(builder
, x
, mantmask
, "");
2770 res
= LLVMBuildOr(builder
, res
, one
, "");
2771 res
= LLVMBuildBitCast(builder
, res
, bld
->vec_type
, "");
2779 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2780 * These coefficients can be generate with
2781 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2783 const double lp_build_log2_polynomial
[] = {
2784 #if LOG_POLY_DEGREE == 5
2785 2.88539008148777786488L,
2786 0.961796878841293367824L,
2787 0.577058946784739859012L,
2788 0.412914355135828735411L,
2789 0.308591899232910175289L,
2790 0.352376952300281371868L,
2791 #elif LOG_POLY_DEGREE == 4
2792 2.88539009343309178325L,
2793 0.961791550404184197881L,
2794 0.577440339438736392009L,
2795 0.403343858251329912514L,
2796 0.406718052498846252698L,
2797 #elif LOG_POLY_DEGREE == 3
2798 2.88538959748872753838L,
2799 0.961932915889597772928L,
2800 0.571118517972136195241L,
2801 0.493997535084709500285L,
2808 * See http://www.devmaster.net/forums/showthread.php?p=43580
2809 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2810 * http://www.nezumi.demon.co.uk/consult/logx.htm
2813 lp_build_log2_approx(struct lp_build_context
*bld
,
2815 LLVMValueRef
*p_exp
,
2816 LLVMValueRef
*p_floor_log2
,
2817 LLVMValueRef
*p_log2
)
2819 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2820 const struct lp_type type
= bld
->type
;
2821 LLVMTypeRef vec_type
= lp_build_vec_type(bld
->gallivm
, type
);
2822 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(bld
->gallivm
, type
);
2824 LLVMValueRef expmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x7f800000);
2825 LLVMValueRef mantmask
= lp_build_const_int_vec(bld
->gallivm
, type
, 0x007fffff);
2826 LLVMValueRef one
= LLVMConstBitCast(bld
->one
, int_vec_type
);
2828 LLVMValueRef i
= NULL
;
2829 LLVMValueRef y
= NULL
;
2830 LLVMValueRef z
= NULL
;
2831 LLVMValueRef exp
= NULL
;
2832 LLVMValueRef mant
= NULL
;
2833 LLVMValueRef logexp
= NULL
;
2834 LLVMValueRef logmant
= NULL
;
2835 LLVMValueRef res
= NULL
;
2837 assert(lp_check_value(bld
->type
, x
));
2839 if(p_exp
|| p_floor_log2
|| p_log2
) {
2840 /* TODO: optimize the constant case */
2841 if (gallivm_debug
& GALLIVM_DEBUG_PERF
&&
2842 LLVMIsConstant(x
)) {
2843 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2847 assert(type
.floating
&& type
.width
== 32);
2850 * We don't explicitly handle denormalized numbers. They will yield a
2851 * result in the neighbourhood of -127, which appears to be adequate
2855 i
= LLVMBuildBitCast(builder
, x
, int_vec_type
, "");
2857 /* exp = (float) exponent(x) */
2858 exp
= LLVMBuildAnd(builder
, i
, expmask
, "");
2861 if(p_floor_log2
|| p_log2
) {
2862 logexp
= LLVMBuildLShr(builder
, exp
, lp_build_const_int_vec(bld
->gallivm
, type
, 23), "");
2863 logexp
= LLVMBuildSub(builder
, logexp
, lp_build_const_int_vec(bld
->gallivm
, type
, 127), "");
2864 logexp
= LLVMBuildSIToFP(builder
, logexp
, vec_type
, "");
2868 /* mant = 1 + (float) mantissa(x) */
2869 mant
= LLVMBuildAnd(builder
, i
, mantmask
, "");
2870 mant
= LLVMBuildOr(builder
, mant
, one
, "");
2871 mant
= LLVMBuildBitCast(builder
, mant
, vec_type
, "");
2873 /* y = (mant - 1) / (mant + 1) */
2874 y
= lp_build_div(bld
,
2875 lp_build_sub(bld
, mant
, bld
->one
),
2876 lp_build_add(bld
, mant
, bld
->one
)
2880 z
= lp_build_mul(bld
, y
, y
);
2883 logmant
= lp_build_polynomial(bld
, z
, lp_build_log2_polynomial
,
2884 Elements(lp_build_log2_polynomial
));
2886 /* logmant = y * P(z) */
2887 logmant
= lp_build_mul(bld
, y
, logmant
);
2889 res
= lp_build_add(bld
, logmant
, logexp
);
2893 exp
= LLVMBuildBitCast(builder
, exp
, vec_type
, "");
2898 *p_floor_log2
= logexp
;
2906 lp_build_log2(struct lp_build_context
*bld
,
2910 lp_build_log2_approx(bld
, x
, NULL
, NULL
, &res
);
2916 * Faster (and less accurate) log2.
2918 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2920 * Piece-wise linear approximation, with exact results when x is a
2923 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2926 lp_build_fast_log2(struct lp_build_context
*bld
,
2929 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2933 assert(lp_check_value(bld
->type
, x
));
2935 assert(bld
->type
.floating
);
2937 /* ipart = floor(log2(x)) - 1 */
2938 ipart
= lp_build_extract_exponent(bld
, x
, -1);
2939 ipart
= LLVMBuildSIToFP(builder
, ipart
, bld
->vec_type
, "");
2941 /* fpart = x / 2**ipart */
2942 fpart
= lp_build_extract_mantissa(bld
, x
);
2945 return LLVMBuildFAdd(builder
, ipart
, fpart
, "");
2950 * Fast implementation of iround(log2(x)).
2952 * Not an approximation -- it should give accurate results all the time.
2955 lp_build_ilog2(struct lp_build_context
*bld
,
2958 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2959 LLVMValueRef sqrt2
= lp_build_const_vec(bld
->gallivm
, bld
->type
, M_SQRT2
);
2962 assert(bld
->type
.floating
);
2964 assert(lp_check_value(bld
->type
, x
));
2966 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
2967 x
= LLVMBuildFMul(builder
, x
, sqrt2
, "");
2969 /* ipart = floor(log2(x) + 0.5) */
2970 ipart
= lp_build_extract_exponent(bld
, x
, 0);
2976 lp_build_mod(struct lp_build_context
*bld
,
2980 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
2982 const struct lp_type type
= bld
->type
;
2984 assert(lp_check_value(type
, x
));
2985 assert(lp_check_value(type
, y
));
2988 res
= LLVMBuildFRem(builder
, x
, y
, "");
2990 res
= LLVMBuildSRem(builder
, x
, y
, "");
2992 res
= LLVMBuildURem(builder
, x
, y
, "");