gallivm: do per-sample depth comparison instead of doing it post-filter
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67
68 #define EXP_POLY_DEGREE 5
69
70 #define LOG_POLY_DEGREE 4
71
72
73 /**
74 * Generate min(a, b)
75 * No checks for special case values of a or b = 1 or 0 are done.
76 * NaN's are handled according to the behavior specified by the
77 * nan_behavior argument.
78 */
79 static LLVMValueRef
80 lp_build_min_simple(struct lp_build_context *bld,
81 LLVMValueRef a,
82 LLVMValueRef b,
83 enum gallivm_nan_behavior nan_behavior)
84 {
85 const struct lp_type type = bld->type;
86 const char *intrinsic = NULL;
87 unsigned intr_size = 0;
88 LLVMValueRef cond;
89
90 assert(lp_check_value(type, a));
91 assert(lp_check_value(type, b));
92
93 /* TODO: optimize the constant case */
94
95 if (type.floating && util_cpu_caps.has_sse) {
96 if (type.width == 32) {
97 if (type.length == 1) {
98 intrinsic = "llvm.x86.sse.min.ss";
99 intr_size = 128;
100 }
101 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
102 intrinsic = "llvm.x86.sse.min.ps";
103 intr_size = 128;
104 }
105 else {
106 intrinsic = "llvm.x86.avx.min.ps.256";
107 intr_size = 256;
108 }
109 }
110 if (type.width == 64 && util_cpu_caps.has_sse2) {
111 if (type.length == 1) {
112 intrinsic = "llvm.x86.sse2.min.sd";
113 intr_size = 128;
114 }
115 else if (type.length == 2 || !util_cpu_caps.has_avx) {
116 intrinsic = "llvm.x86.sse2.min.pd";
117 intr_size = 128;
118 }
119 else {
120 intrinsic = "llvm.x86.avx.min.pd.256";
121 intr_size = 256;
122 }
123 }
124 }
125 else if (type.floating && util_cpu_caps.has_altivec) {
126 debug_printf("%s: altivec doesn't support nan behavior modes\n",
127 __FUNCTION__);
128 if (type.width == 32 && type.length == 4) {
129 intrinsic = "llvm.ppc.altivec.vminfp";
130 intr_size = 128;
131 }
132 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
133 intr_size = 128;
134 if ((type.width == 8 || type.width == 16) &&
135 (type.width * type.length <= 64) &&
136 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
137 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
138 __FUNCTION__);
139 }
140 if (type.width == 8 && !type.sign) {
141 intrinsic = "llvm.x86.sse2.pminu.b";
142 }
143 else if (type.width == 16 && type.sign) {
144 intrinsic = "llvm.x86.sse2.pmins.w";
145 }
146 if (util_cpu_caps.has_sse4_1) {
147 if (type.width == 8 && type.sign) {
148 intrinsic = "llvm.x86.sse41.pminsb";
149 }
150 if (type.width == 16 && !type.sign) {
151 intrinsic = "llvm.x86.sse41.pminuw";
152 }
153 if (type.width == 32 && !type.sign) {
154 intrinsic = "llvm.x86.sse41.pminud";
155 }
156 if (type.width == 32 && type.sign) {
157 intrinsic = "llvm.x86.sse41.pminsd";
158 }
159 }
160 } else if (util_cpu_caps.has_altivec) {
161 intr_size = 128;
162 debug_printf("%s: altivec doesn't support nan behavior modes\n",
163 __FUNCTION__);
164 if (type.width == 8) {
165 if (!type.sign) {
166 intrinsic = "llvm.ppc.altivec.vminub";
167 } else {
168 intrinsic = "llvm.ppc.altivec.vminsb";
169 }
170 } else if (type.width == 16) {
171 if (!type.sign) {
172 intrinsic = "llvm.ppc.altivec.vminuh";
173 } else {
174 intrinsic = "llvm.ppc.altivec.vminsh";
175 }
176 } else if (type.width == 32) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminuw";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsw";
181 }
182 }
183 }
184
185 if(intrinsic) {
186 /* We need to handle nan's for floating point numbers. If one of the
187 * inputs is nan the other should be returned (required by both D3D10+
188 * and OpenCL).
189 * The sse intrinsics return the second operator in case of nan by
190 * default so we need to special code to handle those.
191 */
192 if (util_cpu_caps.has_sse && type.floating &&
193 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
194 nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
195 LLVMValueRef isnan, max;
196 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
197 type,
198 intr_size, a, b);
199 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
200 isnan = lp_build_isnan(bld, b);
201 return lp_build_select(bld, isnan, a, max);
202 } else {
203 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
204 isnan = lp_build_isnan(bld, a);
205 return lp_build_select(bld, isnan, a, max);
206 }
207 } else {
208 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
209 type,
210 intr_size, a, b);
211 }
212 }
213
214 if (type.floating) {
215 switch (nan_behavior) {
216 case GALLIVM_NAN_RETURN_NAN: {
217 LLVMValueRef isnan = lp_build_isnan(bld, b);
218 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
219 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
220 return lp_build_select(bld, cond, a, b);
221 }
222 break;
223 case GALLIVM_NAN_RETURN_OTHER: {
224 LLVMValueRef isnan = lp_build_isnan(bld, a);
225 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
226 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
227 return lp_build_select(bld, cond, a, b);
228 }
229 break;
230 case GALLIVM_NAN_RETURN_SECOND:
231 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
232 return lp_build_select(bld, cond, a, b);
233 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
234 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
235 return lp_build_select(bld, cond, a, b);
236 break;
237 default:
238 assert(0);
239 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
240 return lp_build_select(bld, cond, a, b);
241 }
242 } else {
243 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
244 return lp_build_select(bld, cond, a, b);
245 }
246 }
247
248
249 /**
250 * Generate max(a, b)
251 * No checks for special case values of a or b = 1 or 0 are done.
252 * NaN's are handled according to the behavior specified by the
253 * nan_behavior argument.
254 */
255 static LLVMValueRef
256 lp_build_max_simple(struct lp_build_context *bld,
257 LLVMValueRef a,
258 LLVMValueRef b,
259 enum gallivm_nan_behavior nan_behavior)
260 {
261 const struct lp_type type = bld->type;
262 const char *intrinsic = NULL;
263 unsigned intr_size = 0;
264 LLVMValueRef cond;
265
266 assert(lp_check_value(type, a));
267 assert(lp_check_value(type, b));
268
269 /* TODO: optimize the constant case */
270
271 if (type.floating && util_cpu_caps.has_sse) {
272 if (type.width == 32) {
273 if (type.length == 1) {
274 intrinsic = "llvm.x86.sse.max.ss";
275 intr_size = 128;
276 }
277 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
278 intrinsic = "llvm.x86.sse.max.ps";
279 intr_size = 128;
280 }
281 else {
282 intrinsic = "llvm.x86.avx.max.ps.256";
283 intr_size = 256;
284 }
285 }
286 if (type.width == 64 && util_cpu_caps.has_sse2) {
287 if (type.length == 1) {
288 intrinsic = "llvm.x86.sse2.max.sd";
289 intr_size = 128;
290 }
291 else if (type.length == 2 || !util_cpu_caps.has_avx) {
292 intrinsic = "llvm.x86.sse2.max.pd";
293 intr_size = 128;
294 }
295 else {
296 intrinsic = "llvm.x86.avx.max.pd.256";
297 intr_size = 256;
298 }
299 }
300 }
301 else if (type.floating && util_cpu_caps.has_altivec) {
302 debug_printf("%s: altivec doesn't support nan behavior modes\n",
303 __FUNCTION__);
304 if (type.width == 32 || type.length == 4) {
305 intrinsic = "llvm.ppc.altivec.vmaxfp";
306 intr_size = 128;
307 }
308 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
309 intr_size = 128;
310 if ((type.width == 8 || type.width == 16) &&
311 (type.width * type.length <= 64) &&
312 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
313 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
314 __FUNCTION__);
315 }
316 if (type.width == 8 && !type.sign) {
317 intrinsic = "llvm.x86.sse2.pmaxu.b";
318 intr_size = 128;
319 }
320 else if (type.width == 16 && type.sign) {
321 intrinsic = "llvm.x86.sse2.pmaxs.w";
322 }
323 if (util_cpu_caps.has_sse4_1) {
324 if (type.width == 8 && type.sign) {
325 intrinsic = "llvm.x86.sse41.pmaxsb";
326 }
327 if (type.width == 16 && !type.sign) {
328 intrinsic = "llvm.x86.sse41.pmaxuw";
329 }
330 if (type.width == 32 && !type.sign) {
331 intrinsic = "llvm.x86.sse41.pmaxud";
332 }
333 if (type.width == 32 && type.sign) {
334 intrinsic = "llvm.x86.sse41.pmaxsd";
335 }
336 }
337 } else if (util_cpu_caps.has_altivec) {
338 intr_size = 128;
339 debug_printf("%s: altivec doesn't support nan behavior modes\n",
340 __FUNCTION__);
341 if (type.width == 8) {
342 if (!type.sign) {
343 intrinsic = "llvm.ppc.altivec.vmaxub";
344 } else {
345 intrinsic = "llvm.ppc.altivec.vmaxsb";
346 }
347 } else if (type.width == 16) {
348 if (!type.sign) {
349 intrinsic = "llvm.ppc.altivec.vmaxuh";
350 } else {
351 intrinsic = "llvm.ppc.altivec.vmaxsh";
352 }
353 } else if (type.width == 32) {
354 if (!type.sign) {
355 intrinsic = "llvm.ppc.altivec.vmaxuw";
356 } else {
357 intrinsic = "llvm.ppc.altivec.vmaxsw";
358 }
359 }
360 }
361
362 if(intrinsic) {
363 if (util_cpu_caps.has_sse && type.floating &&
364 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
365 nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
366 LLVMValueRef isnan, min;
367 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
368 type,
369 intr_size, a, b);
370 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
371 isnan = lp_build_isnan(bld, b);
372 return lp_build_select(bld, isnan, a, min);
373 } else {
374 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
375 isnan = lp_build_isnan(bld, a);
376 return lp_build_select(bld, isnan, a, min);
377 }
378 } else {
379 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
380 type,
381 intr_size, a, b);
382 }
383 }
384
385 if (type.floating) {
386 switch (nan_behavior) {
387 case GALLIVM_NAN_RETURN_NAN: {
388 LLVMValueRef isnan = lp_build_isnan(bld, b);
389 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
390 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
391 return lp_build_select(bld, cond, a, b);
392 }
393 break;
394 case GALLIVM_NAN_RETURN_OTHER: {
395 LLVMValueRef isnan = lp_build_isnan(bld, a);
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
398 return lp_build_select(bld, cond, a, b);
399 }
400 break;
401 case GALLIVM_NAN_RETURN_SECOND:
402 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
403 return lp_build_select(bld, cond, a, b);
404 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
405 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
406 return lp_build_select(bld, cond, a, b);
407 break;
408 default:
409 assert(0);
410 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
411 return lp_build_select(bld, cond, a, b);
412 }
413 } else {
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 return lp_build_select(bld, cond, a, b);
416 }
417 }
418
419
420 /**
421 * Generate 1 - a, or ~a depending on bld->type.
422 */
423 LLVMValueRef
424 lp_build_comp(struct lp_build_context *bld,
425 LLVMValueRef a)
426 {
427 LLVMBuilderRef builder = bld->gallivm->builder;
428 const struct lp_type type = bld->type;
429
430 assert(lp_check_value(type, a));
431
432 if(a == bld->one)
433 return bld->zero;
434 if(a == bld->zero)
435 return bld->one;
436
437 if(type.norm && !type.floating && !type.fixed && !type.sign) {
438 if(LLVMIsConstant(a))
439 return LLVMConstNot(a);
440 else
441 return LLVMBuildNot(builder, a, "");
442 }
443
444 if(LLVMIsConstant(a))
445 if (type.floating)
446 return LLVMConstFSub(bld->one, a);
447 else
448 return LLVMConstSub(bld->one, a);
449 else
450 if (type.floating)
451 return LLVMBuildFSub(builder, bld->one, a, "");
452 else
453 return LLVMBuildSub(builder, bld->one, a, "");
454 }
455
456
457 /**
458 * Generate a + b
459 */
460 LLVMValueRef
461 lp_build_add(struct lp_build_context *bld,
462 LLVMValueRef a,
463 LLVMValueRef b)
464 {
465 LLVMBuilderRef builder = bld->gallivm->builder;
466 const struct lp_type type = bld->type;
467 LLVMValueRef res;
468
469 assert(lp_check_value(type, a));
470 assert(lp_check_value(type, b));
471
472 if(a == bld->zero)
473 return b;
474 if(b == bld->zero)
475 return a;
476 if(a == bld->undef || b == bld->undef)
477 return bld->undef;
478
479 if(bld->type.norm) {
480 const char *intrinsic = NULL;
481
482 if(a == bld->one || b == bld->one)
483 return bld->one;
484
485 if (type.width * type.length == 128 &&
486 !type.floating && !type.fixed) {
487 if(util_cpu_caps.has_sse2) {
488 if(type.width == 8)
489 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
490 if(type.width == 16)
491 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
492 } else if (util_cpu_caps.has_altivec) {
493 if(type.width == 8)
494 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
495 if(type.width == 16)
496 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
497 }
498 }
499
500 if(intrinsic)
501 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
502 }
503
504 /* TODO: handle signed case */
505 if(type.norm && !type.floating && !type.fixed && !type.sign)
506 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
507
508 if(LLVMIsConstant(a) && LLVMIsConstant(b))
509 if (type.floating)
510 res = LLVMConstFAdd(a, b);
511 else
512 res = LLVMConstAdd(a, b);
513 else
514 if (type.floating)
515 res = LLVMBuildFAdd(builder, a, b, "");
516 else
517 res = LLVMBuildAdd(builder, a, b, "");
518
519 /* clamp to ceiling of 1.0 */
520 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
521 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
522
523 /* XXX clamp to floor of -1 or 0??? */
524
525 return res;
526 }
527
528
529 /** Return the scalar sum of the elements of a.
530 * Should avoid this operation whenever possible.
531 */
532 LLVMValueRef
533 lp_build_horizontal_add(struct lp_build_context *bld,
534 LLVMValueRef a)
535 {
536 LLVMBuilderRef builder = bld->gallivm->builder;
537 const struct lp_type type = bld->type;
538 LLVMValueRef index, res;
539 unsigned i, length;
540 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
541 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
542 LLVMValueRef vecres, elem2;
543
544 assert(lp_check_value(type, a));
545
546 if (type.length == 1) {
547 return a;
548 }
549
550 assert(!bld->type.norm);
551
552 /*
553 * for byte vectors can do much better with psadbw.
554 * Using repeated shuffle/adds here. Note with multiple vectors
555 * this can be done more efficiently as outlined in the intel
556 * optimization manual.
557 * Note: could cause data rearrangement if used with smaller element
558 * sizes.
559 */
560
561 vecres = a;
562 length = type.length / 2;
563 while (length > 1) {
564 LLVMValueRef vec1, vec2;
565 for (i = 0; i < length; i++) {
566 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
567 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
568 }
569 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
570 LLVMConstVector(shuffles1, length), "");
571 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
572 LLVMConstVector(shuffles2, length), "");
573 if (type.floating) {
574 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
575 }
576 else {
577 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
578 }
579 length = length >> 1;
580 }
581
582 /* always have vector of size 2 here */
583 assert(length == 1);
584
585 index = lp_build_const_int32(bld->gallivm, 0);
586 res = LLVMBuildExtractElement(builder, vecres, index, "");
587 index = lp_build_const_int32(bld->gallivm, 1);
588 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
589
590 if (type.floating)
591 res = LLVMBuildFAdd(builder, res, elem2, "");
592 else
593 res = LLVMBuildAdd(builder, res, elem2, "");
594
595 return res;
596 }
597
598 /**
599 * Return the horizontal sums of 4 float vectors as a float4 vector.
600 * This uses the technique as outlined in Intel Optimization Manual.
601 */
602 static LLVMValueRef
603 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
604 LLVMValueRef src[4])
605 {
606 struct gallivm_state *gallivm = bld->gallivm;
607 LLVMBuilderRef builder = gallivm->builder;
608 LLVMValueRef shuffles[4];
609 LLVMValueRef tmp[4];
610 LLVMValueRef sumtmp[2], shuftmp[2];
611
612 /* lower half of regs */
613 shuffles[0] = lp_build_const_int32(gallivm, 0);
614 shuffles[1] = lp_build_const_int32(gallivm, 1);
615 shuffles[2] = lp_build_const_int32(gallivm, 4);
616 shuffles[3] = lp_build_const_int32(gallivm, 5);
617 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
618 LLVMConstVector(shuffles, 4), "");
619 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
620 LLVMConstVector(shuffles, 4), "");
621
622 /* upper half of regs */
623 shuffles[0] = lp_build_const_int32(gallivm, 2);
624 shuffles[1] = lp_build_const_int32(gallivm, 3);
625 shuffles[2] = lp_build_const_int32(gallivm, 6);
626 shuffles[3] = lp_build_const_int32(gallivm, 7);
627 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
628 LLVMConstVector(shuffles, 4), "");
629 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
630 LLVMConstVector(shuffles, 4), "");
631
632 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
633 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
634
635 shuffles[0] = lp_build_const_int32(gallivm, 0);
636 shuffles[1] = lp_build_const_int32(gallivm, 2);
637 shuffles[2] = lp_build_const_int32(gallivm, 4);
638 shuffles[3] = lp_build_const_int32(gallivm, 6);
639 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640 LLVMConstVector(shuffles, 4), "");
641
642 shuffles[0] = lp_build_const_int32(gallivm, 1);
643 shuffles[1] = lp_build_const_int32(gallivm, 3);
644 shuffles[2] = lp_build_const_int32(gallivm, 5);
645 shuffles[3] = lp_build_const_int32(gallivm, 7);
646 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
647 LLVMConstVector(shuffles, 4), "");
648
649 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
650 }
651
652
653 /*
654 * partially horizontally add 2-4 float vectors with length nx4,
655 * i.e. only four adjacent values in each vector will be added,
656 * assuming values are really grouped in 4 which also determines
657 * output order.
658 *
659 * Return a vector of the same length as the initial vectors,
660 * with the excess elements (if any) being undefined.
661 * The element order is independent of number of input vectors.
662 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
663 * the output order thus will be
664 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
665 */
666 LLVMValueRef
667 lp_build_hadd_partial4(struct lp_build_context *bld,
668 LLVMValueRef vectors[],
669 unsigned num_vecs)
670 {
671 struct gallivm_state *gallivm = bld->gallivm;
672 LLVMBuilderRef builder = gallivm->builder;
673 LLVMValueRef ret_vec;
674 LLVMValueRef tmp[4];
675 const char *intrinsic = NULL;
676
677 assert(num_vecs >= 2 && num_vecs <= 4);
678 assert(bld->type.floating);
679
680 /* only use this with at least 2 vectors, as it is sort of expensive
681 * (depending on cpu) and we always need two horizontal adds anyway,
682 * so a shuffle/add approach might be better.
683 */
684
685 tmp[0] = vectors[0];
686 tmp[1] = vectors[1];
687
688 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
689 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
690
691 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
692 bld->type.length == 4) {
693 intrinsic = "llvm.x86.sse3.hadd.ps";
694 }
695 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
696 bld->type.length == 8) {
697 intrinsic = "llvm.x86.avx.hadd.ps.256";
698 }
699 if (intrinsic) {
700 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
701 lp_build_vec_type(gallivm, bld->type),
702 tmp[0], tmp[1]);
703 if (num_vecs > 2) {
704 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
705 lp_build_vec_type(gallivm, bld->type),
706 tmp[2], tmp[3]);
707 }
708 else {
709 tmp[1] = tmp[0];
710 }
711 return lp_build_intrinsic_binary(builder, intrinsic,
712 lp_build_vec_type(gallivm, bld->type),
713 tmp[0], tmp[1]);
714 }
715
716 if (bld->type.length == 4) {
717 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
718 }
719 else {
720 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
721 unsigned j;
722 unsigned num_iter = bld->type.length / 4;
723 struct lp_type parttype = bld->type;
724 parttype.length = 4;
725 for (j = 0; j < num_iter; j++) {
726 LLVMValueRef partsrc[4];
727 unsigned i;
728 for (i = 0; i < 4; i++) {
729 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
730 }
731 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
732 }
733 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
734 }
735 return ret_vec;
736 }
737
738 /**
739 * Generate a - b
740 */
741 LLVMValueRef
742 lp_build_sub(struct lp_build_context *bld,
743 LLVMValueRef a,
744 LLVMValueRef b)
745 {
746 LLVMBuilderRef builder = bld->gallivm->builder;
747 const struct lp_type type = bld->type;
748 LLVMValueRef res;
749
750 assert(lp_check_value(type, a));
751 assert(lp_check_value(type, b));
752
753 if(b == bld->zero)
754 return a;
755 if(a == bld->undef || b == bld->undef)
756 return bld->undef;
757 if(a == b)
758 return bld->zero;
759
760 if(bld->type.norm) {
761 const char *intrinsic = NULL;
762
763 if(b == bld->one)
764 return bld->zero;
765
766 if (type.width * type.length == 128 &&
767 !type.floating && !type.fixed) {
768 if (util_cpu_caps.has_sse2) {
769 if(type.width == 8)
770 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
771 if(type.width == 16)
772 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
773 } else if (util_cpu_caps.has_altivec) {
774 if(type.width == 8)
775 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
776 if(type.width == 16)
777 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
778 }
779 }
780
781 if(intrinsic)
782 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
783 }
784
785 /* TODO: handle signed case */
786 if(type.norm && !type.floating && !type.fixed && !type.sign)
787 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
788
789 if(LLVMIsConstant(a) && LLVMIsConstant(b))
790 if (type.floating)
791 res = LLVMConstFSub(a, b);
792 else
793 res = LLVMConstSub(a, b);
794 else
795 if (type.floating)
796 res = LLVMBuildFSub(builder, a, b, "");
797 else
798 res = LLVMBuildSub(builder, a, b, "");
799
800 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
801 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802
803 return res;
804 }
805
806
807
808 /**
809 * Normalized multiplication.
810 *
811 * There are several approaches for (using 8-bit normalized multiplication as
812 * an example):
813 *
814 * - alpha plus one
815 *
816 * makes the following approximation to the division (Sree)
817 *
818 * a*b/255 ~= (a*(b + 1)) >> 256
819 *
820 * which is the fastest method that satisfies the following OpenGL criteria of
821 *
822 * 0*0 = 0 and 255*255 = 255
823 *
824 * - geometric series
825 *
826 * takes the geometric series approximation to the division
827 *
828 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
829 *
830 * in this case just the first two terms to fit in 16bit arithmetic
831 *
832 * t/255 ~= (t + (t >> 8)) >> 8
833 *
834 * note that just by itself it doesn't satisfies the OpenGL criteria, as
835 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
836 * must be used.
837 *
838 * - geometric series plus rounding
839 *
840 * when using a geometric series division instead of truncating the result
841 * use roundoff in the approximation (Jim Blinn)
842 *
843 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
844 *
845 * achieving the exact results.
846 *
847 *
848 *
849 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
850 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
851 * @sa Michael Herf, The "double blend trick", May 2000,
852 * http://www.stereopsis.com/doubleblend.html
853 */
854 static LLVMValueRef
855 lp_build_mul_norm(struct gallivm_state *gallivm,
856 struct lp_type wide_type,
857 LLVMValueRef a, LLVMValueRef b)
858 {
859 LLVMBuilderRef builder = gallivm->builder;
860 struct lp_build_context bld;
861 unsigned n;
862 LLVMValueRef half;
863 LLVMValueRef ab;
864
865 assert(!wide_type.floating);
866 assert(lp_check_value(wide_type, a));
867 assert(lp_check_value(wide_type, b));
868
869 lp_build_context_init(&bld, gallivm, wide_type);
870
871 n = wide_type.width / 2;
872 if (wide_type.sign) {
873 --n;
874 }
875
876 /*
877 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
878 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
879 */
880
881 /*
882 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
883 */
884
885 ab = LLVMBuildMul(builder, a, b, "");
886 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
887
888 /*
889 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
890 */
891
892 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
893 if (wide_type.sign) {
894 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
895 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
896 half = lp_build_select(&bld, sign, minus_half, half);
897 }
898 ab = LLVMBuildAdd(builder, ab, half, "");
899
900 /* Final division */
901 ab = lp_build_shr_imm(&bld, ab, n);
902
903 return ab;
904 }
905
906 /**
907 * Generate a * b
908 */
909 LLVMValueRef
910 lp_build_mul(struct lp_build_context *bld,
911 LLVMValueRef a,
912 LLVMValueRef b)
913 {
914 LLVMBuilderRef builder = bld->gallivm->builder;
915 const struct lp_type type = bld->type;
916 LLVMValueRef shift;
917 LLVMValueRef res;
918
919 assert(lp_check_value(type, a));
920 assert(lp_check_value(type, b));
921
922 if(a == bld->zero)
923 return bld->zero;
924 if(a == bld->one)
925 return b;
926 if(b == bld->zero)
927 return bld->zero;
928 if(b == bld->one)
929 return a;
930 if(a == bld->undef || b == bld->undef)
931 return bld->undef;
932
933 if (!type.floating && !type.fixed && type.norm) {
934 struct lp_type wide_type = lp_wider_type(type);
935 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
936
937 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
938 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
939
940 /* PMULLW, PSRLW, PADDW */
941 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
942 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
943
944 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
945
946 return ab;
947 }
948
949 if(type.fixed)
950 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
951 else
952 shift = NULL;
953
954 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
955 if (type.floating)
956 res = LLVMConstFMul(a, b);
957 else
958 res = LLVMConstMul(a, b);
959 if(shift) {
960 if(type.sign)
961 res = LLVMConstAShr(res, shift);
962 else
963 res = LLVMConstLShr(res, shift);
964 }
965 }
966 else {
967 if (type.floating)
968 res = LLVMBuildFMul(builder, a, b, "");
969 else
970 res = LLVMBuildMul(builder, a, b, "");
971 if(shift) {
972 if(type.sign)
973 res = LLVMBuildAShr(builder, res, shift, "");
974 else
975 res = LLVMBuildLShr(builder, res, shift, "");
976 }
977 }
978
979 return res;
980 }
981
982
983 /**
984 * Small vector x scale multiplication optimization.
985 */
986 LLVMValueRef
987 lp_build_mul_imm(struct lp_build_context *bld,
988 LLVMValueRef a,
989 int b)
990 {
991 LLVMBuilderRef builder = bld->gallivm->builder;
992 LLVMValueRef factor;
993
994 assert(lp_check_value(bld->type, a));
995
996 if(b == 0)
997 return bld->zero;
998
999 if(b == 1)
1000 return a;
1001
1002 if(b == -1)
1003 return lp_build_negate(bld, a);
1004
1005 if(b == 2 && bld->type.floating)
1006 return lp_build_add(bld, a, a);
1007
1008 if(util_is_power_of_two(b)) {
1009 unsigned shift = ffs(b) - 1;
1010
1011 if(bld->type.floating) {
1012 #if 0
1013 /*
1014 * Power of two multiplication by directly manipulating the exponent.
1015 *
1016 * XXX: This might not be always faster, it will introduce a small error
1017 * for multiplication by zero, and it will produce wrong results
1018 * for Inf and NaN.
1019 */
1020 unsigned mantissa = lp_mantissa(bld->type);
1021 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1022 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1023 a = LLVMBuildAdd(builder, a, factor, "");
1024 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1025 return a;
1026 #endif
1027 }
1028 else {
1029 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1030 return LLVMBuildShl(builder, a, factor, "");
1031 }
1032 }
1033
1034 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1035 return lp_build_mul(bld, a, factor);
1036 }
1037
1038
1039 /**
1040 * Generate a / b
1041 */
1042 LLVMValueRef
1043 lp_build_div(struct lp_build_context *bld,
1044 LLVMValueRef a,
1045 LLVMValueRef b)
1046 {
1047 LLVMBuilderRef builder = bld->gallivm->builder;
1048 const struct lp_type type = bld->type;
1049
1050 assert(lp_check_value(type, a));
1051 assert(lp_check_value(type, b));
1052
1053 if(a == bld->zero)
1054 return bld->zero;
1055 if(a == bld->one)
1056 return lp_build_rcp(bld, b);
1057 if(b == bld->zero)
1058 return bld->undef;
1059 if(b == bld->one)
1060 return a;
1061 if(a == bld->undef || b == bld->undef)
1062 return bld->undef;
1063
1064 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1065 if (type.floating)
1066 return LLVMConstFDiv(a, b);
1067 else if (type.sign)
1068 return LLVMConstSDiv(a, b);
1069 else
1070 return LLVMConstUDiv(a, b);
1071 }
1072
1073 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1074 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1075 type.floating)
1076 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1077
1078 if (type.floating)
1079 return LLVMBuildFDiv(builder, a, b, "");
1080 else if (type.sign)
1081 return LLVMBuildSDiv(builder, a, b, "");
1082 else
1083 return LLVMBuildUDiv(builder, a, b, "");
1084 }
1085
1086
1087 /**
1088 * Linear interpolation helper.
1089 *
1090 * @param normalized whether we are interpolating normalized values,
1091 * encoded in normalized integers, twice as wide.
1092 *
1093 * @sa http://www.stereopsis.com/doubleblend.html
1094 */
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context *bld,
1097 LLVMValueRef x,
1098 LLVMValueRef v0,
1099 LLVMValueRef v1,
1100 unsigned flags)
1101 {
1102 unsigned half_width = bld->type.width/2;
1103 LLVMBuilderRef builder = bld->gallivm->builder;
1104 LLVMValueRef delta;
1105 LLVMValueRef res;
1106
1107 assert(lp_check_value(bld->type, x));
1108 assert(lp_check_value(bld->type, v0));
1109 assert(lp_check_value(bld->type, v1));
1110
1111 delta = lp_build_sub(bld, v1, v0);
1112
1113 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1114 if (!bld->type.sign) {
1115 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1116 /*
1117 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118 * most-significant-bit to the lowest-significant-bit, so that
1119 * later we can just divide by 2**n instead of 2**n - 1.
1120 */
1121
1122 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1123 }
1124
1125 /* (x * delta) >> n */
1126 res = lp_build_mul(bld, x, delta);
1127 res = lp_build_shr_imm(bld, res, half_width);
1128 } else {
1129 /*
1130 * The rescaling trick above doesn't work for signed numbers, so
1131 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1132 * instead.
1133 */
1134 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1135 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1136 }
1137 } else {
1138 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1139 res = lp_build_mul(bld, x, delta);
1140 }
1141
1142 res = lp_build_add(bld, v0, res);
1143
1144 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1145 bld->type.fixed) {
1146 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148 * but it will be wrong for true fixed point use cases. Basically we need
1149 * a more powerful lp_type, capable of further distinguishing the values
1150 * interpretation from the value storage. */
1151 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1152 }
1153
1154 return res;
1155 }
1156
1157
1158 /**
1159 * Linear interpolation.
1160 */
1161 LLVMValueRef
1162 lp_build_lerp(struct lp_build_context *bld,
1163 LLVMValueRef x,
1164 LLVMValueRef v0,
1165 LLVMValueRef v1,
1166 unsigned flags)
1167 {
1168 const struct lp_type type = bld->type;
1169 LLVMValueRef res;
1170
1171 assert(lp_check_value(type, x));
1172 assert(lp_check_value(type, v0));
1173 assert(lp_check_value(type, v1));
1174
1175 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1176
1177 if (type.norm) {
1178 struct lp_type wide_type;
1179 struct lp_build_context wide_bld;
1180 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1181
1182 assert(type.length >= 2);
1183
1184 /*
1185 * Create a wider integer type, enough to hold the
1186 * intermediate result of the multiplication.
1187 */
1188 memset(&wide_type, 0, sizeof wide_type);
1189 wide_type.sign = type.sign;
1190 wide_type.width = type.width*2;
1191 wide_type.length = type.length/2;
1192
1193 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1194
1195 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1196 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1197 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1198
1199 /*
1200 * Lerp both halves.
1201 */
1202
1203 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1204
1205 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1206 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1207
1208 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1209 } else {
1210 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1211 }
1212
1213 return res;
1214 }
1215
1216
1217 /**
1218 * Bilinear interpolation.
1219 *
1220 * Values indices are in v_{yx}.
1221 */
1222 LLVMValueRef
1223 lp_build_lerp_2d(struct lp_build_context *bld,
1224 LLVMValueRef x,
1225 LLVMValueRef y,
1226 LLVMValueRef v00,
1227 LLVMValueRef v01,
1228 LLVMValueRef v10,
1229 LLVMValueRef v11,
1230 unsigned flags)
1231 {
1232 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1233 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1234 return lp_build_lerp(bld, y, v0, v1, flags);
1235 }
1236
1237
1238 LLVMValueRef
1239 lp_build_lerp_3d(struct lp_build_context *bld,
1240 LLVMValueRef x,
1241 LLVMValueRef y,
1242 LLVMValueRef z,
1243 LLVMValueRef v000,
1244 LLVMValueRef v001,
1245 LLVMValueRef v010,
1246 LLVMValueRef v011,
1247 LLVMValueRef v100,
1248 LLVMValueRef v101,
1249 LLVMValueRef v110,
1250 LLVMValueRef v111,
1251 unsigned flags)
1252 {
1253 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1254 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1255 return lp_build_lerp(bld, z, v0, v1, flags);
1256 }
1257
1258
1259 /**
1260 * Generate min(a, b)
1261 * Do checks for special cases but not for nans.
1262 */
1263 LLVMValueRef
1264 lp_build_min(struct lp_build_context *bld,
1265 LLVMValueRef a,
1266 LLVMValueRef b)
1267 {
1268 assert(lp_check_value(bld->type, a));
1269 assert(lp_check_value(bld->type, b));
1270
1271 if(a == bld->undef || b == bld->undef)
1272 return bld->undef;
1273
1274 if(a == b)
1275 return a;
1276
1277 if (bld->type.norm) {
1278 if (!bld->type.sign) {
1279 if (a == bld->zero || b == bld->zero) {
1280 return bld->zero;
1281 }
1282 }
1283 if(a == bld->one)
1284 return b;
1285 if(b == bld->one)
1286 return a;
1287 }
1288
1289 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1290 }
1291
1292
1293 /**
1294 * Generate min(a, b)
1295 * NaN's are handled according to the behavior specified by the
1296 * nan_behavior argument.
1297 */
1298 LLVMValueRef
1299 lp_build_min_ext(struct lp_build_context *bld,
1300 LLVMValueRef a,
1301 LLVMValueRef b,
1302 enum gallivm_nan_behavior nan_behavior)
1303 {
1304 assert(lp_check_value(bld->type, a));
1305 assert(lp_check_value(bld->type, b));
1306
1307 if(a == bld->undef || b == bld->undef)
1308 return bld->undef;
1309
1310 if(a == b)
1311 return a;
1312
1313 if (bld->type.norm) {
1314 if (!bld->type.sign) {
1315 if (a == bld->zero || b == bld->zero) {
1316 return bld->zero;
1317 }
1318 }
1319 if(a == bld->one)
1320 return b;
1321 if(b == bld->one)
1322 return a;
1323 }
1324
1325 return lp_build_min_simple(bld, a, b, nan_behavior);
1326 }
1327
1328 /**
1329 * Generate max(a, b)
1330 * Do checks for special cases, but NaN behavior is undefined.
1331 */
1332 LLVMValueRef
1333 lp_build_max(struct lp_build_context *bld,
1334 LLVMValueRef a,
1335 LLVMValueRef b)
1336 {
1337 assert(lp_check_value(bld->type, a));
1338 assert(lp_check_value(bld->type, b));
1339
1340 if(a == bld->undef || b == bld->undef)
1341 return bld->undef;
1342
1343 if(a == b)
1344 return a;
1345
1346 if(bld->type.norm) {
1347 if(a == bld->one || b == bld->one)
1348 return bld->one;
1349 if (!bld->type.sign) {
1350 if (a == bld->zero) {
1351 return b;
1352 }
1353 if (b == bld->zero) {
1354 return a;
1355 }
1356 }
1357 }
1358
1359 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1360 }
1361
1362
1363 /**
1364 * Generate max(a, b)
1365 * Checks for special cases.
1366 * NaN's are handled according to the behavior specified by the
1367 * nan_behavior argument.
1368 */
1369 LLVMValueRef
1370 lp_build_max_ext(struct lp_build_context *bld,
1371 LLVMValueRef a,
1372 LLVMValueRef b,
1373 enum gallivm_nan_behavior nan_behavior)
1374 {
1375 assert(lp_check_value(bld->type, a));
1376 assert(lp_check_value(bld->type, b));
1377
1378 if(a == bld->undef || b == bld->undef)
1379 return bld->undef;
1380
1381 if(a == b)
1382 return a;
1383
1384 if(bld->type.norm) {
1385 if(a == bld->one || b == bld->one)
1386 return bld->one;
1387 if (!bld->type.sign) {
1388 if (a == bld->zero) {
1389 return b;
1390 }
1391 if (b == bld->zero) {
1392 return a;
1393 }
1394 }
1395 }
1396
1397 return lp_build_max_simple(bld, a, b, nan_behavior);
1398 }
1399
1400 /**
1401 * Generate clamp(a, min, max)
1402 * Do checks for special cases.
1403 */
1404 LLVMValueRef
1405 lp_build_clamp(struct lp_build_context *bld,
1406 LLVMValueRef a,
1407 LLVMValueRef min,
1408 LLVMValueRef max)
1409 {
1410 assert(lp_check_value(bld->type, a));
1411 assert(lp_check_value(bld->type, min));
1412 assert(lp_check_value(bld->type, max));
1413
1414 /*
1415 * XXX dark magic warning: The order of min/max here matters (!).
1416 * The reason is a typical use case is clamp(a, 0.0, 1.0)
1417 * (for example for float->unorm conversion) and on x86 sse2
1418 * this will give 0.0 for NaNs, whereas doing min first will
1419 * give 1.0 for NaN which makes d3d10 angry...
1420 * This is very much not guaranteed behavior though which just
1421 * happens to work x86 sse2 (and up), and obviously won't do anything
1422 * for other non-zero clamps (say -1.0/1.0 in a SNORM conversion) neither,
1423 * so need to fix this for real...
1424 */
1425 a = lp_build_max(bld, a, min);
1426 a = lp_build_min(bld, a, max);
1427 return a;
1428 }
1429
1430
1431 /**
1432 * Generate abs(a)
1433 */
1434 LLVMValueRef
1435 lp_build_abs(struct lp_build_context *bld,
1436 LLVMValueRef a)
1437 {
1438 LLVMBuilderRef builder = bld->gallivm->builder;
1439 const struct lp_type type = bld->type;
1440 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1441
1442 assert(lp_check_value(type, a));
1443
1444 if(!type.sign)
1445 return a;
1446
1447 if(type.floating) {
1448 /* Mask out the sign bit */
1449 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1450 unsigned long long absMask = ~(1ULL << (type.width - 1));
1451 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1452 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1453 a = LLVMBuildAnd(builder, a, mask, "");
1454 a = LLVMBuildBitCast(builder, a, vec_type, "");
1455 return a;
1456 }
1457
1458 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1459 switch(type.width) {
1460 case 8:
1461 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1462 case 16:
1463 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1464 case 32:
1465 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1466 }
1467 }
1468 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1469 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1470 (type.width == 8 || type.width == 16 || type.width == 32)) {
1471 debug_printf("%s: inefficient code, should split vectors manually\n",
1472 __FUNCTION__);
1473 }
1474
1475 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1476 }
1477
1478
1479 LLVMValueRef
1480 lp_build_negate(struct lp_build_context *bld,
1481 LLVMValueRef a)
1482 {
1483 LLVMBuilderRef builder = bld->gallivm->builder;
1484
1485 assert(lp_check_value(bld->type, a));
1486
1487 #if HAVE_LLVM >= 0x0207
1488 if (bld->type.floating)
1489 a = LLVMBuildFNeg(builder, a, "");
1490 else
1491 #endif
1492 a = LLVMBuildNeg(builder, a, "");
1493
1494 return a;
1495 }
1496
1497
1498 /** Return -1, 0 or +1 depending on the sign of a */
1499 LLVMValueRef
1500 lp_build_sgn(struct lp_build_context *bld,
1501 LLVMValueRef a)
1502 {
1503 LLVMBuilderRef builder = bld->gallivm->builder;
1504 const struct lp_type type = bld->type;
1505 LLVMValueRef cond;
1506 LLVMValueRef res;
1507
1508 assert(lp_check_value(type, a));
1509
1510 /* Handle non-zero case */
1511 if(!type.sign) {
1512 /* if not zero then sign must be positive */
1513 res = bld->one;
1514 }
1515 else if(type.floating) {
1516 LLVMTypeRef vec_type;
1517 LLVMTypeRef int_type;
1518 LLVMValueRef mask;
1519 LLVMValueRef sign;
1520 LLVMValueRef one;
1521 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1522
1523 int_type = lp_build_int_vec_type(bld->gallivm, type);
1524 vec_type = lp_build_vec_type(bld->gallivm, type);
1525 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1526
1527 /* Take the sign bit and add it to 1 constant */
1528 sign = LLVMBuildBitCast(builder, a, int_type, "");
1529 sign = LLVMBuildAnd(builder, sign, mask, "");
1530 one = LLVMConstBitCast(bld->one, int_type);
1531 res = LLVMBuildOr(builder, sign, one, "");
1532 res = LLVMBuildBitCast(builder, res, vec_type, "");
1533 }
1534 else
1535 {
1536 /* signed int/norm/fixed point */
1537 /* could use psign with sse3 and appropriate vectors here */
1538 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1539 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1540 res = lp_build_select(bld, cond, bld->one, minus_one);
1541 }
1542
1543 /* Handle zero */
1544 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1545 res = lp_build_select(bld, cond, bld->zero, res);
1546
1547 return res;
1548 }
1549
1550
1551 /**
1552 * Set the sign of float vector 'a' according to 'sign'.
1553 * If sign==0, return abs(a).
1554 * If sign==1, return -abs(a);
1555 * Other values for sign produce undefined results.
1556 */
1557 LLVMValueRef
1558 lp_build_set_sign(struct lp_build_context *bld,
1559 LLVMValueRef a, LLVMValueRef sign)
1560 {
1561 LLVMBuilderRef builder = bld->gallivm->builder;
1562 const struct lp_type type = bld->type;
1563 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1564 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1565 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1566 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1567 ~((unsigned long long) 1 << (type.width - 1)));
1568 LLVMValueRef val, res;
1569
1570 assert(type.floating);
1571 assert(lp_check_value(type, a));
1572
1573 /* val = reinterpret_cast<int>(a) */
1574 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1575 /* val = val & mask */
1576 val = LLVMBuildAnd(builder, val, mask, "");
1577 /* sign = sign << shift */
1578 sign = LLVMBuildShl(builder, sign, shift, "");
1579 /* res = val | sign */
1580 res = LLVMBuildOr(builder, val, sign, "");
1581 /* res = reinterpret_cast<float>(res) */
1582 res = LLVMBuildBitCast(builder, res, vec_type, "");
1583
1584 return res;
1585 }
1586
1587
1588 /**
1589 * Convert vector of (or scalar) int to vector of (or scalar) float.
1590 */
1591 LLVMValueRef
1592 lp_build_int_to_float(struct lp_build_context *bld,
1593 LLVMValueRef a)
1594 {
1595 LLVMBuilderRef builder = bld->gallivm->builder;
1596 const struct lp_type type = bld->type;
1597 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1598
1599 assert(type.floating);
1600
1601 return LLVMBuildSIToFP(builder, a, vec_type, "");
1602 }
1603
1604 static boolean
1605 arch_rounding_available(const struct lp_type type)
1606 {
1607 if ((util_cpu_caps.has_sse4_1 &&
1608 (type.length == 1 || type.width*type.length == 128)) ||
1609 (util_cpu_caps.has_avx && type.width*type.length == 256))
1610 return TRUE;
1611 else if ((util_cpu_caps.has_altivec &&
1612 (type.width == 32 && type.length == 4)))
1613 return TRUE;
1614
1615 return FALSE;
1616 }
1617
1618 enum lp_build_round_mode
1619 {
1620 LP_BUILD_ROUND_NEAREST = 0,
1621 LP_BUILD_ROUND_FLOOR = 1,
1622 LP_BUILD_ROUND_CEIL = 2,
1623 LP_BUILD_ROUND_TRUNCATE = 3
1624 };
1625
1626 /**
1627 * Helper for SSE4.1's ROUNDxx instructions.
1628 *
1629 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1630 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1631 */
1632 static INLINE LLVMValueRef
1633 lp_build_round_sse41(struct lp_build_context *bld,
1634 LLVMValueRef a,
1635 enum lp_build_round_mode mode)
1636 {
1637 LLVMBuilderRef builder = bld->gallivm->builder;
1638 const struct lp_type type = bld->type;
1639 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1640 const char *intrinsic;
1641 LLVMValueRef res;
1642
1643 assert(type.floating);
1644
1645 assert(lp_check_value(type, a));
1646 assert(util_cpu_caps.has_sse4_1);
1647
1648 if (type.length == 1) {
1649 LLVMTypeRef vec_type;
1650 LLVMValueRef undef;
1651 LLVMValueRef args[3];
1652 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1653
1654 switch(type.width) {
1655 case 32:
1656 intrinsic = "llvm.x86.sse41.round.ss";
1657 break;
1658 case 64:
1659 intrinsic = "llvm.x86.sse41.round.sd";
1660 break;
1661 default:
1662 assert(0);
1663 return bld->undef;
1664 }
1665
1666 vec_type = LLVMVectorType(bld->elem_type, 4);
1667
1668 undef = LLVMGetUndef(vec_type);
1669
1670 args[0] = undef;
1671 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1672 args[2] = LLVMConstInt(i32t, mode, 0);
1673
1674 res = lp_build_intrinsic(builder, intrinsic,
1675 vec_type, args, Elements(args));
1676
1677 res = LLVMBuildExtractElement(builder, res, index0, "");
1678 }
1679 else {
1680 if (type.width * type.length == 128) {
1681 switch(type.width) {
1682 case 32:
1683 intrinsic = "llvm.x86.sse41.round.ps";
1684 break;
1685 case 64:
1686 intrinsic = "llvm.x86.sse41.round.pd";
1687 break;
1688 default:
1689 assert(0);
1690 return bld->undef;
1691 }
1692 }
1693 else {
1694 assert(type.width * type.length == 256);
1695 assert(util_cpu_caps.has_avx);
1696
1697 switch(type.width) {
1698 case 32:
1699 intrinsic = "llvm.x86.avx.round.ps.256";
1700 break;
1701 case 64:
1702 intrinsic = "llvm.x86.avx.round.pd.256";
1703 break;
1704 default:
1705 assert(0);
1706 return bld->undef;
1707 }
1708 }
1709
1710 res = lp_build_intrinsic_binary(builder, intrinsic,
1711 bld->vec_type, a,
1712 LLVMConstInt(i32t, mode, 0));
1713 }
1714
1715 return res;
1716 }
1717
1718
1719 static INLINE LLVMValueRef
1720 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1721 LLVMValueRef a)
1722 {
1723 LLVMBuilderRef builder = bld->gallivm->builder;
1724 const struct lp_type type = bld->type;
1725 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1726 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1727 const char *intrinsic;
1728 LLVMValueRef res;
1729
1730 assert(type.floating);
1731 /* using the double precision conversions is a bit more complicated */
1732 assert(type.width == 32);
1733
1734 assert(lp_check_value(type, a));
1735 assert(util_cpu_caps.has_sse2);
1736
1737 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1738 if (type.length == 1) {
1739 LLVMTypeRef vec_type;
1740 LLVMValueRef undef;
1741 LLVMValueRef arg;
1742 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1743
1744 vec_type = LLVMVectorType(bld->elem_type, 4);
1745
1746 intrinsic = "llvm.x86.sse.cvtss2si";
1747
1748 undef = LLVMGetUndef(vec_type);
1749
1750 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1751
1752 res = lp_build_intrinsic_unary(builder, intrinsic,
1753 ret_type, arg);
1754 }
1755 else {
1756 if (type.width* type.length == 128) {
1757 intrinsic = "llvm.x86.sse2.cvtps2dq";
1758 }
1759 else {
1760 assert(type.width*type.length == 256);
1761 assert(util_cpu_caps.has_avx);
1762
1763 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1764 }
1765 res = lp_build_intrinsic_unary(builder, intrinsic,
1766 ret_type, a);
1767 }
1768
1769 return res;
1770 }
1771
1772
1773 /*
1774 */
1775 static INLINE LLVMValueRef
1776 lp_build_round_altivec(struct lp_build_context *bld,
1777 LLVMValueRef a,
1778 enum lp_build_round_mode mode)
1779 {
1780 LLVMBuilderRef builder = bld->gallivm->builder;
1781 const struct lp_type type = bld->type;
1782 const char *intrinsic = NULL;
1783
1784 assert(type.floating);
1785
1786 assert(lp_check_value(type, a));
1787 assert(util_cpu_caps.has_altivec);
1788
1789 switch (mode) {
1790 case LP_BUILD_ROUND_NEAREST:
1791 intrinsic = "llvm.ppc.altivec.vrfin";
1792 break;
1793 case LP_BUILD_ROUND_FLOOR:
1794 intrinsic = "llvm.ppc.altivec.vrfim";
1795 break;
1796 case LP_BUILD_ROUND_CEIL:
1797 intrinsic = "llvm.ppc.altivec.vrfip";
1798 break;
1799 case LP_BUILD_ROUND_TRUNCATE:
1800 intrinsic = "llvm.ppc.altivec.vrfiz";
1801 break;
1802 }
1803
1804 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1805 }
1806
1807 static INLINE LLVMValueRef
1808 lp_build_round_arch(struct lp_build_context *bld,
1809 LLVMValueRef a,
1810 enum lp_build_round_mode mode)
1811 {
1812 if (util_cpu_caps.has_sse4_1)
1813 return lp_build_round_sse41(bld, a, mode);
1814 else /* (util_cpu_caps.has_altivec) */
1815 return lp_build_round_altivec(bld, a, mode);
1816 }
1817
1818 /**
1819 * Return the integer part of a float (vector) value (== round toward zero).
1820 * The returned value is a float (vector).
1821 * Ex: trunc(-1.5) = -1.0
1822 */
1823 LLVMValueRef
1824 lp_build_trunc(struct lp_build_context *bld,
1825 LLVMValueRef a)
1826 {
1827 LLVMBuilderRef builder = bld->gallivm->builder;
1828 const struct lp_type type = bld->type;
1829
1830 assert(type.floating);
1831 assert(lp_check_value(type, a));
1832
1833 if (arch_rounding_available(type)) {
1834 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1835 }
1836 else {
1837 const struct lp_type type = bld->type;
1838 struct lp_type inttype;
1839 struct lp_build_context intbld;
1840 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1841 LLVMValueRef trunc, res, anosign, mask;
1842 LLVMTypeRef int_vec_type = bld->int_vec_type;
1843 LLVMTypeRef vec_type = bld->vec_type;
1844
1845 assert(type.width == 32); /* might want to handle doubles at some point */
1846
1847 inttype = type;
1848 inttype.floating = 0;
1849 lp_build_context_init(&intbld, bld->gallivm, inttype);
1850
1851 /* round by truncation */
1852 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1853 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1854
1855 /* mask out sign bit */
1856 anosign = lp_build_abs(bld, a);
1857 /*
1858 * mask out all values if anosign > 2^24
1859 * This should work both for large ints (all rounding is no-op for them
1860 * because such floats are always exact) as well as special cases like
1861 * NaNs, Infs (taking advantage of the fact they use max exponent).
1862 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1863 */
1864 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1865 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1866 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1867 return lp_build_select(bld, mask, a, res);
1868 }
1869 }
1870
1871
1872 /**
1873 * Return float (vector) rounded to nearest integer (vector). The returned
1874 * value is a float (vector).
1875 * Ex: round(0.9) = 1.0
1876 * Ex: round(-1.5) = -2.0
1877 */
1878 LLVMValueRef
1879 lp_build_round(struct lp_build_context *bld,
1880 LLVMValueRef a)
1881 {
1882 LLVMBuilderRef builder = bld->gallivm->builder;
1883 const struct lp_type type = bld->type;
1884
1885 assert(type.floating);
1886 assert(lp_check_value(type, a));
1887
1888 if (arch_rounding_available(type)) {
1889 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1890 }
1891 else {
1892 const struct lp_type type = bld->type;
1893 struct lp_type inttype;
1894 struct lp_build_context intbld;
1895 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1896 LLVMValueRef res, anosign, mask;
1897 LLVMTypeRef int_vec_type = bld->int_vec_type;
1898 LLVMTypeRef vec_type = bld->vec_type;
1899
1900 assert(type.width == 32); /* might want to handle doubles at some point */
1901
1902 inttype = type;
1903 inttype.floating = 0;
1904 lp_build_context_init(&intbld, bld->gallivm, inttype);
1905
1906 res = lp_build_iround(bld, a);
1907 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1908
1909 /* mask out sign bit */
1910 anosign = lp_build_abs(bld, a);
1911 /*
1912 * mask out all values if anosign > 2^24
1913 * This should work both for large ints (all rounding is no-op for them
1914 * because such floats are always exact) as well as special cases like
1915 * NaNs, Infs (taking advantage of the fact they use max exponent).
1916 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1917 */
1918 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1919 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1920 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1921 return lp_build_select(bld, mask, a, res);
1922 }
1923 }
1924
1925
1926 /**
1927 * Return floor of float (vector), result is a float (vector)
1928 * Ex: floor(1.1) = 1.0
1929 * Ex: floor(-1.1) = -2.0
1930 */
1931 LLVMValueRef
1932 lp_build_floor(struct lp_build_context *bld,
1933 LLVMValueRef a)
1934 {
1935 LLVMBuilderRef builder = bld->gallivm->builder;
1936 const struct lp_type type = bld->type;
1937
1938 assert(type.floating);
1939 assert(lp_check_value(type, a));
1940
1941 if (arch_rounding_available(type)) {
1942 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1943 }
1944 else {
1945 const struct lp_type type = bld->type;
1946 struct lp_type inttype;
1947 struct lp_build_context intbld;
1948 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1949 LLVMValueRef trunc, res, anosign, mask;
1950 LLVMTypeRef int_vec_type = bld->int_vec_type;
1951 LLVMTypeRef vec_type = bld->vec_type;
1952
1953 assert(type.width == 32); /* might want to handle doubles at some point */
1954
1955 inttype = type;
1956 inttype.floating = 0;
1957 lp_build_context_init(&intbld, bld->gallivm, inttype);
1958
1959 /* round by truncation */
1960 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1961 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1962
1963 if (type.sign) {
1964 LLVMValueRef tmp;
1965
1966 /*
1967 * fix values if rounding is wrong (for non-special cases)
1968 * - this is the case if trunc > a
1969 */
1970 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1971 /* tmp = trunc > a ? 1.0 : 0.0 */
1972 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1973 tmp = lp_build_and(&intbld, mask, tmp);
1974 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1975 res = lp_build_sub(bld, res, tmp);
1976 }
1977
1978 /* mask out sign bit */
1979 anosign = lp_build_abs(bld, a);
1980 /*
1981 * mask out all values if anosign > 2^24
1982 * This should work both for large ints (all rounding is no-op for them
1983 * because such floats are always exact) as well as special cases like
1984 * NaNs, Infs (taking advantage of the fact they use max exponent).
1985 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1986 */
1987 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1988 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1989 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1990 return lp_build_select(bld, mask, a, res);
1991 }
1992 }
1993
1994
1995 /**
1996 * Return ceiling of float (vector), returning float (vector).
1997 * Ex: ceil( 1.1) = 2.0
1998 * Ex: ceil(-1.1) = -1.0
1999 */
2000 LLVMValueRef
2001 lp_build_ceil(struct lp_build_context *bld,
2002 LLVMValueRef a)
2003 {
2004 LLVMBuilderRef builder = bld->gallivm->builder;
2005 const struct lp_type type = bld->type;
2006
2007 assert(type.floating);
2008 assert(lp_check_value(type, a));
2009
2010 if (arch_rounding_available(type)) {
2011 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2012 }
2013 else {
2014 const struct lp_type type = bld->type;
2015 struct lp_type inttype;
2016 struct lp_build_context intbld;
2017 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2018 LLVMValueRef trunc, res, anosign, mask, tmp;
2019 LLVMTypeRef int_vec_type = bld->int_vec_type;
2020 LLVMTypeRef vec_type = bld->vec_type;
2021
2022 assert(type.width == 32); /* might want to handle doubles at some point */
2023
2024 inttype = type;
2025 inttype.floating = 0;
2026 lp_build_context_init(&intbld, bld->gallivm, inttype);
2027
2028 /* round by truncation */
2029 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2030 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2031
2032 /*
2033 * fix values if rounding is wrong (for non-special cases)
2034 * - this is the case if trunc < a
2035 */
2036 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2037 /* tmp = trunc < a ? 1.0 : 0.0 */
2038 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2039 tmp = lp_build_and(&intbld, mask, tmp);
2040 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2041 res = lp_build_add(bld, trunc, tmp);
2042
2043 /* mask out sign bit */
2044 anosign = lp_build_abs(bld, a);
2045 /*
2046 * mask out all values if anosign > 2^24
2047 * This should work both for large ints (all rounding is no-op for them
2048 * because such floats are always exact) as well as special cases like
2049 * NaNs, Infs (taking advantage of the fact they use max exponent).
2050 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2051 */
2052 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2053 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2054 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2055 return lp_build_select(bld, mask, a, res);
2056 }
2057 }
2058
2059
2060 /**
2061 * Return fractional part of 'a' computed as a - floor(a)
2062 * Typically used in texture coord arithmetic.
2063 */
2064 LLVMValueRef
2065 lp_build_fract(struct lp_build_context *bld,
2066 LLVMValueRef a)
2067 {
2068 assert(bld->type.floating);
2069 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2070 }
2071
2072
2073 /**
2074 * Prevent returning a fractional part of 1.0 for very small negative values of
2075 * 'a' by clamping against 0.99999(9).
2076 */
2077 static inline LLVMValueRef
2078 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2079 {
2080 LLVMValueRef max;
2081
2082 /* this is the largest number smaller than 1.0 representable as float */
2083 max = lp_build_const_vec(bld->gallivm, bld->type,
2084 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2085 return lp_build_min(bld, fract, max);
2086 }
2087
2088
2089 /**
2090 * Same as lp_build_fract, but guarantees that the result is always smaller
2091 * than one.
2092 */
2093 LLVMValueRef
2094 lp_build_fract_safe(struct lp_build_context *bld,
2095 LLVMValueRef a)
2096 {
2097 return clamp_fract(bld, lp_build_fract(bld, a));
2098 }
2099
2100
2101 /**
2102 * Return the integer part of a float (vector) value (== round toward zero).
2103 * The returned value is an integer (vector).
2104 * Ex: itrunc(-1.5) = -1
2105 */
2106 LLVMValueRef
2107 lp_build_itrunc(struct lp_build_context *bld,
2108 LLVMValueRef a)
2109 {
2110 LLVMBuilderRef builder = bld->gallivm->builder;
2111 const struct lp_type type = bld->type;
2112 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2113
2114 assert(type.floating);
2115 assert(lp_check_value(type, a));
2116
2117 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2118 }
2119
2120
2121 /**
2122 * Return float (vector) rounded to nearest integer (vector). The returned
2123 * value is an integer (vector).
2124 * Ex: iround(0.9) = 1
2125 * Ex: iround(-1.5) = -2
2126 */
2127 LLVMValueRef
2128 lp_build_iround(struct lp_build_context *bld,
2129 LLVMValueRef a)
2130 {
2131 LLVMBuilderRef builder = bld->gallivm->builder;
2132 const struct lp_type type = bld->type;
2133 LLVMTypeRef int_vec_type = bld->int_vec_type;
2134 LLVMValueRef res;
2135
2136 assert(type.floating);
2137
2138 assert(lp_check_value(type, a));
2139
2140 if ((util_cpu_caps.has_sse2 &&
2141 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2142 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2143 return lp_build_iround_nearest_sse2(bld, a);
2144 }
2145 if (arch_rounding_available(type)) {
2146 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2147 }
2148 else {
2149 LLVMValueRef half;
2150
2151 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2152
2153 if (type.sign) {
2154 LLVMTypeRef vec_type = bld->vec_type;
2155 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2156 (unsigned long long)1 << (type.width - 1));
2157 LLVMValueRef sign;
2158
2159 /* get sign bit */
2160 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2161 sign = LLVMBuildAnd(builder, sign, mask, "");
2162
2163 /* sign * 0.5 */
2164 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2165 half = LLVMBuildOr(builder, sign, half, "");
2166 half = LLVMBuildBitCast(builder, half, vec_type, "");
2167 }
2168
2169 res = LLVMBuildFAdd(builder, a, half, "");
2170 }
2171
2172 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2173
2174 return res;
2175 }
2176
2177
2178 /**
2179 * Return floor of float (vector), result is an int (vector)
2180 * Ex: ifloor(1.1) = 1.0
2181 * Ex: ifloor(-1.1) = -2.0
2182 */
2183 LLVMValueRef
2184 lp_build_ifloor(struct lp_build_context *bld,
2185 LLVMValueRef a)
2186 {
2187 LLVMBuilderRef builder = bld->gallivm->builder;
2188 const struct lp_type type = bld->type;
2189 LLVMTypeRef int_vec_type = bld->int_vec_type;
2190 LLVMValueRef res;
2191
2192 assert(type.floating);
2193 assert(lp_check_value(type, a));
2194
2195 res = a;
2196 if (type.sign) {
2197 if (arch_rounding_available(type)) {
2198 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2199 }
2200 else {
2201 struct lp_type inttype;
2202 struct lp_build_context intbld;
2203 LLVMValueRef trunc, itrunc, mask;
2204
2205 assert(type.floating);
2206 assert(lp_check_value(type, a));
2207
2208 inttype = type;
2209 inttype.floating = 0;
2210 lp_build_context_init(&intbld, bld->gallivm, inttype);
2211
2212 /* round by truncation */
2213 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2214 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2215
2216 /*
2217 * fix values if rounding is wrong (for non-special cases)
2218 * - this is the case if trunc > a
2219 * The results of doing this with NaNs, very large values etc.
2220 * are undefined but this seems to be the case anyway.
2221 */
2222 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2223 /* cheapie minus one with mask since the mask is minus one / zero */
2224 return lp_build_add(&intbld, itrunc, mask);
2225 }
2226 }
2227
2228 /* round to nearest (toward zero) */
2229 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2230
2231 return res;
2232 }
2233
2234
2235 /**
2236 * Return ceiling of float (vector), returning int (vector).
2237 * Ex: iceil( 1.1) = 2
2238 * Ex: iceil(-1.1) = -1
2239 */
2240 LLVMValueRef
2241 lp_build_iceil(struct lp_build_context *bld,
2242 LLVMValueRef a)
2243 {
2244 LLVMBuilderRef builder = bld->gallivm->builder;
2245 const struct lp_type type = bld->type;
2246 LLVMTypeRef int_vec_type = bld->int_vec_type;
2247 LLVMValueRef res;
2248
2249 assert(type.floating);
2250 assert(lp_check_value(type, a));
2251
2252 if (arch_rounding_available(type)) {
2253 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2254 }
2255 else {
2256 struct lp_type inttype;
2257 struct lp_build_context intbld;
2258 LLVMValueRef trunc, itrunc, mask;
2259
2260 assert(type.floating);
2261 assert(lp_check_value(type, a));
2262
2263 inttype = type;
2264 inttype.floating = 0;
2265 lp_build_context_init(&intbld, bld->gallivm, inttype);
2266
2267 /* round by truncation */
2268 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2269 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2270
2271 /*
2272 * fix values if rounding is wrong (for non-special cases)
2273 * - this is the case if trunc < a
2274 * The results of doing this with NaNs, very large values etc.
2275 * are undefined but this seems to be the case anyway.
2276 */
2277 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2278 /* cheapie plus one with mask since the mask is minus one / zero */
2279 return lp_build_sub(&intbld, itrunc, mask);
2280 }
2281
2282 /* round to nearest (toward zero) */
2283 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2284
2285 return res;
2286 }
2287
2288
2289 /**
2290 * Combined ifloor() & fract().
2291 *
2292 * Preferred to calling the functions separately, as it will ensure that the
2293 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2294 */
2295 void
2296 lp_build_ifloor_fract(struct lp_build_context *bld,
2297 LLVMValueRef a,
2298 LLVMValueRef *out_ipart,
2299 LLVMValueRef *out_fpart)
2300 {
2301 LLVMBuilderRef builder = bld->gallivm->builder;
2302 const struct lp_type type = bld->type;
2303 LLVMValueRef ipart;
2304
2305 assert(type.floating);
2306 assert(lp_check_value(type, a));
2307
2308 if (arch_rounding_available(type)) {
2309 /*
2310 * floor() is easier.
2311 */
2312
2313 ipart = lp_build_floor(bld, a);
2314 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2315 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2316 }
2317 else {
2318 /*
2319 * ifloor() is easier.
2320 */
2321
2322 *out_ipart = lp_build_ifloor(bld, a);
2323 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2324 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2325 }
2326 }
2327
2328
2329 /**
2330 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2331 * always smaller than one.
2332 */
2333 void
2334 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2335 LLVMValueRef a,
2336 LLVMValueRef *out_ipart,
2337 LLVMValueRef *out_fpart)
2338 {
2339 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2340 *out_fpart = clamp_fract(bld, *out_fpart);
2341 }
2342
2343
2344 LLVMValueRef
2345 lp_build_sqrt(struct lp_build_context *bld,
2346 LLVMValueRef a)
2347 {
2348 LLVMBuilderRef builder = bld->gallivm->builder;
2349 const struct lp_type type = bld->type;
2350 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2351 char intrinsic[32];
2352
2353 assert(lp_check_value(type, a));
2354
2355 /* TODO: optimize the constant case */
2356
2357 assert(type.floating);
2358 if (type.length == 1) {
2359 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2360 }
2361 else {
2362 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2363 }
2364
2365 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2366 }
2367
2368
2369 /**
2370 * Do one Newton-Raphson step to improve reciprocate precision:
2371 *
2372 * x_{i+1} = x_i * (2 - a * x_i)
2373 *
2374 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2375 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2376 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2377 * halo. It would be necessary to clamp the argument to prevent this.
2378 *
2379 * See also:
2380 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2381 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2382 */
2383 static INLINE LLVMValueRef
2384 lp_build_rcp_refine(struct lp_build_context *bld,
2385 LLVMValueRef a,
2386 LLVMValueRef rcp_a)
2387 {
2388 LLVMBuilderRef builder = bld->gallivm->builder;
2389 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2390 LLVMValueRef res;
2391
2392 res = LLVMBuildFMul(builder, a, rcp_a, "");
2393 res = LLVMBuildFSub(builder, two, res, "");
2394 res = LLVMBuildFMul(builder, rcp_a, res, "");
2395
2396 return res;
2397 }
2398
2399
2400 LLVMValueRef
2401 lp_build_rcp(struct lp_build_context *bld,
2402 LLVMValueRef a)
2403 {
2404 LLVMBuilderRef builder = bld->gallivm->builder;
2405 const struct lp_type type = bld->type;
2406
2407 assert(lp_check_value(type, a));
2408
2409 if(a == bld->zero)
2410 return bld->undef;
2411 if(a == bld->one)
2412 return bld->one;
2413 if(a == bld->undef)
2414 return bld->undef;
2415
2416 assert(type.floating);
2417
2418 if(LLVMIsConstant(a))
2419 return LLVMConstFDiv(bld->one, a);
2420
2421 /*
2422 * We don't use RCPPS because:
2423 * - it only has 10bits of precision
2424 * - it doesn't even get the reciprocate of 1.0 exactly
2425 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2426 * - for recent processors the benefit over DIVPS is marginal, a case
2427 * dependent
2428 *
2429 * We could still use it on certain processors if benchmarks show that the
2430 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2431 * particular uses that require less workarounds.
2432 */
2433
2434 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2435 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2436 const unsigned num_iterations = 0;
2437 LLVMValueRef res;
2438 unsigned i;
2439 const char *intrinsic = NULL;
2440
2441 if (type.length == 4) {
2442 intrinsic = "llvm.x86.sse.rcp.ps";
2443 }
2444 else {
2445 intrinsic = "llvm.x86.avx.rcp.ps.256";
2446 }
2447
2448 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2449
2450 for (i = 0; i < num_iterations; ++i) {
2451 res = lp_build_rcp_refine(bld, a, res);
2452 }
2453
2454 return res;
2455 }
2456
2457 return LLVMBuildFDiv(builder, bld->one, a, "");
2458 }
2459
2460
2461 /**
2462 * Do one Newton-Raphson step to improve rsqrt precision:
2463 *
2464 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2465 *
2466 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2467 */
2468 static INLINE LLVMValueRef
2469 lp_build_rsqrt_refine(struct lp_build_context *bld,
2470 LLVMValueRef a,
2471 LLVMValueRef rsqrt_a)
2472 {
2473 LLVMBuilderRef builder = bld->gallivm->builder;
2474 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2475 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2476 LLVMValueRef res;
2477
2478 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2479 res = LLVMBuildFMul(builder, a, res, "");
2480 res = LLVMBuildFSub(builder, three, res, "");
2481 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2482 res = LLVMBuildFMul(builder, half, res, "");
2483
2484 return res;
2485 }
2486
2487
2488 /**
2489 * Generate 1/sqrt(a).
2490 * Result is undefined for values < 0, infinity for +0.
2491 */
2492 LLVMValueRef
2493 lp_build_rsqrt(struct lp_build_context *bld,
2494 LLVMValueRef a)
2495 {
2496 LLVMBuilderRef builder = bld->gallivm->builder;
2497 const struct lp_type type = bld->type;
2498
2499 assert(lp_check_value(type, a));
2500
2501 assert(type.floating);
2502
2503 /*
2504 * This should be faster but all denormals will end up as infinity.
2505 */
2506 if (0 && lp_build_fast_rsqrt_available(type)) {
2507 const unsigned num_iterations = 1;
2508 LLVMValueRef res;
2509 unsigned i;
2510
2511 /* rsqrt(1.0) != 1.0 here */
2512 res = lp_build_fast_rsqrt(bld, a);
2513
2514 if (num_iterations) {
2515 /*
2516 * Newton-Raphson will result in NaN instead of infinity for zero,
2517 * and NaN instead of zero for infinity.
2518 * Also, need to ensure rsqrt(1.0) == 1.0.
2519 * All numbers smaller than FLT_MIN will result in +infinity
2520 * (rsqrtps treats all denormals as zero).
2521 */
2522 /*
2523 * Certain non-c99 compilers don't know INFINITY and might not support
2524 * hacks to evaluate it at compile time neither.
2525 */
2526 const unsigned posinf_int = 0x7F800000;
2527 LLVMValueRef cmp;
2528 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2529 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2530
2531 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2532
2533 for (i = 0; i < num_iterations; ++i) {
2534 res = lp_build_rsqrt_refine(bld, a, res);
2535 }
2536 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2537 res = lp_build_select(bld, cmp, inf, res);
2538 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2539 res = lp_build_select(bld, cmp, bld->zero, res);
2540 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2541 res = lp_build_select(bld, cmp, bld->one, res);
2542 }
2543
2544 return res;
2545 }
2546
2547 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2548 }
2549
2550 /**
2551 * If there's a fast (inaccurate) rsqrt instruction available
2552 * (caller may want to avoid to call rsqrt_fast if it's not available,
2553 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2554 * unavailable it would result in sqrt/div/mul so obviously
2555 * much better to just call sqrt, skipping both div and mul).
2556 */
2557 boolean
2558 lp_build_fast_rsqrt_available(struct lp_type type)
2559 {
2560 assert(type.floating);
2561
2562 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2563 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2564 return true;
2565 }
2566 return false;
2567 }
2568
2569
2570 /**
2571 * Generate 1/sqrt(a).
2572 * Result is undefined for values < 0, infinity for +0.
2573 * Precision is limited, only ~10 bits guaranteed
2574 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2575 */
2576 LLVMValueRef
2577 lp_build_fast_rsqrt(struct lp_build_context *bld,
2578 LLVMValueRef a)
2579 {
2580 LLVMBuilderRef builder = bld->gallivm->builder;
2581 const struct lp_type type = bld->type;
2582
2583 assert(lp_check_value(type, a));
2584
2585 if (lp_build_fast_rsqrt_available(type)) {
2586 const char *intrinsic = NULL;
2587
2588 if (type.length == 4) {
2589 intrinsic = "llvm.x86.sse.rsqrt.ps";
2590 }
2591 else {
2592 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2593 }
2594 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2595 }
2596 else {
2597 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2598 }
2599 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2600 }
2601
2602
2603 /**
2604 * Generate sin(a) using SSE2
2605 */
2606 LLVMValueRef
2607 lp_build_sin(struct lp_build_context *bld,
2608 LLVMValueRef a)
2609 {
2610 struct gallivm_state *gallivm = bld->gallivm;
2611 LLVMBuilderRef builder = gallivm->builder;
2612 struct lp_type int_type = lp_int_type(bld->type);
2613 LLVMBuilderRef b = builder;
2614
2615 /*
2616 * take the absolute value,
2617 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2618 */
2619
2620 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2621 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2622
2623 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2624 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2625
2626 /*
2627 * extract the sign bit (upper one)
2628 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2629 */
2630 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2631 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2632
2633 /*
2634 * scale by 4/Pi
2635 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2636 */
2637
2638 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2639 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2640
2641 /*
2642 * store the integer part of y in mm0
2643 * emm2 = _mm_cvttps_epi32(y);
2644 */
2645
2646 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2647
2648 /*
2649 * j=(j+1) & (~1) (see the cephes sources)
2650 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2651 */
2652
2653 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2654 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2655 /*
2656 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2657 */
2658 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2659 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2660
2661 /*
2662 * y = _mm_cvtepi32_ps(emm2);
2663 */
2664 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2665
2666 /* get the swap sign flag
2667 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2668 */
2669 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2670 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2671
2672 /*
2673 * emm2 = _mm_slli_epi32(emm0, 29);
2674 */
2675 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2676 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2677
2678 /*
2679 * get the polynom selection mask
2680 * there is one polynom for 0 <= x <= Pi/4
2681 * and another one for Pi/4<x<=Pi/2
2682 * Both branches will be computed.
2683 *
2684 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2685 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2686 */
2687
2688 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2689 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2690 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2691 int_type, PIPE_FUNC_EQUAL,
2692 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2693 /*
2694 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2695 */
2696 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2697
2698 /*
2699 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2700 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2701 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2702 */
2703 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2704 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2705 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2706
2707 /*
2708 * The magic pass: "Extended precision modular arithmetic"
2709 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2710 * xmm1 = _mm_mul_ps(y, xmm1);
2711 * xmm2 = _mm_mul_ps(y, xmm2);
2712 * xmm3 = _mm_mul_ps(y, xmm3);
2713 */
2714 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2715 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2716 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2717
2718 /*
2719 * x = _mm_add_ps(x, xmm1);
2720 * x = _mm_add_ps(x, xmm2);
2721 * x = _mm_add_ps(x, xmm3);
2722 */
2723
2724 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2725 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2726 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2727
2728 /*
2729 * Evaluate the first polynom (0 <= x <= Pi/4)
2730 *
2731 * z = _mm_mul_ps(x,x);
2732 */
2733 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2734
2735 /*
2736 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2737 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2738 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2739 */
2740 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2741 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2742 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2743
2744 /*
2745 * y = *(v4sf*)_ps_coscof_p0;
2746 * y = _mm_mul_ps(y, z);
2747 */
2748 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2749 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2750 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2751 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2752 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2753 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2754
2755
2756 /*
2757 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2758 * y = _mm_sub_ps(y, tmp);
2759 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2760 */
2761 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2762 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2763 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2764 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2765 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2766
2767 /*
2768 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2769 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2770 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2771 */
2772 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2773 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2774 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2775
2776 /*
2777 * Evaluate the second polynom (Pi/4 <= x <= 0)
2778 *
2779 * y2 = *(v4sf*)_ps_sincof_p0;
2780 * y2 = _mm_mul_ps(y2, z);
2781 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2782 * y2 = _mm_mul_ps(y2, z);
2783 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2784 * y2 = _mm_mul_ps(y2, z);
2785 * y2 = _mm_mul_ps(y2, x);
2786 * y2 = _mm_add_ps(y2, x);
2787 */
2788
2789 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2790 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2791 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2792 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2793 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2794 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2795 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2796
2797 /*
2798 * select the correct result from the two polynoms
2799 * xmm3 = poly_mask;
2800 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2801 * y = _mm_andnot_ps(xmm3, y);
2802 * y = _mm_or_ps(y,y2);
2803 */
2804 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2805 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2806 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2807 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2808 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2809 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2810
2811 /*
2812 * update the sign
2813 * y = _mm_xor_ps(y, sign_bit);
2814 */
2815 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2816 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2817 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2818
2819 /* clamp output to be within [-1, 1] */
2820 y_result = lp_build_clamp(bld, y_result,
2821 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2822 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2823 /* If a is -inf, inf or NaN then return NaN */
2824 y_result = lp_build_select(bld, isfinite, y_result,
2825 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2826 return y_result;
2827 }
2828
2829
2830 /**
2831 * Generate cos(a) using SSE2
2832 */
2833 LLVMValueRef
2834 lp_build_cos(struct lp_build_context *bld,
2835 LLVMValueRef a)
2836 {
2837 struct gallivm_state *gallivm = bld->gallivm;
2838 LLVMBuilderRef builder = gallivm->builder;
2839 struct lp_type int_type = lp_int_type(bld->type);
2840 LLVMBuilderRef b = builder;
2841
2842 /*
2843 * take the absolute value,
2844 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2845 */
2846
2847 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2848 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2849
2850 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2851 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2852
2853 /*
2854 * scale by 4/Pi
2855 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2856 */
2857
2858 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2859 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2860
2861 /*
2862 * store the integer part of y in mm0
2863 * emm2 = _mm_cvttps_epi32(y);
2864 */
2865
2866 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2867
2868 /*
2869 * j=(j+1) & (~1) (see the cephes sources)
2870 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2871 */
2872
2873 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2874 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2875 /*
2876 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2877 */
2878 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2879 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2880
2881 /*
2882 * y = _mm_cvtepi32_ps(emm2);
2883 */
2884 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2885
2886
2887 /*
2888 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2889 */
2890 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2891 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2892
2893
2894 /* get the swap sign flag
2895 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2896 */
2897 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2898 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2899 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2900 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2901
2902 /*
2903 * emm2 = _mm_slli_epi32(emm0, 29);
2904 */
2905 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2906 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2907
2908 /*
2909 * get the polynom selection mask
2910 * there is one polynom for 0 <= x <= Pi/4
2911 * and another one for Pi/4<x<=Pi/2
2912 * Both branches will be computed.
2913 *
2914 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2915 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2916 */
2917
2918 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2919 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2920 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2921 int_type, PIPE_FUNC_EQUAL,
2922 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2923
2924 /*
2925 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2926 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2927 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2928 */
2929 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2930 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2931 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2932
2933 /*
2934 * The magic pass: "Extended precision modular arithmetic"
2935 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2936 * xmm1 = _mm_mul_ps(y, xmm1);
2937 * xmm2 = _mm_mul_ps(y, xmm2);
2938 * xmm3 = _mm_mul_ps(y, xmm3);
2939 */
2940 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2941 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2942 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2943
2944 /*
2945 * x = _mm_add_ps(x, xmm1);
2946 * x = _mm_add_ps(x, xmm2);
2947 * x = _mm_add_ps(x, xmm3);
2948 */
2949
2950 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2951 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2952 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2953
2954 /*
2955 * Evaluate the first polynom (0 <= x <= Pi/4)
2956 *
2957 * z = _mm_mul_ps(x,x);
2958 */
2959 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2960
2961 /*
2962 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2963 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2964 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2965 */
2966 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2967 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2968 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2969
2970 /*
2971 * y = *(v4sf*)_ps_coscof_p0;
2972 * y = _mm_mul_ps(y, z);
2973 */
2974 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2975 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2976 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2977 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2978 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2979 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2980
2981
2982 /*
2983 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2984 * y = _mm_sub_ps(y, tmp);
2985 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2986 */
2987 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2988 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2989 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2990 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2991 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2992
2993 /*
2994 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2995 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2996 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2997 */
2998 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2999 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3000 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3001
3002 /*
3003 * Evaluate the second polynom (Pi/4 <= x <= 0)
3004 *
3005 * y2 = *(v4sf*)_ps_sincof_p0;
3006 * y2 = _mm_mul_ps(y2, z);
3007 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3008 * y2 = _mm_mul_ps(y2, z);
3009 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3010 * y2 = _mm_mul_ps(y2, z);
3011 * y2 = _mm_mul_ps(y2, x);
3012 * y2 = _mm_add_ps(y2, x);
3013 */
3014
3015 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
3016 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
3017 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
3018 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
3019 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3020 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
3021 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
3022
3023 /*
3024 * select the correct result from the two polynoms
3025 * xmm3 = poly_mask;
3026 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3027 * y = _mm_andnot_ps(xmm3, y);
3028 * y = _mm_or_ps(y,y2);
3029 */
3030 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3031 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3032 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3033 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3034 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3035 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3036
3037 /*
3038 * update the sign
3039 * y = _mm_xor_ps(y, sign_bit);
3040 */
3041 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
3042 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3043 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3044
3045 /* clamp output to be within [-1, 1] */
3046 y_result = lp_build_clamp(bld, y_result,
3047 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3048 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3049 /* If a is -inf, inf or NaN then return NaN */
3050 y_result = lp_build_select(bld, isfinite, y_result,
3051 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3052 return y_result;
3053 }
3054
3055
3056 /**
3057 * Generate pow(x, y)
3058 */
3059 LLVMValueRef
3060 lp_build_pow(struct lp_build_context *bld,
3061 LLVMValueRef x,
3062 LLVMValueRef y)
3063 {
3064 /* TODO: optimize the constant case */
3065 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3066 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3067 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3068 __FUNCTION__);
3069 }
3070
3071 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3072 }
3073
3074
3075 /**
3076 * Generate exp(x)
3077 */
3078 LLVMValueRef
3079 lp_build_exp(struct lp_build_context *bld,
3080 LLVMValueRef x)
3081 {
3082 /* log2(e) = 1/log(2) */
3083 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3084 1.4426950408889634);
3085
3086 assert(lp_check_value(bld->type, x));
3087
3088 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3089 }
3090
3091
3092 /**
3093 * Generate log(x)
3094 * Behavior is undefined with infs, 0s and nans
3095 */
3096 LLVMValueRef
3097 lp_build_log(struct lp_build_context *bld,
3098 LLVMValueRef x)
3099 {
3100 /* log(2) */
3101 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3102 0.69314718055994529);
3103
3104 assert(lp_check_value(bld->type, x));
3105
3106 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3107 }
3108
3109 /**
3110 * Generate log(x) that handles edge cases (infs, 0s and nans)
3111 */
3112 LLVMValueRef
3113 lp_build_log_safe(struct lp_build_context *bld,
3114 LLVMValueRef x)
3115 {
3116 /* log(2) */
3117 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3118 0.69314718055994529);
3119
3120 assert(lp_check_value(bld->type, x));
3121
3122 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3123 }
3124
3125
3126 /**
3127 * Generate polynomial.
3128 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3129 */
3130 LLVMValueRef
3131 lp_build_polynomial(struct lp_build_context *bld,
3132 LLVMValueRef x,
3133 const double *coeffs,
3134 unsigned num_coeffs)
3135 {
3136 const struct lp_type type = bld->type;
3137 LLVMValueRef even = NULL, odd = NULL;
3138 LLVMValueRef x2;
3139 unsigned i;
3140
3141 assert(lp_check_value(bld->type, x));
3142
3143 /* TODO: optimize the constant case */
3144 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3145 LLVMIsConstant(x)) {
3146 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3147 __FUNCTION__);
3148 }
3149
3150 /*
3151 * Calculate odd and even terms seperately to decrease data dependency
3152 * Ex:
3153 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3154 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3155 */
3156 x2 = lp_build_mul(bld, x, x);
3157
3158 for (i = num_coeffs; i--; ) {
3159 LLVMValueRef coeff;
3160
3161 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3162
3163 if (i % 2 == 0) {
3164 if (even)
3165 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3166 else
3167 even = coeff;
3168 } else {
3169 if (odd)
3170 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3171 else
3172 odd = coeff;
3173 }
3174 }
3175
3176 if (odd)
3177 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3178 else if (even)
3179 return even;
3180 else
3181 return bld->undef;
3182 }
3183
3184
3185 /**
3186 * Minimax polynomial fit of 2**x, in range [0, 1[
3187 */
3188 const double lp_build_exp2_polynomial[] = {
3189 #if EXP_POLY_DEGREE == 5
3190 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3191 0.693153073200168932794,
3192 0.240153617044375388211,
3193 0.0558263180532956664775,
3194 0.00898934009049466391101,
3195 0.00187757667519147912699
3196 #elif EXP_POLY_DEGREE == 4
3197 1.00000259337069434683,
3198 0.693003834469974940458,
3199 0.24144275689150793076,
3200 0.0520114606103070150235,
3201 0.0135341679161270268764
3202 #elif EXP_POLY_DEGREE == 3
3203 0.999925218562710312959,
3204 0.695833540494823811697,
3205 0.226067155427249155588,
3206 0.0780245226406372992967
3207 #elif EXP_POLY_DEGREE == 2
3208 1.00172476321474503578,
3209 0.657636275736077639316,
3210 0.33718943461968720704
3211 #else
3212 #error
3213 #endif
3214 };
3215
3216
3217 void
3218 lp_build_exp2_approx(struct lp_build_context *bld,
3219 LLVMValueRef x,
3220 LLVMValueRef *p_exp2_int_part,
3221 LLVMValueRef *p_frac_part,
3222 LLVMValueRef *p_exp2)
3223 {
3224 LLVMBuilderRef builder = bld->gallivm->builder;
3225 const struct lp_type type = bld->type;
3226 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3227 LLVMValueRef ipart = NULL;
3228 LLVMValueRef fpart = NULL;
3229 LLVMValueRef expipart = NULL;
3230 LLVMValueRef expfpart = NULL;
3231 LLVMValueRef res = NULL;
3232
3233 assert(lp_check_value(bld->type, x));
3234
3235 if(p_exp2_int_part || p_frac_part || p_exp2) {
3236 /* TODO: optimize the constant case */
3237 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3238 LLVMIsConstant(x)) {
3239 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3240 __FUNCTION__);
3241 }
3242
3243 assert(type.floating && type.width == 32);
3244
3245 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3246 * the result is INF and if it's smaller than -126.9 the result is 0 */
3247 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3248 GALLIVM_NAN_RETURN_SECOND);
3249 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x,
3250 GALLIVM_NAN_RETURN_SECOND);
3251
3252 /* ipart = floor(x) */
3253 /* fpart = x - ipart */
3254 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3255 }
3256
3257 if(p_exp2_int_part || p_exp2) {
3258 /* expipart = (float) (1 << ipart) */
3259 expipart = LLVMBuildAdd(builder, ipart,
3260 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3261 expipart = LLVMBuildShl(builder, expipart,
3262 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3263 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3264 }
3265
3266 if(p_exp2) {
3267 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3268 Elements(lp_build_exp2_polynomial));
3269
3270 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3271 }
3272
3273 if(p_exp2_int_part)
3274 *p_exp2_int_part = expipart;
3275
3276 if(p_frac_part)
3277 *p_frac_part = fpart;
3278
3279 if(p_exp2)
3280 *p_exp2 = res;
3281 }
3282
3283
3284 LLVMValueRef
3285 lp_build_exp2(struct lp_build_context *bld,
3286 LLVMValueRef x)
3287 {
3288 LLVMValueRef res;
3289 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3290 return res;
3291 }
3292
3293
3294 /**
3295 * Extract the exponent of a IEEE-754 floating point value.
3296 *
3297 * Optionally apply an integer bias.
3298 *
3299 * Result is an integer value with
3300 *
3301 * ifloor(log2(x)) + bias
3302 */
3303 LLVMValueRef
3304 lp_build_extract_exponent(struct lp_build_context *bld,
3305 LLVMValueRef x,
3306 int bias)
3307 {
3308 LLVMBuilderRef builder = bld->gallivm->builder;
3309 const struct lp_type type = bld->type;
3310 unsigned mantissa = lp_mantissa(type);
3311 LLVMValueRef res;
3312
3313 assert(type.floating);
3314
3315 assert(lp_check_value(bld->type, x));
3316
3317 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3318
3319 res = LLVMBuildLShr(builder, x,
3320 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3321 res = LLVMBuildAnd(builder, res,
3322 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3323 res = LLVMBuildSub(builder, res,
3324 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3325
3326 return res;
3327 }
3328
3329
3330 /**
3331 * Extract the mantissa of the a floating.
3332 *
3333 * Result is a floating point value with
3334 *
3335 * x / floor(log2(x))
3336 */
3337 LLVMValueRef
3338 lp_build_extract_mantissa(struct lp_build_context *bld,
3339 LLVMValueRef x)
3340 {
3341 LLVMBuilderRef builder = bld->gallivm->builder;
3342 const struct lp_type type = bld->type;
3343 unsigned mantissa = lp_mantissa(type);
3344 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3345 (1ULL << mantissa) - 1);
3346 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3347 LLVMValueRef res;
3348
3349 assert(lp_check_value(bld->type, x));
3350
3351 assert(type.floating);
3352
3353 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3354
3355 /* res = x / 2**ipart */
3356 res = LLVMBuildAnd(builder, x, mantmask, "");
3357 res = LLVMBuildOr(builder, res, one, "");
3358 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3359
3360 return res;
3361 }
3362
3363
3364
3365 /**
3366 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3367 * These coefficients can be generate with
3368 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3369 */
3370 const double lp_build_log2_polynomial[] = {
3371 #if LOG_POLY_DEGREE == 5
3372 2.88539008148777786488L,
3373 0.961796878841293367824L,
3374 0.577058946784739859012L,
3375 0.412914355135828735411L,
3376 0.308591899232910175289L,
3377 0.352376952300281371868L,
3378 #elif LOG_POLY_DEGREE == 4
3379 2.88539009343309178325L,
3380 0.961791550404184197881L,
3381 0.577440339438736392009L,
3382 0.403343858251329912514L,
3383 0.406718052498846252698L,
3384 #elif LOG_POLY_DEGREE == 3
3385 2.88538959748872753838L,
3386 0.961932915889597772928L,
3387 0.571118517972136195241L,
3388 0.493997535084709500285L,
3389 #else
3390 #error
3391 #endif
3392 };
3393
3394 /**
3395 * See http://www.devmaster.net/forums/showthread.php?p=43580
3396 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3397 * http://www.nezumi.demon.co.uk/consult/logx.htm
3398 *
3399 * If handle_edge_cases is true the function will perform computations
3400 * to match the required D3D10+ behavior for each of the edge cases.
3401 * That means that if input is:
3402 * - less than zero (to and including -inf) then NaN will be returned
3403 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3404 * - +infinity, then +infinity will be returned
3405 * - NaN, then NaN will be returned
3406 *
3407 * Those checks are fairly expensive so if you don't need them make sure
3408 * handle_edge_cases is false.
3409 */
3410 void
3411 lp_build_log2_approx(struct lp_build_context *bld,
3412 LLVMValueRef x,
3413 LLVMValueRef *p_exp,
3414 LLVMValueRef *p_floor_log2,
3415 LLVMValueRef *p_log2,
3416 boolean handle_edge_cases)
3417 {
3418 LLVMBuilderRef builder = bld->gallivm->builder;
3419 const struct lp_type type = bld->type;
3420 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3421 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3422
3423 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3424 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3425 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3426
3427 LLVMValueRef i = NULL;
3428 LLVMValueRef y = NULL;
3429 LLVMValueRef z = NULL;
3430 LLVMValueRef exp = NULL;
3431 LLVMValueRef mant = NULL;
3432 LLVMValueRef logexp = NULL;
3433 LLVMValueRef logmant = NULL;
3434 LLVMValueRef res = NULL;
3435
3436 assert(lp_check_value(bld->type, x));
3437
3438 if(p_exp || p_floor_log2 || p_log2) {
3439 /* TODO: optimize the constant case */
3440 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3441 LLVMIsConstant(x)) {
3442 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3443 __FUNCTION__);
3444 }
3445
3446 assert(type.floating && type.width == 32);
3447
3448 /*
3449 * We don't explicitly handle denormalized numbers. They will yield a
3450 * result in the neighbourhood of -127, which appears to be adequate
3451 * enough.
3452 */
3453
3454 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3455
3456 /* exp = (float) exponent(x) */
3457 exp = LLVMBuildAnd(builder, i, expmask, "");
3458 }
3459
3460 if(p_floor_log2 || p_log2) {
3461 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3462 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3463 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3464 }
3465
3466 if(p_log2) {
3467 /* mant = 1 + (float) mantissa(x) */
3468 mant = LLVMBuildAnd(builder, i, mantmask, "");
3469 mant = LLVMBuildOr(builder, mant, one, "");
3470 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3471
3472 /* y = (mant - 1) / (mant + 1) */
3473 y = lp_build_div(bld,
3474 lp_build_sub(bld, mant, bld->one),
3475 lp_build_add(bld, mant, bld->one)
3476 );
3477
3478 /* z = y^2 */
3479 z = lp_build_mul(bld, y, y);
3480
3481 /* compute P(z) */
3482 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3483 Elements(lp_build_log2_polynomial));
3484
3485 /* logmant = y * P(z) */
3486 logmant = lp_build_mul(bld, y, logmant);
3487
3488 res = lp_build_add(bld, logmant, logexp);
3489
3490 if (type.floating && handle_edge_cases) {
3491 LLVMValueRef negmask, infmask, zmask;
3492 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3493 lp_build_const_vec(bld->gallivm, type, 0.0f));
3494 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3495 lp_build_const_vec(bld->gallivm, type, 0.0f));
3496 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3497 lp_build_const_vec(bld->gallivm, type, INFINITY));
3498
3499 /* If x is qual to inf make sure we return inf */
3500 res = lp_build_select(bld, infmask,
3501 lp_build_const_vec(bld->gallivm, type, INFINITY),
3502 res);
3503 /* If x is qual to 0, return -inf */
3504 res = lp_build_select(bld, zmask,
3505 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3506 res);
3507 /* If x is nan or less than 0, return nan */
3508 res = lp_build_select(bld, negmask,
3509 lp_build_const_vec(bld->gallivm, type, NAN),
3510 res);
3511 }
3512 }
3513
3514 if(p_exp) {
3515 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3516 *p_exp = exp;
3517 }
3518
3519 if(p_floor_log2)
3520 *p_floor_log2 = logexp;
3521
3522 if(p_log2)
3523 *p_log2 = res;
3524 }
3525
3526
3527 /*
3528 * log2 implementation which doesn't have special code to
3529 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3530 * the results for those cases are undefined.
3531 */
3532 LLVMValueRef
3533 lp_build_log2(struct lp_build_context *bld,
3534 LLVMValueRef x)
3535 {
3536 LLVMValueRef res;
3537 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3538 return res;
3539 }
3540
3541 /*
3542 * Version of log2 which handles all edge cases.
3543 * Look at documentation of lp_build_log2_approx for
3544 * description of the behavior for each of the edge cases.
3545 */
3546 LLVMValueRef
3547 lp_build_log2_safe(struct lp_build_context *bld,
3548 LLVMValueRef x)
3549 {
3550 LLVMValueRef res;
3551 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3552 return res;
3553 }
3554
3555
3556 /**
3557 * Faster (and less accurate) log2.
3558 *
3559 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3560 *
3561 * Piece-wise linear approximation, with exact results when x is a
3562 * power of two.
3563 *
3564 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3565 */
3566 LLVMValueRef
3567 lp_build_fast_log2(struct lp_build_context *bld,
3568 LLVMValueRef x)
3569 {
3570 LLVMBuilderRef builder = bld->gallivm->builder;
3571 LLVMValueRef ipart;
3572 LLVMValueRef fpart;
3573
3574 assert(lp_check_value(bld->type, x));
3575
3576 assert(bld->type.floating);
3577
3578 /* ipart = floor(log2(x)) - 1 */
3579 ipart = lp_build_extract_exponent(bld, x, -1);
3580 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3581
3582 /* fpart = x / 2**ipart */
3583 fpart = lp_build_extract_mantissa(bld, x);
3584
3585 /* ipart + fpart */
3586 return LLVMBuildFAdd(builder, ipart, fpart, "");
3587 }
3588
3589
3590 /**
3591 * Fast implementation of iround(log2(x)).
3592 *
3593 * Not an approximation -- it should give accurate results all the time.
3594 */
3595 LLVMValueRef
3596 lp_build_ilog2(struct lp_build_context *bld,
3597 LLVMValueRef x)
3598 {
3599 LLVMBuilderRef builder = bld->gallivm->builder;
3600 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3601 LLVMValueRef ipart;
3602
3603 assert(bld->type.floating);
3604
3605 assert(lp_check_value(bld->type, x));
3606
3607 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3608 x = LLVMBuildFMul(builder, x, sqrt2, "");
3609
3610 /* ipart = floor(log2(x) + 0.5) */
3611 ipart = lp_build_extract_exponent(bld, x, 0);
3612
3613 return ipart;
3614 }
3615
3616 LLVMValueRef
3617 lp_build_mod(struct lp_build_context *bld,
3618 LLVMValueRef x,
3619 LLVMValueRef y)
3620 {
3621 LLVMBuilderRef builder = bld->gallivm->builder;
3622 LLVMValueRef res;
3623 const struct lp_type type = bld->type;
3624
3625 assert(lp_check_value(type, x));
3626 assert(lp_check_value(type, y));
3627
3628 if (type.floating)
3629 res = LLVMBuildFRem(builder, x, y, "");
3630 else if (type.sign)
3631 res = LLVMBuildSRem(builder, x, y, "");
3632 else
3633 res = LLVMBuildURem(builder, x, y, "");
3634 return res;
3635 }
3636
3637
3638 /*
3639 * For floating inputs it creates and returns a mask
3640 * which is all 1's for channels which are NaN.
3641 * Channels inside x which are not NaN will be 0.
3642 */
3643 LLVMValueRef
3644 lp_build_isnan(struct lp_build_context *bld,
3645 LLVMValueRef x)
3646 {
3647 LLVMValueRef mask;
3648 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3649
3650 assert(bld->type.floating);
3651 assert(lp_check_value(bld->type, x));
3652
3653 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3654 "isnotnan");
3655 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3656 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3657 return mask;
3658 }
3659
3660 /* Returns all 1's for floating point numbers that are
3661 * finite numbers and returns all zeros for -inf,
3662 * inf and nan's */
3663 LLVMValueRef
3664 lp_build_isfinite(struct lp_build_context *bld,
3665 LLVMValueRef x)
3666 {
3667 LLVMBuilderRef builder = bld->gallivm->builder;
3668 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3669 struct lp_type int_type = lp_int_type(bld->type);
3670 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3671 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3672 0x7f800000);
3673
3674 if (!bld->type.floating) {
3675 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3676 }
3677 assert(bld->type.floating);
3678 assert(lp_check_value(bld->type, x));
3679 assert(bld->type.width == 32);
3680
3681 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3682 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3683 intx, infornan32);
3684 }