gallivm: handle -inf, inf and nan's in sin/cos instructions
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67
68 #define EXP_POLY_DEGREE 5
69
70 #define LOG_POLY_DEGREE 4
71
72
73 /**
74 * Generate min(a, b)
75 * No checks for special case values of a or b = 1 or 0 are done.
76 * NaN's are handled according to the behavior specified by the
77 * nan_behavior argument.
78 */
79 static LLVMValueRef
80 lp_build_min_simple(struct lp_build_context *bld,
81 LLVMValueRef a,
82 LLVMValueRef b,
83 enum gallivm_nan_behavior nan_behavior)
84 {
85 const struct lp_type type = bld->type;
86 const char *intrinsic = NULL;
87 unsigned intr_size = 0;
88 LLVMValueRef cond;
89
90 assert(lp_check_value(type, a));
91 assert(lp_check_value(type, b));
92
93 /* TODO: optimize the constant case */
94
95 if (type.floating && util_cpu_caps.has_sse) {
96 if (type.width == 32) {
97 if (type.length == 1) {
98 intrinsic = "llvm.x86.sse.min.ss";
99 intr_size = 128;
100 }
101 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
102 intrinsic = "llvm.x86.sse.min.ps";
103 intr_size = 128;
104 }
105 else {
106 intrinsic = "llvm.x86.avx.min.ps.256";
107 intr_size = 256;
108 }
109 }
110 if (type.width == 64 && util_cpu_caps.has_sse2) {
111 if (type.length == 1) {
112 intrinsic = "llvm.x86.sse2.min.sd";
113 intr_size = 128;
114 }
115 else if (type.length == 2 || !util_cpu_caps.has_avx) {
116 intrinsic = "llvm.x86.sse2.min.pd";
117 intr_size = 128;
118 }
119 else {
120 intrinsic = "llvm.x86.avx.min.pd.256";
121 intr_size = 256;
122 }
123 }
124 }
125 else if (type.floating && util_cpu_caps.has_altivec) {
126 debug_printf("%s: altivec doesn't support nan behavior modes\n",
127 __FUNCTION__);
128 if (type.width == 32 && type.length == 4) {
129 intrinsic = "llvm.ppc.altivec.vminfp";
130 intr_size = 128;
131 }
132 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
133 intr_size = 128;
134 if ((type.width == 8 || type.width == 16) &&
135 (type.width * type.length <= 64) &&
136 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
137 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
138 __FUNCTION__);
139 }
140 if (type.width == 8 && !type.sign) {
141 intrinsic = "llvm.x86.sse2.pminu.b";
142 }
143 else if (type.width == 16 && type.sign) {
144 intrinsic = "llvm.x86.sse2.pmins.w";
145 }
146 if (util_cpu_caps.has_sse4_1) {
147 if (type.width == 8 && type.sign) {
148 intrinsic = "llvm.x86.sse41.pminsb";
149 }
150 if (type.width == 16 && !type.sign) {
151 intrinsic = "llvm.x86.sse41.pminuw";
152 }
153 if (type.width == 32 && !type.sign) {
154 intrinsic = "llvm.x86.sse41.pminud";
155 }
156 if (type.width == 32 && type.sign) {
157 intrinsic = "llvm.x86.sse41.pminsd";
158 }
159 }
160 } else if (util_cpu_caps.has_altivec) {
161 intr_size = 128;
162 debug_printf("%s: altivec doesn't support nan behavior modes\n",
163 __FUNCTION__);
164 if (type.width == 8) {
165 if (!type.sign) {
166 intrinsic = "llvm.ppc.altivec.vminub";
167 } else {
168 intrinsic = "llvm.ppc.altivec.vminsb";
169 }
170 } else if (type.width == 16) {
171 if (!type.sign) {
172 intrinsic = "llvm.ppc.altivec.vminuh";
173 } else {
174 intrinsic = "llvm.ppc.altivec.vminsh";
175 }
176 } else if (type.width == 32) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminuw";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsw";
181 }
182 }
183 }
184
185 if(intrinsic) {
186 /* We need to handle nan's for floating point numbers. If one of the
187 * inputs is nan the other should be returned (required by both D3D10+
188 * and OpenCL).
189 * The sse intrinsics return the second operator in case of nan by
190 * default so we need to special code to handle those.
191 */
192 if (util_cpu_caps.has_sse && type.floating &&
193 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
194 nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
195 LLVMValueRef isnan, max;
196 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
197 type,
198 intr_size, a, b);
199 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
200 isnan = lp_build_isnan(bld, b);
201 return lp_build_select(bld, isnan, a, max);
202 } else {
203 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
204 isnan = lp_build_isnan(bld, a);
205 return lp_build_select(bld, isnan, a, max);
206 }
207 } else {
208 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
209 type,
210 intr_size, a, b);
211 }
212 }
213
214 if (type.floating) {
215 switch (nan_behavior) {
216 case GALLIVM_NAN_RETURN_NAN: {
217 LLVMValueRef isnan = lp_build_isnan(bld, b);
218 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
219 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
220 return lp_build_select(bld, cond, a, b);
221 }
222 break;
223 case GALLIVM_NAN_RETURN_OTHER: {
224 LLVMValueRef isnan = lp_build_isnan(bld, a);
225 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
226 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
227 return lp_build_select(bld, cond, a, b);
228 }
229 break;
230 case GALLIVM_NAN_RETURN_SECOND:
231 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
232 return lp_build_select(bld, cond, a, b);
233 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
234 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
235 return lp_build_select(bld, cond, a, b);
236 break;
237 default:
238 assert(0);
239 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
240 return lp_build_select(bld, cond, a, b);
241 }
242 } else {
243 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
244 return lp_build_select(bld, cond, a, b);
245 }
246 }
247
248
249 /**
250 * Generate max(a, b)
251 * No checks for special case values of a or b = 1 or 0 are done.
252 * NaN's are handled according to the behavior specified by the
253 * nan_behavior argument.
254 */
255 static LLVMValueRef
256 lp_build_max_simple(struct lp_build_context *bld,
257 LLVMValueRef a,
258 LLVMValueRef b,
259 enum gallivm_nan_behavior nan_behavior)
260 {
261 const struct lp_type type = bld->type;
262 const char *intrinsic = NULL;
263 unsigned intr_size = 0;
264 LLVMValueRef cond;
265
266 assert(lp_check_value(type, a));
267 assert(lp_check_value(type, b));
268
269 /* TODO: optimize the constant case */
270
271 if (type.floating && util_cpu_caps.has_sse) {
272 if (type.width == 32) {
273 if (type.length == 1) {
274 intrinsic = "llvm.x86.sse.max.ss";
275 intr_size = 128;
276 }
277 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
278 intrinsic = "llvm.x86.sse.max.ps";
279 intr_size = 128;
280 }
281 else {
282 intrinsic = "llvm.x86.avx.max.ps.256";
283 intr_size = 256;
284 }
285 }
286 if (type.width == 64 && util_cpu_caps.has_sse2) {
287 if (type.length == 1) {
288 intrinsic = "llvm.x86.sse2.max.sd";
289 intr_size = 128;
290 }
291 else if (type.length == 2 || !util_cpu_caps.has_avx) {
292 intrinsic = "llvm.x86.sse2.max.pd";
293 intr_size = 128;
294 }
295 else {
296 intrinsic = "llvm.x86.avx.max.pd.256";
297 intr_size = 256;
298 }
299 }
300 }
301 else if (type.floating && util_cpu_caps.has_altivec) {
302 debug_printf("%s: altivec doesn't support nan behavior modes\n",
303 __FUNCTION__);
304 if (type.width == 32 || type.length == 4) {
305 intrinsic = "llvm.ppc.altivec.vmaxfp";
306 intr_size = 128;
307 }
308 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
309 intr_size = 128;
310 if ((type.width == 8 || type.width == 16) &&
311 (type.width * type.length <= 64) &&
312 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
313 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
314 __FUNCTION__);
315 }
316 if (type.width == 8 && !type.sign) {
317 intrinsic = "llvm.x86.sse2.pmaxu.b";
318 intr_size = 128;
319 }
320 else if (type.width == 16 && type.sign) {
321 intrinsic = "llvm.x86.sse2.pmaxs.w";
322 }
323 if (util_cpu_caps.has_sse4_1) {
324 if (type.width == 8 && type.sign) {
325 intrinsic = "llvm.x86.sse41.pmaxsb";
326 }
327 if (type.width == 16 && !type.sign) {
328 intrinsic = "llvm.x86.sse41.pmaxuw";
329 }
330 if (type.width == 32 && !type.sign) {
331 intrinsic = "llvm.x86.sse41.pmaxud";
332 }
333 if (type.width == 32 && type.sign) {
334 intrinsic = "llvm.x86.sse41.pmaxsd";
335 }
336 }
337 } else if (util_cpu_caps.has_altivec) {
338 intr_size = 128;
339 debug_printf("%s: altivec doesn't support nan behavior modes\n",
340 __FUNCTION__);
341 if (type.width == 8) {
342 if (!type.sign) {
343 intrinsic = "llvm.ppc.altivec.vmaxub";
344 } else {
345 intrinsic = "llvm.ppc.altivec.vmaxsb";
346 }
347 } else if (type.width == 16) {
348 if (!type.sign) {
349 intrinsic = "llvm.ppc.altivec.vmaxuh";
350 } else {
351 intrinsic = "llvm.ppc.altivec.vmaxsh";
352 }
353 } else if (type.width == 32) {
354 if (!type.sign) {
355 intrinsic = "llvm.ppc.altivec.vmaxuw";
356 } else {
357 intrinsic = "llvm.ppc.altivec.vmaxsw";
358 }
359 }
360 }
361
362 if(intrinsic) {
363 if (util_cpu_caps.has_sse && type.floating &&
364 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
365 nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
366 LLVMValueRef isnan, min;
367 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
368 type,
369 intr_size, a, b);
370 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
371 isnan = lp_build_isnan(bld, b);
372 return lp_build_select(bld, isnan, a, min);
373 } else {
374 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
375 isnan = lp_build_isnan(bld, a);
376 return lp_build_select(bld, isnan, a, min);
377 }
378 } else {
379 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
380 type,
381 intr_size, a, b);
382 }
383 }
384
385 if (type.floating) {
386 switch (nan_behavior) {
387 case GALLIVM_NAN_RETURN_NAN: {
388 LLVMValueRef isnan = lp_build_isnan(bld, b);
389 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
390 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
391 return lp_build_select(bld, cond, a, b);
392 }
393 break;
394 case GALLIVM_NAN_RETURN_OTHER: {
395 LLVMValueRef isnan = lp_build_isnan(bld, a);
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
398 return lp_build_select(bld, cond, a, b);
399 }
400 break;
401 case GALLIVM_NAN_RETURN_SECOND:
402 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
403 return lp_build_select(bld, cond, a, b);
404 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
405 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
406 return lp_build_select(bld, cond, a, b);
407 break;
408 default:
409 assert(0);
410 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
411 return lp_build_select(bld, cond, a, b);
412 }
413 } else {
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 return lp_build_select(bld, cond, a, b);
416 }
417 }
418
419
420 /**
421 * Generate 1 - a, or ~a depending on bld->type.
422 */
423 LLVMValueRef
424 lp_build_comp(struct lp_build_context *bld,
425 LLVMValueRef a)
426 {
427 LLVMBuilderRef builder = bld->gallivm->builder;
428 const struct lp_type type = bld->type;
429
430 assert(lp_check_value(type, a));
431
432 if(a == bld->one)
433 return bld->zero;
434 if(a == bld->zero)
435 return bld->one;
436
437 if(type.norm && !type.floating && !type.fixed && !type.sign) {
438 if(LLVMIsConstant(a))
439 return LLVMConstNot(a);
440 else
441 return LLVMBuildNot(builder, a, "");
442 }
443
444 if(LLVMIsConstant(a))
445 if (type.floating)
446 return LLVMConstFSub(bld->one, a);
447 else
448 return LLVMConstSub(bld->one, a);
449 else
450 if (type.floating)
451 return LLVMBuildFSub(builder, bld->one, a, "");
452 else
453 return LLVMBuildSub(builder, bld->one, a, "");
454 }
455
456
457 /**
458 * Generate a + b
459 */
460 LLVMValueRef
461 lp_build_add(struct lp_build_context *bld,
462 LLVMValueRef a,
463 LLVMValueRef b)
464 {
465 LLVMBuilderRef builder = bld->gallivm->builder;
466 const struct lp_type type = bld->type;
467 LLVMValueRef res;
468
469 assert(lp_check_value(type, a));
470 assert(lp_check_value(type, b));
471
472 if(a == bld->zero)
473 return b;
474 if(b == bld->zero)
475 return a;
476 if(a == bld->undef || b == bld->undef)
477 return bld->undef;
478
479 if(bld->type.norm) {
480 const char *intrinsic = NULL;
481
482 if(a == bld->one || b == bld->one)
483 return bld->one;
484
485 if (type.width * type.length == 128 &&
486 !type.floating && !type.fixed) {
487 if(util_cpu_caps.has_sse2) {
488 if(type.width == 8)
489 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
490 if(type.width == 16)
491 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
492 } else if (util_cpu_caps.has_altivec) {
493 if(type.width == 8)
494 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
495 if(type.width == 16)
496 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
497 }
498 }
499
500 if(intrinsic)
501 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
502 }
503
504 /* TODO: handle signed case */
505 if(type.norm && !type.floating && !type.fixed && !type.sign)
506 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
507
508 if(LLVMIsConstant(a) && LLVMIsConstant(b))
509 if (type.floating)
510 res = LLVMConstFAdd(a, b);
511 else
512 res = LLVMConstAdd(a, b);
513 else
514 if (type.floating)
515 res = LLVMBuildFAdd(builder, a, b, "");
516 else
517 res = LLVMBuildAdd(builder, a, b, "");
518
519 /* clamp to ceiling of 1.0 */
520 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
521 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
522
523 /* XXX clamp to floor of -1 or 0??? */
524
525 return res;
526 }
527
528
529 /** Return the scalar sum of the elements of a.
530 * Should avoid this operation whenever possible.
531 */
532 LLVMValueRef
533 lp_build_horizontal_add(struct lp_build_context *bld,
534 LLVMValueRef a)
535 {
536 LLVMBuilderRef builder = bld->gallivm->builder;
537 const struct lp_type type = bld->type;
538 LLVMValueRef index, res;
539 unsigned i, length;
540 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
541 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
542 LLVMValueRef vecres, elem2;
543
544 assert(lp_check_value(type, a));
545
546 if (type.length == 1) {
547 return a;
548 }
549
550 assert(!bld->type.norm);
551
552 /*
553 * for byte vectors can do much better with psadbw.
554 * Using repeated shuffle/adds here. Note with multiple vectors
555 * this can be done more efficiently as outlined in the intel
556 * optimization manual.
557 * Note: could cause data rearrangement if used with smaller element
558 * sizes.
559 */
560
561 vecres = a;
562 length = type.length / 2;
563 while (length > 1) {
564 LLVMValueRef vec1, vec2;
565 for (i = 0; i < length; i++) {
566 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
567 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
568 }
569 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
570 LLVMConstVector(shuffles1, length), "");
571 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
572 LLVMConstVector(shuffles2, length), "");
573 if (type.floating) {
574 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
575 }
576 else {
577 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
578 }
579 length = length >> 1;
580 }
581
582 /* always have vector of size 2 here */
583 assert(length == 1);
584
585 index = lp_build_const_int32(bld->gallivm, 0);
586 res = LLVMBuildExtractElement(builder, vecres, index, "");
587 index = lp_build_const_int32(bld->gallivm, 1);
588 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
589
590 if (type.floating)
591 res = LLVMBuildFAdd(builder, res, elem2, "");
592 else
593 res = LLVMBuildAdd(builder, res, elem2, "");
594
595 return res;
596 }
597
598 /**
599 * Return the horizontal sums of 4 float vectors as a float4 vector.
600 * This uses the technique as outlined in Intel Optimization Manual.
601 */
602 static LLVMValueRef
603 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
604 LLVMValueRef src[4])
605 {
606 struct gallivm_state *gallivm = bld->gallivm;
607 LLVMBuilderRef builder = gallivm->builder;
608 LLVMValueRef shuffles[4];
609 LLVMValueRef tmp[4];
610 LLVMValueRef sumtmp[2], shuftmp[2];
611
612 /* lower half of regs */
613 shuffles[0] = lp_build_const_int32(gallivm, 0);
614 shuffles[1] = lp_build_const_int32(gallivm, 1);
615 shuffles[2] = lp_build_const_int32(gallivm, 4);
616 shuffles[3] = lp_build_const_int32(gallivm, 5);
617 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
618 LLVMConstVector(shuffles, 4), "");
619 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
620 LLVMConstVector(shuffles, 4), "");
621
622 /* upper half of regs */
623 shuffles[0] = lp_build_const_int32(gallivm, 2);
624 shuffles[1] = lp_build_const_int32(gallivm, 3);
625 shuffles[2] = lp_build_const_int32(gallivm, 6);
626 shuffles[3] = lp_build_const_int32(gallivm, 7);
627 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
628 LLVMConstVector(shuffles, 4), "");
629 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
630 LLVMConstVector(shuffles, 4), "");
631
632 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
633 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
634
635 shuffles[0] = lp_build_const_int32(gallivm, 0);
636 shuffles[1] = lp_build_const_int32(gallivm, 2);
637 shuffles[2] = lp_build_const_int32(gallivm, 4);
638 shuffles[3] = lp_build_const_int32(gallivm, 6);
639 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640 LLVMConstVector(shuffles, 4), "");
641
642 shuffles[0] = lp_build_const_int32(gallivm, 1);
643 shuffles[1] = lp_build_const_int32(gallivm, 3);
644 shuffles[2] = lp_build_const_int32(gallivm, 5);
645 shuffles[3] = lp_build_const_int32(gallivm, 7);
646 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
647 LLVMConstVector(shuffles, 4), "");
648
649 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
650 }
651
652
653 /*
654 * partially horizontally add 2-4 float vectors with length nx4,
655 * i.e. only four adjacent values in each vector will be added,
656 * assuming values are really grouped in 4 which also determines
657 * output order.
658 *
659 * Return a vector of the same length as the initial vectors,
660 * with the excess elements (if any) being undefined.
661 * The element order is independent of number of input vectors.
662 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
663 * the output order thus will be
664 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
665 */
666 LLVMValueRef
667 lp_build_hadd_partial4(struct lp_build_context *bld,
668 LLVMValueRef vectors[],
669 unsigned num_vecs)
670 {
671 struct gallivm_state *gallivm = bld->gallivm;
672 LLVMBuilderRef builder = gallivm->builder;
673 LLVMValueRef ret_vec;
674 LLVMValueRef tmp[4];
675 const char *intrinsic = NULL;
676
677 assert(num_vecs >= 2 && num_vecs <= 4);
678 assert(bld->type.floating);
679
680 /* only use this with at least 2 vectors, as it is sort of expensive
681 * (depending on cpu) and we always need two horizontal adds anyway,
682 * so a shuffle/add approach might be better.
683 */
684
685 tmp[0] = vectors[0];
686 tmp[1] = vectors[1];
687
688 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
689 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
690
691 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
692 bld->type.length == 4) {
693 intrinsic = "llvm.x86.sse3.hadd.ps";
694 }
695 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
696 bld->type.length == 8) {
697 intrinsic = "llvm.x86.avx.hadd.ps.256";
698 }
699 if (intrinsic) {
700 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
701 lp_build_vec_type(gallivm, bld->type),
702 tmp[0], tmp[1]);
703 if (num_vecs > 2) {
704 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
705 lp_build_vec_type(gallivm, bld->type),
706 tmp[2], tmp[3]);
707 }
708 else {
709 tmp[1] = tmp[0];
710 }
711 return lp_build_intrinsic_binary(builder, intrinsic,
712 lp_build_vec_type(gallivm, bld->type),
713 tmp[0], tmp[1]);
714 }
715
716 if (bld->type.length == 4) {
717 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
718 }
719 else {
720 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
721 unsigned j;
722 unsigned num_iter = bld->type.length / 4;
723 struct lp_type parttype = bld->type;
724 parttype.length = 4;
725 for (j = 0; j < num_iter; j++) {
726 LLVMValueRef partsrc[4];
727 unsigned i;
728 for (i = 0; i < 4; i++) {
729 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
730 }
731 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
732 }
733 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
734 }
735 return ret_vec;
736 }
737
738 /**
739 * Generate a - b
740 */
741 LLVMValueRef
742 lp_build_sub(struct lp_build_context *bld,
743 LLVMValueRef a,
744 LLVMValueRef b)
745 {
746 LLVMBuilderRef builder = bld->gallivm->builder;
747 const struct lp_type type = bld->type;
748 LLVMValueRef res;
749
750 assert(lp_check_value(type, a));
751 assert(lp_check_value(type, b));
752
753 if(b == bld->zero)
754 return a;
755 if(a == bld->undef || b == bld->undef)
756 return bld->undef;
757 if(a == b)
758 return bld->zero;
759
760 if(bld->type.norm) {
761 const char *intrinsic = NULL;
762
763 if(b == bld->one)
764 return bld->zero;
765
766 if (type.width * type.length == 128 &&
767 !type.floating && !type.fixed) {
768 if (util_cpu_caps.has_sse2) {
769 if(type.width == 8)
770 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
771 if(type.width == 16)
772 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
773 } else if (util_cpu_caps.has_altivec) {
774 if(type.width == 8)
775 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
776 if(type.width == 16)
777 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
778 }
779 }
780
781 if(intrinsic)
782 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
783 }
784
785 /* TODO: handle signed case */
786 if(type.norm && !type.floating && !type.fixed && !type.sign)
787 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
788
789 if(LLVMIsConstant(a) && LLVMIsConstant(b))
790 if (type.floating)
791 res = LLVMConstFSub(a, b);
792 else
793 res = LLVMConstSub(a, b);
794 else
795 if (type.floating)
796 res = LLVMBuildFSub(builder, a, b, "");
797 else
798 res = LLVMBuildSub(builder, a, b, "");
799
800 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
801 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802
803 return res;
804 }
805
806
807
808 /**
809 * Normalized multiplication.
810 *
811 * There are several approaches for (using 8-bit normalized multiplication as
812 * an example):
813 *
814 * - alpha plus one
815 *
816 * makes the following approximation to the division (Sree)
817 *
818 * a*b/255 ~= (a*(b + 1)) >> 256
819 *
820 * which is the fastest method that satisfies the following OpenGL criteria of
821 *
822 * 0*0 = 0 and 255*255 = 255
823 *
824 * - geometric series
825 *
826 * takes the geometric series approximation to the division
827 *
828 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
829 *
830 * in this case just the first two terms to fit in 16bit arithmetic
831 *
832 * t/255 ~= (t + (t >> 8)) >> 8
833 *
834 * note that just by itself it doesn't satisfies the OpenGL criteria, as
835 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
836 * must be used.
837 *
838 * - geometric series plus rounding
839 *
840 * when using a geometric series division instead of truncating the result
841 * use roundoff in the approximation (Jim Blinn)
842 *
843 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
844 *
845 * achieving the exact results.
846 *
847 *
848 *
849 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
850 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
851 * @sa Michael Herf, The "double blend trick", May 2000,
852 * http://www.stereopsis.com/doubleblend.html
853 */
854 static LLVMValueRef
855 lp_build_mul_norm(struct gallivm_state *gallivm,
856 struct lp_type wide_type,
857 LLVMValueRef a, LLVMValueRef b)
858 {
859 LLVMBuilderRef builder = gallivm->builder;
860 struct lp_build_context bld;
861 unsigned n;
862 LLVMValueRef half;
863 LLVMValueRef ab;
864
865 assert(!wide_type.floating);
866 assert(lp_check_value(wide_type, a));
867 assert(lp_check_value(wide_type, b));
868
869 lp_build_context_init(&bld, gallivm, wide_type);
870
871 n = wide_type.width / 2;
872 if (wide_type.sign) {
873 --n;
874 }
875
876 /*
877 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
878 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
879 */
880
881 /*
882 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
883 */
884
885 ab = LLVMBuildMul(builder, a, b, "");
886 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
887
888 /*
889 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
890 */
891
892 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
893 if (wide_type.sign) {
894 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
895 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
896 half = lp_build_select(&bld, sign, minus_half, half);
897 }
898 ab = LLVMBuildAdd(builder, ab, half, "");
899
900 /* Final division */
901 ab = lp_build_shr_imm(&bld, ab, n);
902
903 return ab;
904 }
905
906 /**
907 * Generate a * b
908 */
909 LLVMValueRef
910 lp_build_mul(struct lp_build_context *bld,
911 LLVMValueRef a,
912 LLVMValueRef b)
913 {
914 LLVMBuilderRef builder = bld->gallivm->builder;
915 const struct lp_type type = bld->type;
916 LLVMValueRef shift;
917 LLVMValueRef res;
918
919 assert(lp_check_value(type, a));
920 assert(lp_check_value(type, b));
921
922 if(a == bld->zero)
923 return bld->zero;
924 if(a == bld->one)
925 return b;
926 if(b == bld->zero)
927 return bld->zero;
928 if(b == bld->one)
929 return a;
930 if(a == bld->undef || b == bld->undef)
931 return bld->undef;
932
933 if (!type.floating && !type.fixed && type.norm) {
934 struct lp_type wide_type = lp_wider_type(type);
935 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
936
937 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
938 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
939
940 /* PMULLW, PSRLW, PADDW */
941 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
942 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
943
944 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
945
946 return ab;
947 }
948
949 if(type.fixed)
950 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
951 else
952 shift = NULL;
953
954 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
955 if (type.floating)
956 res = LLVMConstFMul(a, b);
957 else
958 res = LLVMConstMul(a, b);
959 if(shift) {
960 if(type.sign)
961 res = LLVMConstAShr(res, shift);
962 else
963 res = LLVMConstLShr(res, shift);
964 }
965 }
966 else {
967 if (type.floating)
968 res = LLVMBuildFMul(builder, a, b, "");
969 else
970 res = LLVMBuildMul(builder, a, b, "");
971 if(shift) {
972 if(type.sign)
973 res = LLVMBuildAShr(builder, res, shift, "");
974 else
975 res = LLVMBuildLShr(builder, res, shift, "");
976 }
977 }
978
979 return res;
980 }
981
982
983 /**
984 * Small vector x scale multiplication optimization.
985 */
986 LLVMValueRef
987 lp_build_mul_imm(struct lp_build_context *bld,
988 LLVMValueRef a,
989 int b)
990 {
991 LLVMBuilderRef builder = bld->gallivm->builder;
992 LLVMValueRef factor;
993
994 assert(lp_check_value(bld->type, a));
995
996 if(b == 0)
997 return bld->zero;
998
999 if(b == 1)
1000 return a;
1001
1002 if(b == -1)
1003 return lp_build_negate(bld, a);
1004
1005 if(b == 2 && bld->type.floating)
1006 return lp_build_add(bld, a, a);
1007
1008 if(util_is_power_of_two(b)) {
1009 unsigned shift = ffs(b) - 1;
1010
1011 if(bld->type.floating) {
1012 #if 0
1013 /*
1014 * Power of two multiplication by directly manipulating the exponent.
1015 *
1016 * XXX: This might not be always faster, it will introduce a small error
1017 * for multiplication by zero, and it will produce wrong results
1018 * for Inf and NaN.
1019 */
1020 unsigned mantissa = lp_mantissa(bld->type);
1021 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1022 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1023 a = LLVMBuildAdd(builder, a, factor, "");
1024 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1025 return a;
1026 #endif
1027 }
1028 else {
1029 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1030 return LLVMBuildShl(builder, a, factor, "");
1031 }
1032 }
1033
1034 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1035 return lp_build_mul(bld, a, factor);
1036 }
1037
1038
1039 /**
1040 * Generate a / b
1041 */
1042 LLVMValueRef
1043 lp_build_div(struct lp_build_context *bld,
1044 LLVMValueRef a,
1045 LLVMValueRef b)
1046 {
1047 LLVMBuilderRef builder = bld->gallivm->builder;
1048 const struct lp_type type = bld->type;
1049
1050 assert(lp_check_value(type, a));
1051 assert(lp_check_value(type, b));
1052
1053 if(a == bld->zero)
1054 return bld->zero;
1055 if(a == bld->one)
1056 return lp_build_rcp(bld, b);
1057 if(b == bld->zero)
1058 return bld->undef;
1059 if(b == bld->one)
1060 return a;
1061 if(a == bld->undef || b == bld->undef)
1062 return bld->undef;
1063
1064 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1065 if (type.floating)
1066 return LLVMConstFDiv(a, b);
1067 else if (type.sign)
1068 return LLVMConstSDiv(a, b);
1069 else
1070 return LLVMConstUDiv(a, b);
1071 }
1072
1073 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1074 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1075 type.floating)
1076 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1077
1078 if (type.floating)
1079 return LLVMBuildFDiv(builder, a, b, "");
1080 else if (type.sign)
1081 return LLVMBuildSDiv(builder, a, b, "");
1082 else
1083 return LLVMBuildUDiv(builder, a, b, "");
1084 }
1085
1086
1087 /**
1088 * Linear interpolation helper.
1089 *
1090 * @param normalized whether we are interpolating normalized values,
1091 * encoded in normalized integers, twice as wide.
1092 *
1093 * @sa http://www.stereopsis.com/doubleblend.html
1094 */
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context *bld,
1097 LLVMValueRef x,
1098 LLVMValueRef v0,
1099 LLVMValueRef v1,
1100 unsigned flags)
1101 {
1102 unsigned half_width = bld->type.width/2;
1103 LLVMBuilderRef builder = bld->gallivm->builder;
1104 LLVMValueRef delta;
1105 LLVMValueRef res;
1106
1107 assert(lp_check_value(bld->type, x));
1108 assert(lp_check_value(bld->type, v0));
1109 assert(lp_check_value(bld->type, v1));
1110
1111 delta = lp_build_sub(bld, v1, v0);
1112
1113 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1114 if (!bld->type.sign) {
1115 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1116 /*
1117 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118 * most-significant-bit to the lowest-significant-bit, so that
1119 * later we can just divide by 2**n instead of 2**n - 1.
1120 */
1121
1122 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1123 }
1124
1125 /* (x * delta) >> n */
1126 res = lp_build_mul(bld, x, delta);
1127 res = lp_build_shr_imm(bld, res, half_width);
1128 } else {
1129 /*
1130 * The rescaling trick above doesn't work for signed numbers, so
1131 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1132 * instead.
1133 */
1134 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1135 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1136 }
1137 } else {
1138 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1139 res = lp_build_mul(bld, x, delta);
1140 }
1141
1142 res = lp_build_add(bld, v0, res);
1143
1144 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1145 bld->type.fixed) {
1146 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148 * but it will be wrong for true fixed point use cases. Basically we need
1149 * a more powerful lp_type, capable of further distinguishing the values
1150 * interpretation from the value storage. */
1151 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1152 }
1153
1154 return res;
1155 }
1156
1157
1158 /**
1159 * Linear interpolation.
1160 */
1161 LLVMValueRef
1162 lp_build_lerp(struct lp_build_context *bld,
1163 LLVMValueRef x,
1164 LLVMValueRef v0,
1165 LLVMValueRef v1,
1166 unsigned flags)
1167 {
1168 const struct lp_type type = bld->type;
1169 LLVMValueRef res;
1170
1171 assert(lp_check_value(type, x));
1172 assert(lp_check_value(type, v0));
1173 assert(lp_check_value(type, v1));
1174
1175 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1176
1177 if (type.norm) {
1178 struct lp_type wide_type;
1179 struct lp_build_context wide_bld;
1180 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1181
1182 assert(type.length >= 2);
1183
1184 /*
1185 * Create a wider integer type, enough to hold the
1186 * intermediate result of the multiplication.
1187 */
1188 memset(&wide_type, 0, sizeof wide_type);
1189 wide_type.sign = type.sign;
1190 wide_type.width = type.width*2;
1191 wide_type.length = type.length/2;
1192
1193 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1194
1195 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1196 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1197 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1198
1199 /*
1200 * Lerp both halves.
1201 */
1202
1203 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1204
1205 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1206 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1207
1208 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1209 } else {
1210 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1211 }
1212
1213 return res;
1214 }
1215
1216
1217 /**
1218 * Bilinear interpolation.
1219 *
1220 * Values indices are in v_{yx}.
1221 */
1222 LLVMValueRef
1223 lp_build_lerp_2d(struct lp_build_context *bld,
1224 LLVMValueRef x,
1225 LLVMValueRef y,
1226 LLVMValueRef v00,
1227 LLVMValueRef v01,
1228 LLVMValueRef v10,
1229 LLVMValueRef v11,
1230 unsigned flags)
1231 {
1232 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1233 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1234 return lp_build_lerp(bld, y, v0, v1, flags);
1235 }
1236
1237
1238 LLVMValueRef
1239 lp_build_lerp_3d(struct lp_build_context *bld,
1240 LLVMValueRef x,
1241 LLVMValueRef y,
1242 LLVMValueRef z,
1243 LLVMValueRef v000,
1244 LLVMValueRef v001,
1245 LLVMValueRef v010,
1246 LLVMValueRef v011,
1247 LLVMValueRef v100,
1248 LLVMValueRef v101,
1249 LLVMValueRef v110,
1250 LLVMValueRef v111,
1251 unsigned flags)
1252 {
1253 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1254 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1255 return lp_build_lerp(bld, z, v0, v1, flags);
1256 }
1257
1258
1259 /**
1260 * Generate min(a, b)
1261 * Do checks for special cases but not for nans.
1262 */
1263 LLVMValueRef
1264 lp_build_min(struct lp_build_context *bld,
1265 LLVMValueRef a,
1266 LLVMValueRef b)
1267 {
1268 assert(lp_check_value(bld->type, a));
1269 assert(lp_check_value(bld->type, b));
1270
1271 if(a == bld->undef || b == bld->undef)
1272 return bld->undef;
1273
1274 if(a == b)
1275 return a;
1276
1277 if (bld->type.norm) {
1278 if (!bld->type.sign) {
1279 if (a == bld->zero || b == bld->zero) {
1280 return bld->zero;
1281 }
1282 }
1283 if(a == bld->one)
1284 return b;
1285 if(b == bld->one)
1286 return a;
1287 }
1288
1289 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1290 }
1291
1292
1293 /**
1294 * Generate min(a, b)
1295 * NaN's are handled according to the behavior specified by the
1296 * nan_behavior argument.
1297 */
1298 LLVMValueRef
1299 lp_build_min_ext(struct lp_build_context *bld,
1300 LLVMValueRef a,
1301 LLVMValueRef b,
1302 enum gallivm_nan_behavior nan_behavior)
1303 {
1304 assert(lp_check_value(bld->type, a));
1305 assert(lp_check_value(bld->type, b));
1306
1307 if(a == bld->undef || b == bld->undef)
1308 return bld->undef;
1309
1310 if(a == b)
1311 return a;
1312
1313 if (bld->type.norm) {
1314 if (!bld->type.sign) {
1315 if (a == bld->zero || b == bld->zero) {
1316 return bld->zero;
1317 }
1318 }
1319 if(a == bld->one)
1320 return b;
1321 if(b == bld->one)
1322 return a;
1323 }
1324
1325 return lp_build_min_simple(bld, a, b, nan_behavior);
1326 }
1327
1328 /**
1329 * Generate max(a, b)
1330 * Do checks for special cases, but NaN behavior is undefined.
1331 */
1332 LLVMValueRef
1333 lp_build_max(struct lp_build_context *bld,
1334 LLVMValueRef a,
1335 LLVMValueRef b)
1336 {
1337 assert(lp_check_value(bld->type, a));
1338 assert(lp_check_value(bld->type, b));
1339
1340 if(a == bld->undef || b == bld->undef)
1341 return bld->undef;
1342
1343 if(a == b)
1344 return a;
1345
1346 if(bld->type.norm) {
1347 if(a == bld->one || b == bld->one)
1348 return bld->one;
1349 if (!bld->type.sign) {
1350 if (a == bld->zero) {
1351 return b;
1352 }
1353 if (b == bld->zero) {
1354 return a;
1355 }
1356 }
1357 }
1358
1359 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1360 }
1361
1362
1363 /**
1364 * Generate max(a, b)
1365 * Checks for special cases.
1366 * NaN's are handled according to the behavior specified by the
1367 * nan_behavior argument.
1368 */
1369 LLVMValueRef
1370 lp_build_max_ext(struct lp_build_context *bld,
1371 LLVMValueRef a,
1372 LLVMValueRef b,
1373 enum gallivm_nan_behavior nan_behavior)
1374 {
1375 assert(lp_check_value(bld->type, a));
1376 assert(lp_check_value(bld->type, b));
1377
1378 if(a == bld->undef || b == bld->undef)
1379 return bld->undef;
1380
1381 if(a == b)
1382 return a;
1383
1384 if(bld->type.norm) {
1385 if(a == bld->one || b == bld->one)
1386 return bld->one;
1387 if (!bld->type.sign) {
1388 if (a == bld->zero) {
1389 return b;
1390 }
1391 if (b == bld->zero) {
1392 return a;
1393 }
1394 }
1395 }
1396
1397 return lp_build_max_simple(bld, a, b, nan_behavior);
1398 }
1399
1400 /**
1401 * Generate clamp(a, min, max)
1402 * Do checks for special cases.
1403 */
1404 LLVMValueRef
1405 lp_build_clamp(struct lp_build_context *bld,
1406 LLVMValueRef a,
1407 LLVMValueRef min,
1408 LLVMValueRef max)
1409 {
1410 assert(lp_check_value(bld->type, a));
1411 assert(lp_check_value(bld->type, min));
1412 assert(lp_check_value(bld->type, max));
1413
1414 a = lp_build_min(bld, a, max);
1415 a = lp_build_max(bld, a, min);
1416 return a;
1417 }
1418
1419
1420 /**
1421 * Generate abs(a)
1422 */
1423 LLVMValueRef
1424 lp_build_abs(struct lp_build_context *bld,
1425 LLVMValueRef a)
1426 {
1427 LLVMBuilderRef builder = bld->gallivm->builder;
1428 const struct lp_type type = bld->type;
1429 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1430
1431 assert(lp_check_value(type, a));
1432
1433 if(!type.sign)
1434 return a;
1435
1436 if(type.floating) {
1437 /* Mask out the sign bit */
1438 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1439 unsigned long long absMask = ~(1ULL << (type.width - 1));
1440 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1441 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1442 a = LLVMBuildAnd(builder, a, mask, "");
1443 a = LLVMBuildBitCast(builder, a, vec_type, "");
1444 return a;
1445 }
1446
1447 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1448 switch(type.width) {
1449 case 8:
1450 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1451 case 16:
1452 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1453 case 32:
1454 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1455 }
1456 }
1457 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1458 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1459 (type.width == 8 || type.width == 16 || type.width == 32)) {
1460 debug_printf("%s: inefficient code, should split vectors manually\n",
1461 __FUNCTION__);
1462 }
1463
1464 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1465 }
1466
1467
1468 LLVMValueRef
1469 lp_build_negate(struct lp_build_context *bld,
1470 LLVMValueRef a)
1471 {
1472 LLVMBuilderRef builder = bld->gallivm->builder;
1473
1474 assert(lp_check_value(bld->type, a));
1475
1476 #if HAVE_LLVM >= 0x0207
1477 if (bld->type.floating)
1478 a = LLVMBuildFNeg(builder, a, "");
1479 else
1480 #endif
1481 a = LLVMBuildNeg(builder, a, "");
1482
1483 return a;
1484 }
1485
1486
1487 /** Return -1, 0 or +1 depending on the sign of a */
1488 LLVMValueRef
1489 lp_build_sgn(struct lp_build_context *bld,
1490 LLVMValueRef a)
1491 {
1492 LLVMBuilderRef builder = bld->gallivm->builder;
1493 const struct lp_type type = bld->type;
1494 LLVMValueRef cond;
1495 LLVMValueRef res;
1496
1497 assert(lp_check_value(type, a));
1498
1499 /* Handle non-zero case */
1500 if(!type.sign) {
1501 /* if not zero then sign must be positive */
1502 res = bld->one;
1503 }
1504 else if(type.floating) {
1505 LLVMTypeRef vec_type;
1506 LLVMTypeRef int_type;
1507 LLVMValueRef mask;
1508 LLVMValueRef sign;
1509 LLVMValueRef one;
1510 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1511
1512 int_type = lp_build_int_vec_type(bld->gallivm, type);
1513 vec_type = lp_build_vec_type(bld->gallivm, type);
1514 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1515
1516 /* Take the sign bit and add it to 1 constant */
1517 sign = LLVMBuildBitCast(builder, a, int_type, "");
1518 sign = LLVMBuildAnd(builder, sign, mask, "");
1519 one = LLVMConstBitCast(bld->one, int_type);
1520 res = LLVMBuildOr(builder, sign, one, "");
1521 res = LLVMBuildBitCast(builder, res, vec_type, "");
1522 }
1523 else
1524 {
1525 /* signed int/norm/fixed point */
1526 /* could use psign with sse3 and appropriate vectors here */
1527 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1528 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1529 res = lp_build_select(bld, cond, bld->one, minus_one);
1530 }
1531
1532 /* Handle zero */
1533 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1534 res = lp_build_select(bld, cond, bld->zero, res);
1535
1536 return res;
1537 }
1538
1539
1540 /**
1541 * Set the sign of float vector 'a' according to 'sign'.
1542 * If sign==0, return abs(a).
1543 * If sign==1, return -abs(a);
1544 * Other values for sign produce undefined results.
1545 */
1546 LLVMValueRef
1547 lp_build_set_sign(struct lp_build_context *bld,
1548 LLVMValueRef a, LLVMValueRef sign)
1549 {
1550 LLVMBuilderRef builder = bld->gallivm->builder;
1551 const struct lp_type type = bld->type;
1552 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1553 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1554 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1555 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1556 ~((unsigned long long) 1 << (type.width - 1)));
1557 LLVMValueRef val, res;
1558
1559 assert(type.floating);
1560 assert(lp_check_value(type, a));
1561
1562 /* val = reinterpret_cast<int>(a) */
1563 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1564 /* val = val & mask */
1565 val = LLVMBuildAnd(builder, val, mask, "");
1566 /* sign = sign << shift */
1567 sign = LLVMBuildShl(builder, sign, shift, "");
1568 /* res = val | sign */
1569 res = LLVMBuildOr(builder, val, sign, "");
1570 /* res = reinterpret_cast<float>(res) */
1571 res = LLVMBuildBitCast(builder, res, vec_type, "");
1572
1573 return res;
1574 }
1575
1576
1577 /**
1578 * Convert vector of (or scalar) int to vector of (or scalar) float.
1579 */
1580 LLVMValueRef
1581 lp_build_int_to_float(struct lp_build_context *bld,
1582 LLVMValueRef a)
1583 {
1584 LLVMBuilderRef builder = bld->gallivm->builder;
1585 const struct lp_type type = bld->type;
1586 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1587
1588 assert(type.floating);
1589
1590 return LLVMBuildSIToFP(builder, a, vec_type, "");
1591 }
1592
1593 static boolean
1594 arch_rounding_available(const struct lp_type type)
1595 {
1596 if ((util_cpu_caps.has_sse4_1 &&
1597 (type.length == 1 || type.width*type.length == 128)) ||
1598 (util_cpu_caps.has_avx && type.width*type.length == 256))
1599 return TRUE;
1600 else if ((util_cpu_caps.has_altivec &&
1601 (type.width == 32 && type.length == 4)))
1602 return TRUE;
1603
1604 return FALSE;
1605 }
1606
1607 enum lp_build_round_mode
1608 {
1609 LP_BUILD_ROUND_NEAREST = 0,
1610 LP_BUILD_ROUND_FLOOR = 1,
1611 LP_BUILD_ROUND_CEIL = 2,
1612 LP_BUILD_ROUND_TRUNCATE = 3
1613 };
1614
1615 /**
1616 * Helper for SSE4.1's ROUNDxx instructions.
1617 *
1618 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1619 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1620 */
1621 static INLINE LLVMValueRef
1622 lp_build_round_sse41(struct lp_build_context *bld,
1623 LLVMValueRef a,
1624 enum lp_build_round_mode mode)
1625 {
1626 LLVMBuilderRef builder = bld->gallivm->builder;
1627 const struct lp_type type = bld->type;
1628 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1629 const char *intrinsic;
1630 LLVMValueRef res;
1631
1632 assert(type.floating);
1633
1634 assert(lp_check_value(type, a));
1635 assert(util_cpu_caps.has_sse4_1);
1636
1637 if (type.length == 1) {
1638 LLVMTypeRef vec_type;
1639 LLVMValueRef undef;
1640 LLVMValueRef args[3];
1641 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1642
1643 switch(type.width) {
1644 case 32:
1645 intrinsic = "llvm.x86.sse41.round.ss";
1646 break;
1647 case 64:
1648 intrinsic = "llvm.x86.sse41.round.sd";
1649 break;
1650 default:
1651 assert(0);
1652 return bld->undef;
1653 }
1654
1655 vec_type = LLVMVectorType(bld->elem_type, 4);
1656
1657 undef = LLVMGetUndef(vec_type);
1658
1659 args[0] = undef;
1660 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1661 args[2] = LLVMConstInt(i32t, mode, 0);
1662
1663 res = lp_build_intrinsic(builder, intrinsic,
1664 vec_type, args, Elements(args));
1665
1666 res = LLVMBuildExtractElement(builder, res, index0, "");
1667 }
1668 else {
1669 if (type.width * type.length == 128) {
1670 switch(type.width) {
1671 case 32:
1672 intrinsic = "llvm.x86.sse41.round.ps";
1673 break;
1674 case 64:
1675 intrinsic = "llvm.x86.sse41.round.pd";
1676 break;
1677 default:
1678 assert(0);
1679 return bld->undef;
1680 }
1681 }
1682 else {
1683 assert(type.width * type.length == 256);
1684 assert(util_cpu_caps.has_avx);
1685
1686 switch(type.width) {
1687 case 32:
1688 intrinsic = "llvm.x86.avx.round.ps.256";
1689 break;
1690 case 64:
1691 intrinsic = "llvm.x86.avx.round.pd.256";
1692 break;
1693 default:
1694 assert(0);
1695 return bld->undef;
1696 }
1697 }
1698
1699 res = lp_build_intrinsic_binary(builder, intrinsic,
1700 bld->vec_type, a,
1701 LLVMConstInt(i32t, mode, 0));
1702 }
1703
1704 return res;
1705 }
1706
1707
1708 static INLINE LLVMValueRef
1709 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1710 LLVMValueRef a)
1711 {
1712 LLVMBuilderRef builder = bld->gallivm->builder;
1713 const struct lp_type type = bld->type;
1714 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1715 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1716 const char *intrinsic;
1717 LLVMValueRef res;
1718
1719 assert(type.floating);
1720 /* using the double precision conversions is a bit more complicated */
1721 assert(type.width == 32);
1722
1723 assert(lp_check_value(type, a));
1724 assert(util_cpu_caps.has_sse2);
1725
1726 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1727 if (type.length == 1) {
1728 LLVMTypeRef vec_type;
1729 LLVMValueRef undef;
1730 LLVMValueRef arg;
1731 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1732
1733 vec_type = LLVMVectorType(bld->elem_type, 4);
1734
1735 intrinsic = "llvm.x86.sse.cvtss2si";
1736
1737 undef = LLVMGetUndef(vec_type);
1738
1739 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1740
1741 res = lp_build_intrinsic_unary(builder, intrinsic,
1742 ret_type, arg);
1743 }
1744 else {
1745 if (type.width* type.length == 128) {
1746 intrinsic = "llvm.x86.sse2.cvtps2dq";
1747 }
1748 else {
1749 assert(type.width*type.length == 256);
1750 assert(util_cpu_caps.has_avx);
1751
1752 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1753 }
1754 res = lp_build_intrinsic_unary(builder, intrinsic,
1755 ret_type, a);
1756 }
1757
1758 return res;
1759 }
1760
1761
1762 /*
1763 */
1764 static INLINE LLVMValueRef
1765 lp_build_round_altivec(struct lp_build_context *bld,
1766 LLVMValueRef a,
1767 enum lp_build_round_mode mode)
1768 {
1769 LLVMBuilderRef builder = bld->gallivm->builder;
1770 const struct lp_type type = bld->type;
1771 const char *intrinsic = NULL;
1772
1773 assert(type.floating);
1774
1775 assert(lp_check_value(type, a));
1776 assert(util_cpu_caps.has_altivec);
1777
1778 switch (mode) {
1779 case LP_BUILD_ROUND_NEAREST:
1780 intrinsic = "llvm.ppc.altivec.vrfin";
1781 break;
1782 case LP_BUILD_ROUND_FLOOR:
1783 intrinsic = "llvm.ppc.altivec.vrfim";
1784 break;
1785 case LP_BUILD_ROUND_CEIL:
1786 intrinsic = "llvm.ppc.altivec.vrfip";
1787 break;
1788 case LP_BUILD_ROUND_TRUNCATE:
1789 intrinsic = "llvm.ppc.altivec.vrfiz";
1790 break;
1791 }
1792
1793 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1794 }
1795
1796 static INLINE LLVMValueRef
1797 lp_build_round_arch(struct lp_build_context *bld,
1798 LLVMValueRef a,
1799 enum lp_build_round_mode mode)
1800 {
1801 if (util_cpu_caps.has_sse4_1)
1802 return lp_build_round_sse41(bld, a, mode);
1803 else /* (util_cpu_caps.has_altivec) */
1804 return lp_build_round_altivec(bld, a, mode);
1805 }
1806
1807 /**
1808 * Return the integer part of a float (vector) value (== round toward zero).
1809 * The returned value is a float (vector).
1810 * Ex: trunc(-1.5) = -1.0
1811 */
1812 LLVMValueRef
1813 lp_build_trunc(struct lp_build_context *bld,
1814 LLVMValueRef a)
1815 {
1816 LLVMBuilderRef builder = bld->gallivm->builder;
1817 const struct lp_type type = bld->type;
1818
1819 assert(type.floating);
1820 assert(lp_check_value(type, a));
1821
1822 if (arch_rounding_available(type)) {
1823 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1824 }
1825 else {
1826 const struct lp_type type = bld->type;
1827 struct lp_type inttype;
1828 struct lp_build_context intbld;
1829 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1830 LLVMValueRef trunc, res, anosign, mask;
1831 LLVMTypeRef int_vec_type = bld->int_vec_type;
1832 LLVMTypeRef vec_type = bld->vec_type;
1833
1834 assert(type.width == 32); /* might want to handle doubles at some point */
1835
1836 inttype = type;
1837 inttype.floating = 0;
1838 lp_build_context_init(&intbld, bld->gallivm, inttype);
1839
1840 /* round by truncation */
1841 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1842 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1843
1844 /* mask out sign bit */
1845 anosign = lp_build_abs(bld, a);
1846 /*
1847 * mask out all values if anosign > 2^24
1848 * This should work both for large ints (all rounding is no-op for them
1849 * because such floats are always exact) as well as special cases like
1850 * NaNs, Infs (taking advantage of the fact they use max exponent).
1851 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1852 */
1853 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1854 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1855 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1856 return lp_build_select(bld, mask, a, res);
1857 }
1858 }
1859
1860
1861 /**
1862 * Return float (vector) rounded to nearest integer (vector). The returned
1863 * value is a float (vector).
1864 * Ex: round(0.9) = 1.0
1865 * Ex: round(-1.5) = -2.0
1866 */
1867 LLVMValueRef
1868 lp_build_round(struct lp_build_context *bld,
1869 LLVMValueRef a)
1870 {
1871 LLVMBuilderRef builder = bld->gallivm->builder;
1872 const struct lp_type type = bld->type;
1873
1874 assert(type.floating);
1875 assert(lp_check_value(type, a));
1876
1877 if (arch_rounding_available(type)) {
1878 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1879 }
1880 else {
1881 const struct lp_type type = bld->type;
1882 struct lp_type inttype;
1883 struct lp_build_context intbld;
1884 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1885 LLVMValueRef res, anosign, mask;
1886 LLVMTypeRef int_vec_type = bld->int_vec_type;
1887 LLVMTypeRef vec_type = bld->vec_type;
1888
1889 assert(type.width == 32); /* might want to handle doubles at some point */
1890
1891 inttype = type;
1892 inttype.floating = 0;
1893 lp_build_context_init(&intbld, bld->gallivm, inttype);
1894
1895 res = lp_build_iround(bld, a);
1896 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1897
1898 /* mask out sign bit */
1899 anosign = lp_build_abs(bld, a);
1900 /*
1901 * mask out all values if anosign > 2^24
1902 * This should work both for large ints (all rounding is no-op for them
1903 * because such floats are always exact) as well as special cases like
1904 * NaNs, Infs (taking advantage of the fact they use max exponent).
1905 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1906 */
1907 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1908 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1909 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1910 return lp_build_select(bld, mask, a, res);
1911 }
1912 }
1913
1914
1915 /**
1916 * Return floor of float (vector), result is a float (vector)
1917 * Ex: floor(1.1) = 1.0
1918 * Ex: floor(-1.1) = -2.0
1919 */
1920 LLVMValueRef
1921 lp_build_floor(struct lp_build_context *bld,
1922 LLVMValueRef a)
1923 {
1924 LLVMBuilderRef builder = bld->gallivm->builder;
1925 const struct lp_type type = bld->type;
1926
1927 assert(type.floating);
1928 assert(lp_check_value(type, a));
1929
1930 if (arch_rounding_available(type)) {
1931 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1932 }
1933 else {
1934 const struct lp_type type = bld->type;
1935 struct lp_type inttype;
1936 struct lp_build_context intbld;
1937 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1938 LLVMValueRef trunc, res, anosign, mask;
1939 LLVMTypeRef int_vec_type = bld->int_vec_type;
1940 LLVMTypeRef vec_type = bld->vec_type;
1941
1942 assert(type.width == 32); /* might want to handle doubles at some point */
1943
1944 inttype = type;
1945 inttype.floating = 0;
1946 lp_build_context_init(&intbld, bld->gallivm, inttype);
1947
1948 /* round by truncation */
1949 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1950 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1951
1952 if (type.sign) {
1953 LLVMValueRef tmp;
1954
1955 /*
1956 * fix values if rounding is wrong (for non-special cases)
1957 * - this is the case if trunc > a
1958 */
1959 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1960 /* tmp = trunc > a ? 1.0 : 0.0 */
1961 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1962 tmp = lp_build_and(&intbld, mask, tmp);
1963 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1964 res = lp_build_sub(bld, res, tmp);
1965 }
1966
1967 /* mask out sign bit */
1968 anosign = lp_build_abs(bld, a);
1969 /*
1970 * mask out all values if anosign > 2^24
1971 * This should work both for large ints (all rounding is no-op for them
1972 * because such floats are always exact) as well as special cases like
1973 * NaNs, Infs (taking advantage of the fact they use max exponent).
1974 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1975 */
1976 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1977 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1978 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1979 return lp_build_select(bld, mask, a, res);
1980 }
1981 }
1982
1983
1984 /**
1985 * Return ceiling of float (vector), returning float (vector).
1986 * Ex: ceil( 1.1) = 2.0
1987 * Ex: ceil(-1.1) = -1.0
1988 */
1989 LLVMValueRef
1990 lp_build_ceil(struct lp_build_context *bld,
1991 LLVMValueRef a)
1992 {
1993 LLVMBuilderRef builder = bld->gallivm->builder;
1994 const struct lp_type type = bld->type;
1995
1996 assert(type.floating);
1997 assert(lp_check_value(type, a));
1998
1999 if (arch_rounding_available(type)) {
2000 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2001 }
2002 else {
2003 const struct lp_type type = bld->type;
2004 struct lp_type inttype;
2005 struct lp_build_context intbld;
2006 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2007 LLVMValueRef trunc, res, anosign, mask, tmp;
2008 LLVMTypeRef int_vec_type = bld->int_vec_type;
2009 LLVMTypeRef vec_type = bld->vec_type;
2010
2011 assert(type.width == 32); /* might want to handle doubles at some point */
2012
2013 inttype = type;
2014 inttype.floating = 0;
2015 lp_build_context_init(&intbld, bld->gallivm, inttype);
2016
2017 /* round by truncation */
2018 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2019 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2020
2021 /*
2022 * fix values if rounding is wrong (for non-special cases)
2023 * - this is the case if trunc < a
2024 */
2025 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2026 /* tmp = trunc < a ? 1.0 : 0.0 */
2027 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2028 tmp = lp_build_and(&intbld, mask, tmp);
2029 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2030 res = lp_build_add(bld, trunc, tmp);
2031
2032 /* mask out sign bit */
2033 anosign = lp_build_abs(bld, a);
2034 /*
2035 * mask out all values if anosign > 2^24
2036 * This should work both for large ints (all rounding is no-op for them
2037 * because such floats are always exact) as well as special cases like
2038 * NaNs, Infs (taking advantage of the fact they use max exponent).
2039 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2040 */
2041 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2042 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2043 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2044 return lp_build_select(bld, mask, a, res);
2045 }
2046 }
2047
2048
2049 /**
2050 * Return fractional part of 'a' computed as a - floor(a)
2051 * Typically used in texture coord arithmetic.
2052 */
2053 LLVMValueRef
2054 lp_build_fract(struct lp_build_context *bld,
2055 LLVMValueRef a)
2056 {
2057 assert(bld->type.floating);
2058 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2059 }
2060
2061
2062 /**
2063 * Prevent returning a fractional part of 1.0 for very small negative values of
2064 * 'a' by clamping against 0.99999(9).
2065 */
2066 static inline LLVMValueRef
2067 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2068 {
2069 LLVMValueRef max;
2070
2071 /* this is the largest number smaller than 1.0 representable as float */
2072 max = lp_build_const_vec(bld->gallivm, bld->type,
2073 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2074 return lp_build_min(bld, fract, max);
2075 }
2076
2077
2078 /**
2079 * Same as lp_build_fract, but guarantees that the result is always smaller
2080 * than one.
2081 */
2082 LLVMValueRef
2083 lp_build_fract_safe(struct lp_build_context *bld,
2084 LLVMValueRef a)
2085 {
2086 return clamp_fract(bld, lp_build_fract(bld, a));
2087 }
2088
2089
2090 /**
2091 * Return the integer part of a float (vector) value (== round toward zero).
2092 * The returned value is an integer (vector).
2093 * Ex: itrunc(-1.5) = -1
2094 */
2095 LLVMValueRef
2096 lp_build_itrunc(struct lp_build_context *bld,
2097 LLVMValueRef a)
2098 {
2099 LLVMBuilderRef builder = bld->gallivm->builder;
2100 const struct lp_type type = bld->type;
2101 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2102
2103 assert(type.floating);
2104 assert(lp_check_value(type, a));
2105
2106 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2107 }
2108
2109
2110 /**
2111 * Return float (vector) rounded to nearest integer (vector). The returned
2112 * value is an integer (vector).
2113 * Ex: iround(0.9) = 1
2114 * Ex: iround(-1.5) = -2
2115 */
2116 LLVMValueRef
2117 lp_build_iround(struct lp_build_context *bld,
2118 LLVMValueRef a)
2119 {
2120 LLVMBuilderRef builder = bld->gallivm->builder;
2121 const struct lp_type type = bld->type;
2122 LLVMTypeRef int_vec_type = bld->int_vec_type;
2123 LLVMValueRef res;
2124
2125 assert(type.floating);
2126
2127 assert(lp_check_value(type, a));
2128
2129 if ((util_cpu_caps.has_sse2 &&
2130 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2131 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2132 return lp_build_iround_nearest_sse2(bld, a);
2133 }
2134 if (arch_rounding_available(type)) {
2135 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2136 }
2137 else {
2138 LLVMValueRef half;
2139
2140 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2141
2142 if (type.sign) {
2143 LLVMTypeRef vec_type = bld->vec_type;
2144 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2145 (unsigned long long)1 << (type.width - 1));
2146 LLVMValueRef sign;
2147
2148 /* get sign bit */
2149 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2150 sign = LLVMBuildAnd(builder, sign, mask, "");
2151
2152 /* sign * 0.5 */
2153 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2154 half = LLVMBuildOr(builder, sign, half, "");
2155 half = LLVMBuildBitCast(builder, half, vec_type, "");
2156 }
2157
2158 res = LLVMBuildFAdd(builder, a, half, "");
2159 }
2160
2161 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2162
2163 return res;
2164 }
2165
2166
2167 /**
2168 * Return floor of float (vector), result is an int (vector)
2169 * Ex: ifloor(1.1) = 1.0
2170 * Ex: ifloor(-1.1) = -2.0
2171 */
2172 LLVMValueRef
2173 lp_build_ifloor(struct lp_build_context *bld,
2174 LLVMValueRef a)
2175 {
2176 LLVMBuilderRef builder = bld->gallivm->builder;
2177 const struct lp_type type = bld->type;
2178 LLVMTypeRef int_vec_type = bld->int_vec_type;
2179 LLVMValueRef res;
2180
2181 assert(type.floating);
2182 assert(lp_check_value(type, a));
2183
2184 res = a;
2185 if (type.sign) {
2186 if (arch_rounding_available(type)) {
2187 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2188 }
2189 else {
2190 struct lp_type inttype;
2191 struct lp_build_context intbld;
2192 LLVMValueRef trunc, itrunc, mask;
2193
2194 assert(type.floating);
2195 assert(lp_check_value(type, a));
2196
2197 inttype = type;
2198 inttype.floating = 0;
2199 lp_build_context_init(&intbld, bld->gallivm, inttype);
2200
2201 /* round by truncation */
2202 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2203 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2204
2205 /*
2206 * fix values if rounding is wrong (for non-special cases)
2207 * - this is the case if trunc > a
2208 * The results of doing this with NaNs, very large values etc.
2209 * are undefined but this seems to be the case anyway.
2210 */
2211 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2212 /* cheapie minus one with mask since the mask is minus one / zero */
2213 return lp_build_add(&intbld, itrunc, mask);
2214 }
2215 }
2216
2217 /* round to nearest (toward zero) */
2218 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2219
2220 return res;
2221 }
2222
2223
2224 /**
2225 * Return ceiling of float (vector), returning int (vector).
2226 * Ex: iceil( 1.1) = 2
2227 * Ex: iceil(-1.1) = -1
2228 */
2229 LLVMValueRef
2230 lp_build_iceil(struct lp_build_context *bld,
2231 LLVMValueRef a)
2232 {
2233 LLVMBuilderRef builder = bld->gallivm->builder;
2234 const struct lp_type type = bld->type;
2235 LLVMTypeRef int_vec_type = bld->int_vec_type;
2236 LLVMValueRef res;
2237
2238 assert(type.floating);
2239 assert(lp_check_value(type, a));
2240
2241 if (arch_rounding_available(type)) {
2242 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2243 }
2244 else {
2245 struct lp_type inttype;
2246 struct lp_build_context intbld;
2247 LLVMValueRef trunc, itrunc, mask;
2248
2249 assert(type.floating);
2250 assert(lp_check_value(type, a));
2251
2252 inttype = type;
2253 inttype.floating = 0;
2254 lp_build_context_init(&intbld, bld->gallivm, inttype);
2255
2256 /* round by truncation */
2257 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2258 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2259
2260 /*
2261 * fix values if rounding is wrong (for non-special cases)
2262 * - this is the case if trunc < a
2263 * The results of doing this with NaNs, very large values etc.
2264 * are undefined but this seems to be the case anyway.
2265 */
2266 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2267 /* cheapie plus one with mask since the mask is minus one / zero */
2268 return lp_build_sub(&intbld, itrunc, mask);
2269 }
2270
2271 /* round to nearest (toward zero) */
2272 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2273
2274 return res;
2275 }
2276
2277
2278 /**
2279 * Combined ifloor() & fract().
2280 *
2281 * Preferred to calling the functions separately, as it will ensure that the
2282 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2283 */
2284 void
2285 lp_build_ifloor_fract(struct lp_build_context *bld,
2286 LLVMValueRef a,
2287 LLVMValueRef *out_ipart,
2288 LLVMValueRef *out_fpart)
2289 {
2290 LLVMBuilderRef builder = bld->gallivm->builder;
2291 const struct lp_type type = bld->type;
2292 LLVMValueRef ipart;
2293
2294 assert(type.floating);
2295 assert(lp_check_value(type, a));
2296
2297 if (arch_rounding_available(type)) {
2298 /*
2299 * floor() is easier.
2300 */
2301
2302 ipart = lp_build_floor(bld, a);
2303 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2304 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2305 }
2306 else {
2307 /*
2308 * ifloor() is easier.
2309 */
2310
2311 *out_ipart = lp_build_ifloor(bld, a);
2312 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2313 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2314 }
2315 }
2316
2317
2318 /**
2319 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2320 * always smaller than one.
2321 */
2322 void
2323 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2324 LLVMValueRef a,
2325 LLVMValueRef *out_ipart,
2326 LLVMValueRef *out_fpart)
2327 {
2328 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2329 *out_fpart = clamp_fract(bld, *out_fpart);
2330 }
2331
2332
2333 LLVMValueRef
2334 lp_build_sqrt(struct lp_build_context *bld,
2335 LLVMValueRef a)
2336 {
2337 LLVMBuilderRef builder = bld->gallivm->builder;
2338 const struct lp_type type = bld->type;
2339 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2340 char intrinsic[32];
2341
2342 assert(lp_check_value(type, a));
2343
2344 /* TODO: optimize the constant case */
2345
2346 assert(type.floating);
2347 if (type.length == 1) {
2348 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2349 }
2350 else {
2351 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2352 }
2353
2354 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2355 }
2356
2357
2358 /**
2359 * Do one Newton-Raphson step to improve reciprocate precision:
2360 *
2361 * x_{i+1} = x_i * (2 - a * x_i)
2362 *
2363 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2364 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2365 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2366 * halo. It would be necessary to clamp the argument to prevent this.
2367 *
2368 * See also:
2369 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2370 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2371 */
2372 static INLINE LLVMValueRef
2373 lp_build_rcp_refine(struct lp_build_context *bld,
2374 LLVMValueRef a,
2375 LLVMValueRef rcp_a)
2376 {
2377 LLVMBuilderRef builder = bld->gallivm->builder;
2378 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2379 LLVMValueRef res;
2380
2381 res = LLVMBuildFMul(builder, a, rcp_a, "");
2382 res = LLVMBuildFSub(builder, two, res, "");
2383 res = LLVMBuildFMul(builder, rcp_a, res, "");
2384
2385 return res;
2386 }
2387
2388
2389 LLVMValueRef
2390 lp_build_rcp(struct lp_build_context *bld,
2391 LLVMValueRef a)
2392 {
2393 LLVMBuilderRef builder = bld->gallivm->builder;
2394 const struct lp_type type = bld->type;
2395
2396 assert(lp_check_value(type, a));
2397
2398 if(a == bld->zero)
2399 return bld->undef;
2400 if(a == bld->one)
2401 return bld->one;
2402 if(a == bld->undef)
2403 return bld->undef;
2404
2405 assert(type.floating);
2406
2407 if(LLVMIsConstant(a))
2408 return LLVMConstFDiv(bld->one, a);
2409
2410 /*
2411 * We don't use RCPPS because:
2412 * - it only has 10bits of precision
2413 * - it doesn't even get the reciprocate of 1.0 exactly
2414 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2415 * - for recent processors the benefit over DIVPS is marginal, a case
2416 * dependent
2417 *
2418 * We could still use it on certain processors if benchmarks show that the
2419 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2420 * particular uses that require less workarounds.
2421 */
2422
2423 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2424 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2425 const unsigned num_iterations = 0;
2426 LLVMValueRef res;
2427 unsigned i;
2428 const char *intrinsic = NULL;
2429
2430 if (type.length == 4) {
2431 intrinsic = "llvm.x86.sse.rcp.ps";
2432 }
2433 else {
2434 intrinsic = "llvm.x86.avx.rcp.ps.256";
2435 }
2436
2437 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2438
2439 for (i = 0; i < num_iterations; ++i) {
2440 res = lp_build_rcp_refine(bld, a, res);
2441 }
2442
2443 return res;
2444 }
2445
2446 return LLVMBuildFDiv(builder, bld->one, a, "");
2447 }
2448
2449
2450 /**
2451 * Do one Newton-Raphson step to improve rsqrt precision:
2452 *
2453 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2454 *
2455 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2456 */
2457 static INLINE LLVMValueRef
2458 lp_build_rsqrt_refine(struct lp_build_context *bld,
2459 LLVMValueRef a,
2460 LLVMValueRef rsqrt_a)
2461 {
2462 LLVMBuilderRef builder = bld->gallivm->builder;
2463 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2464 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2465 LLVMValueRef res;
2466
2467 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2468 res = LLVMBuildFMul(builder, a, res, "");
2469 res = LLVMBuildFSub(builder, three, res, "");
2470 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2471 res = LLVMBuildFMul(builder, half, res, "");
2472
2473 return res;
2474 }
2475
2476
2477 /**
2478 * Generate 1/sqrt(a).
2479 * Result is undefined for values < 0, infinity for +0.
2480 */
2481 LLVMValueRef
2482 lp_build_rsqrt(struct lp_build_context *bld,
2483 LLVMValueRef a)
2484 {
2485 LLVMBuilderRef builder = bld->gallivm->builder;
2486 const struct lp_type type = bld->type;
2487
2488 assert(lp_check_value(type, a));
2489
2490 assert(type.floating);
2491
2492 /*
2493 * This should be faster but all denormals will end up as infinity.
2494 */
2495 if (0 && lp_build_fast_rsqrt_available(type)) {
2496 const unsigned num_iterations = 1;
2497 LLVMValueRef res;
2498 unsigned i;
2499
2500 /* rsqrt(1.0) != 1.0 here */
2501 res = lp_build_fast_rsqrt(bld, a);
2502
2503 if (num_iterations) {
2504 /*
2505 * Newton-Raphson will result in NaN instead of infinity for zero,
2506 * and NaN instead of zero for infinity.
2507 * Also, need to ensure rsqrt(1.0) == 1.0.
2508 * All numbers smaller than FLT_MIN will result in +infinity
2509 * (rsqrtps treats all denormals as zero).
2510 */
2511 /*
2512 * Certain non-c99 compilers don't know INFINITY and might not support
2513 * hacks to evaluate it at compile time neither.
2514 */
2515 const unsigned posinf_int = 0x7F800000;
2516 LLVMValueRef cmp;
2517 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2518 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2519
2520 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2521
2522 for (i = 0; i < num_iterations; ++i) {
2523 res = lp_build_rsqrt_refine(bld, a, res);
2524 }
2525 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2526 res = lp_build_select(bld, cmp, inf, res);
2527 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2528 res = lp_build_select(bld, cmp, bld->zero, res);
2529 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2530 res = lp_build_select(bld, cmp, bld->one, res);
2531 }
2532
2533 return res;
2534 }
2535
2536 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2537 }
2538
2539 /**
2540 * If there's a fast (inaccurate) rsqrt instruction available
2541 * (caller may want to avoid to call rsqrt_fast if it's not available,
2542 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2543 * unavailable it would result in sqrt/div/mul so obviously
2544 * much better to just call sqrt, skipping both div and mul).
2545 */
2546 boolean
2547 lp_build_fast_rsqrt_available(struct lp_type type)
2548 {
2549 assert(type.floating);
2550
2551 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2552 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2553 return true;
2554 }
2555 return false;
2556 }
2557
2558
2559 /**
2560 * Generate 1/sqrt(a).
2561 * Result is undefined for values < 0, infinity for +0.
2562 * Precision is limited, only ~10 bits guaranteed
2563 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2564 */
2565 LLVMValueRef
2566 lp_build_fast_rsqrt(struct lp_build_context *bld,
2567 LLVMValueRef a)
2568 {
2569 LLVMBuilderRef builder = bld->gallivm->builder;
2570 const struct lp_type type = bld->type;
2571
2572 assert(lp_check_value(type, a));
2573
2574 if (lp_build_fast_rsqrt_available(type)) {
2575 const char *intrinsic = NULL;
2576
2577 if (type.length == 4) {
2578 intrinsic = "llvm.x86.sse.rsqrt.ps";
2579 }
2580 else {
2581 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2582 }
2583 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2584 }
2585 else {
2586 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2587 }
2588 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2589 }
2590
2591
2592 /**
2593 * Generate sin(a) using SSE2
2594 */
2595 LLVMValueRef
2596 lp_build_sin(struct lp_build_context *bld,
2597 LLVMValueRef a)
2598 {
2599 struct gallivm_state *gallivm = bld->gallivm;
2600 LLVMBuilderRef builder = gallivm->builder;
2601 struct lp_type int_type = lp_int_type(bld->type);
2602 LLVMBuilderRef b = builder;
2603
2604 /*
2605 * take the absolute value,
2606 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2607 */
2608
2609 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2610 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2611
2612 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2613 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2614
2615 /*
2616 * extract the sign bit (upper one)
2617 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2618 */
2619 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2620 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2621
2622 /*
2623 * scale by 4/Pi
2624 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2625 */
2626
2627 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2628 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2629
2630 /*
2631 * store the integer part of y in mm0
2632 * emm2 = _mm_cvttps_epi32(y);
2633 */
2634
2635 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2636
2637 /*
2638 * j=(j+1) & (~1) (see the cephes sources)
2639 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2640 */
2641
2642 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2643 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2644 /*
2645 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2646 */
2647 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2648 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2649
2650 /*
2651 * y = _mm_cvtepi32_ps(emm2);
2652 */
2653 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2654
2655 /* get the swap sign flag
2656 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2657 */
2658 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2659 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2660
2661 /*
2662 * emm2 = _mm_slli_epi32(emm0, 29);
2663 */
2664 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2665 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2666
2667 /*
2668 * get the polynom selection mask
2669 * there is one polynom for 0 <= x <= Pi/4
2670 * and another one for Pi/4<x<=Pi/2
2671 * Both branches will be computed.
2672 *
2673 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2674 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2675 */
2676
2677 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2678 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2679 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2680 int_type, PIPE_FUNC_EQUAL,
2681 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2682 /*
2683 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2684 */
2685 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2686
2687 /*
2688 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2689 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2690 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2691 */
2692 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2693 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2694 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2695
2696 /*
2697 * The magic pass: "Extended precision modular arithmetic"
2698 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2699 * xmm1 = _mm_mul_ps(y, xmm1);
2700 * xmm2 = _mm_mul_ps(y, xmm2);
2701 * xmm3 = _mm_mul_ps(y, xmm3);
2702 */
2703 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2704 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2705 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2706
2707 /*
2708 * x = _mm_add_ps(x, xmm1);
2709 * x = _mm_add_ps(x, xmm2);
2710 * x = _mm_add_ps(x, xmm3);
2711 */
2712
2713 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2714 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2715 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2716
2717 /*
2718 * Evaluate the first polynom (0 <= x <= Pi/4)
2719 *
2720 * z = _mm_mul_ps(x,x);
2721 */
2722 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2723
2724 /*
2725 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2726 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2727 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2728 */
2729 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2730 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2731 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2732
2733 /*
2734 * y = *(v4sf*)_ps_coscof_p0;
2735 * y = _mm_mul_ps(y, z);
2736 */
2737 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2738 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2739 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2740 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2741 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2742 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2743
2744
2745 /*
2746 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2747 * y = _mm_sub_ps(y, tmp);
2748 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2749 */
2750 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2751 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2752 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2753 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2754 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2755
2756 /*
2757 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2758 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2759 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2760 */
2761 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2762 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2763 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2764
2765 /*
2766 * Evaluate the second polynom (Pi/4 <= x <= 0)
2767 *
2768 * y2 = *(v4sf*)_ps_sincof_p0;
2769 * y2 = _mm_mul_ps(y2, z);
2770 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2771 * y2 = _mm_mul_ps(y2, z);
2772 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2773 * y2 = _mm_mul_ps(y2, z);
2774 * y2 = _mm_mul_ps(y2, x);
2775 * y2 = _mm_add_ps(y2, x);
2776 */
2777
2778 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2779 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2780 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2781 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2782 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2783 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2784 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2785
2786 /*
2787 * select the correct result from the two polynoms
2788 * xmm3 = poly_mask;
2789 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2790 * y = _mm_andnot_ps(xmm3, y);
2791 * y = _mm_or_ps(y,y2);
2792 */
2793 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2794 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2795 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2796 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2797 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2798 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2799
2800 /*
2801 * update the sign
2802 * y = _mm_xor_ps(y, sign_bit);
2803 */
2804 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2805 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2806 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2807
2808 /* clamp output to be within [-1, 1] */
2809 y_result = lp_build_clamp(bld, y_result,
2810 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2811 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2812 /* If a is -inf, inf or NaN then return NaN */
2813 y_result = lp_build_select(bld, isfinite, y_result,
2814 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2815 return y_result;
2816 }
2817
2818
2819 /**
2820 * Generate cos(a) using SSE2
2821 */
2822 LLVMValueRef
2823 lp_build_cos(struct lp_build_context *bld,
2824 LLVMValueRef a)
2825 {
2826 struct gallivm_state *gallivm = bld->gallivm;
2827 LLVMBuilderRef builder = gallivm->builder;
2828 struct lp_type int_type = lp_int_type(bld->type);
2829 LLVMBuilderRef b = builder;
2830
2831 /*
2832 * take the absolute value,
2833 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2834 */
2835
2836 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2837 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2838
2839 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2840 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2841
2842 /*
2843 * scale by 4/Pi
2844 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2845 */
2846
2847 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2848 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2849
2850 /*
2851 * store the integer part of y in mm0
2852 * emm2 = _mm_cvttps_epi32(y);
2853 */
2854
2855 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2856
2857 /*
2858 * j=(j+1) & (~1) (see the cephes sources)
2859 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2860 */
2861
2862 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2863 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2864 /*
2865 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2866 */
2867 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2868 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2869
2870 /*
2871 * y = _mm_cvtepi32_ps(emm2);
2872 */
2873 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2874
2875
2876 /*
2877 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2878 */
2879 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2880 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2881
2882
2883 /* get the swap sign flag
2884 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2885 */
2886 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2887 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2888 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2889 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2890
2891 /*
2892 * emm2 = _mm_slli_epi32(emm0, 29);
2893 */
2894 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2895 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2896
2897 /*
2898 * get the polynom selection mask
2899 * there is one polynom for 0 <= x <= Pi/4
2900 * and another one for Pi/4<x<=Pi/2
2901 * Both branches will be computed.
2902 *
2903 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2904 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2905 */
2906
2907 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2908 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2909 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2910 int_type, PIPE_FUNC_EQUAL,
2911 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2912
2913 /*
2914 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2915 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2916 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2917 */
2918 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2919 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2920 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2921
2922 /*
2923 * The magic pass: "Extended precision modular arithmetic"
2924 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2925 * xmm1 = _mm_mul_ps(y, xmm1);
2926 * xmm2 = _mm_mul_ps(y, xmm2);
2927 * xmm3 = _mm_mul_ps(y, xmm3);
2928 */
2929 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2930 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2931 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2932
2933 /*
2934 * x = _mm_add_ps(x, xmm1);
2935 * x = _mm_add_ps(x, xmm2);
2936 * x = _mm_add_ps(x, xmm3);
2937 */
2938
2939 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2940 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2941 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2942
2943 /*
2944 * Evaluate the first polynom (0 <= x <= Pi/4)
2945 *
2946 * z = _mm_mul_ps(x,x);
2947 */
2948 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2949
2950 /*
2951 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2952 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2953 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2954 */
2955 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2956 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2957 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2958
2959 /*
2960 * y = *(v4sf*)_ps_coscof_p0;
2961 * y = _mm_mul_ps(y, z);
2962 */
2963 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2964 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2965 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2966 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2967 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2968 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2969
2970
2971 /*
2972 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2973 * y = _mm_sub_ps(y, tmp);
2974 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2975 */
2976 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2977 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2978 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2979 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2980 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2981
2982 /*
2983 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2984 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2985 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2986 */
2987 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2988 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2989 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2990
2991 /*
2992 * Evaluate the second polynom (Pi/4 <= x <= 0)
2993 *
2994 * y2 = *(v4sf*)_ps_sincof_p0;
2995 * y2 = _mm_mul_ps(y2, z);
2996 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2997 * y2 = _mm_mul_ps(y2, z);
2998 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2999 * y2 = _mm_mul_ps(y2, z);
3000 * y2 = _mm_mul_ps(y2, x);
3001 * y2 = _mm_add_ps(y2, x);
3002 */
3003
3004 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
3005 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
3006 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
3007 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
3008 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3009 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
3010 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
3011
3012 /*
3013 * select the correct result from the two polynoms
3014 * xmm3 = poly_mask;
3015 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3016 * y = _mm_andnot_ps(xmm3, y);
3017 * y = _mm_or_ps(y,y2);
3018 */
3019 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3020 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3021 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3022 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3023 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3024 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3025
3026 /*
3027 * update the sign
3028 * y = _mm_xor_ps(y, sign_bit);
3029 */
3030 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
3031 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3032 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3033
3034 /* clamp output to be within [-1, 1] */
3035 y_result = lp_build_clamp(bld, y_result,
3036 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3037 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3038 /* If a is -inf, inf or NaN then return NaN */
3039 y_result = lp_build_select(bld, isfinite, y_result,
3040 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3041 return y_result;
3042 }
3043
3044
3045 /**
3046 * Generate pow(x, y)
3047 */
3048 LLVMValueRef
3049 lp_build_pow(struct lp_build_context *bld,
3050 LLVMValueRef x,
3051 LLVMValueRef y)
3052 {
3053 /* TODO: optimize the constant case */
3054 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3055 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3056 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3057 __FUNCTION__);
3058 }
3059
3060 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3061 }
3062
3063
3064 /**
3065 * Generate exp(x)
3066 */
3067 LLVMValueRef
3068 lp_build_exp(struct lp_build_context *bld,
3069 LLVMValueRef x)
3070 {
3071 /* log2(e) = 1/log(2) */
3072 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3073 1.4426950408889634);
3074
3075 assert(lp_check_value(bld->type, x));
3076
3077 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3078 }
3079
3080
3081 /**
3082 * Generate log(x)
3083 */
3084 LLVMValueRef
3085 lp_build_log(struct lp_build_context *bld,
3086 LLVMValueRef x)
3087 {
3088 /* log(2) */
3089 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3090 0.69314718055994529);
3091
3092 assert(lp_check_value(bld->type, x));
3093
3094 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3095 }
3096
3097
3098 /**
3099 * Generate polynomial.
3100 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3101 */
3102 LLVMValueRef
3103 lp_build_polynomial(struct lp_build_context *bld,
3104 LLVMValueRef x,
3105 const double *coeffs,
3106 unsigned num_coeffs)
3107 {
3108 const struct lp_type type = bld->type;
3109 LLVMValueRef even = NULL, odd = NULL;
3110 LLVMValueRef x2;
3111 unsigned i;
3112
3113 assert(lp_check_value(bld->type, x));
3114
3115 /* TODO: optimize the constant case */
3116 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3117 LLVMIsConstant(x)) {
3118 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3119 __FUNCTION__);
3120 }
3121
3122 /*
3123 * Calculate odd and even terms seperately to decrease data dependency
3124 * Ex:
3125 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3126 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3127 */
3128 x2 = lp_build_mul(bld, x, x);
3129
3130 for (i = num_coeffs; i--; ) {
3131 LLVMValueRef coeff;
3132
3133 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3134
3135 if (i % 2 == 0) {
3136 if (even)
3137 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3138 else
3139 even = coeff;
3140 } else {
3141 if (odd)
3142 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3143 else
3144 odd = coeff;
3145 }
3146 }
3147
3148 if (odd)
3149 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3150 else if (even)
3151 return even;
3152 else
3153 return bld->undef;
3154 }
3155
3156
3157 /**
3158 * Minimax polynomial fit of 2**x, in range [0, 1[
3159 */
3160 const double lp_build_exp2_polynomial[] = {
3161 #if EXP_POLY_DEGREE == 5
3162 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3163 0.693153073200168932794,
3164 0.240153617044375388211,
3165 0.0558263180532956664775,
3166 0.00898934009049466391101,
3167 0.00187757667519147912699
3168 #elif EXP_POLY_DEGREE == 4
3169 1.00000259337069434683,
3170 0.693003834469974940458,
3171 0.24144275689150793076,
3172 0.0520114606103070150235,
3173 0.0135341679161270268764
3174 #elif EXP_POLY_DEGREE == 3
3175 0.999925218562710312959,
3176 0.695833540494823811697,
3177 0.226067155427249155588,
3178 0.0780245226406372992967
3179 #elif EXP_POLY_DEGREE == 2
3180 1.00172476321474503578,
3181 0.657636275736077639316,
3182 0.33718943461968720704
3183 #else
3184 #error
3185 #endif
3186 };
3187
3188
3189 void
3190 lp_build_exp2_approx(struct lp_build_context *bld,
3191 LLVMValueRef x,
3192 LLVMValueRef *p_exp2_int_part,
3193 LLVMValueRef *p_frac_part,
3194 LLVMValueRef *p_exp2)
3195 {
3196 LLVMBuilderRef builder = bld->gallivm->builder;
3197 const struct lp_type type = bld->type;
3198 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3199 LLVMValueRef ipart = NULL;
3200 LLVMValueRef fpart = NULL;
3201 LLVMValueRef expipart = NULL;
3202 LLVMValueRef expfpart = NULL;
3203 LLVMValueRef res = NULL;
3204
3205 assert(lp_check_value(bld->type, x));
3206
3207 if(p_exp2_int_part || p_frac_part || p_exp2) {
3208 /* TODO: optimize the constant case */
3209 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3210 LLVMIsConstant(x)) {
3211 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3212 __FUNCTION__);
3213 }
3214
3215 assert(type.floating && type.width == 32);
3216
3217 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3218 * the result is INF and if it's smaller than -126.9 the result is 0 */
3219 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3220 GALLIVM_NAN_RETURN_SECOND);
3221 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x,
3222 GALLIVM_NAN_RETURN_SECOND);
3223
3224 /* ipart = floor(x) */
3225 /* fpart = x - ipart */
3226 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3227 }
3228
3229 if(p_exp2_int_part || p_exp2) {
3230 /* expipart = (float) (1 << ipart) */
3231 expipart = LLVMBuildAdd(builder, ipart,
3232 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3233 expipart = LLVMBuildShl(builder, expipart,
3234 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3235 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3236 }
3237
3238 if(p_exp2) {
3239 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3240 Elements(lp_build_exp2_polynomial));
3241
3242 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3243 }
3244
3245 if(p_exp2_int_part)
3246 *p_exp2_int_part = expipart;
3247
3248 if(p_frac_part)
3249 *p_frac_part = fpart;
3250
3251 if(p_exp2)
3252 *p_exp2 = res;
3253 }
3254
3255
3256 LLVMValueRef
3257 lp_build_exp2(struct lp_build_context *bld,
3258 LLVMValueRef x)
3259 {
3260 LLVMValueRef res;
3261 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3262 return res;
3263 }
3264
3265
3266 /**
3267 * Extract the exponent of a IEEE-754 floating point value.
3268 *
3269 * Optionally apply an integer bias.
3270 *
3271 * Result is an integer value with
3272 *
3273 * ifloor(log2(x)) + bias
3274 */
3275 LLVMValueRef
3276 lp_build_extract_exponent(struct lp_build_context *bld,
3277 LLVMValueRef x,
3278 int bias)
3279 {
3280 LLVMBuilderRef builder = bld->gallivm->builder;
3281 const struct lp_type type = bld->type;
3282 unsigned mantissa = lp_mantissa(type);
3283 LLVMValueRef res;
3284
3285 assert(type.floating);
3286
3287 assert(lp_check_value(bld->type, x));
3288
3289 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3290
3291 res = LLVMBuildLShr(builder, x,
3292 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3293 res = LLVMBuildAnd(builder, res,
3294 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3295 res = LLVMBuildSub(builder, res,
3296 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3297
3298 return res;
3299 }
3300
3301
3302 /**
3303 * Extract the mantissa of the a floating.
3304 *
3305 * Result is a floating point value with
3306 *
3307 * x / floor(log2(x))
3308 */
3309 LLVMValueRef
3310 lp_build_extract_mantissa(struct lp_build_context *bld,
3311 LLVMValueRef x)
3312 {
3313 LLVMBuilderRef builder = bld->gallivm->builder;
3314 const struct lp_type type = bld->type;
3315 unsigned mantissa = lp_mantissa(type);
3316 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3317 (1ULL << mantissa) - 1);
3318 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3319 LLVMValueRef res;
3320
3321 assert(lp_check_value(bld->type, x));
3322
3323 assert(type.floating);
3324
3325 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3326
3327 /* res = x / 2**ipart */
3328 res = LLVMBuildAnd(builder, x, mantmask, "");
3329 res = LLVMBuildOr(builder, res, one, "");
3330 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3331
3332 return res;
3333 }
3334
3335
3336
3337 /**
3338 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3339 * These coefficients can be generate with
3340 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3341 */
3342 const double lp_build_log2_polynomial[] = {
3343 #if LOG_POLY_DEGREE == 5
3344 2.88539008148777786488L,
3345 0.961796878841293367824L,
3346 0.577058946784739859012L,
3347 0.412914355135828735411L,
3348 0.308591899232910175289L,
3349 0.352376952300281371868L,
3350 #elif LOG_POLY_DEGREE == 4
3351 2.88539009343309178325L,
3352 0.961791550404184197881L,
3353 0.577440339438736392009L,
3354 0.403343858251329912514L,
3355 0.406718052498846252698L,
3356 #elif LOG_POLY_DEGREE == 3
3357 2.88538959748872753838L,
3358 0.961932915889597772928L,
3359 0.571118517972136195241L,
3360 0.493997535084709500285L,
3361 #else
3362 #error
3363 #endif
3364 };
3365
3366 /**
3367 * See http://www.devmaster.net/forums/showthread.php?p=43580
3368 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3369 * http://www.nezumi.demon.co.uk/consult/logx.htm
3370 *
3371 * If handle_edge_cases is true the function will perform computations
3372 * to match the required D3D10+ behavior for each of the edge cases.
3373 * That means that if input is:
3374 * - less than zero (to and including -inf) then NaN will be returned
3375 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3376 * - +infinity, then +infinity will be returned
3377 * - NaN, then NaN will be returned
3378 *
3379 * Those checks are fairly expensive so if you don't need them make sure
3380 * handle_edge_cases is false.
3381 */
3382 void
3383 lp_build_log2_approx(struct lp_build_context *bld,
3384 LLVMValueRef x,
3385 LLVMValueRef *p_exp,
3386 LLVMValueRef *p_floor_log2,
3387 LLVMValueRef *p_log2,
3388 boolean handle_edge_cases)
3389 {
3390 LLVMBuilderRef builder = bld->gallivm->builder;
3391 const struct lp_type type = bld->type;
3392 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3393 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3394
3395 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3396 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3397 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3398
3399 LLVMValueRef i = NULL;
3400 LLVMValueRef y = NULL;
3401 LLVMValueRef z = NULL;
3402 LLVMValueRef exp = NULL;
3403 LLVMValueRef mant = NULL;
3404 LLVMValueRef logexp = NULL;
3405 LLVMValueRef logmant = NULL;
3406 LLVMValueRef res = NULL;
3407
3408 assert(lp_check_value(bld->type, x));
3409
3410 if(p_exp || p_floor_log2 || p_log2) {
3411 /* TODO: optimize the constant case */
3412 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3413 LLVMIsConstant(x)) {
3414 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3415 __FUNCTION__);
3416 }
3417
3418 assert(type.floating && type.width == 32);
3419
3420 /*
3421 * We don't explicitly handle denormalized numbers. They will yield a
3422 * result in the neighbourhood of -127, which appears to be adequate
3423 * enough.
3424 */
3425
3426 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3427
3428 /* exp = (float) exponent(x) */
3429 exp = LLVMBuildAnd(builder, i, expmask, "");
3430 }
3431
3432 if(p_floor_log2 || p_log2) {
3433 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3434 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3435 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3436 }
3437
3438 if(p_log2) {
3439 /* mant = 1 + (float) mantissa(x) */
3440 mant = LLVMBuildAnd(builder, i, mantmask, "");
3441 mant = LLVMBuildOr(builder, mant, one, "");
3442 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3443
3444 /* y = (mant - 1) / (mant + 1) */
3445 y = lp_build_div(bld,
3446 lp_build_sub(bld, mant, bld->one),
3447 lp_build_add(bld, mant, bld->one)
3448 );
3449
3450 /* z = y^2 */
3451 z = lp_build_mul(bld, y, y);
3452
3453 /* compute P(z) */
3454 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3455 Elements(lp_build_log2_polynomial));
3456
3457 /* logmant = y * P(z) */
3458 logmant = lp_build_mul(bld, y, logmant);
3459
3460 res = lp_build_add(bld, logmant, logexp);
3461
3462 if (type.floating && handle_edge_cases) {
3463 LLVMValueRef negmask, infmask, zmask;
3464 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3465 lp_build_const_vec(bld->gallivm, type, 0.0f));
3466 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3467 lp_build_const_vec(bld->gallivm, type, 0.0f));
3468 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3469 lp_build_const_vec(bld->gallivm, type, INFINITY));
3470
3471 /* If x is qual to inf make sure we return inf */
3472 res = lp_build_select(bld, infmask,
3473 lp_build_const_vec(bld->gallivm, type, INFINITY),
3474 res);
3475 /* If x is qual to 0, return -inf */
3476 res = lp_build_select(bld, zmask,
3477 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3478 res);
3479 /* If x is nan or less than 0, return nan */
3480 res = lp_build_select(bld, negmask,
3481 lp_build_const_vec(bld->gallivm, type, NAN),
3482 res);
3483 }
3484 }
3485
3486 if(p_exp) {
3487 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3488 *p_exp = exp;
3489 }
3490
3491 if(p_floor_log2)
3492 *p_floor_log2 = logexp;
3493
3494 if(p_log2)
3495 *p_log2 = res;
3496 }
3497
3498
3499 /*
3500 * log2 implementation which doesn't have special code to
3501 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3502 * the results for those cases are undefined.
3503 */
3504 LLVMValueRef
3505 lp_build_log2(struct lp_build_context *bld,
3506 LLVMValueRef x)
3507 {
3508 LLVMValueRef res;
3509 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3510 return res;
3511 }
3512
3513 /*
3514 * Version of log2 which handles all edge cases.
3515 * Look at documentation of lp_build_log2_approx for
3516 * description of the behavior for each of the edge cases.
3517 */
3518 LLVMValueRef
3519 lp_build_log2_safe(struct lp_build_context *bld,
3520 LLVMValueRef x)
3521 {
3522 LLVMValueRef res;
3523 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3524 return res;
3525 }
3526
3527
3528 /**
3529 * Faster (and less accurate) log2.
3530 *
3531 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3532 *
3533 * Piece-wise linear approximation, with exact results when x is a
3534 * power of two.
3535 *
3536 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3537 */
3538 LLVMValueRef
3539 lp_build_fast_log2(struct lp_build_context *bld,
3540 LLVMValueRef x)
3541 {
3542 LLVMBuilderRef builder = bld->gallivm->builder;
3543 LLVMValueRef ipart;
3544 LLVMValueRef fpart;
3545
3546 assert(lp_check_value(bld->type, x));
3547
3548 assert(bld->type.floating);
3549
3550 /* ipart = floor(log2(x)) - 1 */
3551 ipart = lp_build_extract_exponent(bld, x, -1);
3552 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3553
3554 /* fpart = x / 2**ipart */
3555 fpart = lp_build_extract_mantissa(bld, x);
3556
3557 /* ipart + fpart */
3558 return LLVMBuildFAdd(builder, ipart, fpart, "");
3559 }
3560
3561
3562 /**
3563 * Fast implementation of iround(log2(x)).
3564 *
3565 * Not an approximation -- it should give accurate results all the time.
3566 */
3567 LLVMValueRef
3568 lp_build_ilog2(struct lp_build_context *bld,
3569 LLVMValueRef x)
3570 {
3571 LLVMBuilderRef builder = bld->gallivm->builder;
3572 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3573 LLVMValueRef ipart;
3574
3575 assert(bld->type.floating);
3576
3577 assert(lp_check_value(bld->type, x));
3578
3579 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3580 x = LLVMBuildFMul(builder, x, sqrt2, "");
3581
3582 /* ipart = floor(log2(x) + 0.5) */
3583 ipart = lp_build_extract_exponent(bld, x, 0);
3584
3585 return ipart;
3586 }
3587
3588 LLVMValueRef
3589 lp_build_mod(struct lp_build_context *bld,
3590 LLVMValueRef x,
3591 LLVMValueRef y)
3592 {
3593 LLVMBuilderRef builder = bld->gallivm->builder;
3594 LLVMValueRef res;
3595 const struct lp_type type = bld->type;
3596
3597 assert(lp_check_value(type, x));
3598 assert(lp_check_value(type, y));
3599
3600 if (type.floating)
3601 res = LLVMBuildFRem(builder, x, y, "");
3602 else if (type.sign)
3603 res = LLVMBuildSRem(builder, x, y, "");
3604 else
3605 res = LLVMBuildURem(builder, x, y, "");
3606 return res;
3607 }
3608
3609
3610 /*
3611 * For floating inputs it creates and returns a mask
3612 * which is all 1's for channels which are NaN.
3613 * Channels inside x which are not NaN will be 0.
3614 */
3615 LLVMValueRef
3616 lp_build_isnan(struct lp_build_context *bld,
3617 LLVMValueRef x)
3618 {
3619 LLVMValueRef mask;
3620 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3621
3622 assert(bld->type.floating);
3623 assert(lp_check_value(bld->type, x));
3624
3625 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3626 "isnotnan");
3627 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3628 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3629 return mask;
3630 }
3631
3632 /* Returns all 1's for floating point numbers that are
3633 * finite numbers and returns all zeros for -inf,
3634 * inf and nan's */
3635 LLVMValueRef
3636 lp_build_isfinite(struct lp_build_context *bld,
3637 LLVMValueRef x)
3638 {
3639 LLVMBuilderRef builder = bld->gallivm->builder;
3640 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3641 struct lp_type int_type = lp_int_type(bld->type);
3642 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3643 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3644 0x7f800000);
3645
3646 if (!bld->type.floating) {
3647 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3648 }
3649 assert(bld->type.floating);
3650 assert(lp_check_value(bld->type, x));
3651 assert(bld->type.width == 32);
3652
3653 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3654 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3655 intx, infornan32);
3656 }