gallivm,llvmpipe: fix float->srgb conversion to handle NaNs
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67
68 #define EXP_POLY_DEGREE 5
69
70 #define LOG_POLY_DEGREE 4
71
72
73 /**
74 * Generate min(a, b)
75 * No checks for special case values of a or b = 1 or 0 are done.
76 * NaN's are handled according to the behavior specified by the
77 * nan_behavior argument.
78 */
79 static LLVMValueRef
80 lp_build_min_simple(struct lp_build_context *bld,
81 LLVMValueRef a,
82 LLVMValueRef b,
83 enum gallivm_nan_behavior nan_behavior)
84 {
85 const struct lp_type type = bld->type;
86 const char *intrinsic = NULL;
87 unsigned intr_size = 0;
88 LLVMValueRef cond;
89
90 assert(lp_check_value(type, a));
91 assert(lp_check_value(type, b));
92
93 /* TODO: optimize the constant case */
94
95 if (type.floating && util_cpu_caps.has_sse) {
96 if (type.width == 32) {
97 if (type.length == 1) {
98 intrinsic = "llvm.x86.sse.min.ss";
99 intr_size = 128;
100 }
101 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
102 intrinsic = "llvm.x86.sse.min.ps";
103 intr_size = 128;
104 }
105 else {
106 intrinsic = "llvm.x86.avx.min.ps.256";
107 intr_size = 256;
108 }
109 }
110 if (type.width == 64 && util_cpu_caps.has_sse2) {
111 if (type.length == 1) {
112 intrinsic = "llvm.x86.sse2.min.sd";
113 intr_size = 128;
114 }
115 else if (type.length == 2 || !util_cpu_caps.has_avx) {
116 intrinsic = "llvm.x86.sse2.min.pd";
117 intr_size = 128;
118 }
119 else {
120 intrinsic = "llvm.x86.avx.min.pd.256";
121 intr_size = 256;
122 }
123 }
124 }
125 else if (type.floating && util_cpu_caps.has_altivec) {
126 if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
127 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
128 __FUNCTION__);
129 }
130 if (type.width == 32 && type.length == 4) {
131 intrinsic = "llvm.ppc.altivec.vminfp";
132 intr_size = 128;
133 }
134 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
135 intr_size = 128;
136 if ((type.width == 8 || type.width == 16) &&
137 (type.width * type.length <= 64) &&
138 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
139 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
140 __FUNCTION__);
141 }
142 if (type.width == 8 && !type.sign) {
143 intrinsic = "llvm.x86.sse2.pminu.b";
144 }
145 else if (type.width == 16 && type.sign) {
146 intrinsic = "llvm.x86.sse2.pmins.w";
147 }
148 if (util_cpu_caps.has_sse4_1) {
149 if (type.width == 8 && type.sign) {
150 intrinsic = "llvm.x86.sse41.pminsb";
151 }
152 if (type.width == 16 && !type.sign) {
153 intrinsic = "llvm.x86.sse41.pminuw";
154 }
155 if (type.width == 32 && !type.sign) {
156 intrinsic = "llvm.x86.sse41.pminud";
157 }
158 if (type.width == 32 && type.sign) {
159 intrinsic = "llvm.x86.sse41.pminsd";
160 }
161 }
162 } else if (util_cpu_caps.has_altivec) {
163 intr_size = 128;
164 if (type.width == 8) {
165 if (!type.sign) {
166 intrinsic = "llvm.ppc.altivec.vminub";
167 } else {
168 intrinsic = "llvm.ppc.altivec.vminsb";
169 }
170 } else if (type.width == 16) {
171 if (!type.sign) {
172 intrinsic = "llvm.ppc.altivec.vminuh";
173 } else {
174 intrinsic = "llvm.ppc.altivec.vminsh";
175 }
176 } else if (type.width == 32) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminuw";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsw";
181 }
182 }
183 }
184
185 if(intrinsic) {
186 /* We need to handle nan's for floating point numbers. If one of the
187 * inputs is nan the other should be returned (required by both D3D10+
188 * and OpenCL).
189 * The sse intrinsics return the second operator in case of nan by
190 * default so we need to special code to handle those.
191 */
192 if (util_cpu_caps.has_sse && type.floating &&
193 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
194 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
195 LLVMValueRef isnan, max;
196 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
197 type,
198 intr_size, a, b);
199 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
200 isnan = lp_build_isnan(bld, b);
201 return lp_build_select(bld, isnan, a, max);
202 } else {
203 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
204 isnan = lp_build_isnan(bld, a);
205 return lp_build_select(bld, isnan, a, max);
206 }
207 } else {
208 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
209 type,
210 intr_size, a, b);
211 }
212 }
213
214 if (type.floating) {
215 switch (nan_behavior) {
216 case GALLIVM_NAN_RETURN_NAN: {
217 LLVMValueRef isnan = lp_build_isnan(bld, b);
218 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
219 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
220 return lp_build_select(bld, cond, a, b);
221 }
222 break;
223 case GALLIVM_NAN_RETURN_OTHER: {
224 LLVMValueRef isnan = lp_build_isnan(bld, a);
225 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
226 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
227 return lp_build_select(bld, cond, a, b);
228 }
229 break;
230 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
231 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
232 return lp_build_select(bld, cond, a, b);
233 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
234 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
235 return lp_build_select(bld, cond, a, b);
236 break;
237 default:
238 assert(0);
239 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
240 return lp_build_select(bld, cond, a, b);
241 }
242 } else {
243 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
244 return lp_build_select(bld, cond, a, b);
245 }
246 }
247
248
249 /**
250 * Generate max(a, b)
251 * No checks for special case values of a or b = 1 or 0 are done.
252 * NaN's are handled according to the behavior specified by the
253 * nan_behavior argument.
254 */
255 static LLVMValueRef
256 lp_build_max_simple(struct lp_build_context *bld,
257 LLVMValueRef a,
258 LLVMValueRef b,
259 enum gallivm_nan_behavior nan_behavior)
260 {
261 const struct lp_type type = bld->type;
262 const char *intrinsic = NULL;
263 unsigned intr_size = 0;
264 LLVMValueRef cond;
265
266 assert(lp_check_value(type, a));
267 assert(lp_check_value(type, b));
268
269 /* TODO: optimize the constant case */
270
271 if (type.floating && util_cpu_caps.has_sse) {
272 if (type.width == 32) {
273 if (type.length == 1) {
274 intrinsic = "llvm.x86.sse.max.ss";
275 intr_size = 128;
276 }
277 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
278 intrinsic = "llvm.x86.sse.max.ps";
279 intr_size = 128;
280 }
281 else {
282 intrinsic = "llvm.x86.avx.max.ps.256";
283 intr_size = 256;
284 }
285 }
286 if (type.width == 64 && util_cpu_caps.has_sse2) {
287 if (type.length == 1) {
288 intrinsic = "llvm.x86.sse2.max.sd";
289 intr_size = 128;
290 }
291 else if (type.length == 2 || !util_cpu_caps.has_avx) {
292 intrinsic = "llvm.x86.sse2.max.pd";
293 intr_size = 128;
294 }
295 else {
296 intrinsic = "llvm.x86.avx.max.pd.256";
297 intr_size = 256;
298 }
299 }
300 }
301 else if (type.floating && util_cpu_caps.has_altivec) {
302 if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
303 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
304 __FUNCTION__);
305 }
306 if (type.width == 32 || type.length == 4) {
307 intrinsic = "llvm.ppc.altivec.vmaxfp";
308 intr_size = 128;
309 }
310 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
311 intr_size = 128;
312 if ((type.width == 8 || type.width == 16) &&
313 (type.width * type.length <= 64) &&
314 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
315 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
316 __FUNCTION__);
317 }
318 if (type.width == 8 && !type.sign) {
319 intrinsic = "llvm.x86.sse2.pmaxu.b";
320 intr_size = 128;
321 }
322 else if (type.width == 16 && type.sign) {
323 intrinsic = "llvm.x86.sse2.pmaxs.w";
324 }
325 if (util_cpu_caps.has_sse4_1) {
326 if (type.width == 8 && type.sign) {
327 intrinsic = "llvm.x86.sse41.pmaxsb";
328 }
329 if (type.width == 16 && !type.sign) {
330 intrinsic = "llvm.x86.sse41.pmaxuw";
331 }
332 if (type.width == 32 && !type.sign) {
333 intrinsic = "llvm.x86.sse41.pmaxud";
334 }
335 if (type.width == 32 && type.sign) {
336 intrinsic = "llvm.x86.sse41.pmaxsd";
337 }
338 }
339 } else if (util_cpu_caps.has_altivec) {
340 intr_size = 128;
341 if (type.width == 8) {
342 if (!type.sign) {
343 intrinsic = "llvm.ppc.altivec.vmaxub";
344 } else {
345 intrinsic = "llvm.ppc.altivec.vmaxsb";
346 }
347 } else if (type.width == 16) {
348 if (!type.sign) {
349 intrinsic = "llvm.ppc.altivec.vmaxuh";
350 } else {
351 intrinsic = "llvm.ppc.altivec.vmaxsh";
352 }
353 } else if (type.width == 32) {
354 if (!type.sign) {
355 intrinsic = "llvm.ppc.altivec.vmaxuw";
356 } else {
357 intrinsic = "llvm.ppc.altivec.vmaxsw";
358 }
359 }
360 }
361
362 if(intrinsic) {
363 if (util_cpu_caps.has_sse && type.floating &&
364 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
365 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
366 LLVMValueRef isnan, min;
367 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
368 type,
369 intr_size, a, b);
370 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
371 isnan = lp_build_isnan(bld, b);
372 return lp_build_select(bld, isnan, a, min);
373 } else {
374 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
375 isnan = lp_build_isnan(bld, a);
376 return lp_build_select(bld, isnan, a, min);
377 }
378 } else {
379 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
380 type,
381 intr_size, a, b);
382 }
383 }
384
385 if (type.floating) {
386 switch (nan_behavior) {
387 case GALLIVM_NAN_RETURN_NAN: {
388 LLVMValueRef isnan = lp_build_isnan(bld, b);
389 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
390 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
391 return lp_build_select(bld, cond, a, b);
392 }
393 break;
394 case GALLIVM_NAN_RETURN_OTHER: {
395 LLVMValueRef isnan = lp_build_isnan(bld, a);
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
398 return lp_build_select(bld, cond, a, b);
399 }
400 break;
401 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
402 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
403 return lp_build_select(bld, cond, a, b);
404 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
405 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
406 return lp_build_select(bld, cond, a, b);
407 break;
408 default:
409 assert(0);
410 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
411 return lp_build_select(bld, cond, a, b);
412 }
413 } else {
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 return lp_build_select(bld, cond, a, b);
416 }
417 }
418
419
420 /**
421 * Generate 1 - a, or ~a depending on bld->type.
422 */
423 LLVMValueRef
424 lp_build_comp(struct lp_build_context *bld,
425 LLVMValueRef a)
426 {
427 LLVMBuilderRef builder = bld->gallivm->builder;
428 const struct lp_type type = bld->type;
429
430 assert(lp_check_value(type, a));
431
432 if(a == bld->one)
433 return bld->zero;
434 if(a == bld->zero)
435 return bld->one;
436
437 if(type.norm && !type.floating && !type.fixed && !type.sign) {
438 if(LLVMIsConstant(a))
439 return LLVMConstNot(a);
440 else
441 return LLVMBuildNot(builder, a, "");
442 }
443
444 if(LLVMIsConstant(a))
445 if (type.floating)
446 return LLVMConstFSub(bld->one, a);
447 else
448 return LLVMConstSub(bld->one, a);
449 else
450 if (type.floating)
451 return LLVMBuildFSub(builder, bld->one, a, "");
452 else
453 return LLVMBuildSub(builder, bld->one, a, "");
454 }
455
456
457 /**
458 * Generate a + b
459 */
460 LLVMValueRef
461 lp_build_add(struct lp_build_context *bld,
462 LLVMValueRef a,
463 LLVMValueRef b)
464 {
465 LLVMBuilderRef builder = bld->gallivm->builder;
466 const struct lp_type type = bld->type;
467 LLVMValueRef res;
468
469 assert(lp_check_value(type, a));
470 assert(lp_check_value(type, b));
471
472 if(a == bld->zero)
473 return b;
474 if(b == bld->zero)
475 return a;
476 if(a == bld->undef || b == bld->undef)
477 return bld->undef;
478
479 if(bld->type.norm) {
480 const char *intrinsic = NULL;
481
482 if(a == bld->one || b == bld->one)
483 return bld->one;
484
485 if (type.width * type.length == 128 &&
486 !type.floating && !type.fixed) {
487 if(util_cpu_caps.has_sse2) {
488 if(type.width == 8)
489 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
490 if(type.width == 16)
491 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
492 } else if (util_cpu_caps.has_altivec) {
493 if(type.width == 8)
494 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
495 if(type.width == 16)
496 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
497 }
498 }
499
500 if(intrinsic)
501 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
502 }
503
504 /* TODO: handle signed case */
505 if(type.norm && !type.floating && !type.fixed && !type.sign)
506 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
507
508 if(LLVMIsConstant(a) && LLVMIsConstant(b))
509 if (type.floating)
510 res = LLVMConstFAdd(a, b);
511 else
512 res = LLVMConstAdd(a, b);
513 else
514 if (type.floating)
515 res = LLVMBuildFAdd(builder, a, b, "");
516 else
517 res = LLVMBuildAdd(builder, a, b, "");
518
519 /* clamp to ceiling of 1.0 */
520 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
521 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
522
523 /* XXX clamp to floor of -1 or 0??? */
524
525 return res;
526 }
527
528
529 /** Return the scalar sum of the elements of a.
530 * Should avoid this operation whenever possible.
531 */
532 LLVMValueRef
533 lp_build_horizontal_add(struct lp_build_context *bld,
534 LLVMValueRef a)
535 {
536 LLVMBuilderRef builder = bld->gallivm->builder;
537 const struct lp_type type = bld->type;
538 LLVMValueRef index, res;
539 unsigned i, length;
540 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
541 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
542 LLVMValueRef vecres, elem2;
543
544 assert(lp_check_value(type, a));
545
546 if (type.length == 1) {
547 return a;
548 }
549
550 assert(!bld->type.norm);
551
552 /*
553 * for byte vectors can do much better with psadbw.
554 * Using repeated shuffle/adds here. Note with multiple vectors
555 * this can be done more efficiently as outlined in the intel
556 * optimization manual.
557 * Note: could cause data rearrangement if used with smaller element
558 * sizes.
559 */
560
561 vecres = a;
562 length = type.length / 2;
563 while (length > 1) {
564 LLVMValueRef vec1, vec2;
565 for (i = 0; i < length; i++) {
566 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
567 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
568 }
569 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
570 LLVMConstVector(shuffles1, length), "");
571 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
572 LLVMConstVector(shuffles2, length), "");
573 if (type.floating) {
574 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
575 }
576 else {
577 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
578 }
579 length = length >> 1;
580 }
581
582 /* always have vector of size 2 here */
583 assert(length == 1);
584
585 index = lp_build_const_int32(bld->gallivm, 0);
586 res = LLVMBuildExtractElement(builder, vecres, index, "");
587 index = lp_build_const_int32(bld->gallivm, 1);
588 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
589
590 if (type.floating)
591 res = LLVMBuildFAdd(builder, res, elem2, "");
592 else
593 res = LLVMBuildAdd(builder, res, elem2, "");
594
595 return res;
596 }
597
598 /**
599 * Return the horizontal sums of 4 float vectors as a float4 vector.
600 * This uses the technique as outlined in Intel Optimization Manual.
601 */
602 static LLVMValueRef
603 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
604 LLVMValueRef src[4])
605 {
606 struct gallivm_state *gallivm = bld->gallivm;
607 LLVMBuilderRef builder = gallivm->builder;
608 LLVMValueRef shuffles[4];
609 LLVMValueRef tmp[4];
610 LLVMValueRef sumtmp[2], shuftmp[2];
611
612 /* lower half of regs */
613 shuffles[0] = lp_build_const_int32(gallivm, 0);
614 shuffles[1] = lp_build_const_int32(gallivm, 1);
615 shuffles[2] = lp_build_const_int32(gallivm, 4);
616 shuffles[3] = lp_build_const_int32(gallivm, 5);
617 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
618 LLVMConstVector(shuffles, 4), "");
619 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
620 LLVMConstVector(shuffles, 4), "");
621
622 /* upper half of regs */
623 shuffles[0] = lp_build_const_int32(gallivm, 2);
624 shuffles[1] = lp_build_const_int32(gallivm, 3);
625 shuffles[2] = lp_build_const_int32(gallivm, 6);
626 shuffles[3] = lp_build_const_int32(gallivm, 7);
627 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
628 LLVMConstVector(shuffles, 4), "");
629 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
630 LLVMConstVector(shuffles, 4), "");
631
632 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
633 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
634
635 shuffles[0] = lp_build_const_int32(gallivm, 0);
636 shuffles[1] = lp_build_const_int32(gallivm, 2);
637 shuffles[2] = lp_build_const_int32(gallivm, 4);
638 shuffles[3] = lp_build_const_int32(gallivm, 6);
639 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640 LLVMConstVector(shuffles, 4), "");
641
642 shuffles[0] = lp_build_const_int32(gallivm, 1);
643 shuffles[1] = lp_build_const_int32(gallivm, 3);
644 shuffles[2] = lp_build_const_int32(gallivm, 5);
645 shuffles[3] = lp_build_const_int32(gallivm, 7);
646 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
647 LLVMConstVector(shuffles, 4), "");
648
649 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
650 }
651
652
653 /*
654 * partially horizontally add 2-4 float vectors with length nx4,
655 * i.e. only four adjacent values in each vector will be added,
656 * assuming values are really grouped in 4 which also determines
657 * output order.
658 *
659 * Return a vector of the same length as the initial vectors,
660 * with the excess elements (if any) being undefined.
661 * The element order is independent of number of input vectors.
662 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
663 * the output order thus will be
664 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
665 */
666 LLVMValueRef
667 lp_build_hadd_partial4(struct lp_build_context *bld,
668 LLVMValueRef vectors[],
669 unsigned num_vecs)
670 {
671 struct gallivm_state *gallivm = bld->gallivm;
672 LLVMBuilderRef builder = gallivm->builder;
673 LLVMValueRef ret_vec;
674 LLVMValueRef tmp[4];
675 const char *intrinsic = NULL;
676
677 assert(num_vecs >= 2 && num_vecs <= 4);
678 assert(bld->type.floating);
679
680 /* only use this with at least 2 vectors, as it is sort of expensive
681 * (depending on cpu) and we always need two horizontal adds anyway,
682 * so a shuffle/add approach might be better.
683 */
684
685 tmp[0] = vectors[0];
686 tmp[1] = vectors[1];
687
688 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
689 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
690
691 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
692 bld->type.length == 4) {
693 intrinsic = "llvm.x86.sse3.hadd.ps";
694 }
695 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
696 bld->type.length == 8) {
697 intrinsic = "llvm.x86.avx.hadd.ps.256";
698 }
699 if (intrinsic) {
700 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
701 lp_build_vec_type(gallivm, bld->type),
702 tmp[0], tmp[1]);
703 if (num_vecs > 2) {
704 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
705 lp_build_vec_type(gallivm, bld->type),
706 tmp[2], tmp[3]);
707 }
708 else {
709 tmp[1] = tmp[0];
710 }
711 return lp_build_intrinsic_binary(builder, intrinsic,
712 lp_build_vec_type(gallivm, bld->type),
713 tmp[0], tmp[1]);
714 }
715
716 if (bld->type.length == 4) {
717 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
718 }
719 else {
720 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
721 unsigned j;
722 unsigned num_iter = bld->type.length / 4;
723 struct lp_type parttype = bld->type;
724 parttype.length = 4;
725 for (j = 0; j < num_iter; j++) {
726 LLVMValueRef partsrc[4];
727 unsigned i;
728 for (i = 0; i < 4; i++) {
729 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
730 }
731 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
732 }
733 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
734 }
735 return ret_vec;
736 }
737
738 /**
739 * Generate a - b
740 */
741 LLVMValueRef
742 lp_build_sub(struct lp_build_context *bld,
743 LLVMValueRef a,
744 LLVMValueRef b)
745 {
746 LLVMBuilderRef builder = bld->gallivm->builder;
747 const struct lp_type type = bld->type;
748 LLVMValueRef res;
749
750 assert(lp_check_value(type, a));
751 assert(lp_check_value(type, b));
752
753 if(b == bld->zero)
754 return a;
755 if(a == bld->undef || b == bld->undef)
756 return bld->undef;
757 if(a == b)
758 return bld->zero;
759
760 if(bld->type.norm) {
761 const char *intrinsic = NULL;
762
763 if(b == bld->one)
764 return bld->zero;
765
766 if (type.width * type.length == 128 &&
767 !type.floating && !type.fixed) {
768 if (util_cpu_caps.has_sse2) {
769 if(type.width == 8)
770 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
771 if(type.width == 16)
772 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
773 } else if (util_cpu_caps.has_altivec) {
774 if(type.width == 8)
775 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
776 if(type.width == 16)
777 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
778 }
779 }
780
781 if(intrinsic)
782 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
783 }
784
785 /* TODO: handle signed case */
786 if(type.norm && !type.floating && !type.fixed && !type.sign)
787 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
788
789 if(LLVMIsConstant(a) && LLVMIsConstant(b))
790 if (type.floating)
791 res = LLVMConstFSub(a, b);
792 else
793 res = LLVMConstSub(a, b);
794 else
795 if (type.floating)
796 res = LLVMBuildFSub(builder, a, b, "");
797 else
798 res = LLVMBuildSub(builder, a, b, "");
799
800 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
801 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802
803 return res;
804 }
805
806
807
808 /**
809 * Normalized multiplication.
810 *
811 * There are several approaches for (using 8-bit normalized multiplication as
812 * an example):
813 *
814 * - alpha plus one
815 *
816 * makes the following approximation to the division (Sree)
817 *
818 * a*b/255 ~= (a*(b + 1)) >> 256
819 *
820 * which is the fastest method that satisfies the following OpenGL criteria of
821 *
822 * 0*0 = 0 and 255*255 = 255
823 *
824 * - geometric series
825 *
826 * takes the geometric series approximation to the division
827 *
828 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
829 *
830 * in this case just the first two terms to fit in 16bit arithmetic
831 *
832 * t/255 ~= (t + (t >> 8)) >> 8
833 *
834 * note that just by itself it doesn't satisfies the OpenGL criteria, as
835 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
836 * must be used.
837 *
838 * - geometric series plus rounding
839 *
840 * when using a geometric series division instead of truncating the result
841 * use roundoff in the approximation (Jim Blinn)
842 *
843 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
844 *
845 * achieving the exact results.
846 *
847 *
848 *
849 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
850 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
851 * @sa Michael Herf, The "double blend trick", May 2000,
852 * http://www.stereopsis.com/doubleblend.html
853 */
854 static LLVMValueRef
855 lp_build_mul_norm(struct gallivm_state *gallivm,
856 struct lp_type wide_type,
857 LLVMValueRef a, LLVMValueRef b)
858 {
859 LLVMBuilderRef builder = gallivm->builder;
860 struct lp_build_context bld;
861 unsigned n;
862 LLVMValueRef half;
863 LLVMValueRef ab;
864
865 assert(!wide_type.floating);
866 assert(lp_check_value(wide_type, a));
867 assert(lp_check_value(wide_type, b));
868
869 lp_build_context_init(&bld, gallivm, wide_type);
870
871 n = wide_type.width / 2;
872 if (wide_type.sign) {
873 --n;
874 }
875
876 /*
877 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
878 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
879 */
880
881 /*
882 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
883 */
884
885 ab = LLVMBuildMul(builder, a, b, "");
886 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
887
888 /*
889 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
890 */
891
892 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
893 if (wide_type.sign) {
894 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
895 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
896 half = lp_build_select(&bld, sign, minus_half, half);
897 }
898 ab = LLVMBuildAdd(builder, ab, half, "");
899
900 /* Final division */
901 ab = lp_build_shr_imm(&bld, ab, n);
902
903 return ab;
904 }
905
906 /**
907 * Generate a * b
908 */
909 LLVMValueRef
910 lp_build_mul(struct lp_build_context *bld,
911 LLVMValueRef a,
912 LLVMValueRef b)
913 {
914 LLVMBuilderRef builder = bld->gallivm->builder;
915 const struct lp_type type = bld->type;
916 LLVMValueRef shift;
917 LLVMValueRef res;
918
919 assert(lp_check_value(type, a));
920 assert(lp_check_value(type, b));
921
922 if(a == bld->zero)
923 return bld->zero;
924 if(a == bld->one)
925 return b;
926 if(b == bld->zero)
927 return bld->zero;
928 if(b == bld->one)
929 return a;
930 if(a == bld->undef || b == bld->undef)
931 return bld->undef;
932
933 if (!type.floating && !type.fixed && type.norm) {
934 struct lp_type wide_type = lp_wider_type(type);
935 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
936
937 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
938 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
939
940 /* PMULLW, PSRLW, PADDW */
941 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
942 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
943
944 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
945
946 return ab;
947 }
948
949 if(type.fixed)
950 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
951 else
952 shift = NULL;
953
954 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
955 if (type.floating)
956 res = LLVMConstFMul(a, b);
957 else
958 res = LLVMConstMul(a, b);
959 if(shift) {
960 if(type.sign)
961 res = LLVMConstAShr(res, shift);
962 else
963 res = LLVMConstLShr(res, shift);
964 }
965 }
966 else {
967 if (type.floating)
968 res = LLVMBuildFMul(builder, a, b, "");
969 else
970 res = LLVMBuildMul(builder, a, b, "");
971 if(shift) {
972 if(type.sign)
973 res = LLVMBuildAShr(builder, res, shift, "");
974 else
975 res = LLVMBuildLShr(builder, res, shift, "");
976 }
977 }
978
979 return res;
980 }
981
982
983 /**
984 * Small vector x scale multiplication optimization.
985 */
986 LLVMValueRef
987 lp_build_mul_imm(struct lp_build_context *bld,
988 LLVMValueRef a,
989 int b)
990 {
991 LLVMBuilderRef builder = bld->gallivm->builder;
992 LLVMValueRef factor;
993
994 assert(lp_check_value(bld->type, a));
995
996 if(b == 0)
997 return bld->zero;
998
999 if(b == 1)
1000 return a;
1001
1002 if(b == -1)
1003 return lp_build_negate(bld, a);
1004
1005 if(b == 2 && bld->type.floating)
1006 return lp_build_add(bld, a, a);
1007
1008 if(util_is_power_of_two(b)) {
1009 unsigned shift = ffs(b) - 1;
1010
1011 if(bld->type.floating) {
1012 #if 0
1013 /*
1014 * Power of two multiplication by directly manipulating the exponent.
1015 *
1016 * XXX: This might not be always faster, it will introduce a small error
1017 * for multiplication by zero, and it will produce wrong results
1018 * for Inf and NaN.
1019 */
1020 unsigned mantissa = lp_mantissa(bld->type);
1021 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1022 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1023 a = LLVMBuildAdd(builder, a, factor, "");
1024 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1025 return a;
1026 #endif
1027 }
1028 else {
1029 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1030 return LLVMBuildShl(builder, a, factor, "");
1031 }
1032 }
1033
1034 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1035 return lp_build_mul(bld, a, factor);
1036 }
1037
1038
1039 /**
1040 * Generate a / b
1041 */
1042 LLVMValueRef
1043 lp_build_div(struct lp_build_context *bld,
1044 LLVMValueRef a,
1045 LLVMValueRef b)
1046 {
1047 LLVMBuilderRef builder = bld->gallivm->builder;
1048 const struct lp_type type = bld->type;
1049
1050 assert(lp_check_value(type, a));
1051 assert(lp_check_value(type, b));
1052
1053 if(a == bld->zero)
1054 return bld->zero;
1055 if(a == bld->one)
1056 return lp_build_rcp(bld, b);
1057 if(b == bld->zero)
1058 return bld->undef;
1059 if(b == bld->one)
1060 return a;
1061 if(a == bld->undef || b == bld->undef)
1062 return bld->undef;
1063
1064 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1065 if (type.floating)
1066 return LLVMConstFDiv(a, b);
1067 else if (type.sign)
1068 return LLVMConstSDiv(a, b);
1069 else
1070 return LLVMConstUDiv(a, b);
1071 }
1072
1073 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1074 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1075 type.floating)
1076 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1077
1078 if (type.floating)
1079 return LLVMBuildFDiv(builder, a, b, "");
1080 else if (type.sign)
1081 return LLVMBuildSDiv(builder, a, b, "");
1082 else
1083 return LLVMBuildUDiv(builder, a, b, "");
1084 }
1085
1086
1087 /**
1088 * Linear interpolation helper.
1089 *
1090 * @param normalized whether we are interpolating normalized values,
1091 * encoded in normalized integers, twice as wide.
1092 *
1093 * @sa http://www.stereopsis.com/doubleblend.html
1094 */
1095 static INLINE LLVMValueRef
1096 lp_build_lerp_simple(struct lp_build_context *bld,
1097 LLVMValueRef x,
1098 LLVMValueRef v0,
1099 LLVMValueRef v1,
1100 unsigned flags)
1101 {
1102 unsigned half_width = bld->type.width/2;
1103 LLVMBuilderRef builder = bld->gallivm->builder;
1104 LLVMValueRef delta;
1105 LLVMValueRef res;
1106
1107 assert(lp_check_value(bld->type, x));
1108 assert(lp_check_value(bld->type, v0));
1109 assert(lp_check_value(bld->type, v1));
1110
1111 delta = lp_build_sub(bld, v1, v0);
1112
1113 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1114 if (!bld->type.sign) {
1115 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1116 /*
1117 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1118 * most-significant-bit to the lowest-significant-bit, so that
1119 * later we can just divide by 2**n instead of 2**n - 1.
1120 */
1121
1122 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1123 }
1124
1125 /* (x * delta) >> n */
1126 res = lp_build_mul(bld, x, delta);
1127 res = lp_build_shr_imm(bld, res, half_width);
1128 } else {
1129 /*
1130 * The rescaling trick above doesn't work for signed numbers, so
1131 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1132 * instead.
1133 */
1134 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1135 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1136 }
1137 } else {
1138 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1139 res = lp_build_mul(bld, x, delta);
1140 }
1141
1142 res = lp_build_add(bld, v0, res);
1143
1144 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1145 bld->type.fixed) {
1146 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1147 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1148 * but it will be wrong for true fixed point use cases. Basically we need
1149 * a more powerful lp_type, capable of further distinguishing the values
1150 * interpretation from the value storage. */
1151 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1152 }
1153
1154 return res;
1155 }
1156
1157
1158 /**
1159 * Linear interpolation.
1160 */
1161 LLVMValueRef
1162 lp_build_lerp(struct lp_build_context *bld,
1163 LLVMValueRef x,
1164 LLVMValueRef v0,
1165 LLVMValueRef v1,
1166 unsigned flags)
1167 {
1168 const struct lp_type type = bld->type;
1169 LLVMValueRef res;
1170
1171 assert(lp_check_value(type, x));
1172 assert(lp_check_value(type, v0));
1173 assert(lp_check_value(type, v1));
1174
1175 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1176
1177 if (type.norm) {
1178 struct lp_type wide_type;
1179 struct lp_build_context wide_bld;
1180 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1181
1182 assert(type.length >= 2);
1183
1184 /*
1185 * Create a wider integer type, enough to hold the
1186 * intermediate result of the multiplication.
1187 */
1188 memset(&wide_type, 0, sizeof wide_type);
1189 wide_type.sign = type.sign;
1190 wide_type.width = type.width*2;
1191 wide_type.length = type.length/2;
1192
1193 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1194
1195 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1196 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1197 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1198
1199 /*
1200 * Lerp both halves.
1201 */
1202
1203 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1204
1205 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1206 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1207
1208 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1209 } else {
1210 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1211 }
1212
1213 return res;
1214 }
1215
1216
1217 /**
1218 * Bilinear interpolation.
1219 *
1220 * Values indices are in v_{yx}.
1221 */
1222 LLVMValueRef
1223 lp_build_lerp_2d(struct lp_build_context *bld,
1224 LLVMValueRef x,
1225 LLVMValueRef y,
1226 LLVMValueRef v00,
1227 LLVMValueRef v01,
1228 LLVMValueRef v10,
1229 LLVMValueRef v11,
1230 unsigned flags)
1231 {
1232 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1233 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1234 return lp_build_lerp(bld, y, v0, v1, flags);
1235 }
1236
1237
1238 LLVMValueRef
1239 lp_build_lerp_3d(struct lp_build_context *bld,
1240 LLVMValueRef x,
1241 LLVMValueRef y,
1242 LLVMValueRef z,
1243 LLVMValueRef v000,
1244 LLVMValueRef v001,
1245 LLVMValueRef v010,
1246 LLVMValueRef v011,
1247 LLVMValueRef v100,
1248 LLVMValueRef v101,
1249 LLVMValueRef v110,
1250 LLVMValueRef v111,
1251 unsigned flags)
1252 {
1253 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1254 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1255 return lp_build_lerp(bld, z, v0, v1, flags);
1256 }
1257
1258
1259 /**
1260 * Generate min(a, b)
1261 * Do checks for special cases but not for nans.
1262 */
1263 LLVMValueRef
1264 lp_build_min(struct lp_build_context *bld,
1265 LLVMValueRef a,
1266 LLVMValueRef b)
1267 {
1268 assert(lp_check_value(bld->type, a));
1269 assert(lp_check_value(bld->type, b));
1270
1271 if(a == bld->undef || b == bld->undef)
1272 return bld->undef;
1273
1274 if(a == b)
1275 return a;
1276
1277 if (bld->type.norm) {
1278 if (!bld->type.sign) {
1279 if (a == bld->zero || b == bld->zero) {
1280 return bld->zero;
1281 }
1282 }
1283 if(a == bld->one)
1284 return b;
1285 if(b == bld->one)
1286 return a;
1287 }
1288
1289 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1290 }
1291
1292
1293 /**
1294 * Generate min(a, b)
1295 * NaN's are handled according to the behavior specified by the
1296 * nan_behavior argument.
1297 */
1298 LLVMValueRef
1299 lp_build_min_ext(struct lp_build_context *bld,
1300 LLVMValueRef a,
1301 LLVMValueRef b,
1302 enum gallivm_nan_behavior nan_behavior)
1303 {
1304 assert(lp_check_value(bld->type, a));
1305 assert(lp_check_value(bld->type, b));
1306
1307 if(a == bld->undef || b == bld->undef)
1308 return bld->undef;
1309
1310 if(a == b)
1311 return a;
1312
1313 if (bld->type.norm) {
1314 if (!bld->type.sign) {
1315 if (a == bld->zero || b == bld->zero) {
1316 return bld->zero;
1317 }
1318 }
1319 if(a == bld->one)
1320 return b;
1321 if(b == bld->one)
1322 return a;
1323 }
1324
1325 return lp_build_min_simple(bld, a, b, nan_behavior);
1326 }
1327
1328 /**
1329 * Generate max(a, b)
1330 * Do checks for special cases, but NaN behavior is undefined.
1331 */
1332 LLVMValueRef
1333 lp_build_max(struct lp_build_context *bld,
1334 LLVMValueRef a,
1335 LLVMValueRef b)
1336 {
1337 assert(lp_check_value(bld->type, a));
1338 assert(lp_check_value(bld->type, b));
1339
1340 if(a == bld->undef || b == bld->undef)
1341 return bld->undef;
1342
1343 if(a == b)
1344 return a;
1345
1346 if(bld->type.norm) {
1347 if(a == bld->one || b == bld->one)
1348 return bld->one;
1349 if (!bld->type.sign) {
1350 if (a == bld->zero) {
1351 return b;
1352 }
1353 if (b == bld->zero) {
1354 return a;
1355 }
1356 }
1357 }
1358
1359 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1360 }
1361
1362
1363 /**
1364 * Generate max(a, b)
1365 * Checks for special cases.
1366 * NaN's are handled according to the behavior specified by the
1367 * nan_behavior argument.
1368 */
1369 LLVMValueRef
1370 lp_build_max_ext(struct lp_build_context *bld,
1371 LLVMValueRef a,
1372 LLVMValueRef b,
1373 enum gallivm_nan_behavior nan_behavior)
1374 {
1375 assert(lp_check_value(bld->type, a));
1376 assert(lp_check_value(bld->type, b));
1377
1378 if(a == bld->undef || b == bld->undef)
1379 return bld->undef;
1380
1381 if(a == b)
1382 return a;
1383
1384 if(bld->type.norm) {
1385 if(a == bld->one || b == bld->one)
1386 return bld->one;
1387 if (!bld->type.sign) {
1388 if (a == bld->zero) {
1389 return b;
1390 }
1391 if (b == bld->zero) {
1392 return a;
1393 }
1394 }
1395 }
1396
1397 return lp_build_max_simple(bld, a, b, nan_behavior);
1398 }
1399
1400 /**
1401 * Generate clamp(a, min, max)
1402 * NaN behavior (for any of a, min, max) is undefined.
1403 * Do checks for special cases.
1404 */
1405 LLVMValueRef
1406 lp_build_clamp(struct lp_build_context *bld,
1407 LLVMValueRef a,
1408 LLVMValueRef min,
1409 LLVMValueRef max)
1410 {
1411 assert(lp_check_value(bld->type, a));
1412 assert(lp_check_value(bld->type, min));
1413 assert(lp_check_value(bld->type, max));
1414
1415 a = lp_build_min(bld, a, max);
1416 a = lp_build_max(bld, a, min);
1417 return a;
1418 }
1419
1420
1421 /**
1422 * Generate clamp(a, 0, 1)
1423 * A NaN will get converted to zero.
1424 */
1425 LLVMValueRef
1426 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1427 LLVMValueRef a)
1428 {
1429 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1430 a = lp_build_min(bld, a, bld->one);
1431 return a;
1432 }
1433
1434
1435 /**
1436 * Generate abs(a)
1437 */
1438 LLVMValueRef
1439 lp_build_abs(struct lp_build_context *bld,
1440 LLVMValueRef a)
1441 {
1442 LLVMBuilderRef builder = bld->gallivm->builder;
1443 const struct lp_type type = bld->type;
1444 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1445
1446 assert(lp_check_value(type, a));
1447
1448 if(!type.sign)
1449 return a;
1450
1451 if(type.floating) {
1452 /* Mask out the sign bit */
1453 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1454 unsigned long long absMask = ~(1ULL << (type.width - 1));
1455 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1456 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1457 a = LLVMBuildAnd(builder, a, mask, "");
1458 a = LLVMBuildBitCast(builder, a, vec_type, "");
1459 return a;
1460 }
1461
1462 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1463 switch(type.width) {
1464 case 8:
1465 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1466 case 16:
1467 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1468 case 32:
1469 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1470 }
1471 }
1472 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1473 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1474 (type.width == 8 || type.width == 16 || type.width == 32)) {
1475 debug_printf("%s: inefficient code, should split vectors manually\n",
1476 __FUNCTION__);
1477 }
1478
1479 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1480 }
1481
1482
1483 LLVMValueRef
1484 lp_build_negate(struct lp_build_context *bld,
1485 LLVMValueRef a)
1486 {
1487 LLVMBuilderRef builder = bld->gallivm->builder;
1488
1489 assert(lp_check_value(bld->type, a));
1490
1491 #if HAVE_LLVM >= 0x0207
1492 if (bld->type.floating)
1493 a = LLVMBuildFNeg(builder, a, "");
1494 else
1495 #endif
1496 a = LLVMBuildNeg(builder, a, "");
1497
1498 return a;
1499 }
1500
1501
1502 /** Return -1, 0 or +1 depending on the sign of a */
1503 LLVMValueRef
1504 lp_build_sgn(struct lp_build_context *bld,
1505 LLVMValueRef a)
1506 {
1507 LLVMBuilderRef builder = bld->gallivm->builder;
1508 const struct lp_type type = bld->type;
1509 LLVMValueRef cond;
1510 LLVMValueRef res;
1511
1512 assert(lp_check_value(type, a));
1513
1514 /* Handle non-zero case */
1515 if(!type.sign) {
1516 /* if not zero then sign must be positive */
1517 res = bld->one;
1518 }
1519 else if(type.floating) {
1520 LLVMTypeRef vec_type;
1521 LLVMTypeRef int_type;
1522 LLVMValueRef mask;
1523 LLVMValueRef sign;
1524 LLVMValueRef one;
1525 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1526
1527 int_type = lp_build_int_vec_type(bld->gallivm, type);
1528 vec_type = lp_build_vec_type(bld->gallivm, type);
1529 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1530
1531 /* Take the sign bit and add it to 1 constant */
1532 sign = LLVMBuildBitCast(builder, a, int_type, "");
1533 sign = LLVMBuildAnd(builder, sign, mask, "");
1534 one = LLVMConstBitCast(bld->one, int_type);
1535 res = LLVMBuildOr(builder, sign, one, "");
1536 res = LLVMBuildBitCast(builder, res, vec_type, "");
1537 }
1538 else
1539 {
1540 /* signed int/norm/fixed point */
1541 /* could use psign with sse3 and appropriate vectors here */
1542 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1543 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1544 res = lp_build_select(bld, cond, bld->one, minus_one);
1545 }
1546
1547 /* Handle zero */
1548 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1549 res = lp_build_select(bld, cond, bld->zero, res);
1550
1551 return res;
1552 }
1553
1554
1555 /**
1556 * Set the sign of float vector 'a' according to 'sign'.
1557 * If sign==0, return abs(a).
1558 * If sign==1, return -abs(a);
1559 * Other values for sign produce undefined results.
1560 */
1561 LLVMValueRef
1562 lp_build_set_sign(struct lp_build_context *bld,
1563 LLVMValueRef a, LLVMValueRef sign)
1564 {
1565 LLVMBuilderRef builder = bld->gallivm->builder;
1566 const struct lp_type type = bld->type;
1567 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1568 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1569 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1570 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1571 ~((unsigned long long) 1 << (type.width - 1)));
1572 LLVMValueRef val, res;
1573
1574 assert(type.floating);
1575 assert(lp_check_value(type, a));
1576
1577 /* val = reinterpret_cast<int>(a) */
1578 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1579 /* val = val & mask */
1580 val = LLVMBuildAnd(builder, val, mask, "");
1581 /* sign = sign << shift */
1582 sign = LLVMBuildShl(builder, sign, shift, "");
1583 /* res = val | sign */
1584 res = LLVMBuildOr(builder, val, sign, "");
1585 /* res = reinterpret_cast<float>(res) */
1586 res = LLVMBuildBitCast(builder, res, vec_type, "");
1587
1588 return res;
1589 }
1590
1591
1592 /**
1593 * Convert vector of (or scalar) int to vector of (or scalar) float.
1594 */
1595 LLVMValueRef
1596 lp_build_int_to_float(struct lp_build_context *bld,
1597 LLVMValueRef a)
1598 {
1599 LLVMBuilderRef builder = bld->gallivm->builder;
1600 const struct lp_type type = bld->type;
1601 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1602
1603 assert(type.floating);
1604
1605 return LLVMBuildSIToFP(builder, a, vec_type, "");
1606 }
1607
1608 static boolean
1609 arch_rounding_available(const struct lp_type type)
1610 {
1611 if ((util_cpu_caps.has_sse4_1 &&
1612 (type.length == 1 || type.width*type.length == 128)) ||
1613 (util_cpu_caps.has_avx && type.width*type.length == 256))
1614 return TRUE;
1615 else if ((util_cpu_caps.has_altivec &&
1616 (type.width == 32 && type.length == 4)))
1617 return TRUE;
1618
1619 return FALSE;
1620 }
1621
1622 enum lp_build_round_mode
1623 {
1624 LP_BUILD_ROUND_NEAREST = 0,
1625 LP_BUILD_ROUND_FLOOR = 1,
1626 LP_BUILD_ROUND_CEIL = 2,
1627 LP_BUILD_ROUND_TRUNCATE = 3
1628 };
1629
1630 /**
1631 * Helper for SSE4.1's ROUNDxx instructions.
1632 *
1633 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1634 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1635 */
1636 static INLINE LLVMValueRef
1637 lp_build_round_sse41(struct lp_build_context *bld,
1638 LLVMValueRef a,
1639 enum lp_build_round_mode mode)
1640 {
1641 LLVMBuilderRef builder = bld->gallivm->builder;
1642 const struct lp_type type = bld->type;
1643 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1644 const char *intrinsic;
1645 LLVMValueRef res;
1646
1647 assert(type.floating);
1648
1649 assert(lp_check_value(type, a));
1650 assert(util_cpu_caps.has_sse4_1);
1651
1652 if (type.length == 1) {
1653 LLVMTypeRef vec_type;
1654 LLVMValueRef undef;
1655 LLVMValueRef args[3];
1656 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1657
1658 switch(type.width) {
1659 case 32:
1660 intrinsic = "llvm.x86.sse41.round.ss";
1661 break;
1662 case 64:
1663 intrinsic = "llvm.x86.sse41.round.sd";
1664 break;
1665 default:
1666 assert(0);
1667 return bld->undef;
1668 }
1669
1670 vec_type = LLVMVectorType(bld->elem_type, 4);
1671
1672 undef = LLVMGetUndef(vec_type);
1673
1674 args[0] = undef;
1675 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1676 args[2] = LLVMConstInt(i32t, mode, 0);
1677
1678 res = lp_build_intrinsic(builder, intrinsic,
1679 vec_type, args, Elements(args));
1680
1681 res = LLVMBuildExtractElement(builder, res, index0, "");
1682 }
1683 else {
1684 if (type.width * type.length == 128) {
1685 switch(type.width) {
1686 case 32:
1687 intrinsic = "llvm.x86.sse41.round.ps";
1688 break;
1689 case 64:
1690 intrinsic = "llvm.x86.sse41.round.pd";
1691 break;
1692 default:
1693 assert(0);
1694 return bld->undef;
1695 }
1696 }
1697 else {
1698 assert(type.width * type.length == 256);
1699 assert(util_cpu_caps.has_avx);
1700
1701 switch(type.width) {
1702 case 32:
1703 intrinsic = "llvm.x86.avx.round.ps.256";
1704 break;
1705 case 64:
1706 intrinsic = "llvm.x86.avx.round.pd.256";
1707 break;
1708 default:
1709 assert(0);
1710 return bld->undef;
1711 }
1712 }
1713
1714 res = lp_build_intrinsic_binary(builder, intrinsic,
1715 bld->vec_type, a,
1716 LLVMConstInt(i32t, mode, 0));
1717 }
1718
1719 return res;
1720 }
1721
1722
1723 static INLINE LLVMValueRef
1724 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1725 LLVMValueRef a)
1726 {
1727 LLVMBuilderRef builder = bld->gallivm->builder;
1728 const struct lp_type type = bld->type;
1729 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1730 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1731 const char *intrinsic;
1732 LLVMValueRef res;
1733
1734 assert(type.floating);
1735 /* using the double precision conversions is a bit more complicated */
1736 assert(type.width == 32);
1737
1738 assert(lp_check_value(type, a));
1739 assert(util_cpu_caps.has_sse2);
1740
1741 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1742 if (type.length == 1) {
1743 LLVMTypeRef vec_type;
1744 LLVMValueRef undef;
1745 LLVMValueRef arg;
1746 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1747
1748 vec_type = LLVMVectorType(bld->elem_type, 4);
1749
1750 intrinsic = "llvm.x86.sse.cvtss2si";
1751
1752 undef = LLVMGetUndef(vec_type);
1753
1754 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1755
1756 res = lp_build_intrinsic_unary(builder, intrinsic,
1757 ret_type, arg);
1758 }
1759 else {
1760 if (type.width* type.length == 128) {
1761 intrinsic = "llvm.x86.sse2.cvtps2dq";
1762 }
1763 else {
1764 assert(type.width*type.length == 256);
1765 assert(util_cpu_caps.has_avx);
1766
1767 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1768 }
1769 res = lp_build_intrinsic_unary(builder, intrinsic,
1770 ret_type, a);
1771 }
1772
1773 return res;
1774 }
1775
1776
1777 /*
1778 */
1779 static INLINE LLVMValueRef
1780 lp_build_round_altivec(struct lp_build_context *bld,
1781 LLVMValueRef a,
1782 enum lp_build_round_mode mode)
1783 {
1784 LLVMBuilderRef builder = bld->gallivm->builder;
1785 const struct lp_type type = bld->type;
1786 const char *intrinsic = NULL;
1787
1788 assert(type.floating);
1789
1790 assert(lp_check_value(type, a));
1791 assert(util_cpu_caps.has_altivec);
1792
1793 switch (mode) {
1794 case LP_BUILD_ROUND_NEAREST:
1795 intrinsic = "llvm.ppc.altivec.vrfin";
1796 break;
1797 case LP_BUILD_ROUND_FLOOR:
1798 intrinsic = "llvm.ppc.altivec.vrfim";
1799 break;
1800 case LP_BUILD_ROUND_CEIL:
1801 intrinsic = "llvm.ppc.altivec.vrfip";
1802 break;
1803 case LP_BUILD_ROUND_TRUNCATE:
1804 intrinsic = "llvm.ppc.altivec.vrfiz";
1805 break;
1806 }
1807
1808 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1809 }
1810
1811 static INLINE LLVMValueRef
1812 lp_build_round_arch(struct lp_build_context *bld,
1813 LLVMValueRef a,
1814 enum lp_build_round_mode mode)
1815 {
1816 if (util_cpu_caps.has_sse4_1)
1817 return lp_build_round_sse41(bld, a, mode);
1818 else /* (util_cpu_caps.has_altivec) */
1819 return lp_build_round_altivec(bld, a, mode);
1820 }
1821
1822 /**
1823 * Return the integer part of a float (vector) value (== round toward zero).
1824 * The returned value is a float (vector).
1825 * Ex: trunc(-1.5) = -1.0
1826 */
1827 LLVMValueRef
1828 lp_build_trunc(struct lp_build_context *bld,
1829 LLVMValueRef a)
1830 {
1831 LLVMBuilderRef builder = bld->gallivm->builder;
1832 const struct lp_type type = bld->type;
1833
1834 assert(type.floating);
1835 assert(lp_check_value(type, a));
1836
1837 if (arch_rounding_available(type)) {
1838 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1839 }
1840 else {
1841 const struct lp_type type = bld->type;
1842 struct lp_type inttype;
1843 struct lp_build_context intbld;
1844 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1845 LLVMValueRef trunc, res, anosign, mask;
1846 LLVMTypeRef int_vec_type = bld->int_vec_type;
1847 LLVMTypeRef vec_type = bld->vec_type;
1848
1849 assert(type.width == 32); /* might want to handle doubles at some point */
1850
1851 inttype = type;
1852 inttype.floating = 0;
1853 lp_build_context_init(&intbld, bld->gallivm, inttype);
1854
1855 /* round by truncation */
1856 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1857 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1858
1859 /* mask out sign bit */
1860 anosign = lp_build_abs(bld, a);
1861 /*
1862 * mask out all values if anosign > 2^24
1863 * This should work both for large ints (all rounding is no-op for them
1864 * because such floats are always exact) as well as special cases like
1865 * NaNs, Infs (taking advantage of the fact they use max exponent).
1866 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1867 */
1868 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1869 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1870 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1871 return lp_build_select(bld, mask, a, res);
1872 }
1873 }
1874
1875
1876 /**
1877 * Return float (vector) rounded to nearest integer (vector). The returned
1878 * value is a float (vector).
1879 * Ex: round(0.9) = 1.0
1880 * Ex: round(-1.5) = -2.0
1881 */
1882 LLVMValueRef
1883 lp_build_round(struct lp_build_context *bld,
1884 LLVMValueRef a)
1885 {
1886 LLVMBuilderRef builder = bld->gallivm->builder;
1887 const struct lp_type type = bld->type;
1888
1889 assert(type.floating);
1890 assert(lp_check_value(type, a));
1891
1892 if (arch_rounding_available(type)) {
1893 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1894 }
1895 else {
1896 const struct lp_type type = bld->type;
1897 struct lp_type inttype;
1898 struct lp_build_context intbld;
1899 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1900 LLVMValueRef res, anosign, mask;
1901 LLVMTypeRef int_vec_type = bld->int_vec_type;
1902 LLVMTypeRef vec_type = bld->vec_type;
1903
1904 assert(type.width == 32); /* might want to handle doubles at some point */
1905
1906 inttype = type;
1907 inttype.floating = 0;
1908 lp_build_context_init(&intbld, bld->gallivm, inttype);
1909
1910 res = lp_build_iround(bld, a);
1911 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1912
1913 /* mask out sign bit */
1914 anosign = lp_build_abs(bld, a);
1915 /*
1916 * mask out all values if anosign > 2^24
1917 * This should work both for large ints (all rounding is no-op for them
1918 * because such floats are always exact) as well as special cases like
1919 * NaNs, Infs (taking advantage of the fact they use max exponent).
1920 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1921 */
1922 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1923 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1924 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1925 return lp_build_select(bld, mask, a, res);
1926 }
1927 }
1928
1929
1930 /**
1931 * Return floor of float (vector), result is a float (vector)
1932 * Ex: floor(1.1) = 1.0
1933 * Ex: floor(-1.1) = -2.0
1934 */
1935 LLVMValueRef
1936 lp_build_floor(struct lp_build_context *bld,
1937 LLVMValueRef a)
1938 {
1939 LLVMBuilderRef builder = bld->gallivm->builder;
1940 const struct lp_type type = bld->type;
1941
1942 assert(type.floating);
1943 assert(lp_check_value(type, a));
1944
1945 if (arch_rounding_available(type)) {
1946 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1947 }
1948 else {
1949 const struct lp_type type = bld->type;
1950 struct lp_type inttype;
1951 struct lp_build_context intbld;
1952 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1953 LLVMValueRef trunc, res, anosign, mask;
1954 LLVMTypeRef int_vec_type = bld->int_vec_type;
1955 LLVMTypeRef vec_type = bld->vec_type;
1956
1957 assert(type.width == 32); /* might want to handle doubles at some point */
1958
1959 inttype = type;
1960 inttype.floating = 0;
1961 lp_build_context_init(&intbld, bld->gallivm, inttype);
1962
1963 /* round by truncation */
1964 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1965 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1966
1967 if (type.sign) {
1968 LLVMValueRef tmp;
1969
1970 /*
1971 * fix values if rounding is wrong (for non-special cases)
1972 * - this is the case if trunc > a
1973 */
1974 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1975 /* tmp = trunc > a ? 1.0 : 0.0 */
1976 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1977 tmp = lp_build_and(&intbld, mask, tmp);
1978 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1979 res = lp_build_sub(bld, res, tmp);
1980 }
1981
1982 /* mask out sign bit */
1983 anosign = lp_build_abs(bld, a);
1984 /*
1985 * mask out all values if anosign > 2^24
1986 * This should work both for large ints (all rounding is no-op for them
1987 * because such floats are always exact) as well as special cases like
1988 * NaNs, Infs (taking advantage of the fact they use max exponent).
1989 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1990 */
1991 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1992 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1993 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1994 return lp_build_select(bld, mask, a, res);
1995 }
1996 }
1997
1998
1999 /**
2000 * Return ceiling of float (vector), returning float (vector).
2001 * Ex: ceil( 1.1) = 2.0
2002 * Ex: ceil(-1.1) = -1.0
2003 */
2004 LLVMValueRef
2005 lp_build_ceil(struct lp_build_context *bld,
2006 LLVMValueRef a)
2007 {
2008 LLVMBuilderRef builder = bld->gallivm->builder;
2009 const struct lp_type type = bld->type;
2010
2011 assert(type.floating);
2012 assert(lp_check_value(type, a));
2013
2014 if (arch_rounding_available(type)) {
2015 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2016 }
2017 else {
2018 const struct lp_type type = bld->type;
2019 struct lp_type inttype;
2020 struct lp_build_context intbld;
2021 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2022 LLVMValueRef trunc, res, anosign, mask, tmp;
2023 LLVMTypeRef int_vec_type = bld->int_vec_type;
2024 LLVMTypeRef vec_type = bld->vec_type;
2025
2026 assert(type.width == 32); /* might want to handle doubles at some point */
2027
2028 inttype = type;
2029 inttype.floating = 0;
2030 lp_build_context_init(&intbld, bld->gallivm, inttype);
2031
2032 /* round by truncation */
2033 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2034 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2035
2036 /*
2037 * fix values if rounding is wrong (for non-special cases)
2038 * - this is the case if trunc < a
2039 */
2040 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2041 /* tmp = trunc < a ? 1.0 : 0.0 */
2042 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2043 tmp = lp_build_and(&intbld, mask, tmp);
2044 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2045 res = lp_build_add(bld, trunc, tmp);
2046
2047 /* mask out sign bit */
2048 anosign = lp_build_abs(bld, a);
2049 /*
2050 * mask out all values if anosign > 2^24
2051 * This should work both for large ints (all rounding is no-op for them
2052 * because such floats are always exact) as well as special cases like
2053 * NaNs, Infs (taking advantage of the fact they use max exponent).
2054 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2055 */
2056 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2057 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2058 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2059 return lp_build_select(bld, mask, a, res);
2060 }
2061 }
2062
2063
2064 /**
2065 * Return fractional part of 'a' computed as a - floor(a)
2066 * Typically used in texture coord arithmetic.
2067 */
2068 LLVMValueRef
2069 lp_build_fract(struct lp_build_context *bld,
2070 LLVMValueRef a)
2071 {
2072 assert(bld->type.floating);
2073 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2074 }
2075
2076
2077 /**
2078 * Prevent returning a fractional part of 1.0 for very small negative values of
2079 * 'a' by clamping against 0.99999(9).
2080 */
2081 static inline LLVMValueRef
2082 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2083 {
2084 LLVMValueRef max;
2085
2086 /* this is the largest number smaller than 1.0 representable as float */
2087 max = lp_build_const_vec(bld->gallivm, bld->type,
2088 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2089 return lp_build_min(bld, fract, max);
2090 }
2091
2092
2093 /**
2094 * Same as lp_build_fract, but guarantees that the result is always smaller
2095 * than one.
2096 */
2097 LLVMValueRef
2098 lp_build_fract_safe(struct lp_build_context *bld,
2099 LLVMValueRef a)
2100 {
2101 return clamp_fract(bld, lp_build_fract(bld, a));
2102 }
2103
2104
2105 /**
2106 * Return the integer part of a float (vector) value (== round toward zero).
2107 * The returned value is an integer (vector).
2108 * Ex: itrunc(-1.5) = -1
2109 */
2110 LLVMValueRef
2111 lp_build_itrunc(struct lp_build_context *bld,
2112 LLVMValueRef a)
2113 {
2114 LLVMBuilderRef builder = bld->gallivm->builder;
2115 const struct lp_type type = bld->type;
2116 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2117
2118 assert(type.floating);
2119 assert(lp_check_value(type, a));
2120
2121 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2122 }
2123
2124
2125 /**
2126 * Return float (vector) rounded to nearest integer (vector). The returned
2127 * value is an integer (vector).
2128 * Ex: iround(0.9) = 1
2129 * Ex: iround(-1.5) = -2
2130 */
2131 LLVMValueRef
2132 lp_build_iround(struct lp_build_context *bld,
2133 LLVMValueRef a)
2134 {
2135 LLVMBuilderRef builder = bld->gallivm->builder;
2136 const struct lp_type type = bld->type;
2137 LLVMTypeRef int_vec_type = bld->int_vec_type;
2138 LLVMValueRef res;
2139
2140 assert(type.floating);
2141
2142 assert(lp_check_value(type, a));
2143
2144 if ((util_cpu_caps.has_sse2 &&
2145 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2146 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2147 return lp_build_iround_nearest_sse2(bld, a);
2148 }
2149 if (arch_rounding_available(type)) {
2150 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2151 }
2152 else {
2153 LLVMValueRef half;
2154
2155 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2156
2157 if (type.sign) {
2158 LLVMTypeRef vec_type = bld->vec_type;
2159 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2160 (unsigned long long)1 << (type.width - 1));
2161 LLVMValueRef sign;
2162
2163 /* get sign bit */
2164 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2165 sign = LLVMBuildAnd(builder, sign, mask, "");
2166
2167 /* sign * 0.5 */
2168 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2169 half = LLVMBuildOr(builder, sign, half, "");
2170 half = LLVMBuildBitCast(builder, half, vec_type, "");
2171 }
2172
2173 res = LLVMBuildFAdd(builder, a, half, "");
2174 }
2175
2176 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2177
2178 return res;
2179 }
2180
2181
2182 /**
2183 * Return floor of float (vector), result is an int (vector)
2184 * Ex: ifloor(1.1) = 1.0
2185 * Ex: ifloor(-1.1) = -2.0
2186 */
2187 LLVMValueRef
2188 lp_build_ifloor(struct lp_build_context *bld,
2189 LLVMValueRef a)
2190 {
2191 LLVMBuilderRef builder = bld->gallivm->builder;
2192 const struct lp_type type = bld->type;
2193 LLVMTypeRef int_vec_type = bld->int_vec_type;
2194 LLVMValueRef res;
2195
2196 assert(type.floating);
2197 assert(lp_check_value(type, a));
2198
2199 res = a;
2200 if (type.sign) {
2201 if (arch_rounding_available(type)) {
2202 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2203 }
2204 else {
2205 struct lp_type inttype;
2206 struct lp_build_context intbld;
2207 LLVMValueRef trunc, itrunc, mask;
2208
2209 assert(type.floating);
2210 assert(lp_check_value(type, a));
2211
2212 inttype = type;
2213 inttype.floating = 0;
2214 lp_build_context_init(&intbld, bld->gallivm, inttype);
2215
2216 /* round by truncation */
2217 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2218 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2219
2220 /*
2221 * fix values if rounding is wrong (for non-special cases)
2222 * - this is the case if trunc > a
2223 * The results of doing this with NaNs, very large values etc.
2224 * are undefined but this seems to be the case anyway.
2225 */
2226 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2227 /* cheapie minus one with mask since the mask is minus one / zero */
2228 return lp_build_add(&intbld, itrunc, mask);
2229 }
2230 }
2231
2232 /* round to nearest (toward zero) */
2233 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2234
2235 return res;
2236 }
2237
2238
2239 /**
2240 * Return ceiling of float (vector), returning int (vector).
2241 * Ex: iceil( 1.1) = 2
2242 * Ex: iceil(-1.1) = -1
2243 */
2244 LLVMValueRef
2245 lp_build_iceil(struct lp_build_context *bld,
2246 LLVMValueRef a)
2247 {
2248 LLVMBuilderRef builder = bld->gallivm->builder;
2249 const struct lp_type type = bld->type;
2250 LLVMTypeRef int_vec_type = bld->int_vec_type;
2251 LLVMValueRef res;
2252
2253 assert(type.floating);
2254 assert(lp_check_value(type, a));
2255
2256 if (arch_rounding_available(type)) {
2257 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2258 }
2259 else {
2260 struct lp_type inttype;
2261 struct lp_build_context intbld;
2262 LLVMValueRef trunc, itrunc, mask;
2263
2264 assert(type.floating);
2265 assert(lp_check_value(type, a));
2266
2267 inttype = type;
2268 inttype.floating = 0;
2269 lp_build_context_init(&intbld, bld->gallivm, inttype);
2270
2271 /* round by truncation */
2272 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2273 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2274
2275 /*
2276 * fix values if rounding is wrong (for non-special cases)
2277 * - this is the case if trunc < a
2278 * The results of doing this with NaNs, very large values etc.
2279 * are undefined but this seems to be the case anyway.
2280 */
2281 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2282 /* cheapie plus one with mask since the mask is minus one / zero */
2283 return lp_build_sub(&intbld, itrunc, mask);
2284 }
2285
2286 /* round to nearest (toward zero) */
2287 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2288
2289 return res;
2290 }
2291
2292
2293 /**
2294 * Combined ifloor() & fract().
2295 *
2296 * Preferred to calling the functions separately, as it will ensure that the
2297 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2298 */
2299 void
2300 lp_build_ifloor_fract(struct lp_build_context *bld,
2301 LLVMValueRef a,
2302 LLVMValueRef *out_ipart,
2303 LLVMValueRef *out_fpart)
2304 {
2305 LLVMBuilderRef builder = bld->gallivm->builder;
2306 const struct lp_type type = bld->type;
2307 LLVMValueRef ipart;
2308
2309 assert(type.floating);
2310 assert(lp_check_value(type, a));
2311
2312 if (arch_rounding_available(type)) {
2313 /*
2314 * floor() is easier.
2315 */
2316
2317 ipart = lp_build_floor(bld, a);
2318 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2319 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2320 }
2321 else {
2322 /*
2323 * ifloor() is easier.
2324 */
2325
2326 *out_ipart = lp_build_ifloor(bld, a);
2327 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2328 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2329 }
2330 }
2331
2332
2333 /**
2334 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2335 * always smaller than one.
2336 */
2337 void
2338 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2339 LLVMValueRef a,
2340 LLVMValueRef *out_ipart,
2341 LLVMValueRef *out_fpart)
2342 {
2343 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2344 *out_fpart = clamp_fract(bld, *out_fpart);
2345 }
2346
2347
2348 LLVMValueRef
2349 lp_build_sqrt(struct lp_build_context *bld,
2350 LLVMValueRef a)
2351 {
2352 LLVMBuilderRef builder = bld->gallivm->builder;
2353 const struct lp_type type = bld->type;
2354 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2355 char intrinsic[32];
2356
2357 assert(lp_check_value(type, a));
2358
2359 /* TODO: optimize the constant case */
2360
2361 assert(type.floating);
2362 if (type.length == 1) {
2363 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2364 }
2365 else {
2366 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2367 }
2368
2369 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2370 }
2371
2372
2373 /**
2374 * Do one Newton-Raphson step to improve reciprocate precision:
2375 *
2376 * x_{i+1} = x_i * (2 - a * x_i)
2377 *
2378 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2379 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2380 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2381 * halo. It would be necessary to clamp the argument to prevent this.
2382 *
2383 * See also:
2384 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2385 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2386 */
2387 static INLINE LLVMValueRef
2388 lp_build_rcp_refine(struct lp_build_context *bld,
2389 LLVMValueRef a,
2390 LLVMValueRef rcp_a)
2391 {
2392 LLVMBuilderRef builder = bld->gallivm->builder;
2393 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2394 LLVMValueRef res;
2395
2396 res = LLVMBuildFMul(builder, a, rcp_a, "");
2397 res = LLVMBuildFSub(builder, two, res, "");
2398 res = LLVMBuildFMul(builder, rcp_a, res, "");
2399
2400 return res;
2401 }
2402
2403
2404 LLVMValueRef
2405 lp_build_rcp(struct lp_build_context *bld,
2406 LLVMValueRef a)
2407 {
2408 LLVMBuilderRef builder = bld->gallivm->builder;
2409 const struct lp_type type = bld->type;
2410
2411 assert(lp_check_value(type, a));
2412
2413 if(a == bld->zero)
2414 return bld->undef;
2415 if(a == bld->one)
2416 return bld->one;
2417 if(a == bld->undef)
2418 return bld->undef;
2419
2420 assert(type.floating);
2421
2422 if(LLVMIsConstant(a))
2423 return LLVMConstFDiv(bld->one, a);
2424
2425 /*
2426 * We don't use RCPPS because:
2427 * - it only has 10bits of precision
2428 * - it doesn't even get the reciprocate of 1.0 exactly
2429 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2430 * - for recent processors the benefit over DIVPS is marginal, a case
2431 * dependent
2432 *
2433 * We could still use it on certain processors if benchmarks show that the
2434 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2435 * particular uses that require less workarounds.
2436 */
2437
2438 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2439 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2440 const unsigned num_iterations = 0;
2441 LLVMValueRef res;
2442 unsigned i;
2443 const char *intrinsic = NULL;
2444
2445 if (type.length == 4) {
2446 intrinsic = "llvm.x86.sse.rcp.ps";
2447 }
2448 else {
2449 intrinsic = "llvm.x86.avx.rcp.ps.256";
2450 }
2451
2452 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2453
2454 for (i = 0; i < num_iterations; ++i) {
2455 res = lp_build_rcp_refine(bld, a, res);
2456 }
2457
2458 return res;
2459 }
2460
2461 return LLVMBuildFDiv(builder, bld->one, a, "");
2462 }
2463
2464
2465 /**
2466 * Do one Newton-Raphson step to improve rsqrt precision:
2467 *
2468 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2469 *
2470 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2471 */
2472 static INLINE LLVMValueRef
2473 lp_build_rsqrt_refine(struct lp_build_context *bld,
2474 LLVMValueRef a,
2475 LLVMValueRef rsqrt_a)
2476 {
2477 LLVMBuilderRef builder = bld->gallivm->builder;
2478 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2479 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2480 LLVMValueRef res;
2481
2482 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2483 res = LLVMBuildFMul(builder, a, res, "");
2484 res = LLVMBuildFSub(builder, three, res, "");
2485 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2486 res = LLVMBuildFMul(builder, half, res, "");
2487
2488 return res;
2489 }
2490
2491
2492 /**
2493 * Generate 1/sqrt(a).
2494 * Result is undefined for values < 0, infinity for +0.
2495 */
2496 LLVMValueRef
2497 lp_build_rsqrt(struct lp_build_context *bld,
2498 LLVMValueRef a)
2499 {
2500 LLVMBuilderRef builder = bld->gallivm->builder;
2501 const struct lp_type type = bld->type;
2502
2503 assert(lp_check_value(type, a));
2504
2505 assert(type.floating);
2506
2507 /*
2508 * This should be faster but all denormals will end up as infinity.
2509 */
2510 if (0 && lp_build_fast_rsqrt_available(type)) {
2511 const unsigned num_iterations = 1;
2512 LLVMValueRef res;
2513 unsigned i;
2514
2515 /* rsqrt(1.0) != 1.0 here */
2516 res = lp_build_fast_rsqrt(bld, a);
2517
2518 if (num_iterations) {
2519 /*
2520 * Newton-Raphson will result in NaN instead of infinity for zero,
2521 * and NaN instead of zero for infinity.
2522 * Also, need to ensure rsqrt(1.0) == 1.0.
2523 * All numbers smaller than FLT_MIN will result in +infinity
2524 * (rsqrtps treats all denormals as zero).
2525 */
2526 /*
2527 * Certain non-c99 compilers don't know INFINITY and might not support
2528 * hacks to evaluate it at compile time neither.
2529 */
2530 const unsigned posinf_int = 0x7F800000;
2531 LLVMValueRef cmp;
2532 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2533 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2534
2535 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2536
2537 for (i = 0; i < num_iterations; ++i) {
2538 res = lp_build_rsqrt_refine(bld, a, res);
2539 }
2540 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2541 res = lp_build_select(bld, cmp, inf, res);
2542 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2543 res = lp_build_select(bld, cmp, bld->zero, res);
2544 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2545 res = lp_build_select(bld, cmp, bld->one, res);
2546 }
2547
2548 return res;
2549 }
2550
2551 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2552 }
2553
2554 /**
2555 * If there's a fast (inaccurate) rsqrt instruction available
2556 * (caller may want to avoid to call rsqrt_fast if it's not available,
2557 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2558 * unavailable it would result in sqrt/div/mul so obviously
2559 * much better to just call sqrt, skipping both div and mul).
2560 */
2561 boolean
2562 lp_build_fast_rsqrt_available(struct lp_type type)
2563 {
2564 assert(type.floating);
2565
2566 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2567 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2568 return true;
2569 }
2570 return false;
2571 }
2572
2573
2574 /**
2575 * Generate 1/sqrt(a).
2576 * Result is undefined for values < 0, infinity for +0.
2577 * Precision is limited, only ~10 bits guaranteed
2578 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2579 */
2580 LLVMValueRef
2581 lp_build_fast_rsqrt(struct lp_build_context *bld,
2582 LLVMValueRef a)
2583 {
2584 LLVMBuilderRef builder = bld->gallivm->builder;
2585 const struct lp_type type = bld->type;
2586
2587 assert(lp_check_value(type, a));
2588
2589 if (lp_build_fast_rsqrt_available(type)) {
2590 const char *intrinsic = NULL;
2591
2592 if (type.length == 4) {
2593 intrinsic = "llvm.x86.sse.rsqrt.ps";
2594 }
2595 else {
2596 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2597 }
2598 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2599 }
2600 else {
2601 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2602 }
2603 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2604 }
2605
2606
2607 /**
2608 * Generate sin(a) or cos(a) using polynomial approximation.
2609 * TODO: it might be worth recognizing sin and cos using same source
2610 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2611 * would be way cheaper than calculating (nearly) everything twice...
2612 * Not sure it's common enough to be worth bothering however, scs
2613 * opcode could also benefit from calculating both though.
2614 */
2615 static LLVMValueRef
2616 lp_build_sin_or_cos(struct lp_build_context *bld,
2617 LLVMValueRef a,
2618 boolean cos)
2619 {
2620 struct gallivm_state *gallivm = bld->gallivm;
2621 LLVMBuilderRef b = gallivm->builder;
2622 struct lp_type int_type = lp_int_type(bld->type);
2623
2624 /*
2625 * take the absolute value,
2626 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2627 */
2628
2629 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2630 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2631
2632 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2633 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2634
2635 /*
2636 * scale by 4/Pi
2637 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2638 */
2639
2640 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2641 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2642
2643 /*
2644 * store the integer part of y in mm0
2645 * emm2 = _mm_cvttps_epi32(y);
2646 */
2647
2648 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2649
2650 /*
2651 * j=(j+1) & (~1) (see the cephes sources)
2652 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2653 */
2654
2655 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2656 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2657 /*
2658 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2659 */
2660 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2661 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2662
2663 /*
2664 * y = _mm_cvtepi32_ps(emm2);
2665 */
2666 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2667
2668 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2669 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2670 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2671 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2672
2673 /*
2674 * Argument used for poly selection and sign bit determination
2675 * is different for sin vs. cos.
2676 */
2677 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2678 emm2_and;
2679
2680 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2681 LLVMBuildNot(b, emm2_2, ""), ""),
2682 const_29, "sign_bit") :
2683 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2684 LLVMBuildShl(b, emm2_add,
2685 const_29, ""), ""),
2686 sign_mask, "sign_bit");
2687
2688 /*
2689 * get the polynom selection mask
2690 * there is one polynom for 0 <= x <= Pi/4
2691 * and another one for Pi/4<x<=Pi/2
2692 * Both branches will be computed.
2693 *
2694 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2695 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2696 */
2697
2698 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2699 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2700 int_type, PIPE_FUNC_EQUAL,
2701 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2702
2703 /*
2704 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2705 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2706 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2707 */
2708 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2709 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2710 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2711
2712 /*
2713 * The magic pass: "Extended precision modular arithmetic"
2714 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2715 * xmm1 = _mm_mul_ps(y, xmm1);
2716 * xmm2 = _mm_mul_ps(y, xmm2);
2717 * xmm3 = _mm_mul_ps(y, xmm3);
2718 */
2719 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2720 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2721 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2722
2723 /*
2724 * x = _mm_add_ps(x, xmm1);
2725 * x = _mm_add_ps(x, xmm2);
2726 * x = _mm_add_ps(x, xmm3);
2727 */
2728
2729 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2730 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2731 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2732
2733 /*
2734 * Evaluate the first polynom (0 <= x <= Pi/4)
2735 *
2736 * z = _mm_mul_ps(x,x);
2737 */
2738 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2739
2740 /*
2741 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2742 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2743 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2744 */
2745 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2746 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2747 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2748
2749 /*
2750 * y = *(v4sf*)_ps_coscof_p0;
2751 * y = _mm_mul_ps(y, z);
2752 */
2753 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2754 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2755 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2756 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2757 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2758 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2759
2760
2761 /*
2762 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2763 * y = _mm_sub_ps(y, tmp);
2764 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2765 */
2766 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2767 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2768 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2769 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2770 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2771
2772 /*
2773 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2774 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2775 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2776 */
2777 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2778 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2779 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2780
2781 /*
2782 * Evaluate the second polynom (Pi/4 <= x <= 0)
2783 *
2784 * y2 = *(v4sf*)_ps_sincof_p0;
2785 * y2 = _mm_mul_ps(y2, z);
2786 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2787 * y2 = _mm_mul_ps(y2, z);
2788 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2789 * y2 = _mm_mul_ps(y2, z);
2790 * y2 = _mm_mul_ps(y2, x);
2791 * y2 = _mm_add_ps(y2, x);
2792 */
2793
2794 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2795 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2796 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2797 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2798 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2799 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2800 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2801
2802 /*
2803 * select the correct result from the two polynoms
2804 * xmm3 = poly_mask;
2805 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2806 * y = _mm_andnot_ps(xmm3, y);
2807 * y = _mm_or_ps(y,y2);
2808 */
2809 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2810 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2811 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2812 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2813 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2814 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2815
2816 /*
2817 * update the sign
2818 * y = _mm_xor_ps(y, sign_bit);
2819 */
2820 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2821 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2822
2823 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2824
2825 /* clamp output to be within [-1, 1] */
2826 y_result = lp_build_clamp(bld, y_result,
2827 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2828 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2829 /* If a is -inf, inf or NaN then return NaN */
2830 y_result = lp_build_select(bld, isfinite, y_result,
2831 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2832 return y_result;
2833 }
2834
2835
2836 /**
2837 * Generate sin(a)
2838 */
2839 LLVMValueRef
2840 lp_build_sin(struct lp_build_context *bld,
2841 LLVMValueRef a)
2842 {
2843 return lp_build_sin_or_cos(bld, a, FALSE);
2844 }
2845
2846
2847 /**
2848 * Generate cos(a)
2849 */
2850 LLVMValueRef
2851 lp_build_cos(struct lp_build_context *bld,
2852 LLVMValueRef a)
2853 {
2854 return lp_build_sin_or_cos(bld, a, TRUE);
2855 }
2856
2857
2858 /**
2859 * Generate pow(x, y)
2860 */
2861 LLVMValueRef
2862 lp_build_pow(struct lp_build_context *bld,
2863 LLVMValueRef x,
2864 LLVMValueRef y)
2865 {
2866 /* TODO: optimize the constant case */
2867 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2868 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2869 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2870 __FUNCTION__);
2871 }
2872
2873 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2874 }
2875
2876
2877 /**
2878 * Generate exp(x)
2879 */
2880 LLVMValueRef
2881 lp_build_exp(struct lp_build_context *bld,
2882 LLVMValueRef x)
2883 {
2884 /* log2(e) = 1/log(2) */
2885 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2886 1.4426950408889634);
2887
2888 assert(lp_check_value(bld->type, x));
2889
2890 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2891 }
2892
2893
2894 /**
2895 * Generate log(x)
2896 * Behavior is undefined with infs, 0s and nans
2897 */
2898 LLVMValueRef
2899 lp_build_log(struct lp_build_context *bld,
2900 LLVMValueRef x)
2901 {
2902 /* log(2) */
2903 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2904 0.69314718055994529);
2905
2906 assert(lp_check_value(bld->type, x));
2907
2908 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2909 }
2910
2911 /**
2912 * Generate log(x) that handles edge cases (infs, 0s and nans)
2913 */
2914 LLVMValueRef
2915 lp_build_log_safe(struct lp_build_context *bld,
2916 LLVMValueRef x)
2917 {
2918 /* log(2) */
2919 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2920 0.69314718055994529);
2921
2922 assert(lp_check_value(bld->type, x));
2923
2924 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2925 }
2926
2927
2928 /**
2929 * Generate polynomial.
2930 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2931 */
2932 LLVMValueRef
2933 lp_build_polynomial(struct lp_build_context *bld,
2934 LLVMValueRef x,
2935 const double *coeffs,
2936 unsigned num_coeffs)
2937 {
2938 const struct lp_type type = bld->type;
2939 LLVMValueRef even = NULL, odd = NULL;
2940 LLVMValueRef x2;
2941 unsigned i;
2942
2943 assert(lp_check_value(bld->type, x));
2944
2945 /* TODO: optimize the constant case */
2946 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2947 LLVMIsConstant(x)) {
2948 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2949 __FUNCTION__);
2950 }
2951
2952 /*
2953 * Calculate odd and even terms seperately to decrease data dependency
2954 * Ex:
2955 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2956 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2957 */
2958 x2 = lp_build_mul(bld, x, x);
2959
2960 for (i = num_coeffs; i--; ) {
2961 LLVMValueRef coeff;
2962
2963 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2964
2965 if (i % 2 == 0) {
2966 if (even)
2967 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2968 else
2969 even = coeff;
2970 } else {
2971 if (odd)
2972 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2973 else
2974 odd = coeff;
2975 }
2976 }
2977
2978 if (odd)
2979 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2980 else if (even)
2981 return even;
2982 else
2983 return bld->undef;
2984 }
2985
2986
2987 /**
2988 * Minimax polynomial fit of 2**x, in range [0, 1[
2989 */
2990 const double lp_build_exp2_polynomial[] = {
2991 #if EXP_POLY_DEGREE == 5
2992 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2993 0.693153073200168932794,
2994 0.240153617044375388211,
2995 0.0558263180532956664775,
2996 0.00898934009049466391101,
2997 0.00187757667519147912699
2998 #elif EXP_POLY_DEGREE == 4
2999 1.00000259337069434683,
3000 0.693003834469974940458,
3001 0.24144275689150793076,
3002 0.0520114606103070150235,
3003 0.0135341679161270268764
3004 #elif EXP_POLY_DEGREE == 3
3005 0.999925218562710312959,
3006 0.695833540494823811697,
3007 0.226067155427249155588,
3008 0.0780245226406372992967
3009 #elif EXP_POLY_DEGREE == 2
3010 1.00172476321474503578,
3011 0.657636275736077639316,
3012 0.33718943461968720704
3013 #else
3014 #error
3015 #endif
3016 };
3017
3018
3019 LLVMValueRef
3020 lp_build_exp2(struct lp_build_context *bld,
3021 LLVMValueRef x)
3022 {
3023 LLVMBuilderRef builder = bld->gallivm->builder;
3024 const struct lp_type type = bld->type;
3025 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3026 LLVMValueRef ipart = NULL;
3027 LLVMValueRef fpart = NULL;
3028 LLVMValueRef expipart = NULL;
3029 LLVMValueRef expfpart = NULL;
3030 LLVMValueRef res = NULL;
3031
3032 assert(lp_check_value(bld->type, x));
3033
3034
3035 /* TODO: optimize the constant case */
3036 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3037 LLVMIsConstant(x)) {
3038 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3039 __FUNCTION__);
3040 }
3041
3042 assert(type.floating && type.width == 32);
3043
3044 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3045 * the result is INF and if it's smaller than -126.9 the result is 0 */
3046 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3047 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
3048 x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x);
3049
3050 /* ipart = floor(x) */
3051 /* fpart = x - ipart */
3052 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3053
3054
3055
3056 /* expipart = (float) (1 << ipart) */
3057 expipart = LLVMBuildAdd(builder, ipart,
3058 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3059 expipart = LLVMBuildShl(builder, expipart,
3060 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3061 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3062
3063
3064 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3065 Elements(lp_build_exp2_polynomial));
3066
3067 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3068
3069
3070 return res;
3071 }
3072
3073
3074
3075 /**
3076 * Extract the exponent of a IEEE-754 floating point value.
3077 *
3078 * Optionally apply an integer bias.
3079 *
3080 * Result is an integer value with
3081 *
3082 * ifloor(log2(x)) + bias
3083 */
3084 LLVMValueRef
3085 lp_build_extract_exponent(struct lp_build_context *bld,
3086 LLVMValueRef x,
3087 int bias)
3088 {
3089 LLVMBuilderRef builder = bld->gallivm->builder;
3090 const struct lp_type type = bld->type;
3091 unsigned mantissa = lp_mantissa(type);
3092 LLVMValueRef res;
3093
3094 assert(type.floating);
3095
3096 assert(lp_check_value(bld->type, x));
3097
3098 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3099
3100 res = LLVMBuildLShr(builder, x,
3101 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3102 res = LLVMBuildAnd(builder, res,
3103 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3104 res = LLVMBuildSub(builder, res,
3105 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3106
3107 return res;
3108 }
3109
3110
3111 /**
3112 * Extract the mantissa of the a floating.
3113 *
3114 * Result is a floating point value with
3115 *
3116 * x / floor(log2(x))
3117 */
3118 LLVMValueRef
3119 lp_build_extract_mantissa(struct lp_build_context *bld,
3120 LLVMValueRef x)
3121 {
3122 LLVMBuilderRef builder = bld->gallivm->builder;
3123 const struct lp_type type = bld->type;
3124 unsigned mantissa = lp_mantissa(type);
3125 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3126 (1ULL << mantissa) - 1);
3127 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3128 LLVMValueRef res;
3129
3130 assert(lp_check_value(bld->type, x));
3131
3132 assert(type.floating);
3133
3134 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3135
3136 /* res = x / 2**ipart */
3137 res = LLVMBuildAnd(builder, x, mantmask, "");
3138 res = LLVMBuildOr(builder, res, one, "");
3139 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3140
3141 return res;
3142 }
3143
3144
3145
3146 /**
3147 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3148 * These coefficients can be generate with
3149 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3150 */
3151 const double lp_build_log2_polynomial[] = {
3152 #if LOG_POLY_DEGREE == 5
3153 2.88539008148777786488L,
3154 0.961796878841293367824L,
3155 0.577058946784739859012L,
3156 0.412914355135828735411L,
3157 0.308591899232910175289L,
3158 0.352376952300281371868L,
3159 #elif LOG_POLY_DEGREE == 4
3160 2.88539009343309178325L,
3161 0.961791550404184197881L,
3162 0.577440339438736392009L,
3163 0.403343858251329912514L,
3164 0.406718052498846252698L,
3165 #elif LOG_POLY_DEGREE == 3
3166 2.88538959748872753838L,
3167 0.961932915889597772928L,
3168 0.571118517972136195241L,
3169 0.493997535084709500285L,
3170 #else
3171 #error
3172 #endif
3173 };
3174
3175 /**
3176 * See http://www.devmaster.net/forums/showthread.php?p=43580
3177 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3178 * http://www.nezumi.demon.co.uk/consult/logx.htm
3179 *
3180 * If handle_edge_cases is true the function will perform computations
3181 * to match the required D3D10+ behavior for each of the edge cases.
3182 * That means that if input is:
3183 * - less than zero (to and including -inf) then NaN will be returned
3184 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3185 * - +infinity, then +infinity will be returned
3186 * - NaN, then NaN will be returned
3187 *
3188 * Those checks are fairly expensive so if you don't need them make sure
3189 * handle_edge_cases is false.
3190 */
3191 void
3192 lp_build_log2_approx(struct lp_build_context *bld,
3193 LLVMValueRef x,
3194 LLVMValueRef *p_exp,
3195 LLVMValueRef *p_floor_log2,
3196 LLVMValueRef *p_log2,
3197 boolean handle_edge_cases)
3198 {
3199 LLVMBuilderRef builder = bld->gallivm->builder;
3200 const struct lp_type type = bld->type;
3201 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3202 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3203
3204 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3205 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3206 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3207
3208 LLVMValueRef i = NULL;
3209 LLVMValueRef y = NULL;
3210 LLVMValueRef z = NULL;
3211 LLVMValueRef exp = NULL;
3212 LLVMValueRef mant = NULL;
3213 LLVMValueRef logexp = NULL;
3214 LLVMValueRef logmant = NULL;
3215 LLVMValueRef res = NULL;
3216
3217 assert(lp_check_value(bld->type, x));
3218
3219 if(p_exp || p_floor_log2 || p_log2) {
3220 /* TODO: optimize the constant case */
3221 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3222 LLVMIsConstant(x)) {
3223 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3224 __FUNCTION__);
3225 }
3226
3227 assert(type.floating && type.width == 32);
3228
3229 /*
3230 * We don't explicitly handle denormalized numbers. They will yield a
3231 * result in the neighbourhood of -127, which appears to be adequate
3232 * enough.
3233 */
3234
3235 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3236
3237 /* exp = (float) exponent(x) */
3238 exp = LLVMBuildAnd(builder, i, expmask, "");
3239 }
3240
3241 if(p_floor_log2 || p_log2) {
3242 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3243 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3244 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3245 }
3246
3247 if(p_log2) {
3248 /* mant = 1 + (float) mantissa(x) */
3249 mant = LLVMBuildAnd(builder, i, mantmask, "");
3250 mant = LLVMBuildOr(builder, mant, one, "");
3251 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3252
3253 /* y = (mant - 1) / (mant + 1) */
3254 y = lp_build_div(bld,
3255 lp_build_sub(bld, mant, bld->one),
3256 lp_build_add(bld, mant, bld->one)
3257 );
3258
3259 /* z = y^2 */
3260 z = lp_build_mul(bld, y, y);
3261
3262 /* compute P(z) */
3263 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3264 Elements(lp_build_log2_polynomial));
3265
3266 /* logmant = y * P(z) */
3267 logmant = lp_build_mul(bld, y, logmant);
3268
3269 res = lp_build_add(bld, logmant, logexp);
3270
3271 if (type.floating && handle_edge_cases) {
3272 LLVMValueRef negmask, infmask, zmask;
3273 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3274 lp_build_const_vec(bld->gallivm, type, 0.0f));
3275 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3276 lp_build_const_vec(bld->gallivm, type, 0.0f));
3277 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3278 lp_build_const_vec(bld->gallivm, type, INFINITY));
3279
3280 /* If x is qual to inf make sure we return inf */
3281 res = lp_build_select(bld, infmask,
3282 lp_build_const_vec(bld->gallivm, type, INFINITY),
3283 res);
3284 /* If x is qual to 0, return -inf */
3285 res = lp_build_select(bld, zmask,
3286 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3287 res);
3288 /* If x is nan or less than 0, return nan */
3289 res = lp_build_select(bld, negmask,
3290 lp_build_const_vec(bld->gallivm, type, NAN),
3291 res);
3292 }
3293 }
3294
3295 if(p_exp) {
3296 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3297 *p_exp = exp;
3298 }
3299
3300 if(p_floor_log2)
3301 *p_floor_log2 = logexp;
3302
3303 if(p_log2)
3304 *p_log2 = res;
3305 }
3306
3307
3308 /*
3309 * log2 implementation which doesn't have special code to
3310 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3311 * the results for those cases are undefined.
3312 */
3313 LLVMValueRef
3314 lp_build_log2(struct lp_build_context *bld,
3315 LLVMValueRef x)
3316 {
3317 LLVMValueRef res;
3318 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3319 return res;
3320 }
3321
3322 /*
3323 * Version of log2 which handles all edge cases.
3324 * Look at documentation of lp_build_log2_approx for
3325 * description of the behavior for each of the edge cases.
3326 */
3327 LLVMValueRef
3328 lp_build_log2_safe(struct lp_build_context *bld,
3329 LLVMValueRef x)
3330 {
3331 LLVMValueRef res;
3332 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3333 return res;
3334 }
3335
3336
3337 /**
3338 * Faster (and less accurate) log2.
3339 *
3340 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3341 *
3342 * Piece-wise linear approximation, with exact results when x is a
3343 * power of two.
3344 *
3345 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3346 */
3347 LLVMValueRef
3348 lp_build_fast_log2(struct lp_build_context *bld,
3349 LLVMValueRef x)
3350 {
3351 LLVMBuilderRef builder = bld->gallivm->builder;
3352 LLVMValueRef ipart;
3353 LLVMValueRef fpart;
3354
3355 assert(lp_check_value(bld->type, x));
3356
3357 assert(bld->type.floating);
3358
3359 /* ipart = floor(log2(x)) - 1 */
3360 ipart = lp_build_extract_exponent(bld, x, -1);
3361 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3362
3363 /* fpart = x / 2**ipart */
3364 fpart = lp_build_extract_mantissa(bld, x);
3365
3366 /* ipart + fpart */
3367 return LLVMBuildFAdd(builder, ipart, fpart, "");
3368 }
3369
3370
3371 /**
3372 * Fast implementation of iround(log2(x)).
3373 *
3374 * Not an approximation -- it should give accurate results all the time.
3375 */
3376 LLVMValueRef
3377 lp_build_ilog2(struct lp_build_context *bld,
3378 LLVMValueRef x)
3379 {
3380 LLVMBuilderRef builder = bld->gallivm->builder;
3381 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3382 LLVMValueRef ipart;
3383
3384 assert(bld->type.floating);
3385
3386 assert(lp_check_value(bld->type, x));
3387
3388 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3389 x = LLVMBuildFMul(builder, x, sqrt2, "");
3390
3391 /* ipart = floor(log2(x) + 0.5) */
3392 ipart = lp_build_extract_exponent(bld, x, 0);
3393
3394 return ipart;
3395 }
3396
3397 LLVMValueRef
3398 lp_build_mod(struct lp_build_context *bld,
3399 LLVMValueRef x,
3400 LLVMValueRef y)
3401 {
3402 LLVMBuilderRef builder = bld->gallivm->builder;
3403 LLVMValueRef res;
3404 const struct lp_type type = bld->type;
3405
3406 assert(lp_check_value(type, x));
3407 assert(lp_check_value(type, y));
3408
3409 if (type.floating)
3410 res = LLVMBuildFRem(builder, x, y, "");
3411 else if (type.sign)
3412 res = LLVMBuildSRem(builder, x, y, "");
3413 else
3414 res = LLVMBuildURem(builder, x, y, "");
3415 return res;
3416 }
3417
3418
3419 /*
3420 * For floating inputs it creates and returns a mask
3421 * which is all 1's for channels which are NaN.
3422 * Channels inside x which are not NaN will be 0.
3423 */
3424 LLVMValueRef
3425 lp_build_isnan(struct lp_build_context *bld,
3426 LLVMValueRef x)
3427 {
3428 LLVMValueRef mask;
3429 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3430
3431 assert(bld->type.floating);
3432 assert(lp_check_value(bld->type, x));
3433
3434 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3435 "isnotnan");
3436 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3437 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3438 return mask;
3439 }
3440
3441 /* Returns all 1's for floating point numbers that are
3442 * finite numbers and returns all zeros for -inf,
3443 * inf and nan's */
3444 LLVMValueRef
3445 lp_build_isfinite(struct lp_build_context *bld,
3446 LLVMValueRef x)
3447 {
3448 LLVMBuilderRef builder = bld->gallivm->builder;
3449 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3450 struct lp_type int_type = lp_int_type(bld->type);
3451 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3452 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3453 0x7f800000);
3454
3455 if (!bld->type.floating) {
3456 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3457 }
3458 assert(bld->type.floating);
3459 assert(lp_check_value(bld->type, x));
3460 assert(bld->type.width == 32);
3461
3462 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3463 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3464 intx, infornan32);
3465 }
3466
3467 /*
3468 * Returns true if the number is nan or inf and false otherwise.
3469 * The input has to be a floating point vector.
3470 */
3471 LLVMValueRef
3472 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3473 const struct lp_type type,
3474 LLVMValueRef x)
3475 {
3476 LLVMBuilderRef builder = gallivm->builder;
3477 struct lp_type int_type = lp_int_type(type);
3478 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3479 0x7f800000);
3480 LLVMValueRef ret;
3481
3482 assert(type.floating);
3483
3484 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3485 ret = LLVMBuildAnd(builder, ret, const0, "");
3486 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3487 ret, const0);
3488
3489 return ret;
3490 }
3491